diff --git a/benchmarks/build_dataset/seed_tts_design/en/meta.lst b/benchmarks/build_dataset/seed_tts_design/en/meta.lst new file mode 100644 index 00000000000..7e364c2e517 --- /dev/null +++ b/benchmarks/build_dataset/seed_tts_design/en/meta.lst @@ -0,0 +1,20 @@ +vd001|||The quick brown fox jumps over the lazy dog.|A warm, friendly female voice with a slight American Midwest accent, speaking at a moderate pace with natural inflection. +vd002|||Welcome to the future of text-to-speech synthesis.|A deep, authoritative male news anchor voice, clear and professional with a measured cadence. +vd003|||The sunset painted the sky in brilliant shades of orange and pink.|A gentle elderly female voice, soft and wise, with a slight Southern American accent. +vd004|||Scientists have discovered a new species of deep-sea creature.|A young male voice with an Australian accent, curious and enthusiastic. +vd005|||Breaking news: a major climate summit opens today in Geneva.|A crisp female newsreader voice, neutral accent, confident and precise. +vd006|||In the beginning, there was darkness and silence across the universe.|A rich, dramatic bass male narrator voice, slow and deeply resonant. +vd007|||Come closer, I have something important to tell you.|A soft, intimate female voice, slightly whispery, warm and gentle. +vd008|||And they're off! The horses race toward the first turn at incredible speed.|An energetic male sports commentator, fast-paced and excited. +vd009|||Once upon a time, in a land far away, lived a very clever fox.|A light, playful voice with childlike enthusiasm, bright and clear. +vd010|||The ancient manuscript reveals secrets hidden for a thousand years.|A wise, measured elderly male voice, slow and deliberate, British English accent. +vd011|||Good evening, ladies and gentlemen, and welcome to our show.|A sophisticated female voice with a slight French accent speaking English, elegant and refined. +vd012|||System initialized. Running diagnostics. All systems nominal.|A clear, precise robotic-sounding voice, neutral and monotone with slight synthetic quality. +vd013|||I hear what you are saying, and it is completely understandable to feel that way.|A warm, empathetic female therapist voice, calm and reassuring, unhurried pace. +vd014|||Attention all units: proceed to grid reference seven-seven-alpha.|A firm, authoritative military male voice, clipped and commanding. +vd015|||Oh my goodness, you have to try this amazing new recipe I just found!|An enthusiastic, bubbly female voice, high energy and friendly. +vd016|||Dude, the waves were totally amazing out there today. Super happy about it!|A relaxed male voice with a California accent, casual and laid-back. +vd017|||The quarterly results exceed expectations across all major metrics.|A sharp, businesslike female voice, confident and efficient, fast-paced delivery. +vd018|||Chapter one. The morning sun filtered gently through the forest canopy.|A smooth, rich male audiobook narrator voice, expressive and engaging. +vd019|||To be or not to be, that is the question.|A theatrical female voice, dramatic and expressive, stage projection quality. +vd020|||And that is all for tonight. Stay well out there, everyone.|A warm, velvety male late-night radio DJ voice, smooth and intimate. diff --git a/benchmarks/build_dataset/seed_tts_smoke/en/meta.lst b/benchmarks/build_dataset/seed_tts_smoke/en/meta.lst new file mode 100644 index 00000000000..afe4bc8abcd --- /dev/null +++ b/benchmarks/build_dataset/seed_tts_smoke/en/meta.lst @@ -0,0 +1,20 @@ +smoke001|||The quick brown fox jumps over the lazy dog near the riverbank at sunset. +smoke002|||Welcome to the future of text-to-speech synthesis in production systems. +smoke003|||Yesterday the team finished rolling out the new authentication flow. +smoke004|||She walked carefully across the wet cobblestones, careful not to slip. +smoke005|||The conference call is scheduled for nine in the morning, Pacific time. +smoke006|||Please remember to save your work before closing the editor. +smoke007|||Two plus two equals four, but five hundred and forty three digits is long. +smoke008|||I would like a coffee with oat milk and a chocolate croissant please. +smoke009|||The library closes at eight on weekdays and six on Saturdays. +smoke010|||During the Renaissance, art and science flourished in European cities. +smoke011|||He whispered the secret word so quietly that no one else could hear. +smoke012|||Our flight departs from gate twenty three at eleven fifteen. +smoke013|||The storm knocked out power for six hours, but the backup generator kicked in. +smoke014|||Reading a good book on a rainy afternoon is one of life's great pleasures. +smoke015|||When the kettle whistled, she poured the hot water over the fresh tea leaves. +smoke016|||The algorithm runs in linear time, which is a big improvement over the previous approach. +smoke017|||In the distance, the mountains were shrouded in thick morning fog. +smoke018|||Our company reported record revenue for the fourth quarter of the fiscal year. +smoke019|||She explained the new policy in detail during the staff meeting this morning. +smoke020|||The children laughed and played in the garden until the sun began to set. diff --git a/benchmarks/qwen3-tts/README.md b/benchmarks/qwen3-tts/README.md deleted file mode 100644 index a1c2ebe12ff..00000000000 --- a/benchmarks/qwen3-tts/README.md +++ /dev/null @@ -1,107 +0,0 @@ -# Qwen3-TTS Benchmark - -Benchmarks for Qwen3-TTS text-to-speech models, comparing vLLM-Omni streaming serving against HuggingFace Transformers offline inference. - -## Prerequisites - -```bash -pip install matplotlib aiohttp soundfile numpy tqdm -pip install qwen_tts # for HF baseline -``` - -## Quick Start - -Run the full benchmark (vllm-omni + HF baseline) with a single command: - -```bash -cd benchmarks/qwen3-tts -bash run_benchmark.sh -``` - -Results (JSON + PNG plots) are saved to `results/`. - -### Common options - -```bash -# Only vllm-omni (skip HF baseline) -bash run_benchmark.sh --async-only - -# Only HF baseline -bash run_benchmark.sh --hf-only - -# Use a different model (e.g. 1.7B) -MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only - -# Use a Voice Clone model -MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only - -# Use batch size 16 for higher throughput -BATCH_SIZE=16 bash run_benchmark.sh --async-only - -# Custom GPU, prompt count, concurrency levels -GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh -``` - -## Manual Steps - -### 1) Start the vLLM-Omni server - -```bash -CUDA_VISIBLE_DEVICES=0 python -m vllm_omni.entrypoints.cli.main serve \ - "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \ - --omni --host 127.0.0.1 --port 8000 \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ - --stage-overrides '{"0":{"max_num_seqs":1,"gpu_memory_utilization":0.3,"max_num_batched_tokens":512},"1":{"max_num_seqs":1,"gpu_memory_utilization":0.3,"max_num_batched_tokens":8192}}' \ - --trust-remote-code -``` - -### 2) Run online serving benchmark - -```bash -python benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py \ - --port 8000 \ - --num-prompts 50 \ - --max-concurrency 1 4 10 \ - --config-name "async_chunk" \ - --result-dir results/ -``` - -### 3) Run HuggingFace baseline - -```bash -python benchmarks/qwen3-tts/transformers/bench_tts_hf.py \ - --model "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \ - --num-prompts 50 \ - --gpu-device 0 \ - --result-dir results/ -``` - -### 4) Generate comparison plots - -```bash -python benchmarks/qwen3-tts/plot_results.py \ - --results results/bench_async_chunk_*.json results/bench_hf_transformers_*.json \ - --labels "vllm-omni" "hf_transformers" \ - --output results/comparison.png -``` - -## Batch-size presets - -The bench script loads the bundled production deploy (`vllm_omni/deploy/qwen3_tts.yaml`) and layers per-stage budgets on top via `--stage-overrides`, driven by the `BATCH_SIZE` env var. Each batch size picks compatible per-stage `max_num_seqs`, `max_num_batched_tokens`, and `gpu_memory_utilization` defaults: - -| `BATCH_SIZE` | Description | -|:--:|-------------| -| `1` (default) | Single-request processing (lowest latency) | -| `4` | Moderate-throughput concurrent processing | -| `16` | High-throughput concurrent processing | - -The 2-stage pipeline (Talker -> Code2Wav) runs with `async_chunk` streaming enabled via the prod deploy; the `SharedMemoryConnector` streams codec frames (25-frame chunks with 25-frame context overlap) between stages. - -The model is specified via the CLI `--model` flag (or `MODEL` env var), so the same bench script works for both the 0.6B and 1.7B model variants. - -## Metrics - -- **TTFP (Time to First Audio Packet)**: Time from request to first audio chunk (streaming latency) -- **E2E (End-to-End Latency)**: Total time from request to complete audio response -- **RTF (Real-Time Factor)**: E2E latency / audio duration. RTF < 1.0 means faster-than-real-time synthesis -- **Throughput**: Total audio seconds generated per wall-clock second diff --git a/benchmarks/qwen3-tts/plot_results.py b/benchmarks/qwen3-tts/plot_results.py deleted file mode 100644 index e750101e324..00000000000 --- a/benchmarks/qwen3-tts/plot_results.py +++ /dev/null @@ -1,254 +0,0 @@ -"""Plot Qwen3-TTS benchmark results. - -Generates comparison bar charts similar to the async_chunk design doc: -- TTFP (Time-to-First-Packet) across concurrency levels -- E2E latency across concurrency levels -- RTF (Real-Time Factor) across concurrency levels - -Usage: - # Compare two configs (async_chunk vs no_async_chunk): - python plot_results.py \ - --results results/bench_async_chunk_*.json results/bench_no_async_chunk_*.json \ - --labels "async_chunk" "no_async_chunk" \ - --output results/qwen3_tts_benchmark.png - - # Single config: - python plot_results.py \ - --results results/bench_async_chunk_*.json \ - --labels "async_chunk" \ - --output results/qwen3_tts_benchmark.png -""" - -import argparse -import json -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np - - -def load_results(result_files: list[str]) -> list[list[dict]]: - """Load benchmark results from JSON files.""" - all_results = [] - for f in result_files: - with open(f) as fh: - data = json.load(fh) - all_results.append(data) - return all_results - - -def plot_comparison( - all_results: list[list[dict]], - labels: list[str], - output_path: str, - title_prefix: str = "Qwen3-TTS", -): - """Generate comparison bar charts.""" - n_configs = len(all_results) - - # Collect concurrency levels present in ALL configs (skip missing data) - all_concurrencies = [set(r["concurrency"] for r in results) for results in all_results] - concurrencies = sorted(set.union(*all_concurrencies)) - - # Build data arrays, using None for missing concurrency levels - ttfp_data = {label: [] for label in labels} - e2e_data = {label: [] for label in labels} - rtf_data = {label: [] for label in labels} - throughput_data = {label: [] for label in labels} - - for results, label in zip(all_results, labels): - conc_map = {r["concurrency"]: r for r in results} - for c in concurrencies: - r = conc_map.get(c) - ttfp_data[label].append(r["mean_ttfp_ms"] if r else None) - e2e_data[label].append(r["mean_e2e_ms"] if r else None) - rtf_data[label].append(r["mean_rtf"] if r else None) - throughput_data[label].append(r["audio_throughput"] if r else None) - - fig, axes = plt.subplots(2, 2, figsize=(14, 10)) - fig.suptitle(f"{title_prefix} Performance Benchmark", fontsize=16, fontweight="bold") - - x = np.arange(len(concurrencies)) - width = 0.35 if n_configs == 2 else 0.5 - if n_configs > 1: - offsets = np.linspace(-width / 2 * (n_configs - 1), width / 2 * (n_configs - 1), n_configs) - else: - offsets = [0] - - colors = ["#2196F3", "#FF5722", "#4CAF50", "#FFC107"] - - def plot_metric(ax, data_dict, ylabel, title, fmt=".1f"): - bars = [] - for i, (label, values) in enumerate(data_dict.items()): - # Replace None with 0 for plotting, but track which are missing - plot_values = [v if v is not None else 0 for v in values] - color = colors[i % len(colors)] - bar = ax.bar(x + offsets[i], plot_values, width, label=label, color=color, alpha=0.85) - bars.append(bar) - # Add value labels on bars (skip None/missing data) - max_val = max((v for v in values if v is not None), default=1) - for rect, val in zip(bar, values): - if val is not None and val > 0: - ax.text( - rect.get_x() + rect.get_width() / 2, - rect.get_height() + max_val * 0.02, - f"{val:{fmt}}", - ha="center", - va="bottom", - fontsize=9, - fontweight="bold", - ) - ax.set_xlabel("Concurrency", fontsize=12) - ax.set_ylabel(ylabel, fontsize=12) - ax.set_title(title, fontsize=13, fontweight="bold") - ax.set_xticks(x) - ax.set_xticklabels([str(c) for c in concurrencies]) - ax.legend(fontsize=10) - ax.grid(axis="y", alpha=0.3) - ax.set_axisbelow(True) - - plot_metric(axes[0, 0], ttfp_data, "TTFP (ms)", "Time to First Audio Packet (TTFP)") - plot_metric(axes[0, 1], e2e_data, "E2E Latency (ms)", "End-to-End Latency (E2E)") - plot_metric(axes[1, 0], rtf_data, "RTF", "Real-Time Factor (RTF)", fmt=".3f") - plot_metric(axes[1, 1], throughput_data, "Audio-sec / Wall-sec", "Audio Throughput", fmt=".2f") - - plt.tight_layout() - plt.savefig(output_path, dpi=150, bbox_inches="tight") - print(f"Plot saved to {output_path}") - plt.close() - - -def plot_single_summary(results: list[dict], label: str, output_path: str): - """Generate a single-config summary with percentile breakdown.""" - concurrencies = [r["concurrency"] for r in results] - - fig, axes = plt.subplots(1, 3, figsize=(16, 5)) - fig.suptitle(f"Qwen3-TTS Benchmark - {label}", fontsize=15, fontweight="bold") - - # TTFP breakdown - ax = axes[0] - means = [r["mean_ttfp_ms"] for r in results] - medians = [r["median_ttfp_ms"] for r in results] - p90s = [r["p90_ttfp_ms"] for r in results] - p99s = [r["p99_ttfp_ms"] for r in results] - x = np.arange(len(concurrencies)) - w = 0.2 - ax.bar(x - 1.5 * w, means, w, label="mean", color="#2196F3") - ax.bar(x - 0.5 * w, medians, w, label="median", color="#4CAF50") - ax.bar(x + 0.5 * w, p90s, w, label="p90", color="#FF9800") - ax.bar(x + 1.5 * w, p99s, w, label="p99", color="#F44336") - ax.set_xticks(x) - ax.set_xticklabels([str(c) for c in concurrencies]) - ax.set_xlabel("Concurrency") - ax.set_ylabel("TTFP (ms)") - ax.set_title("Time to First Audio Packet") - ax.legend(fontsize=9) - ax.grid(axis="y", alpha=0.3) - - # E2E breakdown - ax = axes[1] - means = [r["mean_e2e_ms"] for r in results] - medians = [r["median_e2e_ms"] for r in results] - p90s = [r["p90_e2e_ms"] for r in results] - p99s = [r["p99_e2e_ms"] for r in results] - ax.bar(x - 1.5 * w, means, w, label="mean", color="#2196F3") - ax.bar(x - 0.5 * w, medians, w, label="median", color="#4CAF50") - ax.bar(x + 0.5 * w, p90s, w, label="p90", color="#FF9800") - ax.bar(x + 1.5 * w, p99s, w, label="p99", color="#F44336") - ax.set_xticks(x) - ax.set_xticklabels([str(c) for c in concurrencies]) - ax.set_xlabel("Concurrency") - ax.set_ylabel("E2E Latency (ms)") - ax.set_title("End-to-End Latency") - ax.legend(fontsize=9) - ax.grid(axis="y", alpha=0.3) - - # RTF - ax = axes[2] - means = [r["mean_rtf"] for r in results] - medians = [r["median_rtf"] for r in results] - ax.bar(x - 0.15, means, 0.3, label="mean", color="#2196F3") - ax.bar(x + 0.15, medians, 0.3, label="median", color="#4CAF50") - ax.set_xticks(x) - ax.set_xticklabels([str(c) for c in concurrencies]) - ax.set_xlabel("Concurrency") - ax.set_ylabel("RTF") - ax.set_title("Real-Time Factor") - ax.legend(fontsize=9) - ax.grid(axis="y", alpha=0.3) - - plt.tight_layout() - plt.savefig(output_path, dpi=150, bbox_inches="tight") - print(f"Plot saved to {output_path}") - plt.close() - - -def print_comparison_table(all_results: list[list[dict]], labels: list[str]): - """Print a markdown-formatted comparison table.""" - concurrencies = sorted(set(r["concurrency"] for r in all_results[0])) - - print("\n## Benchmark Results\n") - header = "| Metric | Concurrency |" - sep = "| --- | --- |" - for label in labels: - header += f" {label} |" - sep += " --- |" - print(header) - print(sep) - - for metric, key, fmt in [ - ("TTFP (ms)", "mean_ttfp_ms", ".1f"), - ("E2E (ms)", "mean_e2e_ms", ".1f"), - ("RTF", "mean_rtf", ".3f"), - ("Throughput (audio-s/s)", "audio_throughput", ".2f"), - ]: - for c in concurrencies: - row = f"| {metric} | {c} |" - for results in all_results: - conc_map = {r["concurrency"]: r for r in results} - val = conc_map.get(c, {}).get(key, 0) - row += f" {val:{fmt}} |" - print(row) - - # Improvement calculation (only if 2 configs) - if len(all_results) == 2: - print(f"\n## Improvement ({labels[0]} vs {labels[1]})\n") - print("| Metric | Concurrency | Improvement |") - print("| --- | --- | --- |") - for metric, key in [("TTFP", "mean_ttfp_ms"), ("E2E", "mean_e2e_ms"), ("RTF", "mean_rtf")]: - for c in concurrencies: - m0 = {r["concurrency"]: r for r in all_results[0]} - m1 = {r["concurrency"]: r for r in all_results[1]} - v0 = m0.get(c, {}).get(key, 0) - v1 = m1.get(c, {}).get(key, 0) - if v1 > 0: - pct = (v1 - v0) / v1 * 100 - print(f"| {metric} | {c} | {pct:+.1f}% |") - - -def parse_args(): - parser = argparse.ArgumentParser(description="Plot Qwen3-TTS benchmark results") - parser.add_argument( - "--results", type=str, nargs="+", required=True, help="Path(s) to result JSON files (one per config)" - ) - parser.add_argument( - "--labels", type=str, nargs="+", required=True, help="Labels for each config (must match --results count)" - ) - parser.add_argument("--output", type=str, default="results/qwen3_tts_benchmark.png", help="Output image path") - parser.add_argument("--title", type=str, default="Qwen3-TTS", help="Title prefix for the plot") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - assert len(args.results) == len(args.labels), "--results and --labels must have the same count" - - all_results = load_results(args.results) - print_comparison_table(all_results, args.labels) - - Path(args.output).parent.mkdir(parents=True, exist_ok=True) - - if len(all_results) == 1: - plot_single_summary(all_results[0], args.labels[0], args.output) - else: - plot_comparison(all_results, args.labels, args.output, title_prefix=args.title) diff --git a/benchmarks/qwen3-tts/results/.gitignore b/benchmarks/qwen3-tts/results/.gitignore deleted file mode 100644 index 5b6759ef717..00000000000 --- a/benchmarks/qwen3-tts/results/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Benchmark results are machine-specific - do not commit -* -!.gitignore diff --git a/benchmarks/qwen3-tts/run_benchmark.sh b/benchmarks/qwen3-tts/run_benchmark.sh deleted file mode 100755 index 8c3e46903ca..00000000000 --- a/benchmarks/qwen3-tts/run_benchmark.sh +++ /dev/null @@ -1,285 +0,0 @@ -#!/bin/bash -# Qwen3-TTS Benchmark Runner -# -# Compares vllm-omni streaming serving vs HuggingFace transformers offline inference. -# Produces JSON results and comparison plots. -# -# Usage: -# # Full comparison (vllm-omni + HF): -# bash run_benchmark.sh -# -# # Only vllm-omni async_chunk config: -# bash run_benchmark.sh --async-only -# -# # Only HuggingFace baseline: -# bash run_benchmark.sh --hf-only -# -# # vllm-omni only (skip HF): -# bash run_benchmark.sh --skip-hf -# -# # Custom settings: -# GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh -# -# # Use 1.7B model: -# MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only -# -# # Use Voice Clone model -# MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only -# -# # Use batch_size=4: -# BATCH_SIZE=4 bash run_benchmark.sh --async-only -# -# Environment variables: -# GPU_DEVICE - GPU index to use (default: 0) -# NUM_PROMPTS - Number of prompts per concurrency level (default: 50) -# CONCURRENCY - Space-separated concurrency levels (default: "1 4 10") -# MODEL - Model name (default: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice) -# PORT - Server port (default: 8000) -# BATCH_SIZE - Per-stage ``max_num_seqs`` for both talker and code2wav (default: 1) -# GPU_MEM_TALKER - gpu_memory_utilization for talker stage (default: 0.3 at bs=1, else 0.2) -# GPU_MEM_CODE2WAV - gpu_memory_utilization for code2wav stage (default: 0.3 at bs=1, else 0.2) -# TASK_TYPE - Task type: CustomVoice, VoiceDesign, Base (default: CustomVoice) - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" - -# Defaults -GPU_DEVICE="${GPU_DEVICE:-0}" -NUM_PROMPTS="${NUM_PROMPTS:-50}" -CONCURRENCY="${CONCURRENCY:-1 4 10}" -MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice}" -PORT="${PORT:-8000}" -BATCH_SIZE="${BATCH_SIZE:-1}" -DEFAULT_MEM=$([ "${BATCH_SIZE}" = "1" ] && echo "0.3" || echo "0.2") -GPU_MEM_TALKER="${GPU_MEM_TALKER:-${DEFAULT_MEM}}" -GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV:-${DEFAULT_MEM}}" -NUM_WARMUPS="${NUM_WARMUPS:-3}" -DEPLOY_CONFIG="vllm_omni/deploy/qwen3_tts.yaml" -RESULT_DIR="${SCRIPT_DIR}/results" -TIMESTAMP="$(date +%Y%m%d_%H%M%S)" -TASK_TYPE="${TASK_TYPE:-CustomVoice}" - -# Build --stage-overrides JSON from BATCH_SIZE + GPU_MEM_*. -STAGE_OVERRIDES=$( - BATCH_SIZE="${BATCH_SIZE}" \ - GPU_MEM_TALKER="${GPU_MEM_TALKER}" \ - GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV}" \ - python - <<'PYEOF' -import json, os -bs = int(os.environ["BATCH_SIZE"]) -mem_t = float(os.environ["GPU_MEM_TALKER"]) -mem_c = float(os.environ["GPU_MEM_CODE2WAV"]) -# Prefill budget grows with batch size on both stages. -talker_batched = 512 if bs <= 4 else 4096 -code2wav_batched = 8192 if bs <= 4 else 32768 -print(json.dumps({ - "0": {"max_num_seqs": bs, "gpu_memory_utilization": mem_t, "max_num_batched_tokens": talker_batched}, - "1": {"max_num_seqs": bs, "gpu_memory_utilization": mem_c, "max_num_batched_tokens": code2wav_batched}, -})) -PYEOF -) - -# Parse args -RUN_ASYNC=true -RUN_HF=true -for arg in "$@"; do - case "$arg" in - --async-only) RUN_HF=false ;; - --hf-only) RUN_ASYNC=false ;; - --skip-hf) RUN_HF=false ;; - esac -done - -mkdir -p "${RESULT_DIR}" - -echo "============================================================" -echo " Qwen3-TTS Benchmark" -echo "============================================================" -echo " GPU: ${GPU_DEVICE}" -echo " Model: ${MODEL}" -echo " Prompts: ${NUM_PROMPTS}" -echo " Concurrency: ${CONCURRENCY}" -echo " Port: ${PORT}" -echo " Deploy config: ${DEPLOY_CONFIG}" -echo " Batch size: ${BATCH_SIZE}" -echo " GPU mem T/C: ${GPU_MEM_TALKER} / ${GPU_MEM_CODE2WAV}" -echo " Results: ${RESULT_DIR}" -echo " Task type: ${TASK_TYPE}" -echo "============================================================" - -# Start server and wait for it to be ready -start_server() { - local config_name="$1" - local log_file="${RESULT_DIR}/server_${config_name}_${TIMESTAMP}.log" - - echo "" - echo "Starting server with config: ${config_name}" - echo " Deploy config: ${DEPLOY_CONFIG}" - echo " Stage overrides: ${STAGE_OVERRIDES}" - echo " Log file: ${log_file}" - - VLLM_WORKER_MULTIPROC_METHOD=spawn \ - CUDA_VISIBLE_DEVICES="${GPU_DEVICE}" \ - python -m vllm_omni.entrypoints.cli.main serve "${MODEL}" \ - --omni \ - --host 127.0.0.1 \ - --port "${PORT}" \ - --deploy-config "${DEPLOY_CONFIG}" \ - --stage-overrides "${STAGE_OVERRIDES}" \ - --stage-init-timeout 120 \ - --trust-remote-code \ - --disable-log-stats \ - > "${log_file}" 2>&1 & - - SERVER_PID=$! - echo " Server PID: ${SERVER_PID}" - - # Wait for server to be ready - echo " Waiting for server to be ready..." - local max_wait=300 - local waited=0 - while [ ${waited} -lt ${max_wait} ]; do - if curl -sf "http://127.0.0.1:${PORT}/v1/models" > /dev/null 2>&1; then - echo " Server is ready! (waited ${waited}s)" - return 0 - fi - # Check if process is still alive - if ! kill -0 ${SERVER_PID} 2>/dev/null; then - echo " ERROR: Server process died. Check log: ${log_file}" - tail -20 "${log_file}" - return 1 - fi - sleep 2 - waited=$((waited + 2)) - done - - echo " ERROR: Server did not start within ${max_wait}s. Check log: ${log_file}" - kill ${SERVER_PID} 2>/dev/null || true - return 1 -} - -# Stop the server -stop_server() { - if [ -n "${SERVER_PID:-}" ]; then - echo " Stopping server (PID: ${SERVER_PID})..." - kill ${SERVER_PID} 2>/dev/null || true - wait ${SERVER_PID} 2>/dev/null || true - # Kill any remaining child processes on the port - local pids - pids=$(lsof -ti:${PORT} 2>/dev/null || true) - if [ -n "${pids}" ]; then - echo " Cleaning up remaining processes on port ${PORT}..." - echo "${pids}" | xargs kill -9 2>/dev/null || true - fi - echo " Server stopped." - SERVER_PID="" - fi -} - -# Cleanup on exit -trap 'stop_server' EXIT - -# Run benchmark for a given config -run_bench() { - local config_name="$1" - - echo "" - echo "============================================================" - echo " Benchmarking: ${config_name}" - echo "============================================================" - - start_server "${config_name}" - - # Convert concurrency string to args - local conc_args="" - for c in ${CONCURRENCY}; do - conc_args="${conc_args} ${c}" - done - - cd "${PROJECT_ROOT}" - python "${SCRIPT_DIR}/vllm_omni/bench_tts_serve.py" \ - --host 127.0.0.1 \ - --port "${PORT}" \ - --num-prompts "${NUM_PROMPTS}" \ - --max-concurrency ${conc_args} \ - --num-warmups "${NUM_WARMUPS}" \ - --config-name "${config_name}" \ - --result-dir "${RESULT_DIR}" \ - --task-type "${TASK_TYPE}" - - stop_server - - # Allow GPU memory to settle - sleep 5 -} - -# Run vllm-omni benchmark -if [ "${RUN_ASYNC}" = true ]; then - run_bench "async_chunk" -fi - -# Run HuggingFace baseline benchmark -if [ "${RUN_HF}" = true ]; then - echo "" - echo "============================================================" - echo " Benchmarking: HuggingFace transformers (offline)" - echo "============================================================" - - cd "${PROJECT_ROOT}" - python "${SCRIPT_DIR}/transformers/bench_tts_hf.py" \ - --model "${MODEL}" \ - --num-prompts "${NUM_PROMPTS}" \ - --num-warmups "${NUM_WARMUPS}" \ - --gpu-device "${GPU_DEVICE}" \ - --config-name "hf_transformers" \ - --result-dir "${RESULT_DIR}" \ - --task-type "${TASK_TYPE}" - - # Allow GPU memory to settle - sleep 5 -fi - -# Plot results -echo "" -echo "============================================================" -echo " Generating plots..." -echo "============================================================" - -RESULT_FILES="" -LABELS="" - -if [ "${RUN_ASYNC}" = true ]; then - ASYNC_FILE=$(ls -t "${RESULT_DIR}"/bench_async_chunk_*.json 2>/dev/null | head -1) - if [ -n "${ASYNC_FILE}" ]; then - RESULT_FILES="${ASYNC_FILE}" - LABELS="async_chunk" - fi -fi - -if [ "${RUN_HF}" = true ]; then - HF_FILE=$(ls -t "${RESULT_DIR}"/bench_hf_transformers_*.json 2>/dev/null | head -1) - if [ -n "${HF_FILE}" ]; then - if [ -n "${RESULT_FILES}" ]; then - RESULT_FILES="${RESULT_FILES} ${HF_FILE}" - LABELS="${LABELS} hf_transformers" - else - RESULT_FILES="${HF_FILE}" - LABELS="hf_transformers" - fi - fi -fi - -if [ -n "${RESULT_FILES}" ]; then - python "${SCRIPT_DIR}/plot_results.py" \ - --results ${RESULT_FILES} \ - --labels ${LABELS} \ - --output "${RESULT_DIR}/qwen3_tts_benchmark_${TIMESTAMP}.png" -fi - -echo "" -echo "============================================================" -echo " Benchmark complete!" -echo " Results: ${RESULT_DIR}" -echo "============================================================" diff --git a/benchmarks/qwen3-tts/transformers/bench_tts_hf.py b/benchmarks/qwen3-tts/transformers/bench_tts_hf.py deleted file mode 100644 index ed04ee264c4..00000000000 --- a/benchmarks/qwen3-tts/transformers/bench_tts_hf.py +++ /dev/null @@ -1,301 +0,0 @@ -"""Benchmark Qwen3-TTS using HuggingFace transformers (qwen_tts library). - -Measures E2E latency, RTF, and audio duration for offline (non-serving) inference. -Results are saved in the same JSON format as bench_tts_serve.py for unified plotting. - -Usage: - python bench_tts_hf.py \ - --model Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice \ - --num-prompts 50 \ - --num-warmups 3 \ - --gpu-device 0 \ - --result-dir results/ -""" - -import argparse -import json -import time -from dataclasses import asdict, dataclass, field -from datetime import datetime -from pathlib import Path - -import numpy as np -import soundfile as sf -import torch - -PROMPTS = [ - "Hello, welcome to the voice synthesis benchmark test.", - "She said she would be here by noon, but nobody showed up.", - "The quick brown fox jumps over the lazy dog near the riverbank.", - "I can't believe how beautiful the sunset looks from up here on the mountain.", - "Please remember to bring your identification documents to the appointment tomorrow morning.", - "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", - "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", - "After the meeting, we should discuss the quarterly results and plan for the next phase.", - "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", - "The train leaves at half past seven, so we need to arrive at the station before then.", - "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", - "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", -] - -REF_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav" -REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you." -INSTRUCT = "Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice." - - -@dataclass -class BenchmarkResult: - config_name: str = "" - concurrency: int = 1 # always 1 for offline - num_prompts: int = 0 - completed: int = 0 - failed: int = 0 - duration_s: float = 0.0 - # TTFP stats - not applicable for HF offline, set to E2E for compatibility - mean_ttfp_ms: float = 0.0 - median_ttfp_ms: float = 0.0 - std_ttfp_ms: float = 0.0 - p90_ttfp_ms: float = 0.0 - p95_ttfp_ms: float = 0.0 - p99_ttfp_ms: float = 0.0 - # E2E stats (ms) - mean_e2e_ms: float = 0.0 - median_e2e_ms: float = 0.0 - std_e2e_ms: float = 0.0 - p90_e2e_ms: float = 0.0 - p95_e2e_ms: float = 0.0 - p99_e2e_ms: float = 0.0 - # RTF stats - mean_rtf: float = 0.0 - median_rtf: float = 0.0 - std_rtf: float = 0.0 - p99_rtf: float = 0.0 - # Audio stats - mean_audio_duration_s: float = 0.0 - total_audio_duration_s: float = 0.0 - audio_throughput: float = 0.0 - request_throughput: float = 0.0 - # Per-request details - per_request: list = field(default_factory=list) - - -def generate_audio(model, prompt: str, args): - if args.task_type == "Base": - return model.generate_voice_clone( - text=prompt, - language=args.language, - ref_audio=REF_AUDIO, - ref_text=REF_TEXT, - ) - - if args.task_type == "VoiceDesign": - return model.generate_voice_design( - text=prompt, - language=args.language, - instruct=INSTRUCT, - ) - - return model.generate_custom_voice( - text=prompt, - language=args.language, - speaker=args.voice, - ) - - -def run_benchmark(args): - from qwen_tts import Qwen3TTSModel - - device = f"cuda:{args.gpu_device}" - print(f"Loading model: {args.model} on {device}") - model = Qwen3TTSModel.from_pretrained( - args.model, - device_map=device, - dtype=torch.bfloat16, - ) - print("Model loaded.") - - # Build prompt list - prompts = [PROMPTS[i % len(PROMPTS)] for i in range(args.num_prompts)] - - # Warmup - if args.num_warmups > 0: - print(f"Warming up with {args.num_warmups} requests...") - for i in range(args.num_warmups): - p = PROMPTS[i % len(PROMPTS)] - wavs, sr = generate_audio(model, p, args) - # Sync GPU - torch.cuda.synchronize(device) - print("Warmup done.") - - # Benchmark - print(f"Running {args.num_prompts} requests sequentially...") - e2e_times = [] - rtfs = [] - audio_durations = [] - per_request = [] - failed = 0 - - audio_dir = None - if args.save_audio: - audio_dir = Path(args.result_dir) / "audio_hf" - audio_dir.mkdir(parents=True, exist_ok=True) - - total_start = time.perf_counter() - - for i, prompt in enumerate(prompts): - try: - torch.cuda.synchronize(device) - st = time.perf_counter() - - wavs, sr = generate_audio(model, prompt, args) - - torch.cuda.synchronize(device) - elapsed = time.perf_counter() - st - - # Compute audio duration - audio_samples = wavs[0] - if isinstance(audio_samples, torch.Tensor): - audio_samples = audio_samples.cpu().numpy() - audio_dur = len(audio_samples) / sr - - rtf = elapsed / audio_dur if audio_dur > 0 else 0.0 - - e2e_times.append(elapsed) - rtfs.append(rtf) - audio_durations.append(audio_dur) - per_request.append( - { - "e2e_ms": elapsed * 1000, - "ttfp_ms": elapsed * 1000, # no streaming, TTFP = E2E - "rtf": rtf, - "audio_duration_s": audio_dur, - "prompt": prompt, - } - ) - - if audio_dir: - sf.write(str(audio_dir / f"output_{i:04d}.wav"), audio_samples, sr) - - if (i + 1) % 10 == 0 or i == 0: - print( - f" [{i + 1}/{args.num_prompts}] e2e={elapsed * 1000:.0f}ms rtf={rtf:.3f} audio={audio_dur:.2f}s" - ) - - except Exception as e: - print(f" [{i + 1}/{args.num_prompts}] FAILED: {e}") - failed += 1 - - total_duration = time.perf_counter() - total_start - completed = len(e2e_times) - - # Compute stats - result = BenchmarkResult( - config_name=args.config_name, - concurrency=1, - num_prompts=args.num_prompts, - completed=completed, - failed=failed, - duration_s=total_duration, - ) - - if e2e_times: - e2e_ms = [t * 1000 for t in e2e_times] - - result.mean_e2e_ms = float(np.mean(e2e_ms)) - result.median_e2e_ms = float(np.median(e2e_ms)) - result.std_e2e_ms = float(np.std(e2e_ms)) - result.p90_e2e_ms = float(np.percentile(e2e_ms, 90)) - result.p95_e2e_ms = float(np.percentile(e2e_ms, 95)) - result.p99_e2e_ms = float(np.percentile(e2e_ms, 99)) - - # For HF offline, TTFP = E2E (no streaming) - result.mean_ttfp_ms = result.mean_e2e_ms - result.median_ttfp_ms = result.median_e2e_ms - result.std_ttfp_ms = result.std_e2e_ms - result.p90_ttfp_ms = result.p90_e2e_ms - result.p95_ttfp_ms = result.p95_e2e_ms - result.p99_ttfp_ms = result.p99_e2e_ms - - result.mean_rtf = float(np.mean(rtfs)) - result.median_rtf = float(np.median(rtfs)) - result.std_rtf = float(np.std(rtfs)) - result.p99_rtf = float(np.percentile(rtfs, 99)) - - result.mean_audio_duration_s = float(np.mean(audio_durations)) - result.total_audio_duration_s = float(np.sum(audio_durations)) - result.audio_throughput = result.total_audio_duration_s / total_duration - result.request_throughput = completed / total_duration - result.per_request = per_request - - # Print summary in standardized performance template - W = 50 - print("") - print(f"{'=' * W}") - print(f"{'Serving Benchmark Result':^{W}}") - print(f"{'=' * W}") - print(f"{'Successful requests:':<40}{completed:<10}") - print(f"{'Failed requests:':<40}{failed:<10}") - print(f"{'Maximum request concurrency:':<40}{1:<10}") - print(f"{'Benchmark duration (s):':<40}{total_duration:<10.2f}") - print(f"{'Request throughput (req/s):':<40}{result.request_throughput:<10.2f}") - print(f"{'-' * W}") - print(f"{'End-to-end Latency':^{W}}") - print(f"{'-' * W}") - print(f"{'Mean E2EL (ms):':<40}{result.mean_e2e_ms:<10.2f}") - print(f"{'Median E2EL (ms):':<40}{result.median_e2e_ms:<10.2f}") - print(f"{'P99 E2EL (ms):':<40}{result.p99_e2e_ms:<10.2f}") - print(f"{'=' * W}") - print(f"{'Audio Result':^{W}}") - print(f"{'=' * W}") - print(f"{'Total audio duration generated (s):':<40}{result.total_audio_duration_s:<10.2f}") - print(f"{'Audio throughput (audio duration/s):':<40}{result.audio_throughput:<10.2f}") - print(f"{'-' * W}") - print(f"{'Time to First Packet':^{W}}") - print(f"{'-' * W}") - print(f"{'Mean AUDIO_TTFP (ms):':<40}{result.mean_ttfp_ms:<10.2f}") - print(f"{'Median AUDIO_TTFP (ms):':<40}{result.median_ttfp_ms:<10.2f}") - print(f"{'P99 AUDIO_TTFP (ms):':<40}{result.p99_ttfp_ms:<10.2f}") - print(f"{'-' * W}") - print(f"{'Real Time Factor':^{W}}") - print(f"{'-' * W}") - print(f"{'Mean AUDIO_RTF:':<40}{result.mean_rtf:<10.3f}") - print(f"{'Median AUDIO_RTF:':<40}{result.median_rtf:<10.3f}") - print(f"{'P99 AUDIO_RTF:':<40}{result.p99_rtf:<10.3f}") - print(f"{'=' * W}") - print("") - - # Save results (as a list with single concurrency=1 entry, matching serve format) - result_dir = Path(args.result_dir) - result_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json" - - with open(result_file, "w") as f: - json.dump([asdict(result)], f, indent=2) - print(f"Results saved to {result_file}") - - return result - - -def parse_args(): - parser = argparse.ArgumentParser(description="Qwen3-TTS HuggingFace Benchmark") - parser.add_argument( - "--model", type=str, default="Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", help="HuggingFace model name or path" - ) - parser.add_argument("--num-prompts", type=int, default=50) - parser.add_argument("--num-warmups", type=int, default=3) - parser.add_argument("--gpu-device", type=int, default=0) - parser.add_argument("--voice", type=str, default="Vivian") - parser.add_argument("--language", type=str, default="English") - parser.add_argument("--task-type", type=str, default="CustomVoice", choices=["CustomVoice", "VoiceDesign", "Base"]) - parser.add_argument( - "--config-name", type=str, default="hf_transformers", help="Label for this config (used in filenames)" - ) - parser.add_argument("--result-dir", type=str, default="results") - parser.add_argument("--save-audio", action="store_true", help="Save generated audio files") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - run_benchmark(args) diff --git a/benchmarks/qwen3-tts/vllm_omni/bench_async_chunk.py b/benchmarks/qwen3-tts/vllm_omni/bench_async_chunk.py deleted file mode 100644 index 3497ae82152..00000000000 --- a/benchmarks/qwen3-tts/vllm_omni/bench_async_chunk.py +++ /dev/null @@ -1,301 +0,0 @@ -"""Benchmark comparing async_chunk on vs off for Qwen3-TTS. - -Measures TTFP (Time-to-First-Packet), E2E latency, and RTF across -concurrency levels for both async_chunk modes. Saves results as JSON. - -Usage: - # Run against a server already serving with a given config: - python bench_async_chunk.py \ - --host 127.0.0.1 --port 8000 \ - --config-name async_chunk_on \ - --num-prompts 50 \ - --max-concurrency 1 10 \ - --result-dir results/ -""" - -import argparse -import asyncio -import json -import time -from dataclasses import asdict, dataclass, field -from datetime import datetime -from pathlib import Path - -import aiohttp -import numpy as np -from tqdm.asyncio import tqdm - -PROMPTS = [ - "Hello, welcome to the voice synthesis benchmark test.", - "She said she would be here by noon, but nobody showed up.", - "The quick brown fox jumps over the lazy dog near the riverbank.", - "I can't believe how beautiful the sunset looks from up here on the mountain.", - "Please remember to bring your identification documents to the appointment tomorrow morning.", - "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", - "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", - "After the meeting, we should discuss the quarterly results and plan for the next phase.", - "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", - "The train leaves at half past seven, so we need to arrive at the station before then.", - "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", - "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", -] - - -@dataclass -class RequestResult: - success: bool = False - ttfp: float = 0.0 - e2e: float = 0.0 - audio_bytes: int = 0 - audio_duration: float = 0.0 - rtf: float = 0.0 - prompt: str = "" - error: str = "" - - -@dataclass -class BenchmarkResult: - config_name: str = "" - concurrency: int = 0 - num_prompts: int = 0 - completed: int = 0 - failed: int = 0 - duration_s: float = 0.0 - mean_ttfp_ms: float = 0.0 - median_ttfp_ms: float = 0.0 - std_ttfp_ms: float = 0.0 - p90_ttfp_ms: float = 0.0 - p95_ttfp_ms: float = 0.0 - p99_ttfp_ms: float = 0.0 - mean_e2e_ms: float = 0.0 - median_e2e_ms: float = 0.0 - std_e2e_ms: float = 0.0 - p90_e2e_ms: float = 0.0 - p95_e2e_ms: float = 0.0 - p99_e2e_ms: float = 0.0 - mean_rtf: float = 0.0 - median_rtf: float = 0.0 - std_rtf: float = 0.0 - mean_audio_duration_s: float = 0.0 - total_audio_duration_s: float = 0.0 - audio_throughput: float = 0.0 - request_throughput: float = 0.0 - per_request: list = field(default_factory=list) - - -def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = 24000, sample_width: int = 2) -> float: - return num_bytes / sample_width / sample_rate - - -async def send_tts_request( - session: aiohttp.ClientSession, - api_url: str, - prompt: str, - voice: str = "vivian", - language: str = "English", - stream: bool = True, - pbar: tqdm | None = None, -) -> RequestResult: - payload = { - "input": prompt, - "voice": voice, - "language": language, - "stream": stream, - "response_format": "pcm", - } - - result = RequestResult(prompt=prompt) - st = time.perf_counter() - - try: - async with session.post(api_url, json=payload) as response: - if response.status != 200: - result.error = f"HTTP {response.status}: {await response.text()}" - return result - - first_chunk = True - total_bytes = 0 - - async for chunk in response.content.iter_any(): - if first_chunk and len(chunk) > 0: - result.ttfp = time.perf_counter() - st - first_chunk = False - total_bytes += len(chunk) - - result.e2e = time.perf_counter() - st - result.audio_bytes = total_bytes - result.audio_duration = pcm_bytes_to_duration(total_bytes) - if result.audio_duration > 0: - result.rtf = result.e2e / result.audio_duration - result.success = True - - except Exception as e: - result.error = str(e) - result.e2e = time.perf_counter() - st - - if pbar: - pbar.update(1) - return result - - -async def run_benchmark( - host: str, - port: int, - num_prompts: int, - max_concurrency: int, - num_warmups: int = 3, - voice: str = "vivian", - language: str = "English", - stream: bool = True, -) -> BenchmarkResult: - api_url = f"http://{host}:{port}/v1/audio/speech" - - connector = aiohttp.TCPConnector(limit=max_concurrency, limit_per_host=max_concurrency, keepalive_timeout=60) - session = aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) - - if num_warmups > 0: - print(f" Warming up with {num_warmups} requests...") - warmup_tasks = [ - send_tts_request(session, api_url, PROMPTS[i % len(PROMPTS)], voice, language, stream) - for i in range(num_warmups) - ] - await asyncio.gather(*warmup_tasks) - print(" Warmup done.") - - request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] - - print(f" Running {num_prompts} requests with concurrency={max_concurrency}...") - semaphore = asyncio.Semaphore(max_concurrency) - pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") - - async def limited_request(prompt): - async with semaphore: - return await send_tts_request(session, api_url, prompt, voice, language, stream, pbar) - - start_time = time.perf_counter() - tasks = [asyncio.create_task(limited_request(p)) for p in request_prompts] - results: list[RequestResult] = await asyncio.gather(*tasks) - duration = time.perf_counter() - start_time - pbar.close() - - await session.close() - - successful = [r for r in results if r.success] - failed = [r for r in results if not r.success] - - bench = BenchmarkResult( - concurrency=max_concurrency, - num_prompts=num_prompts, - completed=len(successful), - failed=len(failed), - duration_s=duration, - ) - - if successful: - ttfps = [r.ttfp * 1000 for r in successful] - e2es = [r.e2e * 1000 for r in successful] - rtfs = [r.rtf for r in successful] - audio_durs = [r.audio_duration for r in successful] - - bench.mean_ttfp_ms = float(np.mean(ttfps)) - bench.median_ttfp_ms = float(np.median(ttfps)) - bench.std_ttfp_ms = float(np.std(ttfps)) - bench.p90_ttfp_ms = float(np.percentile(ttfps, 90)) - bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) - bench.p99_ttfp_ms = float(np.percentile(ttfps, 99)) - - bench.mean_e2e_ms = float(np.mean(e2es)) - bench.median_e2e_ms = float(np.median(e2es)) - bench.std_e2e_ms = float(np.std(e2es)) - bench.p90_e2e_ms = float(np.percentile(e2es, 90)) - bench.p95_e2e_ms = float(np.percentile(e2es, 95)) - bench.p99_e2e_ms = float(np.percentile(e2es, 99)) - - bench.mean_rtf = float(np.mean(rtfs)) - bench.median_rtf = float(np.median(rtfs)) - bench.std_rtf = float(np.std(rtfs)) - - bench.mean_audio_duration_s = float(np.mean(audio_durs)) - bench.total_audio_duration_s = float(np.sum(audio_durs)) - bench.audio_throughput = bench.total_audio_duration_s / duration - bench.request_throughput = len(successful) / duration - - bench.per_request = [ - { - "ttfp_ms": r.ttfp * 1000, - "e2e_ms": r.e2e * 1000, - "rtf": r.rtf, - "audio_duration_s": r.audio_duration, - "prompt": r.prompt, - } - for r in successful - ] - - print(f"\n{'=' * 60}") - print(f" Concurrency: {max_concurrency} | Completed: {bench.completed} | Failed: {bench.failed}") - print(f" Duration: {duration:.2f}s | Throughput: {bench.request_throughput:.2f} req/s") - print( - f" TTFP (ms): mean={bench.mean_ttfp_ms:.1f} median={bench.median_ttfp_ms:.1f}" - f" p90={bench.p90_ttfp_ms:.1f} p99={bench.p99_ttfp_ms:.1f}" - ) - print( - f" E2E (ms): mean={bench.mean_e2e_ms:.1f} median={bench.median_e2e_ms:.1f}" - f" p90={bench.p90_e2e_ms:.1f} p99={bench.p99_e2e_ms:.1f}" - ) - print(f" RTF: mean={bench.mean_rtf:.3f} median={bench.median_rtf:.3f}") - print(f" Throughput: {bench.audio_throughput:.2f} audio-sec/wall-sec") - print(f"{'=' * 60}\n") - - if failed: - for r in failed[:3]: - print(f" [ERROR] {r.error[:200]}") - - return bench - - -async def main(args): - all_results = [] - - for concurrency in args.max_concurrency: - result = await run_benchmark( - host=args.host, - port=args.port, - num_prompts=args.num_prompts, - max_concurrency=concurrency, - num_warmups=args.num_warmups, - voice=args.voice, - language=args.language, - stream=args.stream, - ) - result.config_name = args.config_name - all_results.append(asdict(result)) - - result_dir = Path(args.result_dir) - result_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json" - - with open(result_file, "w") as f: - json.dump(all_results, f, indent=2) - print(f"Results saved to {result_file}") - - -def parse_args(): - parser = argparse.ArgumentParser(description="Qwen3-TTS async_chunk benchmark client") - parser.add_argument("--host", type=str, default="127.0.0.1") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--num-prompts", type=int, default=50) - parser.add_argument("--max-concurrency", type=int, nargs="+", default=[1, 10]) - parser.add_argument("--num-warmups", type=int, default=3) - parser.add_argument("--voice", type=str, default="vivian") - parser.add_argument("--language", type=str, default="English") - parser.add_argument("--stream", action="store_true", default=True) - parser.add_argument("--no-stream", dest="stream", action="store_false") - parser.add_argument("--config-name", type=str, default="async_chunk_on") - parser.add_argument("--result-dir", type=str, default="results") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - asyncio.run(main(args)) diff --git a/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py b/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py deleted file mode 100644 index 96b904b0174..00000000000 --- a/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py +++ /dev/null @@ -1,371 +0,0 @@ -"""Benchmark client for Qwen3-TTS via /v1/audio/speech endpoint. - -Measures TTFP (Time-to-First-Packet), E2E latency, and RTF (Real-Time Factor) -across configurable concurrency levels. Saves results as JSON for plotting. - -Usage: - python bench_tts_serve.py \ - --host 127.0.0.1 --port 8000 \ - --num-prompts 50 \ - --max-concurrency 1 4 10 \ - --result-dir results/ -""" - -import argparse -import asyncio -import json -import time -from dataclasses import asdict, dataclass, field -from datetime import datetime -from pathlib import Path - -import aiohttp -import numpy as np -from tqdm.asyncio import tqdm - -PROMPTS = [ - "Hello, welcome to the voice synthesis benchmark test.", - "She said she would be here by noon, but nobody showed up.", - "The quick brown fox jumps over the lazy dog near the riverbank.", - "I can't believe how beautiful the sunset looks from up here on the mountain.", - "Please remember to bring your identification documents to the appointment tomorrow morning.", - "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", - "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", - "After the meeting, we should discuss the quarterly results and plan for the next phase.", - "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", - "The train leaves at half past seven, so we need to arrive at the station before then.", - "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", - "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", -] -REF_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav" -REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you." -INSTRUCT = "Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice." - - -@dataclass -class RequestResult: - success: bool = False - ttfp: float = 0.0 # Time to first audio packet (seconds) - e2e: float = 0.0 # End-to-end latency (seconds) - audio_bytes: int = 0 # Total audio bytes received - audio_duration: float = 0.0 # Audio duration in seconds (estimated from PCM) - rtf: float = 0.0 # Real-time factor = e2e / audio_duration - prompt: str = "" - error: str = "" - - -@dataclass -class BenchmarkResult: - config_name: str = "" - concurrency: int = 0 - num_prompts: int = 0 - completed: int = 0 - failed: int = 0 - duration_s: float = 0.0 - # TTFP stats (ms) - mean_ttfp_ms: float = 0.0 - median_ttfp_ms: float = 0.0 - std_ttfp_ms: float = 0.0 - p90_ttfp_ms: float = 0.0 - p95_ttfp_ms: float = 0.0 - p99_ttfp_ms: float = 0.0 - # E2E stats (ms) - mean_e2e_ms: float = 0.0 - median_e2e_ms: float = 0.0 - std_e2e_ms: float = 0.0 - p90_e2e_ms: float = 0.0 - p95_e2e_ms: float = 0.0 - p99_e2e_ms: float = 0.0 - # RTF stats - mean_rtf: float = 0.0 - median_rtf: float = 0.0 - std_rtf: float = 0.0 - p99_rtf: float = 0.0 - # Audio stats - mean_audio_duration_s: float = 0.0 - total_audio_duration_s: float = 0.0 - audio_throughput: float = 0.0 # audio_duration / wall_time - request_throughput: float = 0.0 # requests / second - # Per-request details - per_request: list = field(default_factory=list) - - -def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = 24000, sample_width: int = 2) -> float: - """Convert raw PCM byte count to duration in seconds.""" - num_samples = num_bytes / sample_width - return num_samples / sample_rate - - -def create_payload( - prompt: str, task_type: str = "CustomVoice", voice: str = "vivian", language: str = "English" -) -> dict: - payload = { - "input": prompt, - "language": language, - "stream": True, - "response_format": "pcm", - "task_type": task_type, - } - - if task_type == "Base": - payload["ref_audio"] = REF_AUDIO - payload["ref_text"] = REF_TEXT - elif task_type == "CustomVoice": - payload["voice"] = voice - elif task_type == "VoiceDesign": - payload["instructions"] = INSTRUCT - - return payload - - -async def send_tts_request( - session: aiohttp.ClientSession, - api_url: str, - prompt: str, - task_type: str = "CustomVoice", - voice: str = "vivian", - language: str = "English", - pbar: tqdm | None = None, -) -> RequestResult: - """Send a streaming TTS request and measure latency metrics.""" - payload = create_payload(prompt, task_type, voice, language) - - result = RequestResult(prompt=prompt) - st = time.perf_counter() - - try: - async with session.post(api_url, json=payload) as response: - if response.status != 200: - result.error = f"HTTP {response.status}: {await response.text()}" - result.success = False - return result - - first_chunk = True - total_bytes = 0 - - async for chunk in response.content.iter_any(): - if first_chunk and len(chunk) > 0: - result.ttfp = time.perf_counter() - st - first_chunk = False - total_bytes += len(chunk) - - result.e2e = time.perf_counter() - st - result.audio_bytes = total_bytes - result.audio_duration = pcm_bytes_to_duration(total_bytes) - - if result.audio_duration > 0: - result.rtf = result.e2e / result.audio_duration - result.success = True - - except Exception as e: - result.error = str(e) - result.success = False - result.e2e = time.perf_counter() - st - - if pbar: - pbar.update(1) - return result - - -async def run_benchmark( - host: str, - port: int, - num_prompts: int, - max_concurrency: int, - num_warmups: int = 3, - task_type: str = "CustomVoice", - voice: str = "vivian", - language: str = "English", -) -> BenchmarkResult: - """Run benchmark at a given concurrency level.""" - api_url = f"http://{host}:{port}/v1/audio/speech" - - connector = aiohttp.TCPConnector( - limit=max_concurrency, - limit_per_host=max_concurrency, - keepalive_timeout=60, - ) - session = aiohttp.ClientSession( - connector=connector, - timeout=aiohttp.ClientTimeout(total=600), - ) - - # Warmup - if num_warmups > 0: - print(f" Warming up with {num_warmups} requests...") - warmup_tasks = [] - for i in range(num_warmups): - prompt = PROMPTS[i % len(PROMPTS)] - warmup_tasks.append(send_tts_request(session, api_url, prompt, task_type, voice, language)) - await asyncio.gather(*warmup_tasks) - print(" Warmup done.") - - # Build request list - request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] - - # Run benchmark - print(f" Running {num_prompts} requests with concurrency={max_concurrency}...") - semaphore = asyncio.Semaphore(max_concurrency) - pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") - - async def limited_request(prompt): - async with semaphore: - return await send_tts_request(session, api_url, prompt, task_type, voice, language, pbar) - - start_time = time.perf_counter() - tasks = [asyncio.create_task(limited_request(p)) for p in request_prompts] - results: list[RequestResult] = await asyncio.gather(*tasks) - duration = time.perf_counter() - start_time - pbar.close() - - await session.close() - - # Compute stats - successful = [r for r in results if r.success] - failed = [r for r in results if not r.success] - - bench = BenchmarkResult( - concurrency=max_concurrency, - num_prompts=num_prompts, - completed=len(successful), - failed=len(failed), - duration_s=duration, - ) - - if successful: - ttfps = [r.ttfp * 1000 for r in successful] # convert to ms - e2es = [r.e2e * 1000 for r in successful] - rtfs = [r.rtf for r in successful] - audio_durs = [r.audio_duration for r in successful] - - bench.mean_ttfp_ms = float(np.mean(ttfps)) - bench.median_ttfp_ms = float(np.median(ttfps)) - bench.std_ttfp_ms = float(np.std(ttfps)) - bench.p90_ttfp_ms = float(np.percentile(ttfps, 90)) - bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) - bench.p99_ttfp_ms = float(np.percentile(ttfps, 99)) - - bench.mean_e2e_ms = float(np.mean(e2es)) - bench.median_e2e_ms = float(np.median(e2es)) - bench.std_e2e_ms = float(np.std(e2es)) - bench.p90_e2e_ms = float(np.percentile(e2es, 90)) - bench.p95_e2e_ms = float(np.percentile(e2es, 95)) - bench.p99_e2e_ms = float(np.percentile(e2es, 99)) - - bench.mean_rtf = float(np.mean(rtfs)) - bench.median_rtf = float(np.median(rtfs)) - bench.std_rtf = float(np.std(rtfs)) - bench.p99_rtf = float(np.percentile(rtfs, 99)) - - bench.mean_audio_duration_s = float(np.mean(audio_durs)) - bench.total_audio_duration_s = float(np.sum(audio_durs)) - bench.audio_throughput = bench.total_audio_duration_s / duration - bench.request_throughput = len(successful) / duration - - bench.per_request = [ - { - "ttfp_ms": r.ttfp * 1000, - "e2e_ms": r.e2e * 1000, - "rtf": r.rtf, - "audio_duration_s": r.audio_duration, - "prompt": r.prompt, - } - for r in successful - ] - - # Print summary in standardized performance template - W = 50 - print("") - print(f"{'=' * W}") - print(f"{'Serving Benchmark Result':^{W}}") - print(f"{'=' * W}") - print(f"{'Successful requests:':<40}{bench.completed:<10}") - print(f"{'Failed requests:':<40}{bench.failed:<10}") - print(f"{'Maximum request concurrency:':<40}{max_concurrency:<10}") - print(f"{'Benchmark duration (s):':<40}{duration:<10.2f}") - print(f"{'Request throughput (req/s):':<40}{bench.request_throughput:<10.2f}") - print(f"{'-' * W}") - print(f"{'End-to-end Latency':^{W}}") - print(f"{'-' * W}") - print(f"{'Mean E2EL (ms):':<40}{bench.mean_e2e_ms:<10.2f}") - print(f"{'Median E2EL (ms):':<40}{bench.median_e2e_ms:<10.2f}") - print(f"{'P99 E2EL (ms):':<40}{bench.p99_e2e_ms:<10.2f}") - print(f"{'=' * W}") - print(f"{'Audio Result':^{W}}") - print(f"{'=' * W}") - print(f"{'Total audio duration generated (s):':<40}{bench.total_audio_duration_s:<10.2f}") - print(f"{'Audio throughput (audio duration/s):':<40}{bench.audio_throughput:<10.2f}") - print(f"{'-' * W}") - print(f"{'Time to First Packet':^{W}}") - print(f"{'-' * W}") - print(f"{'Mean AUDIO_TTFP (ms):':<40}{bench.mean_ttfp_ms:<10.2f}") - print(f"{'Median AUDIO_TTFP (ms):':<40}{bench.median_ttfp_ms:<10.2f}") - print(f"{'P99 AUDIO_TTFP (ms):':<40}{bench.p99_ttfp_ms:<10.2f}") - print(f"{'-' * W}") - print(f"{'Real Time Factor':^{W}}") - print(f"{'-' * W}") - print(f"{'Mean AUDIO_RTF:':<40}{bench.mean_rtf:<10.3f}") - print(f"{'Median AUDIO_RTF:':<40}{bench.median_rtf:<10.3f}") - print(f"{'P99 AUDIO_RTF:':<40}{bench.p99_rtf:<10.3f}") - print(f"{'=' * W}") - print("") - - if failed: - for r in failed[:3]: - print(f" [ERROR] {r.error[:200]}") - - return bench - - -async def main(args): - all_results = [] - - for concurrency in args.max_concurrency: - result = await run_benchmark( - host=args.host, - port=args.port, - num_prompts=args.num_prompts, - max_concurrency=concurrency, - num_warmups=args.num_warmups, - task_type=args.task_type, - voice=args.voice, - language=args.language, - ) - result.config_name = args.config_name - all_results.append(asdict(result)) - - # Save results - result_dir = Path(args.result_dir) - result_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json" - - with open(result_file, "w") as f: - json.dump(all_results, f, indent=2) - print(f"Results saved to {result_file}") - - return all_results - - -def parse_args(): - parser = argparse.ArgumentParser(description="Qwen3-TTS Benchmark Client") - parser.add_argument("--host", type=str, default="127.0.0.1") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--num-prompts", type=int, default=50, help="Number of prompts per concurrency level") - parser.add_argument( # noqa: E501 - "--max-concurrency", type=int, nargs="+", default=[1, 4, 10], help="Concurrency levels to test" - ) - parser.add_argument("--num-warmups", type=int, default=3) - parser.add_argument("--task-type", type=str, default="CustomVoice", choices=["CustomVoice", "VoiceDesign", "Base"]) - parser.add_argument("--voice", type=str, default="vivian") - parser.add_argument("--language", type=str, default="English") - parser.add_argument( - "--config-name", type=str, default="async_chunk", help="Label for this config (used in filenames)" - ) - parser.add_argument("--result-dir", type=str, default="results") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - asyncio.run(main(args)) diff --git a/benchmarks/qwen3-tts/vllm_omni/plot_async_chunk.py b/benchmarks/qwen3-tts/vllm_omni/plot_async_chunk.py deleted file mode 100644 index dd03d9626d9..00000000000 --- a/benchmarks/qwen3-tts/vllm_omni/plot_async_chunk.py +++ /dev/null @@ -1,249 +0,0 @@ -"""Plot TTFP comparison: async_chunk off vs on. - -Generates a bar chart with improvement arrows, matching the Qwen3-Omni -async_chunk benchmark figure style. - -Usage: - python plot_async_chunk.py \ - --off results/bench_async_chunk_off_*.json \ - --on results/bench_async_chunk_on_*.json \ - --output results/qwen3_tts_async_chunk_ttfp.png - - # Also supports E2E and RTF metrics: - python plot_async_chunk.py \ - --off results/bench_async_chunk_off_*.json \ - --on results/bench_async_chunk_on_*.json \ - --metric e2e \ - --output results/qwen3_tts_async_chunk_e2e.png -""" - -import argparse -import json -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np - -METRIC_CONFIG = { - "ttfp": { - "key": "mean_ttfp_ms", - "ylabel": "TTFP (s)", - "title": "TTFP (Time to First Audio Packet) - Qwen3-TTS, by concurrency", - "to_seconds": True, - }, - "e2e": { - "key": "mean_e2e_ms", - "ylabel": "E2E (s)", - "title": "E2E Latency - Qwen3-TTS, by concurrency", - "to_seconds": True, - }, - "rtf": { - "key": "mean_rtf", - "ylabel": "RTF", - "title": "Real-Time Factor - Qwen3-TTS, by concurrency", - "to_seconds": False, - }, -} - - -def load_results(path: str) -> list[dict]: - with open(path) as f: - return json.load(f) - - -def plot_ttfp_comparison( - off_results: list[dict], - on_results: list[dict], - metric: str, - output_path: str, - title_override: str | None = None, -): - cfg = METRIC_CONFIG[metric] - key = cfg["key"] - to_seconds = cfg["to_seconds"] - - off_map = {r["concurrency"]: r for r in off_results} - on_map = {r["concurrency"]: r for r in on_results} - concurrencies = sorted(set(off_map.keys()) & set(on_map.keys())) - - off_vals = [] - on_vals = [] - for c in concurrencies: - v_off = off_map[c][key] - v_on = on_map[c][key] - if to_seconds: - v_off /= 1000.0 - v_on /= 1000.0 - off_vals.append(v_off) - on_vals.append(v_on) - - fig, ax = plt.subplots(figsize=(8, 6)) - - x = np.arange(len(concurrencies)) - width = 0.3 - - ax.bar(x - width / 2, off_vals, width, label="async_chunk off", color="#87CEEB", edgecolor="none") - ax.bar(x + width / 2, on_vals, width, label="async_chunk on", color="#FFF8DC", edgecolor="#DDD8B8") - - # Draw improvement arrows and labels - for i in range(len(concurrencies)): - v_off = off_vals[i] - v_on = on_vals[i] - if v_on > 0: - improvement = v_off / v_on - else: - improvement = float("inf") - - # Arrow from top of off-bar to top of on-bar - arrow_start_x = x[i] - width / 2 - arrow_start_y = v_off * 0.95 - arrow_end_x = x[i] + width / 2 - arrow_end_y = v_on * 1.05 - - ax.annotate( - "", - xy=(arrow_end_x, arrow_end_y), - xytext=(arrow_start_x, arrow_start_y), - arrowprops=dict(arrowstyle="->", color="red", lw=1.5), - ) - - # Improvement label - label_x = (arrow_start_x + arrow_end_x) / 2 - label_y = arrow_start_y + (v_off - v_on) * 0.15 - ax.text( - label_x, - label_y, - f"{improvement:.1f}x improvement", - ha="center", - va="bottom", - fontsize=10, - color="red", - fontweight="bold", - ) - - title = title_override or cfg["title"] - ax.set_title(title, fontsize=13, fontweight="bold") - ax.set_ylabel(cfg["ylabel"], fontsize=12) - ax.set_xlabel("Max concurrency", fontsize=12) - ax.set_xticks(x) - ax.set_xticklabels([str(c) for c in concurrencies]) - ax.set_yscale("log") - ax.legend(loc="upper left", fontsize=11) - ax.grid(axis="y", alpha=0.3, linestyle="--") - ax.set_axisbelow(True) - - plt.tight_layout() - Path(output_path).parent.mkdir(parents=True, exist_ok=True) - plt.savefig(output_path, dpi=150, bbox_inches="tight") - print(f"Plot saved to {output_path}") - plt.close() - - -def plot_all_metrics(off_results: list[dict], on_results: list[dict], output_path: str): - """Generate a 1x3 subplot with TTFP, E2E, and RTF comparisons.""" - off_map = {r["concurrency"]: r for r in off_results} - on_map = {r["concurrency"]: r for r in on_results} - concurrencies = sorted(set(off_map.keys()) & set(on_map.keys())) - - fig, axes = plt.subplots(1, 3, figsize=(18, 6)) - fig.suptitle("Qwen3-TTS: async_chunk on vs off", fontsize=15, fontweight="bold") - - for ax, metric in zip(axes, ["ttfp", "e2e", "rtf"]): - cfg = METRIC_CONFIG[metric] - key = cfg["key"] - to_seconds = cfg["to_seconds"] - - off_vals = [] - on_vals = [] - for c in concurrencies: - v_off = off_map[c][key] - v_on = on_map[c][key] - if to_seconds: - v_off /= 1000.0 - v_on /= 1000.0 - off_vals.append(v_off) - on_vals.append(v_on) - - x = np.arange(len(concurrencies)) - width = 0.3 - ax.bar(x - width / 2, off_vals, width, label="async_chunk off", color="#87CEEB") - ax.bar(x + width / 2, on_vals, width, label="async_chunk on", color="#FFF8DC", edgecolor="#DDD8B8") - - for i in range(len(concurrencies)): - if on_vals[i] > 0: - improvement = off_vals[i] / on_vals[i] - ax.annotate( - "", - xy=(x[i] + width / 2, on_vals[i] * 1.05), - xytext=(x[i] - width / 2, off_vals[i] * 0.95), - arrowprops=dict(arrowstyle="->", color="red", lw=1.5), - ) - label_y = off_vals[i] * 0.85 - ax.text(x[i], label_y, f"{improvement:.1f}x", ha="center", fontsize=10, color="red", fontweight="bold") - - ax.set_title(cfg["title"].split(" - ")[0], fontsize=12, fontweight="bold") - ax.set_ylabel(cfg["ylabel"], fontsize=11) - ax.set_xlabel("Max concurrency", fontsize=11) - ax.set_xticks(x) - ax.set_xticklabels([str(c) for c in concurrencies]) - if metric != "rtf": - ax.set_yscale("log") - ax.legend(fontsize=9) - ax.grid(axis="y", alpha=0.3, linestyle="--") - ax.set_axisbelow(True) - - plt.tight_layout() - Path(output_path).parent.mkdir(parents=True, exist_ok=True) - plt.savefig(output_path, dpi=150, bbox_inches="tight") - print(f"Plot saved to {output_path}") - plt.close() - - -def print_table(off_results: list[dict], on_results: list[dict]): - off_map = {r["concurrency"]: r for r in off_results} - on_map = {r["concurrency"]: r for r in on_results} - concurrencies = sorted(set(off_map.keys()) & set(on_map.keys())) - - print("\n## Benchmark Results: async_chunk off vs on\n") - print("| Metric | Concurrency | async_chunk off | async_chunk on | Improvement |") - print("| --- | --- | --- | --- | --- |") - - for name, key, fmt in [ - ("TTFP (ms)", "mean_ttfp_ms", ".1f"), - ("E2E (ms)", "mean_e2e_ms", ".1f"), - ("RTF", "mean_rtf", ".3f"), - ("Throughput", "audio_throughput", ".2f"), - ]: - for c in concurrencies: - v_off = off_map[c].get(key, 0) - v_on = on_map[c].get(key, 0) - if v_on > 0 and key != "audio_throughput": - ratio = f"{v_off / v_on:.1f}x" - elif v_off > 0 and key == "audio_throughput": - ratio = f"{v_on / v_off:.1f}x" - else: - ratio = "N/A" - print(f"| {name} | {c} | {v_off:{fmt}} | {v_on:{fmt}} | {ratio} |") - - -def parse_args(): - parser = argparse.ArgumentParser(description="Plot async_chunk comparison for Qwen3-TTS") - parser.add_argument("--off", type=str, required=True, help="JSON results for async_chunk off") - parser.add_argument("--on", type=str, required=True, help="JSON results for async_chunk on") - parser.add_argument("--metric", type=str, default="ttfp", choices=["ttfp", "e2e", "rtf", "all"]) - parser.add_argument("--output", type=str, default="results/qwen3_tts_async_chunk.png") - parser.add_argument("--title", type=str, default=None, help="Custom title override") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - off_results = load_results(args.off) - on_results = load_results(args.on) - - print_table(off_results, on_results) - - if args.metric == "all": - plot_all_metrics(off_results, on_results, args.output) - else: - plot_ttfp_comparison(off_results, on_results, args.metric, args.output, args.title) diff --git a/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh b/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh deleted file mode 100755 index 0ede359ea37..00000000000 --- a/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh +++ /dev/null @@ -1,169 +0,0 @@ -#!/bin/bash -# Qwen3-TTS async_chunk on vs off Benchmark -# -# Starts two servers (async_chunk on and off), benchmarks both, -# and generates comparison plots. -# -# Usage: -# bash run_async_chunk_benchmark.sh -# -# Environment variables: -# GPU_DEVICE - GPU index (default: 0) -# MODEL - Model path (default: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice) -# NUM_PROMPTS - Prompts per concurrency level (default: 50) -# CONCURRENCY - Space-separated concurrency levels (default: "1 10") -# PORT_ON - Port for async_chunk on server (default: 8000) -# PORT_OFF - Port for async_chunk off server (default: 8001) - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" -cd "$PROJECT_ROOT" - -GPU_DEVICE="${GPU_DEVICE:-0}" -MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice}" -NUM_PROMPTS="${NUM_PROMPTS:-50}" -CONCURRENCY="${CONCURRENCY:-1 10}" -NUM_WARMUPS="${NUM_WARMUPS:-3}" -PORT_ON="${PORT_ON:-8000}" -PORT_OFF="${PORT_OFF:-8001}" -RESULT_DIR="${SCRIPT_DIR}/results" -TIMESTAMP="$(date +%Y%m%d_%H%M%S)" - -# The bundled ``vllm_omni/deploy/qwen3_tts.yaml`` is auto-loaded by the model -# registry; no ``--deploy-config`` flag needed on the default (ON) path. -# async_chunk OFF is selected by the ``--no-async-chunk`` CLI flag — -# the single ``qwen3_tts`` pipeline dispatches to the end-to-end codec -# processor when ``deploy.async_chunk`` is false. - -mkdir -p "${RESULT_DIR}" - -echo "============================================================" -echo " Qwen3-TTS async_chunk Benchmark" -echo "============================================================" -echo " GPU: ${GPU_DEVICE}" -echo " Model: ${MODEL}" -echo " Prompts: ${NUM_PROMPTS}" -echo " Concurrency: ${CONCURRENCY}" -echo " Port (on/off): ${PORT_ON} / ${PORT_OFF}" -echo " Results: ${RESULT_DIR}" -echo "============================================================" - -cleanup() { - echo "Cleaning up servers..." - kill "$PID_ON" 2>/dev/null || true - kill "$PID_OFF" 2>/dev/null || true - wait "$PID_ON" 2>/dev/null || true - wait "$PID_OFF" 2>/dev/null || true -} -trap cleanup EXIT - -wait_for_server() { - local port=$1 - local name=$2 - local max_wait=300 - local elapsed=0 - echo "Waiting for ${name} server on port ${port}..." - while ! curl -s "http://localhost:${port}/health" >/dev/null 2>&1; do - sleep 5 - elapsed=$((elapsed + 5)) - if [ $elapsed -ge $max_wait ]; then - echo "ERROR: ${name} server failed to start within ${max_wait}s" - exit 1 - fi - done - echo "${name} server ready (${elapsed}s)" -} - -# ---- Phase 1: Start async_chunk ON server ---- -echo "" -echo "[Phase 1] Starting async_chunk ON server on port ${PORT_ON}..." -CUDA_VISIBLE_DEVICES=${GPU_DEVICE} vllm-omni serve "${MODEL}" \ - --host 0.0.0.0 --port "${PORT_ON}" \ - --trust-remote-code --enforce-eager --omni \ - > "${RESULT_DIR}/server_on_${TIMESTAMP}.log" 2>&1 & -PID_ON=$! - -wait_for_server "${PORT_ON}" "async_chunk_on" - -echo "[Phase 1] Benchmarking async_chunk ON..." -# shellcheck disable=SC2086 -python "${SCRIPT_DIR}/bench_async_chunk.py" \ - --host 127.0.0.1 --port "${PORT_ON}" \ - --config-name "async_chunk_on" \ - --num-prompts "${NUM_PROMPTS}" \ - --max-concurrency ${CONCURRENCY} \ - --num-warmups "${NUM_WARMUPS}" \ - --result-dir "${RESULT_DIR}" - -echo "[Phase 1] Stopping async_chunk ON server..." -kill "$PID_ON" 2>/dev/null || true -wait "$PID_ON" 2>/dev/null || true -sleep 5 - -# ---- Phase 2: Start async_chunk OFF server ---- -echo "" -echo "[Phase 2] Starting async_chunk OFF server on port ${PORT_OFF}..." -CUDA_VISIBLE_DEVICES=${GPU_DEVICE} vllm-omni serve "${MODEL}" \ - --no-async-chunk \ - --host 0.0.0.0 --port "${PORT_OFF}" \ - --trust-remote-code --enforce-eager --omni \ - > "${RESULT_DIR}/server_off_${TIMESTAMP}.log" 2>&1 & -PID_OFF=$! - -wait_for_server "${PORT_OFF}" "async_chunk_off" - -echo "[Phase 2] Benchmarking async_chunk OFF (non-streaming)..." -# shellcheck disable=SC2086 -python "${SCRIPT_DIR}/bench_async_chunk.py" \ - --host 127.0.0.1 --port "${PORT_OFF}" \ - --config-name "async_chunk_off" \ - --num-prompts "${NUM_PROMPTS}" \ - --max-concurrency ${CONCURRENCY} \ - --num-warmups "${NUM_WARMUPS}" \ - --no-stream \ - --result-dir "${RESULT_DIR}" - -echo "[Phase 2] Stopping async_chunk OFF server..." -kill "$PID_OFF" 2>/dev/null || true -wait "$PID_OFF" 2>/dev/null || true - -# ---- Phase 3: Plot results ---- -echo "" -echo "[Phase 3] Generating plots..." - -# Find the latest result files -RESULT_ON=$(ls -t "${RESULT_DIR}"/bench_async_chunk_on_*.json 2>/dev/null | head -1) -RESULT_OFF=$(ls -t "${RESULT_DIR}"/bench_async_chunk_off_*.json 2>/dev/null | head -1) - -if [ -z "$RESULT_ON" ] || [ -z "$RESULT_OFF" ]; then - echo "ERROR: Could not find result files. Check logs in ${RESULT_DIR}/" - exit 1 -fi - -echo " ON results: ${RESULT_ON}" -echo " OFF results: ${RESULT_OFF}" - -# TTFP comparison (main figure) -python "${SCRIPT_DIR}/plot_async_chunk.py" \ - --off "${RESULT_OFF}" \ - --on "${RESULT_ON}" \ - --metric ttfp \ - --output "${RESULT_DIR}/qwen3_tts_async_chunk_ttfp.png" - -# All metrics comparison -python "${SCRIPT_DIR}/plot_async_chunk.py" \ - --off "${RESULT_OFF}" \ - --on "${RESULT_ON}" \ - --metric all \ - --output "${RESULT_DIR}/qwen3_tts_async_chunk_all.png" - -echo "" -echo "============================================================" -echo " Benchmark complete!" -echo " Results: ${RESULT_DIR}/" -echo " Plots:" -echo " - ${RESULT_DIR}/qwen3_tts_async_chunk_ttfp.png" -echo " - ${RESULT_DIR}/qwen3_tts_async_chunk_all.png" -echo "============================================================" diff --git a/benchmarks/tts/README.md b/benchmarks/tts/README.md new file mode 100644 index 00000000000..9e2fd35b1a5 --- /dev/null +++ b/benchmarks/tts/README.md @@ -0,0 +1,227 @@ +# TTS Universal Benchmark + +A model-agnostic serving benchmark for TTS models in vllm-omni. One CLI +(`bench_tts.py`) + one YAML registry (`model_configs.yaml`) drive perf and +quality runs for every registered checkpoint: **Qwen3-TTS** (Base / CustomVoice) +and **VoxCPM2** today, more to come. + +The same three task types — `voice_clone`, `default_voice`, `voice_design` — +are wired into both the manual CLI and the DFX nightly CI matrix +(`tests/dfx/perf/tests/test_tts.json`). + +## Quick start + +### 1. Start the server + +```bash +vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base --omni --port 8000 +``` + +The server auto-loads its Deploy YAML from `vllm_omni/deploy/qwen3_tts.yaml` +(Pipeline + Deploy schema introduced in #2383). No `--stage-configs-path` or +`--deploy-config` flag is needed for any registered model. + +### 2. Run the benchmark (`vllm bench serve --omni`) + +The primary, directly-controllable path. Copy-paste one of these and tweak +any bench flag (sampling params, endpoint, extra body, warmups, etc.): + +#### voice_clone (Qwen3-TTS-Base, seed-tts dataset) + +```bash +vllm bench serve --omni \ + --host 127.0.0.1 --port 8000 \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ + --backend openai-audio-speech \ + --endpoint /v1/audio/speech \ + --dataset-name seed-tts \ + --dataset-path /path/to/seed-tts-eval \ + --seed-tts-locale en \ + --num-prompts 20 --num-warmups 2 \ + --extra-body '{"task_type":"Base"}' \ + --max-concurrency 1 --request-rate inf \ + --percentile-metrics ttft,e2el,audio_rtf,audio_ttfp,audio_duration \ + --save-result --result-dir ./results +``` + +#### default_voice (Qwen3-TTS-CustomVoice, bundled seed_tts_smoke) + +```bash +vllm bench serve --omni \ + --host 127.0.0.1 --port 8000 \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ + --backend openai-audio-speech \ + --endpoint /v1/audio/speech \ + --dataset-name seed-tts-text \ + --dataset-path benchmarks/build_dataset/seed_tts_smoke \ + --seed-tts-locale en \ + --num-prompts 20 --num-warmups 2 \ + --extra-body '{"voice":"Vivian","language":"English","task_type":"CustomVoice"}' \ + --max-concurrency 1 --request-rate inf \ + --percentile-metrics ttft,e2el,audio_rtf,audio_ttfp,audio_duration \ + --save-result --result-dir ./results +``` + +#### voice_design (Qwen3-TTS-CustomVoice, bundled seed_tts_design) + +```bash +vllm bench serve --omni \ + --host 127.0.0.1 --port 8000 \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ + --backend openai-audio-speech \ + --endpoint /v1/audio/speech \ + --dataset-name seed-tts-design \ + --dataset-path benchmarks/build_dataset/seed_tts_design \ + --seed-tts-locale en \ + --num-prompts 20 --num-warmups 2 \ + --extra-body '{"task_type":"VoiceDesign","language":"English"}' \ + --max-concurrency 1 --request-rate inf \ + --percentile-metrics ttft,e2el,audio_rtf,audio_ttfp,audio_duration \ + --save-result --result-dir ./results +``` + +#### Add WER / SIM / UTMOS to any of the above + +Append `--seed-tts-wer-eval` (and optionally `SEED_TTS_EVAL_DEVICE=cuda:0` +in the env, per PR #2558). This triggers the seed-tts-eval protocol: +Whisper-large-v3 ASR → WER, WavLM embeddings → SIM, balacoon/utmos → UTMOS. + +### 3. Convenience wrapper (`bench_tts.py`) + +If you're running the **canonical** configuration for a registered model, +`bench_tts.py` loads the right defaults from `model_configs.yaml` and +emits the exact `vllm bench serve --omni` command above — useful for +concurrency sweeps and multi-task runs: + +```bash +# Smallest smoke — 5 prompts, concurrency=1 +python benchmarks/tts/bench_tts.py \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ + --task voice_clone \ + --dataset-path /path/to/seed-tts-eval \ + --concurrency 1 --num-prompts 5 \ + --output-dir ./results + +# Full concurrency sweep +python benchmarks/tts/bench_tts.py \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ + --task voice_clone \ + --dataset-path /path/to/seed-tts-eval \ + --concurrency 1 2 4 8 16 32 \ + --num-prompts 20 \ + --output-dir ./results + +# With WER / SIM / UTMOS quality eval (adds ASR + embedding compute) +python benchmarks/tts/bench_tts.py \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ + --task voice_clone \ + --dataset-path /path/to/seed-tts-eval \ + --wer-eval \ + --concurrency 4 --num-prompts 200 \ + --output-dir ./results +``` + +### 4. Plot a sweep + +```bash +python benchmarks/tts/plot_results.py \ + --results ./results/*.json \ + --output ./results/curve.png +``` + +Outputs TTFP / RTF / throughput curves (and a markdown table) for every +`(task, concurrency)` combination in the result set. + +## Task types + +| Task | Dataset | Request body | Checkpoints that support it | +|-----------------|-------------------|-----------------------------------------------------|------------------------------------------| +| `voice_clone` | `seed-tts` | `ref_audio` + `ref_text` + `task_type=Base` | `Qwen3-TTS-*-Base`, `VoxCPM2` | +| `default_voice` | `seed-tts-text` | `voice=Vivian` + `task_type=CustomVoice` | `Qwen3-TTS-*-CustomVoice` | +| `voice_design` | `seed-tts-design` | `instructions=` + `task_type=VoiceDesign` | `Qwen3-TTS-*-CustomVoice` | + +**`-CustomVoice` checkpoints do NOT ship `speaker_encoder` weights**, so +voice_clone requests raise `ValueError` at model runtime. Use `-Base` for +voice_clone. + +## Adding a new TTS model + +Drop an entry into `model_configs.yaml` — no Python changes required: + +```yaml +models: + /: + supported_tasks: [voice_clone] # or default_voice / voice_design + backend: openai-audio-speech # vllm bench serve backend + endpoint: /v1/audio/speech # OpenAI-compatible endpoint + task_extra_body: # merged into every request's body + voice_clone: + task_type: Base +``` + +Then add the model's Deploy YAML under `vllm_omni/deploy/.yaml` +(Pipeline + Deploy schema) and it's immediately benchable. + +## Datasets + +| Dataset | Bundled? | Format | Source | +|--------------------|----------|-------------------|----------------------------------------------------------------| +| `seed-tts-design` | ✅ | 5-field meta.lst | `benchmarks/build_dataset/seed_tts_design/en/meta.lst` (20 prompts) | +| `seed_tts_smoke` | ✅ | 4-field meta.lst | `benchmarks/build_dataset/seed_tts_smoke/en/meta.lst` (20 text-only) | +| `seed-tts` | ❌ | 4-field meta.lst + WAVs | Google-Drive: [BytedanceSpeech/seed-tts-eval][seedtts] (~1.2 GB) | +| `seed-tts-text` | ❌ | 4-field meta.lst | Same archive as `seed-tts` (wav column unused) | + +[seedtts]: https://github.com/BytedanceSpeech/seed-tts-eval + +For manual voice_clone / default_voice runs against the full corpus, follow +`benchmarks/build_dataset/download_process_data_seedtts.md` and point +`--dataset-path` at the extracted `seedtts_testset` directory. + +## DFX nightly CI + +`tests/dfx/perf/tests/test_tts.json` wires three perf regimes plus quality: + +| eval_phase | concurrency | purpose | Baseline metrics | +|---------------|-------------|---------------------------------------------------------|-----------------------------------------| +| `latency` | 1 | Single-request TTFP / RTF SLO | `median_audio_ttfp_ms`, `median_audio_rtf` | +| `throughput` | 8 | Codec-batching cliff sentinel (PDF #272 concurrency≥8) | `median_audio_ttfp_ms`, `median_audio_rtf` | +| `quality` | 4 | WER / SIM / UTMOS regression (disabled in CI by default)| `mean_audio_rtf` | + +Why `median_*` for latency/throughput and `mean_*` for quality: latency +distributions have cold-start tails that drag the mean; quality aggregates +over 200 prompts so single-request outliers don't matter. + +Quality entries are `enabled: false` in CI because seed-tts-eval is not +staged in the Buildkite container (matches the precedent in +PR #2558 — quality runs are manual / release-validation, not nightly). + +## Concurrency cliff regression sentinel + +Observed on H20-3e, Qwen3-TTS-1.7B (measured pre-merge on this branch): + +| Task | Model | c=1 | c=4 | **c=8** | c=16 | c=32 | +|---------------|---------------|--------|--------|------------|--------|--------| +| voice_clone | 1.7B-Base | RTF 0.15 / TTFP 165ms | 0.28 / 412ms | **0.49 / 1701ms** | 0.72 / 3355ms | 0.77 / 3772ms | +| voice_design | 1.7B-CustomVoice | RTF 0.08 / TTFP 53ms | 0.11 / 154ms | **0.21 / 872ms** | 0.33 / 1801ms | 0.38 / 1989ms | + +Both models show a **4–6× TTFP jump from c=4 to c=8** while audio throughput +saturates around c=4–8 — the codec-bs=1 bottleneck documented in +vllm-project/vllm-omni#272. The `throughput` CI regime at c=8 is the +sentinel for regressions in this area. + +## File layout + +``` +benchmarks/tts/ +├── README.md (this file) +├── bench_tts.py CLI — serve-mode benchmark driver +├── bench_voxcpm_offline.py CLI — offline VoxCPM benchmark (sync + streaming) +├── plot_results.py Generate per-task / per-concurrency curves +└── model_configs.yaml Model registry (supported tasks + extra body) +``` + +## Related + +- Upstream seed-tts-eval integration: vllm-project/vllm-omni#2558 +- Pipeline + Deploy schema: vllm-project/vllm-omni#2383 +- Concurrency cliff RFC: vllm-project/vllm-omni#272 diff --git a/benchmarks/tts/bench_tts.py b/benchmarks/tts/bench_tts.py new file mode 100644 index 00000000000..ba82b1c9b7b --- /dev/null +++ b/benchmarks/tts/bench_tts.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +"""Universal TTS benchmark CLI for vllm-omni. + +Runs ``vllm bench serve --omni`` with model-aware defaults loaded from +``model_configs.yaml``. Supports Qwen3-TTS, VoxCPM2, and any future TTS +model registered in the config file -- no code changes needed to add models. + +Usage:: + + python benchmarks/tts/bench_tts.py \\ + --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \\ + --task voice_clone \\ + --locale en \\ + --concurrency 1 4 \\ + --num-prompts 20 \\ + --dataset-path /path/to/seed-tts-eval \\ + --host localhost --port 8000 + +See ``--help`` for full option list. +""" + +from __future__ import annotations + +import argparse +import json +import math +import os +import subprocess +import sys +from datetime import datetime +from pathlib import Path +from typing import Any + +import yaml + + +def _vllm_omni_bin() -> str: + """Return the vllm-omni (or vllm) binary co-located with the current Python.""" + bin_dir = Path(sys.executable).parent + for candidate in ("vllm-omni", "vllm"): + p = bin_dir / candidate + if p.is_file(): + return str(p) + return "vllm-omni" # fall back and let the shell resolve it + + +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +_SCRIPT_DIR = Path(__file__).resolve().parent +_DEFAULT_MODEL_CONFIGS = _SCRIPT_DIR / "model_configs.yaml" + +# Maps task name to the dataset_name used with vllm bench serve +_TASK_TO_DATASET: dict[str, str] = { + "voice_clone": "seed-tts", + "default_voice": "seed-tts-text", + "voice_design": "seed-tts-design", +} + +# Default design dataset path (bundled with the repo) +_DEFAULT_DESIGN_DATASET_PATH = str(_REPO_ROOT / "benchmarks" / "build_dataset" / "seed_tts_design") + + +def load_model_configs(path: Path) -> dict[str, Any]: + """Load model registry from YAML file.""" + with open(path, encoding="utf-8") as f: + data = yaml.safe_load(f) + return data.get("models", {}) + + +def build_bench_args( + *, + host: str, + port: int, + model: str, + task: str, + model_cfg: dict[str, Any], + locale: str, + num_prompts: int, + concurrency: int | None, + dataset_path: str | None, + wer_eval: bool, + output_dir: str | None, + result_filename: str | None, + extra_cli_args: list[str], +) -> list[str]: + """Build the ``vllm bench serve --omni`` command for one (task, concurrency) run.""" + dataset_name = _TASK_TO_DATASET[task] + backend: str = model_cfg["backend"] + endpoint: str = model_cfg["endpoint"] + task_extra_body: dict[str, Any] = (model_cfg.get("task_extra_body") or {}).get(task) or {} + + # Resolve dataset path + if dataset_path: + resolved_dataset_path = dataset_path + elif task == "voice_design": + resolved_dataset_path = _DEFAULT_DESIGN_DATASET_PATH + else: + resolved_dataset_path = None + + cmd = [ + _vllm_omni_bin(), + "bench", + "serve", + "--omni", + "--host", + host, + "--port", + str(port), + "--model", + model, + "--backend", + backend, + "--endpoint", + endpoint, + "--dataset-name", + dataset_name, + "--num-prompts", + str(num_prompts), + "--num-warmups", + "2", + "--percentile-metrics", + "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + ] + + if resolved_dataset_path: + cmd += ["--dataset-path", resolved_dataset_path] + + if locale: + cmd += ["--seed-tts-locale", locale] + + if task_extra_body: + cmd += ["--extra-body", json.dumps(task_extra_body, separators=(",", ":"))] + + if concurrency is not None: + cmd += ["--max-concurrency", str(concurrency), "--request-rate", "inf"] + + if wer_eval: + cmd.append("--seed-tts-wer-eval") + + if output_dir or result_filename: + out_dir = output_dir or "." + os.makedirs(out_dir, exist_ok=True) + cmd += ["--save-result", "--result-dir", out_dir] + if result_filename: + cmd += ["--result-filename", result_filename] + + cmd += extra_cli_args + return cmd + + +def run_one_benchmark(cmd: list[str]) -> dict[str, Any] | None: + """Run a single benchmark subprocess and return parsed JSON result if available.""" + print(f"\n{'=' * 60}") + print("Running:", " ".join(cmd)) + print("=" * 60) + result = subprocess.run(cmd, check=False) + if result.returncode != 0: + print(f"[bench_tts] WARNING: benchmark exited with code {result.returncode}") + return None + # If --save-result was used, find the result file + try: + result_dir_idx = cmd.index("--result-dir") + result_dir = Path(cmd[result_dir_idx + 1]) + if "--result-filename" in cmd: + fname_idx = cmd.index("--result-filename") + result_file = result_dir / cmd[fname_idx + 1] + else: + # find most recently modified json + jsons = sorted(result_dir.glob("result_*.json"), key=lambda p: p.stat().st_mtime) + result_file = jsons[-1] if jsons else None + if result_file and result_file.is_file(): + return json.loads(result_file.read_text(encoding="utf-8")) + except (ValueError, IndexError, OSError): + pass + return None + + +def print_summary_table(results: list[dict[str, Any]]) -> None: + """Print a unified metrics table across all (task, concurrency) runs.""" + if not results: + return + header = ( + f"{'Task':<16} {'Concurrency':>11} {'RTF mean':>10} " + f"{'TTFP (ms)':>10} {'Throughput':>12} {'WER':>7} {'SIM':>7} {'UTMOS':>7}" + ) + print(f"\n{'=' * len(header)}") + print("BENCHMARK SUMMARY") + print("=" * len(header)) + print(header) + print("-" * len(header)) + for r in results: + task = r.get("_task", "?") + conc = r.get("_concurrency", "?") + rtf = r.get("mean_audio_rtf", float("nan")) + ttfp = r.get("mean_audio_ttfp_ms", float("nan")) + throughput = r.get("audio_throughput", float("nan")) + wer = r.get("seed_tts_mean_wer", float("nan")) + sim = r.get("seed_tts_mean_sim", float("nan")) + utmos = r.get("seed_tts_mean_utmos", float("nan")) + + def fmt(v: float, digits: int = 3) -> str: + return f"{v:.{digits}f}" if not math.isnan(v) else " n/a" + + print( + f"{task:<16} {str(conc):>11} {fmt(rtf):>10} {fmt(ttfp, 0):>10} " + f"{fmt(throughput):>12} {fmt(wer):>7} {fmt(sim):>7} {fmt(utmos):>7}" + ) + print("=" * len(header)) + + +def main() -> None: + """Entry point for the universal TTS benchmark CLI.""" + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--model", required=True, help="HuggingFace model ID (e.g. Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)" + ) + parser.add_argument("--task", default="all", help="Task type: voice_clone | default_voice | voice_design | all") + parser.add_argument("--locale", default="en", choices=["en", "zh"]) + parser.add_argument("--concurrency", type=int, nargs="+", default=[1, 4], metavar="N") + parser.add_argument( + "--num-prompts", + type=int, + nargs="+", + default=[20], + metavar="N", + help="Number of prompts per run. If one value, applied to all concurrency levels.", + ) + parser.add_argument( + "--dataset-path", default=None, help="Root of seed-tts-eval dataset (required for voice_clone/default_voice)" + ) + parser.add_argument("--wer-eval", action="store_true", help="Enable WER/SIM/UTMOS quality eval") + parser.add_argument("--output-dir", default=None, help="Directory to save result JSON files") + parser.add_argument("--host", default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model-configs", default=str(_DEFAULT_MODEL_CONFIGS), help="Path to model_configs.yaml") + parser.add_argument("extra", nargs=argparse.REMAINDER, help="Extra args passed directly to vllm bench serve") + args = parser.parse_args() + + model_configs = load_model_configs(Path(args.model_configs)) + if args.model not in model_configs: + known = "\n ".join(model_configs.keys()) + print(f"[bench_tts] ERROR: model '{args.model}' not in model_configs.yaml.\nKnown models:\n {known}") + sys.exit(1) + + model_cfg = model_configs[args.model] + supported_tasks: list[str] = model_cfg.get("supported_tasks", []) + + tasks_to_run: list[str] + if args.task == "all": + tasks_to_run = supported_tasks + elif args.task in supported_tasks: + tasks_to_run = [args.task] + else: + print( + f"[bench_tts] ERROR: task '{args.task}' not supported by {args.model}.\nSupported tasks: {supported_tasks}" + ) + sys.exit(1) + + # Align num_prompts list with concurrency list + num_prompts_list: list[int] = args.num_prompts + if len(num_prompts_list) == 1: + num_prompts_list = num_prompts_list * len(args.concurrency) + elif len(num_prompts_list) != len(args.concurrency): + print( + f"[bench_tts] ERROR: --num-prompts ({len(num_prompts_list)} values) must be " + f"length 1 or match --concurrency ({len(args.concurrency)} values)." + ) + sys.exit(1) + + all_results: list[dict[str, Any]] = [] + + for task in tasks_to_run: + for concurrency, num_prompts in zip(args.concurrency, num_prompts_list): + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + result_filename = f"bench_tts_{args.model.replace('/', '_')}_{task}_c{concurrency}_{ts}.json" + cmd = build_bench_args( + host=args.host, + port=args.port, + model=args.model, + task=task, + model_cfg=model_cfg, + locale=args.locale, + num_prompts=num_prompts, + concurrency=concurrency, + dataset_path=args.dataset_path, + wer_eval=args.wer_eval, + output_dir=args.output_dir, + result_filename=result_filename, + extra_cli_args=args.extra or [], + ) + result = run_one_benchmark(cmd) + if result is not None: + result["_task"] = task + result["_concurrency"] = concurrency + all_results.append(result) + # Persist the metadata so plot_results.py can pick it up. + if args.output_dir and result_filename: + result_path = Path(args.output_dir) / result_filename + if result_path.is_file(): + result_path.write_text(json.dumps(result, indent=2), encoding="utf-8") + + print_summary_table(all_results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/voxcpm/vllm_omni/bench_tts_offline.py b/benchmarks/tts/bench_voxcpm_offline.py similarity index 96% rename from benchmarks/voxcpm/vllm_omni/bench_tts_offline.py rename to benchmarks/tts/bench_voxcpm_offline.py index a3bad3e6928..672b77f1495 100644 --- a/benchmarks/voxcpm/vllm_omni/bench_tts_offline.py +++ b/benchmarks/tts/bench_voxcpm_offline.py @@ -6,6 +6,27 @@ - text-only synthesis - voice cloning - text/clone batch inputs from txt or jsonl + +Usage:: + + # Sync (default voice) + python benchmarks/tts/bench_voxcpm_offline.py \\ + --model /path/to/VoxCPM \\ + --text "Hello world" \\ + --output-dir results/audio/ + + # Streaming (async_chunk) + python benchmarks/tts/bench_voxcpm_offline.py \\ + --model /path/to/VoxCPM \\ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \\ + --txt-prompts prompts.txt \\ + --output-dir results/audio/ + + # Voice cloning batch via JSONL + python benchmarks/tts/bench_voxcpm_offline.py \\ + --model /path/to/VoxCPM \\ + --jsonl-prompts prompts.jsonl \\ + --output-dir results/audio/ """ from __future__ import annotations @@ -26,7 +47,21 @@ from vllm_omni import AsyncOmni, Omni -REPO_ROOT = Path(__file__).resolve().parents[3] + +def _find_repo_root(start: Path) -> Path: + """Walk up from ``start`` until a repo marker is found. + + Falls back to ``parents[2]`` for backwards compatibility if no marker hits + (which can only happen in unusual checkouts — the tree should always have + pyproject.toml + vllm_omni/ at the top level). + """ + for candidate in [start, *start.parents]: + if (candidate / "pyproject.toml").is_file() and (candidate / "vllm_omni").is_dir(): + return candidate + return start.parents[2] + + +REPO_ROOT = _find_repo_root(Path(__file__).resolve()) DEFAULT_STAGE_ASYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm_async_chunk.yaml" DEFAULT_STAGE_SYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" @@ -473,9 +508,6 @@ def parse_args(): def _is_streaming_stage_config(stage_configs_path: str) -> bool: cfg_name = Path(stage_configs_path).name.lower() - # Keep routing purely config-path based: - # - voxcpm.yaml => sync - # - voxcpm_async_chunk.yaml => streaming return "async_chunk" in cfg_name diff --git a/benchmarks/tts/model_configs.yaml b/benchmarks/tts/model_configs.yaml new file mode 100644 index 00000000000..83b25370538 --- /dev/null +++ b/benchmarks/tts/model_configs.yaml @@ -0,0 +1,39 @@ +# Universal TTS benchmark model registry. +# Maps HuggingFace model ID → supported tasks + per-task extra body fields. +# To add a new TTS model: add an entry here. No code changes required. +# +# The server auto-loads its Deploy YAML from vllm_omni/deploy/.yaml via +# the Pipeline + Deploy schema introduced in #2383, so no stage_config path +# is tracked here. + +models: + # -CustomVoice checkpoints lack speaker_encoder weights, so voice_clone is + # NOT supported (an attempt raises ValueError from _extract_speaker_embedding + # at model runtime). Use -Base for voice_clone. + Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice: + supported_tasks: [default_voice, voice_design] + backend: openai-audio-speech + endpoint: /v1/audio/speech + task_extra_body: + default_voice: + voice: Vivian + language: English + task_type: CustomVoice + voice_design: + task_type: VoiceDesign + language: English + + Qwen/Qwen3-TTS-12Hz-1.7B-Base: + supported_tasks: [voice_clone] + backend: openai-audio-speech + endpoint: /v1/audio/speech + task_extra_body: + voice_clone: + task_type: Base + + openbmb/VoxCPM2: + supported_tasks: [voice_clone] + backend: openai-audio-speech + endpoint: /v1/audio/speech + task_extra_body: + voice_clone: {} diff --git a/benchmarks/tts/plot_results.py b/benchmarks/tts/plot_results.py new file mode 100644 index 00000000000..f19c613209a --- /dev/null +++ b/benchmarks/tts/plot_results.py @@ -0,0 +1,324 @@ +"""Plot universal TTS benchmark results. + +Reads JSON files saved by ``bench_tts.py`` (via ``vllm bench serve --omni``) +and generates comparison bar charts grouped by task type. + +Metrics plotted: +- AUDIO_TTFP (mean audio time-to-first-packet, ms) +- E2EL (mean end-to-end latency, ms) +- Audio RTF (mean real-time factor) +- Audio throughput (audio-seconds / wall-second) + +Quality metrics (WER / SIM / UTMOS) are printed in a table when present. + +Usage:: + + # Single run — one JSON per task, all in results/ + python benchmarks/tts/plot_results.py \\ + --results results/bench_tts_*.json \\ + --output results/tts_benchmark.png + + # Compare two runs (e.g. async_chunk on vs off) + python benchmarks/tts/plot_results.py \\ + --results run_a/bench_tts_*.json \\ + --results run_b/bench_tts_*.json \\ + --labels "async_chunk_on" "async_chunk_off" \\ + --output results/comparison.png +""" + +from __future__ import annotations + +import argparse +import json +import math +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + +# --------------------------------------------------------------------------- +# JSON loading +# --------------------------------------------------------------------------- + + +def load_run(paths: list[str]) -> list[dict]: + """Load and merge all JSON files for one run into a flat list of records. + + Each record is expected to have at least ``_concurrency`` (int) and + ``_task`` (str) keys injected by ``bench_tts.py``. Records that come + from a file that contains a list are flattened. + """ + records: list[dict] = [] + for p in paths: + raw = json.loads(Path(p).read_text(encoding="utf-8")) + if isinstance(raw, list): + records.extend(raw) + elif isinstance(raw, dict): + records.append(raw) + return records + + +def _get(record: dict, key: str) -> float: + v = record.get(key, float("nan")) + if v is None or (isinstance(v, float) and math.isnan(v)): + return float("nan") + try: + return float(v) + except (TypeError, ValueError): + return float("nan") + + +# --------------------------------------------------------------------------- +# Plotting helpers +# --------------------------------------------------------------------------- + + +def _bar_group( + ax: plt.Axes, + x: np.ndarray, + data_per_label: dict[str, list[float]], + width: float, + colors: list[str], + ylabel: str, + title: str, + concurrency_labels: list[str], + fmt: str = ".1f", +) -> None: + n = len(data_per_label) + offsets = np.linspace(-(n - 1) * width / 2, (n - 1) * width / 2, n) if n > 1 else [0.0] + + for i, (label, values) in enumerate(data_per_label.items()): + plot_vals = [0.0 if math.isnan(v) else v for v in values] + bar = ax.bar(x + offsets[i], plot_vals, width, label=label, color=colors[i % len(colors)], alpha=0.85) + max_val = max((v for v in values if not math.isnan(v)), default=1.0) + for rect, val in zip(bar, values): + if not math.isnan(val) and val > 0: + ax.text( + rect.get_x() + rect.get_width() / 2, + rect.get_height() + max_val * 0.02, + f"{val:{fmt}}", + ha="center", + va="bottom", + fontsize=8, + fontweight="bold", + ) + + ax.set_xlabel("Concurrency", fontsize=11) + ax.set_ylabel(ylabel, fontsize=11) + ax.set_title(title, fontsize=12, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels(concurrency_labels) + ax.legend(fontsize=9) + ax.grid(axis="y", alpha=0.3) + ax.set_axisbelow(True) + + +COLORS = ["#2196F3", "#FF5722", "#4CAF50", "#FFC107", "#9C27B0"] + + +# --------------------------------------------------------------------------- +# Comparison plot (multiple labels / runs) +# --------------------------------------------------------------------------- + + +def plot_comparison( + all_runs: list[list[dict]], + labels: list[str], + output_path: str, + task_filter: str | None = None, + title_prefix: str = "TTS", +) -> None: + """One 2×2 subplot per task found in the data.""" + # Determine tasks to plot + tasks: list[str] = [] + for run in all_runs: + for r in run: + t = r.get("_task", "unknown") + if t not in tasks: + tasks.append(t) + if task_filter: + tasks = [t for t in tasks if t == task_filter] + + n_tasks = len(tasks) + if n_tasks == 0: + print("[plot_results] No tasks found in data.") + return + + fig, axes_grid = plt.subplots(n_tasks, 4, figsize=(18, 4.5 * n_tasks)) + fig.suptitle(f"{title_prefix} Benchmark", fontsize=15, fontweight="bold") + + # Ensure axes_grid is always 2D + if n_tasks == 1: + axes_grid = [axes_grid] + + for row_idx, task in enumerate(tasks): + # Collect concurrencies across all runs for this task + all_concs: set[int] = set() + for run in all_runs: + for r in run: + if r.get("_task") == task: + c = r.get("_concurrency") + if c is not None: + all_concs.add(int(c)) + concurrencies = sorted(all_concs) + x = np.arange(len(concurrencies)) + conc_labels = [str(c) for c in concurrencies] + + def _series(run: list[dict], metric_key: str) -> list[float]: + conc_map = {int(r["_concurrency"]): r for r in run if r.get("_task") == task and "_concurrency" in r} + return [_get(conc_map.get(c, {}), metric_key) for c in concurrencies] + + metrics = [ + ("mean_audio_ttfp_ms", "TTFP (ms)", "Time-to-First-Packet", ".0f"), + ("mean_e2el_ms", "E2E Latency (ms)", "End-to-End Latency", ".0f"), + ("mean_audio_rtf", "RTF", "Real-Time Factor (RTF)", ".3f"), + ("audio_throughput", "audio-s / wall-s", "Audio Throughput", ".2f"), + ] + + axes_row = axes_grid[row_idx] + for col_idx, (key, ylabel, subtitle, fmt) in enumerate(metrics): + data_per_label = {lbl: _series(run, key) for lbl, run in zip(labels, all_runs)} + _bar_group( + axes_row[col_idx], + x, + data_per_label, + width=0.3 if len(labels) > 1 else 0.5, + colors=COLORS, + ylabel=ylabel, + title=f"{task} — {subtitle}", + concurrency_labels=conc_labels, + fmt=fmt, + ) + + plt.tight_layout() + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Plot saved to {output_path}") + plt.close() + + +# --------------------------------------------------------------------------- +# Markdown comparison table +# --------------------------------------------------------------------------- + + +def print_comparison_table(all_runs: list[list[dict]], labels: list[str]) -> None: + tasks: list[str] = [] + for run in all_runs: + for r in run: + t = r.get("_task", "unknown") + if t not in tasks: + tasks.append(t) + + perf_metrics = [ + ("TTFP (ms)", "mean_audio_ttfp_ms", ".1f"), + ("E2E (ms)", "mean_e2el_ms", ".1f"), + ("RTF", "mean_audio_rtf", ".3f"), + ("Throughput (a-s/s)", "audio_throughput", ".2f"), + ] + quality_metrics = [ + ("WER (%)", "seed_tts_mean_wer", ".1f"), + ("SIM", "seed_tts_mean_sim", ".3f"), + ("UTMOS", "seed_tts_mean_utmos", ".2f"), + ] + + for task in tasks: + all_concs: set[int] = set() + for run in all_runs: + for r in run: + if r.get("_task") == task: + c = r.get("_concurrency") + if c is not None: + all_concs.add(int(c)) + concurrencies = sorted(all_concs) + + print(f"\n## {task}\n") + col_header = "| Metric | Concurrency |" + "".join(f" {lbl} |" for lbl in labels) + sep = "| --- | --- |" + " --- |" * len(labels) + print(col_header) + print(sep) + + for metric, key, fmt in perf_metrics + quality_metrics: + for c in concurrencies: + row = f"| {metric} | {c} |" + for run in all_runs: + conc_map = { + int(r["_concurrency"]): r for r in run if r.get("_task") == task and "_concurrency" in r + } + val = _get(conc_map.get(c, {}), key) + row += f" {val:{fmt}} |" if not math.isnan(val) else " n/a |" + print(row) + + # Improvement column (2-run comparison only) + if len(all_runs) == 2: + print(f"\n### Improvement ({labels[0]} vs {labels[1]})\n") + print("| Metric | Concurrency | Change |") + print("| --- | --- | --- |") + for metric, key, _ in perf_metrics: + for c in concurrencies: + conc_map0 = { + int(r["_concurrency"]): r for r in all_runs[0] if r.get("_task") == task and "_concurrency" in r + } + conc_map1 = { + int(r["_concurrency"]): r for r in all_runs[1] if r.get("_task") == task and "_concurrency" in r + } + v0 = _get(conc_map0.get(c, {}), key) + v1 = _get(conc_map1.get(c, {}), key) + if not math.isnan(v0) and not math.isnan(v1) and v1 > 0: + pct = (v1 - v0) / v1 * 100 + print(f"| {metric} | {c} | {pct:+.1f}% |") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--results", + type=str, + nargs="+", + action="append", + required=True, + metavar="FILE", + help="JSON result file(s) for one run. Repeat --results for multiple runs to compare.", + ) + parser.add_argument( + "--labels", + type=str, + nargs="+", + default=None, + help="Label for each --results group (must match the number of --results groups).", + ) + parser.add_argument("--output", type=str, default="results/tts_benchmark.png", help="Output image path.") + parser.add_argument("--title", type=str, default="TTS", help="Title prefix for the plot.") + parser.add_argument("--task", type=str, default=None, help="Filter to a single task (e.g. voice_clone).") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + # args.results is a list-of-lists due to action="append" + all_runs: list[list[dict]] = [load_run(group) for group in args.results] + n_runs = len(all_runs) + + labels: list[str] + if args.labels: + if len(args.labels) != n_runs: + raise SystemExit(f"--labels count ({len(args.labels)}) must match --results groups ({n_runs})") + labels = args.labels + else: + labels = [f"run{i + 1}" for i in range(n_runs)] + + print_comparison_table(all_runs, labels) + plot_comparison(all_runs, labels, args.output, task_filter=args.task, title_prefix=args.title) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/voxcpm/README.md b/benchmarks/voxcpm/README.md deleted file mode 100644 index 17f904101bb..00000000000 --- a/benchmarks/voxcpm/README.md +++ /dev/null @@ -1,119 +0,0 @@ -# VoxCPM Benchmark - -This directory contains both: - -- online serving benchmark through the OpenAI-compatible `/v1/audio/speech` API -- offline benchmark for `Omni` / `AsyncOmni` -- full offline smoke-matrix orchestration - -Both benchmark paths report: - -- TTFP: time to first PCM packet -- E2E latency -- RTF: real-time factor (`e2e / audio_duration`) - -## Offline Benchmark - -Single offline benchmark run: - -```bash -python benchmarks/voxcpm/vllm_omni/bench_tts_offline.py \ - --model /path/to/voxcpm-model \ - --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm.yaml \ - --text "This is a split-stage VoxCPM synthesis example running on vLLM Omni." \ - --warmup-runs 1 \ - --output-dir benchmarks/voxcpm/results/offline_single -``` - -Streaming offline benchmark: - -```bash -python benchmarks/voxcpm/vllm_omni/bench_tts_offline.py \ - --model /path/to/voxcpm-model \ - --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ - --text "This is a split-stage VoxCPM streaming example running on vLLM Omni." \ - --warmup-runs 1 \ - --output-dir benchmarks/voxcpm/results/offline_streaming -``` - -Full fixed offline matrix, equivalent to the old `examples/offline_inference/voxcpm/test.py`: - -```bash -python benchmarks/voxcpm/vllm_omni/run_offline_matrix.py \ - --model /path/to/voxcpm-model \ - --ref-audio /path/to/reference.wav \ - --ref-text "The exact transcript spoken in reference.wav." \ - --output-root benchmarks/voxcpm/results/offline_matrix -``` - -The full matrix covers both routes: - -- streaming: `voxcpm_async_chunk.yaml` -- sync: `voxcpm.yaml` - -And these six scenarios under each route: - -- warmup + single TTS -- warmup + single voice cloning -- warmup + batch TTS -- warmup + batch voice cloning -- cold single TTS -- cold single voice cloning - -`bench_tts_offline.py` itself no longer writes `summary.json` / `results.json`; it prints TTFP / RTF inline and saves generated WAV files only. The matrix runner keeps only per-case `run.log`. - -## Start the Server - -Async-chunk: - -```bash -vllm serve /path/to/voxcpm-model \ - --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ - --trust-remote-code \ - --enforce-eager \ - --omni \ - --port 8091 -``` - -Non-streaming: - -```bash -vllm serve /path/to/voxcpm-model \ - --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm.yaml \ - --trust-remote-code \ - --enforce-eager \ - --omni \ - --port 8091 -``` - -## Run the Benchmark - -```bash -python benchmarks/voxcpm/vllm_omni/bench_tts_serve.py \ - --host 127.0.0.1 \ - --port 8091 \ - --num-prompts 20 \ - --max-concurrency 1 \ - --result-dir /tmp/voxcpm_bench -``` - -Voice cloning benchmark: - -```bash -python benchmarks/voxcpm/vllm_omni/bench_tts_serve.py \ - --host 127.0.0.1 \ - --port 8091 \ - --num-prompts 10 \ - --max-concurrency 1 \ - --ref-audio https://example.com/reference.wav \ - --ref-text "The exact transcript spoken in the reference audio." \ - --result-dir /tmp/voxcpm_clone_bench -``` - -## Notes - -- The benchmark uses `stream=true` and `response_format=pcm` so TTFP is measured from the first audio packet. -- `RTF < 1.0` means the server generates audio faster than real time. -- For `voxcpm_async_chunk.yaml`, keep concurrency at `1`. This matches native VoxCPM streaming more closely. -- Do not benchmark concurrent online streaming on `voxcpm_async_chunk.yaml`; use `voxcpm.yaml` for multi-request throughput runs. -- For the offline matrix mode, `--ref-audio` and `--ref-text` are required because clone cases are part of the fixed coverage set. diff --git a/benchmarks/voxcpm/vllm_omni/bench_tts_serve.py b/benchmarks/voxcpm/vllm_omni/bench_tts_serve.py deleted file mode 100644 index 816df32796d..00000000000 --- a/benchmarks/voxcpm/vllm_omni/bench_tts_serve.py +++ /dev/null @@ -1,283 +0,0 @@ -"""Benchmark VoxCPM via /v1/audio/speech. - -Reports TTFP (time to first packet), E2E latency, and RTF (real-time factor). -""" - -from __future__ import annotations - -import argparse -import asyncio -import json -import time -from dataclasses import asdict, dataclass, field -from datetime import datetime -from pathlib import Path - -import aiohttp -import numpy as np -from tqdm.asyncio import tqdm - -DEFAULT_MODEL = "OpenBMB/VoxCPM1.5" -DEFAULT_SAMPLE_RATE = 24000 -PROMPTS = [ - "Hello, welcome to the VoxCPM speech benchmark.", - "This is a short benchmark prompt for online text-to-speech generation.", - "The quick brown fox jumps over the lazy dog near the riverbank.", - "Please remember to bring your identification documents tomorrow morning.", - "Learning a new language takes patience, practice, and curiosity.", - "This benchmark reports TTFP and RTF for the VoxCPM online serving path.", -] - - -@dataclass -class RequestResult: - success: bool = False - ttfp: float = 0.0 - e2e: float = 0.0 - audio_bytes: int = 0 - audio_duration: float = 0.0 - rtf: float = 0.0 - prompt: str = "" - error: str = "" - - -@dataclass -class BenchmarkResult: - concurrency: int = 0 - num_prompts: int = 0 - completed: int = 0 - failed: int = 0 - duration_s: float = 0.0 - mean_ttfp_ms: float = 0.0 - median_ttfp_ms: float = 0.0 - p95_ttfp_ms: float = 0.0 - mean_e2e_ms: float = 0.0 - median_e2e_ms: float = 0.0 - p95_e2e_ms: float = 0.0 - mean_rtf: float = 0.0 - median_rtf: float = 0.0 - p95_rtf: float = 0.0 - total_audio_duration_s: float = 0.0 - request_throughput: float = 0.0 - per_request: list[dict[str, float | str]] = field(default_factory=list) - - -def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = DEFAULT_SAMPLE_RATE, sample_width: int = 2) -> float: - num_samples = num_bytes / sample_width - return num_samples / sample_rate - - -async def send_tts_request( - session: aiohttp.ClientSession, - api_url: str, - *, - model: str, - prompt: str, - ref_audio: str | None, - ref_text: str | None, - pbar: tqdm | None = None, -) -> RequestResult: - payload: dict[str, object] = { - "model": model, - "input": prompt, - "stream": True, - "response_format": "pcm", - } - if ref_audio is not None: - payload["ref_audio"] = ref_audio - if ref_text is not None: - payload["ref_text"] = ref_text - - result = RequestResult(prompt=prompt) - started_at = time.perf_counter() - - try: - async with session.post(api_url, json=payload) as response: - if response.status != 200: - result.error = f"HTTP {response.status}: {await response.text()}" - return result - - first_chunk = True - total_bytes = 0 - async for chunk in response.content.iter_any(): - if not chunk: - continue - if first_chunk: - result.ttfp = time.perf_counter() - started_at - first_chunk = False - total_bytes += len(chunk) - - result.e2e = time.perf_counter() - started_at - result.audio_bytes = total_bytes - result.audio_duration = pcm_bytes_to_duration(total_bytes) - if result.audio_duration > 0: - result.rtf = result.e2e / result.audio_duration - result.success = True - except Exception as e: - result.error = str(e) - result.e2e = time.perf_counter() - started_at - - if pbar is not None: - pbar.update(1) - return result - - -async def run_benchmark( - *, - host: str, - port: int, - model: str, - num_prompts: int, - max_concurrency: int, - num_warmups: int, - ref_audio: str | None, - ref_text: str | None, -) -> BenchmarkResult: - api_url = f"http://{host}:{port}/v1/audio/speech" - connector = aiohttp.TCPConnector(limit=max_concurrency, limit_per_host=max_concurrency, keepalive_timeout=60) - timeout = aiohttp.ClientTimeout(total=600) - - async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session: - if num_warmups > 0: - print(f" Warming up with {num_warmups} requests...") - warmup_tasks = [ - send_tts_request( - session, - api_url, - model=model, - prompt=PROMPTS[i % len(PROMPTS)], - ref_audio=ref_audio, - ref_text=ref_text, - ) - for i in range(num_warmups) - ] - await asyncio.gather(*warmup_tasks) - print(" Warmup done.") - - request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] - semaphore = asyncio.Semaphore(max_concurrency) - pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") - - async def limited_request(prompt: str) -> RequestResult: - async with semaphore: - return await send_tts_request( - session, - api_url, - model=model, - prompt=prompt, - ref_audio=ref_audio, - ref_text=ref_text, - pbar=pbar, - ) - - started_at = time.perf_counter() - results = await asyncio.gather(*[asyncio.create_task(limited_request(prompt)) for prompt in request_prompts]) - duration = time.perf_counter() - started_at - pbar.close() - - succeeded = [result for result in results if result.success] - bench = BenchmarkResult( - concurrency=max_concurrency, - num_prompts=num_prompts, - completed=len(succeeded), - failed=len(results) - len(succeeded), - duration_s=duration, - ) - - if not succeeded: - return bench - - ttfps = np.array([result.ttfp * 1000 for result in succeeded], dtype=np.float64) - e2es = np.array([result.e2e * 1000 for result in succeeded], dtype=np.float64) - rtfs = np.array([result.rtf for result in succeeded], dtype=np.float64) - audio_durations = np.array([result.audio_duration for result in succeeded], dtype=np.float64) - - bench.mean_ttfp_ms = float(np.mean(ttfps)) - bench.median_ttfp_ms = float(np.median(ttfps)) - bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) - bench.mean_e2e_ms = float(np.mean(e2es)) - bench.median_e2e_ms = float(np.median(e2es)) - bench.p95_e2e_ms = float(np.percentile(e2es, 95)) - bench.mean_rtf = float(np.mean(rtfs)) - bench.median_rtf = float(np.median(rtfs)) - bench.p95_rtf = float(np.percentile(rtfs, 95)) - bench.total_audio_duration_s = float(np.sum(audio_durations)) - bench.request_throughput = len(succeeded) / duration if duration > 0 else 0.0 - bench.per_request = [ - { - "prompt": result.prompt, - "ttfp_ms": result.ttfp * 1000, - "e2e_ms": result.e2e * 1000, - "rtf": result.rtf, - "audio_duration_s": result.audio_duration, - } - for result in succeeded - ] - - return bench - - -def print_summary(result: BenchmarkResult) -> None: - width = 54 - print("") - print("=" * width) - print(f"{'VoxCPM Serving Benchmark':^{width}}") - print("=" * width) - print(f"concurrency : {result.concurrency}") - print(f"requests : {result.completed}/{result.num_prompts} succeeded") - print(f"wall time (s) : {result.duration_s:.3f}") - print(f"mean TTFP (ms) : {result.mean_ttfp_ms:.2f}") - print(f"p95 TTFP (ms) : {result.p95_ttfp_ms:.2f}") - print(f"mean E2E (ms) : {result.mean_e2e_ms:.2f}") - print(f"p95 E2E (ms) : {result.p95_e2e_ms:.2f}") - print(f"mean RTF : {result.mean_rtf:.3f}") - print(f"p95 RTF : {result.p95_rtf:.3f}") - print(f"request throughput : {result.request_throughput:.2f} req/s") - print("=" * width) - - -async def main_async(args) -> None: - result_dir = Path(args.result_dir) - result_dir.mkdir(parents=True, exist_ok=True) - - all_results: list[BenchmarkResult] = [] - for concurrency in args.max_concurrency: - result = await run_benchmark( - host=args.host, - port=args.port, - model=args.model, - num_prompts=args.num_prompts, - max_concurrency=concurrency, - num_warmups=args.num_warmups, - ref_audio=args.ref_audio, - ref_text=args.ref_text, - ) - print_summary(result) - all_results.append(result) - - payload = { - "model": args.model, - "created_at": datetime.utcnow().isoformat() + "Z", - "results": [asdict(result) for result in all_results], - } - result_path = result_dir / "bench_tts_serve.json" - result_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") - print(f"Saved results to: {result_path}") - - -def parse_args(): - parser = argparse.ArgumentParser(description="Benchmark VoxCPM via /v1/audio/speech") - parser.add_argument("--host", default="127.0.0.1", help="Server host") - parser.add_argument("--port", type=int, default=8091, help="Server port") - parser.add_argument("--model", default=DEFAULT_MODEL, help="Model name or path") - parser.add_argument("--num-prompts", type=int, default=20, help="Number of prompts to send") - parser.add_argument("--max-concurrency", type=int, nargs="+", default=[1], help="Concurrency levels to benchmark") - parser.add_argument("--num-warmups", type=int, default=3, help="Warmup request count") - parser.add_argument("--ref-audio", default=None, help="Reference audio URL or data URL for voice cloning") - parser.add_argument("--ref-text", default=None, help="Reference audio transcript for voice cloning") - parser.add_argument("--result-dir", default="results", help="Directory to save benchmark JSON") - return parser.parse_args() - - -if __name__ == "__main__": - asyncio.run(main_async(parse_args())) diff --git a/benchmarks/voxcpm/vllm_omni/run_offline_matrix.py b/benchmarks/voxcpm/vllm_omni/run_offline_matrix.py deleted file mode 100644 index cee46c0f867..00000000000 --- a/benchmarks/voxcpm/vllm_omni/run_offline_matrix.py +++ /dev/null @@ -1,303 +0,0 @@ -"""Run the full offline VoxCPM smoke matrix. - -This script keeps the old `test.py` coverage, but delegates each case to -`bench_tts_offline.py` so the benchmark runner itself stays focused on a -single execution path. -""" - -from __future__ import annotations - -import shlex -import subprocess -import sys -import time -from dataclasses import dataclass -from pathlib import Path - -from vllm.utils.argparse_utils import FlexibleArgumentParser - -REPO_ROOT = Path(__file__).resolve().parents[3] -BENCH_SCRIPT = Path(__file__).with_name("bench_tts_offline.py") -DEFAULT_STAGE_ASYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm_async_chunk.yaml" -DEFAULT_STAGE_SYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" -DEFAULT_OUTPUT_ROOT = BENCH_SCRIPT.parents[1] / "results" / "offline_matrix" - -SINGLE_TTS_TEXT = "This is a single text-to-speech smoke test for VoxCPM on vLLM Omni." -SINGLE_CLONE_TEXT = "This sentence is synthesized with the cloned voice for validation." -BATCH_TTS_TEXTS = [ - "The first batch text-to-speech sample validates sequential batch execution.", - "The second batch text-to-speech sample checks another prompt in the same file.", - "The third batch text-to-speech sample completes the sequential batch path.", -] -BATCH_CLONE_TEXTS = [ - "The first cloned sample validates sequential batch voice cloning.", - "The second cloned sample checks the same reference voice on another prompt.", - "The third cloned sample finishes the shared-reference clone batch path.", -] - - -@dataclass(frozen=True, slots=True) -class ModeSpec: - name: str - stage_config: Path - - -@dataclass(frozen=True, slots=True) -class CaseSpec: - name: str - warmup_runs: int - prompt_kind: str - voice_clone: bool - - -@dataclass(frozen=True, slots=True) -class CaseResult: - mode: str - case: str - returncode: int - elapsed_s: float - output_dir: Path - log_path: Path - - @property - def ok(self) -> bool: - return self.returncode == 0 - - -MODE_SPECS = [ - ModeSpec(name="streaming", stage_config=DEFAULT_STAGE_ASYNC), - ModeSpec(name="sync", stage_config=DEFAULT_STAGE_SYNC), -] - -CASE_SPECS = [ - CaseSpec(name="warmup_single_tts", warmup_runs=1, prompt_kind="single", voice_clone=False), - CaseSpec(name="warmup_single_clone", warmup_runs=1, prompt_kind="single", voice_clone=True), - CaseSpec(name="warmup_batch_tts", warmup_runs=1, prompt_kind="batch", voice_clone=False), - CaseSpec(name="warmup_batch_clone", warmup_runs=1, prompt_kind="batch", voice_clone=True), - CaseSpec(name="cold_single_tts", warmup_runs=0, prompt_kind="single", voice_clone=False), - CaseSpec(name="cold_single_clone", warmup_runs=0, prompt_kind="single", voice_clone=True), -] - - -def _write_lines(path: Path, lines: list[str]) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text("\n".join(lines) + "\n", encoding="utf-8") - - -def _prepare_batch_inputs(output_root: Path) -> tuple[Path, Path]: - input_dir = output_root / "inputs" - batch_tts_path = input_dir / "batch_tts_prompts.txt" - batch_clone_path = input_dir / "batch_clone_prompts.txt" - _write_lines(batch_tts_path, BATCH_TTS_TEXTS) - _write_lines(batch_clone_path, BATCH_CLONE_TEXTS) - return batch_tts_path, batch_clone_path - - -def _base_command(args, mode: ModeSpec, output_dir: Path) -> list[str]: - cmd = [ - args.python, - str(BENCH_SCRIPT), - "--model", - args.model, - "--stage-configs-path", - str(mode.stage_config), - "--output-dir", - str(output_dir), - "--num-runs", - str(args.num_runs), - "--stage-init-timeout", - str(args.stage_init_timeout), - ] - cmd.append("--log-stats" if args.log_stats else "--no-log-stats") - cmd.extend(["--cfg-value", str(args.cfg_value)]) - cmd.extend(["--inference-timesteps", str(args.inference_timesteps)]) - cmd.extend(["--min-len", str(args.min_len)]) - cmd.extend(["--max-new-tokens", str(args.max_new_tokens)]) - if args.streaming_prefix_len is not None: - cmd.extend(["--streaming-prefix-len", str(args.streaming_prefix_len)]) - if args.enable_profiler: - profiler_dir = Path(args.profiler_dir) if args.profiler_dir is not None else (output_dir / "profiler") - cmd.append("--enable-profiler") - cmd.extend(["--profiler-dir", str(profiler_dir)]) - cmd.extend(["--profiler-wait-seconds", str(args.profiler_wait_seconds)]) - if args.profiler_stages is not None: - cmd.append("--profiler-stages") - cmd.extend(str(stage_id) for stage_id in args.profiler_stages) - return cmd - - -def _build_case_command( - args, - mode: ModeSpec, - case: CaseSpec, - *, - batch_tts_path: Path, - batch_clone_path: Path, - output_dir: Path, -) -> list[str]: - cmd = _base_command(args, mode, output_dir) - cmd.extend(["--warmup-runs", str(case.warmup_runs)]) - if case.prompt_kind == "single": - cmd.extend(["--text", SINGLE_CLONE_TEXT if case.voice_clone else SINGLE_TTS_TEXT]) - else: - cmd.extend(["--txt-prompts", str(batch_clone_path if case.voice_clone else batch_tts_path)]) - if case.voice_clone: - cmd.extend(["--ref-audio", args.ref_audio, "--ref-text", args.ref_text]) - return cmd - - -def _run_case( - args, - mode: ModeSpec, - case: CaseSpec, - *, - batch_tts_path: Path, - batch_clone_path: Path, - output_root: Path, -) -> CaseResult: - case_output_dir = output_root / mode.name / case.name - case_output_dir.mkdir(parents=True, exist_ok=True) - case_log_path = case_output_dir / "run.log" - cmd = _build_case_command( - args, - mode, - case, - batch_tts_path=batch_tts_path, - batch_clone_path=batch_clone_path, - output_dir=case_output_dir, - ) - - print() - print("=" * 80) - print(f"[{mode.name}] {case.name}") - print(f"Output directory: {case_output_dir}") - print(shlex.join(cmd)) - - start = time.perf_counter() - with case_log_path.open("w", encoding="utf-8") as log_fp: - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - ) - assert process.stdout is not None - for line in process.stdout: - print(line, end="") - log_fp.write(line) - process.wait() - - elapsed_s = time.perf_counter() - start - status = "PASS" if (process.returncode or 0) == 0 else f"FAIL({process.returncode})" - print(f"[{mode.name}] {case.name} -> {status} ({elapsed_s:.2f}s)") - return CaseResult( - mode=mode.name, - case=case.name, - returncode=int(process.returncode or 0), - elapsed_s=elapsed_s, - output_dir=case_output_dir, - log_path=case_log_path, - ) - - -def parse_args(): - parser = FlexibleArgumentParser(description="Run the full offline VoxCPM smoke matrix.") - parser.add_argument("--model", type=str, required=True, help="Local VoxCPM model directory.") - parser.add_argument("--ref-audio", type=str, required=True, help="Reference audio path for clone cases.") - parser.add_argument("--ref-text", type=str, required=True, help="Exact transcript spoken in --ref-audio.") - parser.add_argument("--output-root", type=str, default=str(DEFAULT_OUTPUT_ROOT), help="Root directory for outputs.") - parser.add_argument("--python", type=str, default=sys.executable, help="Python executable used to launch cases.") - parser.add_argument("--stage-init-timeout", type=int, default=600, help="Stage initialization timeout in seconds.") - parser.add_argument("--log-stats", dest="log_stats", action="store_true", help="Enable vLLM Omni stats logging.") - parser.add_argument( - "--no-log-stats", - dest="log_stats", - action="store_false", - help="Disable vLLM Omni stats logging.", - ) - parser.set_defaults(log_stats=True) - parser.add_argument("--num-runs", type=int, default=1, help="Number of measured runs per case.") - parser.add_argument("--cfg-value", type=float, default=2.0, help="Classifier-free guidance value for VoxCPM.") - parser.add_argument("--inference-timesteps", type=int, default=10, help="Number of inference timesteps.") - parser.add_argument("--min-len", type=int, default=2, help="Minimum generated token length.") - parser.add_argument("--max-new-tokens", type=int, default=4096, help="Maximum generated token length.") - parser.add_argument( - "--streaming-prefix-len", - type=int, - default=None, - help="Optional VoxCPM streaming window passed to streaming cases.", - ) - parser.add_argument("--enable-profiler", action="store_true", help="Enable torch profiler for each case.") - parser.add_argument( - "--profiler-dir", - type=str, - default=None, - help="Profiler output root. Defaults to /profiler.", - ) - parser.add_argument( - "--profiler-stages", - type=int, - nargs="*", - default=None, - help="Optional stage ids to profile. Defaults to all configured stages.", - ) - parser.add_argument( - "--profiler-wait-seconds", - type=float, - default=30.0, - help="Seconds to wait after stopping profiler for traces to flush.", - ) - args = parser.parse_args() - if args.num_runs < 1: - parser.error("--num-runs must be >= 1") - return args - - -def main(args) -> int: - output_root = Path(args.output_root) - output_root.mkdir(parents=True, exist_ok=True) - batch_tts_path, batch_clone_path = _prepare_batch_inputs(output_root) - - print(f"Model: {args.model}") - print(f"Reference audio: {args.ref_audio}") - print(f"Reference text: {args.ref_text}") - print(f"Python: {args.python}") - print(f"Output root: {output_root}") - print(f"Cases: {len(MODE_SPECS) * len(CASE_SPECS)}") - - results: list[CaseResult] = [] - for mode in MODE_SPECS: - for case in CASE_SPECS: - results.append( - _run_case( - args, - mode, - case, - batch_tts_path=batch_tts_path, - batch_clone_path=batch_clone_path, - output_root=output_root, - ) - ) - - failed = [result for result in results if not result.ok] - print() - print("=" * 80) - print("Summary:") - for result in results: - status = "PASS" if result.ok else f"FAIL({result.returncode})" - print(f"- [{result.mode}] {result.case}: {status} ({result.elapsed_s:.2f}s)") - print(f" output_dir={result.output_dir}") - print(f" log={result.log_path}") - - print(f"Passed: {len(results) - len(failed)}/{len(results)}") - if failed: - print("Failed cases:") - for result in failed: - print(f"- [{result.mode}] {result.case}: see {result.log_path}") - return 1 - return 0 - - -if __name__ == "__main__": - raise SystemExit(main(parse_args())) diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py new file mode 100644 index 00000000000..7af6c3f8cb8 --- /dev/null +++ b/tests/benchmarks/conftest.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""conftest.py for benchmarks unit tests. + +Installs lightweight mock stubs for ``vllm`` (and sub-packages) so the +data-module unit tests can run without a full vLLM installation. Only the +symbols actually imported by +``vllm_omni.benchmarks.data_modules.seed_tts_dataset`` are emulated. +""" + +from __future__ import annotations + +import sys +import types +from dataclasses import dataclass +from typing import Any + + +def _install_vllm_stubs() -> None: + """Register minimal vllm stubs in sys.modules. + + Only installs when real vllm is unavailable. We actively probe the + import because an empty or partial vllm may not yet have imported + the submodules we rely on, and unconditionally registering stubs + would shadow the real package for sibling tests (e.g. + ``tests/benchmarks/metrics/test_metrics.py`` needs the real + ``vllm.benchmarks.serve``). + """ + try: + import vllm.benchmarks.datasets # noqa: F401 + import vllm.tokenizers # noqa: F401 + except ImportError: + pass + else: + return # real vllm available — do not shadow it + if "vllm.benchmarks.datasets" in sys.modules: + return + + # ------------------------------------------------------------------ # + # vllm.benchmarks.datasets # + # ------------------------------------------------------------------ # + @dataclass + class SampleRequest: + prompt: str = "" + prompt_len: int = 0 + expected_output_len: int = 0 + multi_modal_data: Any = None + request_id: str = "" + + class BenchmarkDataset: + def __init__( + self, + dataset_path: str = "", + random_seed: int = 0, + disable_shuffle: bool = False, + **kwargs: Any, + ) -> None: + self.dataset_path = dataset_path + self.random_seed = random_seed + self.disable_shuffle = disable_shuffle + + def maybe_oversample_requests( + self, + out: list, + num_requests: int, + request_id_prefix: str, + no_oversample: bool, + ) -> None: + pass + + # ------------------------------------------------------------------ # + # vllm.tokenizers / vllm.tokenizers.hf # + # ------------------------------------------------------------------ # + class TokenizerLike: + pass + + def get_cached_tokenizer(t: Any) -> Any: + return t + + # ------------------------------------------------------------------ # + # Wire up sys.modules # + # ------------------------------------------------------------------ # + vllm_mod = types.ModuleType("vllm") + vllm_benchmarks = types.ModuleType("vllm.benchmarks") + vllm_benchmarks_datasets = types.ModuleType("vllm.benchmarks.datasets") + vllm_tokenizers = types.ModuleType("vllm.tokenizers") + vllm_tokenizers_hf = types.ModuleType("vllm.tokenizers.hf") + + vllm_benchmarks_datasets.BenchmarkDataset = BenchmarkDataset # type: ignore[attr-defined] + vllm_benchmarks_datasets.SampleRequest = SampleRequest # type: ignore[attr-defined] + vllm_tokenizers.TokenizerLike = TokenizerLike # type: ignore[attr-defined] + vllm_tokenizers_hf.get_cached_tokenizer = get_cached_tokenizer # type: ignore[attr-defined] + + sys.modules["vllm"] = vllm_mod + sys.modules["vllm.benchmarks"] = vllm_benchmarks + sys.modules["vllm.benchmarks.datasets"] = vllm_benchmarks_datasets + sys.modules["vllm.tokenizers"] = vllm_tokenizers + sys.modules["vllm.tokenizers.hf"] = vllm_tokenizers_hf + + +# Install stubs immediately at collection time (before any test import). +_install_vllm_stubs() diff --git a/tests/benchmarks/test_bench_tts_cli.py b/tests/benchmarks/test_bench_tts_cli.py new file mode 100644 index 00000000000..b8a487f80c6 --- /dev/null +++ b/tests/benchmarks/test_bench_tts_cli.py @@ -0,0 +1,139 @@ +"""Tests for the universal benchmarks/tts/bench_tts.py CLI.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +import pytest +import yaml + +# Add benchmarks/tts to path for import +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "benchmarks" / "tts")) +import bench_tts + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture() +def model_configs_path(tmp_path: Path) -> Path: + cfg = { + "models": { + "test/ModelA": { + "stage_config": "model_a.yaml", + "supported_tasks": ["voice_clone", "default_voice"], + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "task_extra_body": { + "voice_clone": {"task_type": "Base"}, + "default_voice": {"voice": "Vivian", "task_type": "CustomVoice"}, + }, + }, + "test/ModelB": { + "stage_config": "model_b.yaml", + "supported_tasks": ["voice_clone"], + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "task_extra_body": {"voice_clone": {}}, + }, + } + } + p = tmp_path / "model_configs.yaml" + p.write_text(yaml.dump(cfg), encoding="utf-8") + return p + + +def test_load_model_configs(model_configs_path: Path) -> None: + configs = bench_tts.load_model_configs(model_configs_path) + assert "test/ModelA" in configs + assert "test/ModelB" in configs + assert configs["test/ModelA"]["supported_tasks"] == ["voice_clone", "default_voice"] + + +def test_build_bench_args_voice_clone(model_configs_path: Path) -> None: + configs = bench_tts.load_model_configs(model_configs_path) + cmd = bench_tts.build_bench_args( + host="localhost", + port=8000, + model="test/ModelA", + task="voice_clone", + model_cfg=configs["test/ModelA"], + locale="en", + num_prompts=10, + concurrency=1, + dataset_path="/data/seed-tts", + wer_eval=False, + output_dir=None, + result_filename=None, + extra_cli_args=[], + ) + assert "--dataset-name" in cmd + idx = cmd.index("--dataset-name") + assert cmd[idx + 1] == "seed-tts" + assert "--max-concurrency" in cmd + assert "--extra-body" in cmd + extra_body = json.loads(cmd[cmd.index("--extra-body") + 1]) + assert extra_body.get("task_type") == "Base" + + +def test_build_bench_args_default_voice_has_voice_param(model_configs_path: Path) -> None: + configs = bench_tts.load_model_configs(model_configs_path) + cmd = bench_tts.build_bench_args( + host="localhost", + port=8000, + model="test/ModelA", + task="default_voice", + model_cfg=configs["test/ModelA"], + locale="en", + num_prompts=10, + concurrency=1, + dataset_path="/data/seed-tts", + wer_eval=False, + output_dir=None, + result_filename=None, + extra_cli_args=[], + ) + idx = cmd.index("--dataset-name") + assert cmd[idx + 1] == "seed-tts-text" + extra_body = json.loads(cmd[cmd.index("--extra-body") + 1]) + assert extra_body.get("voice") == "Vivian" + + +def test_build_bench_args_wer_eval_adds_flag(model_configs_path: Path) -> None: + configs = bench_tts.load_model_configs(model_configs_path) + cmd = bench_tts.build_bench_args( + host="localhost", + port=8000, + model="test/ModelA", + task="voice_clone", + model_cfg=configs["test/ModelA"], + locale="en", + num_prompts=10, + concurrency=1, + dataset_path="/data/seed-tts", + wer_eval=True, + output_dir=None, + result_filename=None, + extra_cli_args=[], + ) + assert "--seed-tts-wer-eval" in cmd + + +def test_unsupported_task_exits(model_configs_path: Path, capsys: pytest.CaptureFixture, mocker) -> None: + # ModelB does not support voice_design + mocker.patch.object( + sys, + "argv", + [ + "bench_tts.py", + "--model", + "test/ModelB", + "--task", + "voice_design", + "--model-configs", + str(model_configs_path), + ], + ) + with pytest.raises(SystemExit): + bench_tts.main() diff --git a/tests/benchmarks/test_seed_tts_dataset_variants.py b/tests/benchmarks/test_seed_tts_dataset_variants.py new file mode 100644 index 00000000000..9f6fa91b539 --- /dev/null +++ b/tests/benchmarks/test_seed_tts_dataset_variants.py @@ -0,0 +1,159 @@ +"""Tests for SeedTTSTextDataset, SeedTTSTextSampleRequest, SeedTTSDesignDataset, +and SeedTTSDesignSampleRequest. + +vllm stubs are installed by tests/benchmarks/conftest.py before collection. +""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + +import pytest + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +# Load the data module directly (bypasses vllm_omni.__init__ heavy imports). +_REPO_ROOT = Path(__file__).resolve().parents[2] +_MODULE_PATH = _REPO_ROOT / "vllm_omni" / "benchmarks" / "data_modules" / "seed_tts_dataset.py" +_MODULE_NAME = "vllm_omni.benchmarks.data_modules.seed_tts_dataset" + +if _MODULE_NAME not in sys.modules: + _spec = importlib.util.spec_from_file_location(_MODULE_NAME, _MODULE_PATH) + _mod = importlib.util.module_from_spec(_spec) + sys.modules[_MODULE_NAME] = _mod + _spec.loader.exec_module(_mod) + +from vllm_omni.benchmarks.data_modules.seed_tts_dataset import ( # noqa: E402 + SeedTTSDesignDataset, + SeedTTSDesignSampleRequest, + SeedTTSTextDataset, + SeedTTSTextSampleRequest, +) + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture() +def seed_tts_root(tmp_path: Path) -> Path: + """Minimal seed-tts-style directory with 5 entries.""" + locale_dir = tmp_path / "en" + locale_dir.mkdir() + wav_dir = locale_dir / "prompt-wavs" + wav_dir.mkdir() + for i in range(5): + (wav_dir / f"utt{i:03d}.wav").write_bytes(b"RIFF\x00\x00\x00\x00WAVE") + meta = "\n".join(f"utt{i:03d}|ref text {i}|prompt-wavs/utt{i:03d}.wav|target text {i}" for i in range(5)) + (locale_dir / "meta.lst").write_text(meta, encoding="utf-8") + return tmp_path + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_seed_tts_text_dataset_omits_ref_audio(seed_tts_root, mocker): + ds = SeedTTSTextDataset( + dataset_path=str(seed_tts_root), + random_seed=0, + locale="en", + disable_shuffle=True, + ) + tokenizer = mocker.MagicMock() + tokenizer.encode = lambda text, **kw: [0] * len(text.split()) + requests = ds.sample(tokenizer, num_requests=3) + assert len(requests) == 3 + for req in requests: + assert isinstance(req, SeedTTSTextSampleRequest) + assert req.seed_tts_speech_extra is None or "ref_audio" not in (req.seed_tts_speech_extra or {}) + assert req.seed_tts_ref_wav_path == "" + assert "target text" in req.prompt + + +# --------------------------------------------------------------------------- +# SeedTTSDesignDataset tests +# --------------------------------------------------------------------------- + + +@pytest.fixture() +def seed_tts_design_root(tmp_path: Path) -> Path: + """seed-tts-design directory with 5-field meta.lst entries.""" + locale_dir = tmp_path / "en" + locale_dir.mkdir() + meta = "\n".join( + f"des{i:03d}|||target text {i}|A warm {['female', 'male'][i % 2]} voice with neutral accent." for i in range(5) + ) + (locale_dir / "meta.lst").write_text(meta, encoding="utf-8") + return tmp_path + + +def test_seed_tts_design_dataset_has_instructions(seed_tts_design_root, mocker): + ds = SeedTTSDesignDataset( + dataset_path=str(seed_tts_design_root), + random_seed=0, + locale="en", + disable_shuffle=True, + ) + tokenizer = mocker.MagicMock() + tokenizer.encode = lambda text, **kw: [0] * len(text.split()) + requests = ds.sample(tokenizer, num_requests=3) + assert len(requests) == 3 + for req in requests: + assert isinstance(req, SeedTTSDesignSampleRequest) + extra = req.seed_tts_speech_extra or {} + assert "instructions" in extra + assert extra["instructions"], "instructions must be non-empty" + assert extra.get("task_type") == "VoiceDesign" + assert "ref_audio" not in extra + assert req.seed_tts_ref_wav_path == "" + + +def test_seed_tts_design_dataset_rejects_missing_description(seed_tts_design_root, mocker): + """Lines without a voice_description should be skipped.""" + locale_dir = seed_tts_design_root / "en" + (locale_dir / "meta.lst").write_text( + "bad001|||target text without description\nok001|||good target|A clear female voice.\n", + encoding="utf-8", + ) + ds = SeedTTSDesignDataset( + dataset_path=str(seed_tts_design_root), + random_seed=0, + locale="en", + disable_shuffle=True, + ) + tokenizer = mocker.MagicMock() + tokenizer.encode = lambda text, **kw: [0] * len(text.split()) + requests = ds.sample(tokenizer, num_requests=10) + assert len(requests) == 1 # only the valid row + + +def test_attach_sets_seed_tts_row_even_without_extra_body(): + """seed_tts_row=True must be set for SeedTTSTextSampleRequest (no extra body).""" + from vllm_omni.benchmarks.data_modules.seed_tts_dataset import SeedTTSTextSampleRequest + + req = SeedTTSTextSampleRequest( + prompt="hello world", + prompt_len=2, + expected_output_len=100, + multi_modal_data=None, + request_id="test-0", + seed_tts_speech_extra=None, + seed_tts_ref_wav_path="", + ) + assert req.seed_tts_speech_extra is None + assert req.seed_tts_ref_wav_path == "" + # The fix ensures that even with speech_extra=None, the function + # sets seed_tts_row=True. We verify the source code has the fix. + import inspect + + import vllm_omni.benchmarks.patch.patch as patch_mod + + src = inspect.getsource(patch_mod._attach_seed_tts_to_request_func_input) + # seed_tts_row must be set BEFORE the 'if not ex: return' check + row_pos = src.index("seed_tts_row") + not_ex_pos = src.index("if not ex:") + assert row_pos < not_ex_pos, "seed_tts_row must be set before 'if not ex: return'" diff --git a/tests/dfx/conftest.py b/tests/dfx/conftest.py index b8edeba9d5e..647195f6087 100644 --- a/tests/dfx/conftest.py +++ b/tests/dfx/conftest.py @@ -82,7 +82,11 @@ def create_test_parameter_mapping(configs: list[dict[str, Any]]) -> dict[str, di "test_name": test_name, "benchmark_params": [], } - mapping[test_name]["benchmark_params"].extend(config["benchmark_params"]) + for entry in config["benchmark_params"]: + # Skip disabled entries + if not entry.get("enabled", True): + continue + mapping[test_name]["benchmark_params"].append(entry) return mapping diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index 0de60c6a543..1f10ba99079 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -329,7 +329,7 @@ def to_list(value, default=None): raise ValueError("The number of prompts does not match the QPS or max_concurrency") args = ["--host", host, "--port", str(port)] - exclude_keys = {"request_rate", "baseline", "num_prompts", "max_concurrency"} + exclude_keys = {"request_rate", "baseline", "num_prompts", "max_concurrency", "task", "enabled", "eval_phase"} for key, value in params.items(): if key in exclude_keys or value is None: diff --git a/tests/dfx/perf/tests/test_runner_metadata.py b/tests/dfx/perf/tests/test_runner_metadata.py new file mode 100644 index 00000000000..1276a847069 --- /dev/null +++ b/tests/dfx/perf/tests/test_runner_metadata.py @@ -0,0 +1,79 @@ +"""Tests for DFX runner metadata field exclusion.""" + +import json + +import pytest + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def test_task_excluded_from_cli_args(): + """'task' field must not become --task CLI arg.""" + params = { + "task": "voice_clone", + "dataset_name": "seed-tts", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "percentile-metrics": "audio_rtf,audio_ttfp", + "baseline": {"mean_audio_rtf": [0.5]}, + } + exclude_keys = {"request_rate", "baseline", "num_prompts", "max_concurrency", "task", "enabled", "eval_phase"} + args = [] + for key, value in params.items(): + if key in exclude_keys or value is None: + continue + arg_name = f"--{key.replace('_', '-')}" + if isinstance(value, bool) and value: + args.append(arg_name) + elif isinstance(value, dict): + args.extend([arg_name, json.dumps(value)]) + elif not isinstance(value, bool): + args.extend([arg_name, str(value)]) + assert "--task" not in args + assert "--enabled" not in args + assert "--dataset-name" in args + + +def test_enabled_false_entry_is_skipped(): + """benchmark_params entry with enabled=false should be skipped.""" + import sys + from pathlib import Path + + sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + from tests.dfx.conftest import create_test_parameter_mapping + + configs = [ + { + "test_name": "test_model", + "server_params": {"model": "some/model"}, + "benchmark_params": [ + { + "task": "voice_clone", + "enabled": True, + "dataset_name": "seed-tts", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "num_prompts": [10], + "max_concurrency": [1], + "percentile-metrics": "audio_rtf", + "baseline": {}, + }, + { + "task": "voice_design", + "enabled": False, + "dataset_name": "seed-tts-design", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "num_prompts": [5], + "max_concurrency": [1], + "percentile-metrics": "audio_rtf", + "baseline": {}, + }, + ], + } + ] + mapping = create_test_parameter_mapping(configs) + params = mapping["test_model"]["benchmark_params"] + # Only the enabled=True entry should appear + assert len(params) == 1 + assert params[0].get("task") == "voice_clone" diff --git a/tests/dfx/perf/tests/test_tts.json b/tests/dfx/perf/tests/test_tts.json index 3583b45b4f2..06c9c4d2384 100644 --- a/tests/dfx/perf/tests/test_tts.json +++ b/tests/dfx/perf/tests/test_tts.json @@ -1,34 +1,155 @@ [ - { - "test_name": "test_qwen3_tts", - "server_params": { - "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" - }, - "benchmark_params": [ - { - "dataset_name": "random", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "num_prompts": [ - 10, - 40 - ], - "max_concurrency": [ - 1, - 4 - ], - "random_input_len": 100, - "random_output_len": 100, - "extra_body": { - "voice": "Vivian", - "language": "English" - }, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_audio_ttfp_ms": [6000, 6000], - "mean_audio_rtf": [0.3, 0.3] - } - } - ] - } + { + "test_name": "test_qwen3_tts_base", + "server_params": { + "model": "Qwen/Qwen3-TTS-12Hz-1.7B-Base" + }, + "benchmark_params": [ + { + "task": "voice_clone", + "eval_phase": "latency", + "enabled": false, + "dataset_name": "seed-tts", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "num_prompts": [20], + "max_concurrency": [1], + "seed_tts_locale": "en", + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "median_audio_ttfp_ms": [350], + "median_audio_rtf": [0.25] + } + }, + { + "task": "voice_clone", + "eval_phase": "throughput", + "enabled": false, + "dataset_name": "seed-tts", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "num_prompts": [80], + "max_concurrency": [8], + "seed_tts_locale": "en", + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "median_audio_ttfp_ms": [3500], + "median_audio_rtf": [0.75], + "audio_throughput": [10.0] + } + }, + { + "task": "voice_clone", + "eval_phase": "quality", + "enabled": false, + "dataset_name": "seed-tts", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "num_prompts": [200], + "max_concurrency": [4], + "seed_tts_locale": "en", + "seed_tts_wer_eval": true, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_audio_rtf": [0.45] + } + } + ] + }, + { + "test_name": "test_qwen3_tts_customvoice", + "server_params": { + "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" + }, + "benchmark_params": [ + { + "task": "default_voice", + "eval_phase": "latency", + "dataset_name": "seed-tts-text", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "dataset_path": "benchmarks/build_dataset/seed_tts_smoke", + "num_prompts": [20], + "max_concurrency": [1], + "seed_tts_locale": "en", + "extra_body": {"voice": "Vivian", "language": "English", "task_type": "CustomVoice"}, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "median_audio_ttfp_ms": [150], + "median_audio_rtf": [0.15] + } + }, + { + "task": "default_voice", + "eval_phase": "throughput", + "dataset_name": "seed-tts-text", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "dataset_path": "benchmarks/build_dataset/seed_tts_smoke", + "num_prompts": [80], + "max_concurrency": [8], + "seed_tts_locale": "en", + "extra_body": {"voice": "Vivian", "language": "English", "task_type": "CustomVoice"}, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "median_audio_ttfp_ms": [1500], + "median_audio_rtf": [0.30], + "audio_throughput": [30.0] + } + }, + { + "task": "default_voice", + "eval_phase": "quality", + "enabled": false, + "dataset_name": "seed-tts-text", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "dataset_path": "benchmarks/build_dataset/seed_tts_smoke", + "num_prompts": [200], + "max_concurrency": [4], + "seed_tts_locale": "en", + "extra_body": {"voice": "Vivian", "language": "English", "task_type": "CustomVoice"}, + "seed_tts_wer_eval": true, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_audio_rtf": [0.35] + } + }, + { + "task": "voice_design", + "eval_phase": "latency", + "dataset_name": "seed-tts-design", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "dataset_path": "benchmarks/build_dataset/seed_tts_design", + "num_prompts": [20], + "max_concurrency": [1], + "seed_tts_locale": "en", + "extra_body": {"task_type": "VoiceDesign", "language": "English"}, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "median_audio_ttfp_ms": [150], + "median_audio_rtf": [0.15] + } + }, + { + "task": "voice_design", + "eval_phase": "throughput", + "dataset_name": "seed-tts-design", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "dataset_path": "benchmarks/build_dataset/seed_tts_design", + "num_prompts": [80], + "max_concurrency": [8], + "seed_tts_locale": "en", + "extra_body": {"task_type": "VoiceDesign", "language": "English"}, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "median_audio_ttfp_ms": [1500], + "median_audio_rtf": [0.35], + "audio_throughput": [25.0] + } + } + ] + } ] diff --git a/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py b/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py index ca6de4cb202..f548a3cb16e 100644 --- a/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py +++ b/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py @@ -252,6 +252,146 @@ def sample( return out +@dataclass +class _SeedTTSDesignRow: + utterance_id: str + target_text: str + voice_description: str + + +def _parse_design_meta_line(line: str) -> _SeedTTSDesignRow | None: + """Parse a 5-field design meta.lst line. + + Format: ``utt_id|ref_text|wav_rel|target_text|voice_description`` + + Returns None (with a warning) if the line has fewer than 5 fields or if + voice_description is empty. + """ + line = line.strip() + if not line or line.startswith("#"): + return None + parts = line.split("|") + if len(parts) < 5: + logger.warning("Skipping malformed design meta.lst line (need 5 '|'-fields): %r", line[:120]) + return None + utt_id = parts[0].strip() + target_text = parts[3].strip() + voice_description = parts[4].strip() + if not voice_description: + logger.warning("Skipping design meta.lst line with empty voice_description: %r", line[:120]) + return None + return _SeedTTSDesignRow( + utterance_id=utt_id, + target_text=target_text, + voice_description=voice_description, + ) + + +@dataclass +class SeedTTSDesignSampleRequest(SeedTTSSampleRequest): + """SampleRequest for voice-design TTS (no ref_audio; voice described via natural language). + + The ``seed_tts_speech_extra`` dict carries ``instructions`` (natural-language + voice description, forwarded as-is to the Qwen3-TTS VoiceDesign endpoint) and + ``task_type="VoiceDesign"`` instead of ``ref_audio`` / ``ref_text``. + SIM is skipped (``seed_tts_ref_wav_path`` is empty). + """ + + +class SeedTTSDesignDataset(SeedTTSDataset): + """Seed-TTS prompts for voice-design benchmarking (dataset name: ``seed-tts-design``). + + Loads a 5-field ``meta.lst``:: + + utt_id|ref_text|wav_rel|target_text|voice_description + + and builds requests with ``task_type="VoiceDesign"`` and the natural-language + ``voice_description`` column forwarded via the ``instructions`` field + (the Qwen3-TTS VoiceDesign endpoint's expected key) instead of + ``ref_audio`` / ``ref_text``. Speaker-similarity (SIM) is not computed. + """ + + def load_data(self) -> None: + # Does NOT call super().load_data() — the format is different (5 fields, + # no wav file). self._rows is intentionally left empty; the parent + # sample() is fully overridden so an empty self._rows is safe. + meta = self._root / self.locale / "meta.lst" + if not meta.is_file(): + raise FileNotFoundError( + f"Seed-TTS-Design meta not found: {meta}. Expected layout: {self._root}/{self.locale}/meta.lst" + ) + text = meta.read_text(encoding="utf-8") + design_rows: list[_SeedTTSDesignRow] = [] + for line in text.splitlines(): + r = _parse_design_meta_line(line) + if r is not None: + design_rows.append(r) + if not design_rows: + raise ValueError(f"No valid rows in {meta}") + if not self.disable_shuffle: + rng = random.Random(self.random_seed) + rng.shuffle(design_rows) + self._design_rows = design_rows + # Keep self._rows empty — parent sample() is overridden. + self._rows = [] + self.data = self._design_rows + logger.info( + "Loaded Seed-TTS-Design: root=%s locale=%s rows=%d", + self._root, + self.locale, + len(self._design_rows), + ) + + def sample( + self, + tokenizer: TokenizerLike, + num_requests: int, + output_len: int | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs: Any, + ) -> list[SampleRequest]: + if output_len is None: + output_len = self.DEFAULT_OUTPUT_LEN + + tok = get_cached_tokenizer(tokenizer) + lang = "English" if self.locale == "en" else "Chinese" + out: list[SampleRequest] = [] + for i, row in enumerate(self._design_rows): + if len(out) >= num_requests: + break + target = row.target_text + prompt_len = len(tok.encode(target)) + speech_extra: dict[str, Any] = { + "instructions": row.voice_description, + "task_type": "VoiceDesign", + "language": lang, + "max_new_tokens": output_len, + } + out.append( + SeedTTSDesignSampleRequest( + prompt=target, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + request_id=f"{request_id_prefix}{i}", + seed_tts_speech_extra=speech_extra, + seed_tts_utterance_id=row.utterance_id, + seed_tts_locale=self.locale, + seed_tts_system_prompt=self._system_prompt, + seed_tts_ref_wav_path="", # SIM skipped for voice-design + ) + ) + + logger.info( + "Seed-TTS-Design: built %d requests (asked %d) — no ref_audio (voice design)", + len(out), + num_requests, + ) + self.maybe_oversample_requests(out, num_requests, request_id_prefix, no_oversample) + return out + + def load_seed_tts_dataset( dataset_path: str, random_seed: int = 0, @@ -270,3 +410,65 @@ def load_seed_tts_dataset( system_prompt=system_prompt, **kwargs, ) + + +@dataclass +class SeedTTSTextSampleRequest(SeedTTSSampleRequest): + """SampleRequest for default-voice TTS (no ref_audio, no ref_text). + + The voice param (e.g. ``voice: "Vivian"``) is supplied at request time via + ``--extra-body`` in the benchmark config. SIM is skipped (empty ref_wav_path). + WER and UTMOS are computed normally. + """ + + +class SeedTTSTextDataset(SeedTTSDataset): + """Seed-TTS prompts for default-voice benchmarking (dataset name: ``seed-tts-text``). + + Loads the same ``meta.lst`` as :class:`SeedTTSDataset` but builds requests + WITHOUT ``ref_audio`` / ``ref_text`` body fields. The named voice must be + supplied via ``--extra-body`` in the benchmark config. + Speaker-similarity (SIM) is not computed. + """ + + def sample( + self, + tokenizer: TokenizerLike, + num_requests: int, + output_len: int | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs: Any, + ) -> list[SampleRequest]: + if output_len is None: + output_len = self.DEFAULT_OUTPUT_LEN + + tok = get_cached_tokenizer(tokenizer) + out: list[SampleRequest] = [] + for i, row in enumerate(self._rows): + if len(out) >= num_requests: + break + target = row.target_text + prompt_len = len(tok.encode(target)) + out.append( + SeedTTSTextSampleRequest( + prompt=target, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + request_id=f"{request_id_prefix}{i}", + seed_tts_speech_extra=None, # voice supplied via --extra-body in config + seed_tts_utterance_id=row.utterance_id, + seed_tts_locale=self.locale, + seed_tts_system_prompt=self._system_prompt, + seed_tts_ref_wav_path="", # empty → SIM skipped in seed_tts_eval + ) + ) + + logger.info( + "Seed-TTS-Text: built %d requests (asked %d) — no ref_audio (default voice)", + len(out), + num_requests, + ) + self.maybe_oversample_requests(out, num_requests, request_id_prefix, no_oversample) + return out diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index 41aed094235..bda75ef624d 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -40,7 +40,9 @@ from vllm_omni.benchmarks.data_modules.seed_tts_dataset import ( SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT, SeedTTSDataset, + SeedTTSDesignDataset, SeedTTSSampleRequest, + SeedTTSTextDataset, ) get_samples_old = datasets.get_samples @@ -85,23 +87,16 @@ def _attach_daily_omni_to_request_func_input(sample: SampleRequest, rfi: Request def _attach_seed_tts_to_request_func_input(sample: SampleRequest, rfi: RequestFuncInput) -> None: - """Merge Seed-TTS per-row TTS fields (ref_audio, ref_text, task_type, …) into ``extra_body``. + """Merge Seed-TTS per-row TTS fields into ``extra_body`` and mark for PCM capture. - Used by both ``/v1/audio/speech`` and ``/v1/chat/completions`` (flattened into JSON body). - For ``openai-chat-omni``, also sets ``omni_chat_messages`` (system + user) so Qwen3-Omni - follows the same role layout as official TTS / multimodal demos. ``/v1/audio/speech`` ignores - ``messages`` and only uses ``input`` + body fields. - Flags ``openai-chat-omni`` to request audio output and optionally export PCM for WER. + Always sets ``seed_tts_row=True`` on the RequestFuncInput for any + :class:`SeedTTSSampleRequest` subclass (including text-only and design + variants that carry no ``ref_audio``). This enables PCM capture for WER / + UTMOS evaluation even when there is no reference audio. """ if not isinstance(sample, SeedTTSSampleRequest): return - ex = sample.seed_tts_speech_extra - if not ex: - return - base = dict(rfi.extra_body) if rfi.extra_body else {} - base.update(ex) - rfi.extra_body = base - # Used by request funcs to force streaming TTS behavior and to export PCM when WER is on. + # Mark for PCM capture (WER / UTMOS eval) regardless of extra body presence. setattr(rfi, "seed_tts_row", True) sys_prompt = (sample.seed_tts_system_prompt or "").strip() or SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT setattr( @@ -112,6 +107,12 @@ def _attach_seed_tts_to_request_func_input(sample: SampleRequest, rfi: RequestFu {"role": "user", "content": [{"type": "text", "text": sample.prompt}]}, ], ) + ex = sample.seed_tts_speech_extra + if not ex: + return # voice comes from --extra-body in config; no ref_audio to merge + base = dict(rfi.extra_body) if rfi.extra_body else {} + base.update(ex) + rfi.extra_body = base def _daily_omni_repo_from_args(args) -> str | None: @@ -136,7 +137,7 @@ def get_samples(args, tokenizer): is_daily_omni = args.dataset_name == "daily-omni" or ( args.dataset_name == "hf" and _daily_omni_repo_from_args(args) is not None ) - is_seed_tts = args.dataset_name == "seed-tts" + is_seed_tts = args.dataset_name in ("seed-tts", "seed-tts-text", "seed-tts-design") # Check if we need to handle omni-related backends/datasets is_omni_backend = args.backend in ["openai-chat-omni", "openai-audio-speech", "daily-omni"] @@ -249,7 +250,13 @@ def get_samples(args, tokenizer): "--hf-name for the Hub dataset id." ) - dataset = SeedTTSDataset( + _cls_map = { + "seed-tts": SeedTTSDataset, + "seed-tts-text": SeedTTSTextDataset, + "seed-tts-design": SeedTTSDesignDataset, + } + DatasetCls = _cls_map[args.dataset_name] + dataset = DatasetCls( dataset_path=repo_id, random_seed=args.seed, locale=getattr(args, "seed_tts_locale", "en"), diff --git a/vllm_omni/entrypoints/cli/benchmark/serve.py b/vllm_omni/entrypoints/cli/benchmark/serve.py index d281432e59b..79d7fbf4ba4 100644 --- a/vllm_omni/entrypoints/cli/benchmark/serve.py +++ b/vllm_omni/entrypoints/cli/benchmark/serve.py @@ -145,7 +145,9 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: for action in parser._actions: if action.dest == "dataset_name" and action.choices is not None: - extra = [c for c in ("daily-omni", "seed-tts") if c not in action.choices] + extra = [ + c for c in ("daily-omni", "seed-tts", "seed-tts-text", "seed-tts-design") if c not in action.choices + ] if extra: action.choices = list(action.choices) + extra