Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions tests/perf/scripts/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,14 @@ def omni_server(request):
print("OmniServer stopped")


def run_benchmark(args: list, test_name: str, flow, dataset_name: str, num_prompt) -> Any:
"""Generate synthetic image with random values."""
def run_benchmark(
args: list,
test_name: str,
flow,
dataset_name: str,
num_prompt,
) -> Any:
"""Run a single benchmark iteration and return the parsed result JSON."""
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_{current_dt}.json"
if "--result-filename" in args:
Expand All @@ -117,10 +123,6 @@ def run_benchmark(args: list, test_name: str, flow, dataset_name: str, num_promp
["vllm", "bench", "serve", "--omni"]
+ args
+ [
"--backend",
"openai-chat-omni",
"--endpoint",
"/v1/chat/completions",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check possible risks for omni series test.

"--save-result",
"--result-dir",
os.environ.get("BENCHMARK_DIR", "tests"),
Expand Down Expand Up @@ -196,7 +198,10 @@ def benchmark_params(request, omni_server):
total = len(all_params)
print(f"\n Running benchmark {current}/{total} for {test_name}")

return {"test_name": test_name, "params": all_params[param_index]}
return {
"test_name": test_name,
"params": all_params[param_index],
}


def assert_result(result, params, num_prompt):
Expand Down Expand Up @@ -266,14 +271,22 @@ def to_list(value, default=None):
for qps, num_prompt in zip(qps_list, num_prompt_list):
args = args + ["--request-rate", str(qps), "--num-prompts", str(num_prompt)]
result = run_benchmark(
args=args, test_name=test_name, flow=qps, dataset_name=dataset_name, num_prompt=num_prompt
args=args,
test_name=test_name,
flow=qps,
dataset_name=dataset_name,
num_prompt=num_prompt,
)
assert_result(result, params, num_prompt=num_prompt)

# concurrency test
for concurrency, num_prompt in zip(max_concurrency_list, num_prompt_list):
args = args + ["--max-concurrency", str(concurrency), "--num-prompts", str(num_prompt), "--request-rate", "inf"]
result = run_benchmark(
args=args, test_name=test_name, flow=concurrency, dataset_name=dataset_name, num_prompt=num_prompt
args=args,
test_name=test_name,
flow=concurrency,
dataset_name=dataset_name,
num_prompt=num_prompt,
)
assert_result(result, params, num_prompt=num_prompt)
99 changes: 99 additions & 0 deletions tests/perf/stage_configs/qwen3_tts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Stage config for running Qwen3-TTS with 2-stage architecture
# Stage 0: Talker (text -> 8-layer RVQ codec codes)
# Stage 1: Code2Wav (codec codes -> audio waveform)
#
# The following config has been verified on 1x H100-80G GPU.
async_chunk: true
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we use async_chunk for default? do we need to change the yaml name?

stage_args:
- stage_id: 0
stage_type: llm
runtime:
devices: "0"
max_batch_size: 4
engine_args:
model_stage: qwen3_tts
model_arch: Qwen3TTSTalkerForConditionalGeneration
hf_overrides:
architectures: [Qwen3TTSTalkerForConditionalGeneration]
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
enforce_eager: false
trust_remote_code: true
async_scheduling: false
enable_prefix_caching: false
engine_output_type: latent
gpu_memory_utilization: 0.3
distributed_executor_backend: "mp"
max_num_batched_tokens: 512
max_model_len: 4096
custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk
output_connectors:
to_stage_1: connector_of_shared_memory
default_sampling_params:
temperature: 0.9
top_k: 50
max_tokens: 4096
seed: 42
detokenize: false
repetition_penalty: 1.05
stop_token_ids: [2150]

- stage_id: 1
stage_type: llm
runtime:
devices: "0"
max_batch_size: 4
engine_args:
model_stage: code2wav
model_arch: Qwen3TTSCode2Wav
hf_overrides:
architectures: [Qwen3TTSCode2Wav]
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
enforce_eager: true
trust_remote_code: true
async_scheduling: false
enable_prefix_caching: false
engine_output_type: audio
gpu_memory_utilization: 0.2
distributed_executor_backend: "mp"
max_num_batched_tokens: 8192
max_model_len: 32768
engine_input_source: [0]
final_output: true
final_output_type: audio
input_connectors:
from_stage_0: connector_of_shared_memory
tts_args:
max_instructions_length: 500
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 65536
seed: 42
detokenize: true
repetition_penalty: 1.0

runtime:
enabled: true
defaults:
window_size: -1
max_inflight: 4

connectors:
connector_of_shared_memory:
name: SharedMemoryConnector
extra:
shm_threshold_bytes: 65536
codec_streaming: true
connector_get_sleep_s: 0.01
connector_get_max_wait_first_chunk: 3000
connector_get_max_wait: 300
codec_chunk_frames: 25
codec_left_context_frames: 25

edges:
- from: 0
to: 1
window_size: -1
41 changes: 41 additions & 0 deletions tests/perf/tests/test.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
"benchmark_params": [
{
"dataset_name": "random",
"backend": "openai-chat-omni",
"endpoint": "/v1/chat/completions",
"num_prompts": [
10,
40
Expand All @@ -28,6 +30,8 @@
},
{
"dataset_name": "random-mm",
"backend": "openai-chat-omni",
"endpoint": "/v1/chat/completions",
"num_prompts": [
10,
40
Expand Down Expand Up @@ -88,6 +92,8 @@
"benchmark_params": [
{
"dataset_name": "random",
"backend": "openai-chat-omni",
"endpoint": "/v1/chat/completions",
"num_prompts": [
10,
40
Expand All @@ -108,6 +114,8 @@
},
{
"dataset_name": "random-mm",
"backend": "openai-chat-omni",
"endpoint": "/v1/chat/completions",
"num_prompts": [
10,
40
Expand Down Expand Up @@ -140,5 +148,38 @@
}
}
]
},
{
"test_name": "test_qwen3_tts",
"server_params": {
"model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
"stage_config_name": "qwen3_tts.yaml"
},
"benchmark_params": [
{
"dataset_name": "random",
"backend": "openai-audio-speech",
"endpoint": "/v1/audio/speech",
"num_prompts": [
10,
40
],
"max_concurrency": [
1,
4
],
"random_input_len": 100,
"random_output_len": 100,
"extra_body": {
"voice": "Vivian",
"language": "English"
},
"percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration",
"baseline": {
"mean_audio_ttfp_ms": 100000,
"mean_audio_rtf": 100000
}
}
]
}
]
69 changes: 31 additions & 38 deletions vllm_omni/benchmarks/patch/patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,27 +208,23 @@ async def async_request_openai_chat_omni_completions(
async def async_request_openai_audio_speech(
request_func_input: RequestFuncInput, session: aiohttp.ClientSession, pbar: tqdm | None = None
) -> MixRequestFuncOutput:
"""Non-streaming request to /v1/audio/speech endpoint.
"""Streaming request to /v1/audio/speech endpoint.

The endpoint returns raw audio bytes (e.g. WAV). Pass voice, instructions,
and other TTS-specific fields via ``extra_body``.
Sends ``stream=true`` with ``response_format=pcm`` so the server returns
raw PCM chunks as they are decoded. This allows measuring TTFP (time to
first audio packet) separately from E2EL.
"""
api_url = request_func_input.api_url
_validate_api_url(api_url, "OpenAI Audio Speech API", "audio/speech")

payload = {
"model": request_func_input.model_name if request_func_input.model_name else request_func_input.model,
"input": request_func_input.prompt,
"stream": True,
"response_format": "pcm",
}
_update_payload_common(payload, request_func_input)

response_format = payload.get("response_format", "wav")
if response_format == "pcm":
raise ValueError(
"pcm response format is not supported yet. \
Please use other formats like wav, mp3, etc. instead."
)

headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
Expand All @@ -238,41 +234,38 @@ async def async_request_openai_audio_speech(
output = MixRequestFuncOutput()
output.prompt_len = request_func_input.prompt_len

# PCM format: 16-bit signed, 24 kHz, mono
sample_rate = 24000
sample_width = 2 # 16-bit = 2 bytes
channels = 1

st = time.perf_counter()
output.start_time = st
total_pcm_bytes = 0
try:
async with session.post(url=api_url, json=payload, headers=headers) as response:
if response.status == 200:
audio_bytes = await response.read()
async for chunk in response.content.iter_any():
if not chunk:
continue
timestamp = time.perf_counter()
if output.audio_ttfp == 0.0:
output.audio_ttfp = timestamp - st
output.ttft = output.audio_ttfp
total_pcm_bytes += len(chunk)

end_time = time.perf_counter()
output.latency = end_time - st
# ttft = latency since this is a non-streaming request
# hence there is no distinction between first and last token/audio
output.ttft = output.latency
output.audio_ttfp = output.latency

try:
audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes))
output.audio_duration = len(audio_segment) / 1000.0
frame_width = audio_segment.frame_width
if frame_width > 0:
output.audio_frames = len(audio_segment.raw_data) // frame_width
else:
output.audio_frames = 0
logger.warning("Audio frame width is zero")
if output.audio_duration > 0:
# rtf = audio_generate_time / audio_duration and
# audio_generate_time = latency since this is a non-streaming request
# so the time to receive last portion of audio is the latency
output.audio_rtf = output.latency / output.audio_duration
else:
output.audio_rtf = 0
logger.warning("Audio duration is zero")
output.success = True
except Exception as e:
output.success = False
output.error = f"Failed to parse audio response: {e}"
logger.error(f"ERROR: Failed to parse audio response: {e}")

total_samples = total_pcm_bytes // (sample_width * channels)
output.audio_duration = total_samples / sample_rate
output.audio_frames = total_samples
if output.audio_duration > 0:
output.audio_rtf = output.latency / output.audio_duration
else:
output.audio_rtf = 0
logger.warning("Audio duration is zero")
output.success = True
else:
output.error = response.reason or ""
output.success = False
Expand Down
Loading