diff --git a/tests/perf/scripts/run_benchmark.py b/tests/perf/scripts/run_benchmark.py index eb2ac535900..ed2fab83e3d 100644 --- a/tests/perf/scripts/run_benchmark.py +++ b/tests/perf/scripts/run_benchmark.py @@ -107,8 +107,14 @@ def omni_server(request): print("OmniServer stopped") -def run_benchmark(args: list, test_name: str, flow, dataset_name: str, num_prompt) -> Any: - """Generate synthetic image with random values.""" +def run_benchmark( + args: list, + test_name: str, + flow, + dataset_name: str, + num_prompt, +) -> Any: + """Run a single benchmark iteration and return the parsed result JSON.""" current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_{current_dt}.json" if "--result-filename" in args: @@ -117,10 +123,6 @@ def run_benchmark(args: list, test_name: str, flow, dataset_name: str, num_promp ["vllm", "bench", "serve", "--omni"] + args + [ - "--backend", - "openai-chat-omni", - "--endpoint", - "/v1/chat/completions", "--save-result", "--result-dir", os.environ.get("BENCHMARK_DIR", "tests"), @@ -196,7 +198,10 @@ def benchmark_params(request, omni_server): total = len(all_params) print(f"\n Running benchmark {current}/{total} for {test_name}") - return {"test_name": test_name, "params": all_params[param_index]} + return { + "test_name": test_name, + "params": all_params[param_index], + } def assert_result(result, params, num_prompt): @@ -266,7 +271,11 @@ def to_list(value, default=None): for qps, num_prompt in zip(qps_list, num_prompt_list): args = args + ["--request-rate", str(qps), "--num-prompts", str(num_prompt)] result = run_benchmark( - args=args, test_name=test_name, flow=qps, dataset_name=dataset_name, num_prompt=num_prompt + args=args, + test_name=test_name, + flow=qps, + dataset_name=dataset_name, + num_prompt=num_prompt, ) assert_result(result, params, num_prompt=num_prompt) @@ -274,6 +283,10 @@ def to_list(value, default=None): for concurrency, num_prompt in zip(max_concurrency_list, num_prompt_list): args = args + ["--max-concurrency", str(concurrency), "--num-prompts", str(num_prompt), "--request-rate", "inf"] result = run_benchmark( - args=args, test_name=test_name, flow=concurrency, dataset_name=dataset_name, num_prompt=num_prompt + args=args, + test_name=test_name, + flow=concurrency, + dataset_name=dataset_name, + num_prompt=num_prompt, ) assert_result(result, params, num_prompt=num_prompt) diff --git a/tests/perf/stage_configs/qwen3_tts.yaml b/tests/perf/stage_configs/qwen3_tts.yaml new file mode 100644 index 00000000000..4ba4e6e83e8 --- /dev/null +++ b/tests/perf/stage_configs/qwen3_tts.yaml @@ -0,0 +1,99 @@ +# Stage config for running Qwen3-TTS with 2-stage architecture +# Stage 0: Talker (text -> 8-layer RVQ codec codes) +# Stage 1: Code2Wav (codec codes -> audio waveform) +# +# The following config has been verified on 1x H100-80G GPU. +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 4 + engine_args: + model_stage: qwen3_tts + model_arch: Qwen3TTSTalkerForConditionalGeneration + hf_overrides: + architectures: [Qwen3TTSTalkerForConditionalGeneration] + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: false + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.3 + distributed_executor_backend: "mp" + max_num_batched_tokens: 512 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 4 + engine_args: + model_stage: code2wav + model_arch: Qwen3TTSCode2Wav + hf_overrides: + architectures: [Qwen3TTSCode2Wav] + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.2 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 32768 + engine_input_source: [0] + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 4 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: true + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + codec_chunk_frames: 25 + codec_left_context_frames: 25 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/tests/perf/tests/test.json b/tests/perf/tests/test.json index 88041c1195d..65ef6588d9e 100644 --- a/tests/perf/tests/test.json +++ b/tests/perf/tests/test.json @@ -8,6 +8,8 @@ "benchmark_params": [ { "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", "num_prompts": [ 10, 40 @@ -28,6 +30,8 @@ }, { "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", "num_prompts": [ 10, 40 @@ -88,6 +92,8 @@ "benchmark_params": [ { "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", "num_prompts": [ 10, 40 @@ -108,6 +114,8 @@ }, { "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", "num_prompts": [ 10, 40 @@ -140,5 +148,38 @@ } } ] + }, + { + "test_name": "test_qwen3_tts", + "server_params": { + "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", + "stage_config_name": "qwen3_tts.yaml" + }, + "benchmark_params": [ + { + "dataset_name": "random", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "num_prompts": [ + 10, + 40 + ], + "max_concurrency": [ + 1, + 4 + ], + "random_input_len": 100, + "random_output_len": 100, + "extra_body": { + "voice": "Vivian", + "language": "English" + }, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_audio_ttfp_ms": 100000, + "mean_audio_rtf": 100000 + } + } + ] } ] diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index 78792e81c22..84bcfa93a70 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -208,10 +208,11 @@ async def async_request_openai_chat_omni_completions( async def async_request_openai_audio_speech( request_func_input: RequestFuncInput, session: aiohttp.ClientSession, pbar: tqdm | None = None ) -> MixRequestFuncOutput: - """Non-streaming request to /v1/audio/speech endpoint. + """Streaming request to /v1/audio/speech endpoint. - The endpoint returns raw audio bytes (e.g. WAV). Pass voice, instructions, - and other TTS-specific fields via ``extra_body``. + Sends ``stream=true`` with ``response_format=pcm`` so the server returns + raw PCM chunks as they are decoded. This allows measuring TTFP (time to + first audio packet) separately from E2EL. """ api_url = request_func_input.api_url _validate_api_url(api_url, "OpenAI Audio Speech API", "audio/speech") @@ -219,16 +220,11 @@ async def async_request_openai_audio_speech( payload = { "model": request_func_input.model_name if request_func_input.model_name else request_func_input.model, "input": request_func_input.prompt, + "stream": True, + "response_format": "pcm", } _update_payload_common(payload, request_func_input) - response_format = payload.get("response_format", "wav") - if response_format == "pcm": - raise ValueError( - "pcm response format is not supported yet. \ - Please use other formats like wav, mp3, etc. instead." - ) - headers = { "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", @@ -238,41 +234,38 @@ async def async_request_openai_audio_speech( output = MixRequestFuncOutput() output.prompt_len = request_func_input.prompt_len + # PCM format: 16-bit signed, 24 kHz, mono + sample_rate = 24000 + sample_width = 2 # 16-bit = 2 bytes + channels = 1 + st = time.perf_counter() output.start_time = st + total_pcm_bytes = 0 try: async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: - audio_bytes = await response.read() + async for chunk in response.content.iter_any(): + if not chunk: + continue + timestamp = time.perf_counter() + if output.audio_ttfp == 0.0: + output.audio_ttfp = timestamp - st + output.ttft = output.audio_ttfp + total_pcm_bytes += len(chunk) + end_time = time.perf_counter() output.latency = end_time - st - # ttft = latency since this is a non-streaming request - # hence there is no distinction between first and last token/audio - output.ttft = output.latency - output.audio_ttfp = output.latency - - try: - audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes)) - output.audio_duration = len(audio_segment) / 1000.0 - frame_width = audio_segment.frame_width - if frame_width > 0: - output.audio_frames = len(audio_segment.raw_data) // frame_width - else: - output.audio_frames = 0 - logger.warning("Audio frame width is zero") - if output.audio_duration > 0: - # rtf = audio_generate_time / audio_duration and - # audio_generate_time = latency since this is a non-streaming request - # so the time to receive last portion of audio is the latency - output.audio_rtf = output.latency / output.audio_duration - else: - output.audio_rtf = 0 - logger.warning("Audio duration is zero") - output.success = True - except Exception as e: - output.success = False - output.error = f"Failed to parse audio response: {e}" - logger.error(f"ERROR: Failed to parse audio response: {e}") + + total_samples = total_pcm_bytes // (sample_width * channels) + output.audio_duration = total_samples / sample_rate + output.audio_frames = total_samples + if output.audio_duration > 0: + output.audio_rtf = output.latency / output.audio_duration + else: + output.audio_rtf = 0 + logger.warning("Audio duration is zero") + output.success = True else: output.error = response.reason or "" output.success = False