vllm-project · hsliuustc0106 · Mar 11, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
@@ -107,8 +107,14 @@ def omni_server(request):
         print("OmniServer stopped")
 
 
-def run_benchmark(args: list, test_name: str, flow, dataset_name: str, num_prompt) -> Any:
-    """Generate synthetic image with random values."""
+def run_benchmark(
+    args: list,
+    test_name: str,
+    flow,
+    dataset_name: str,
+    num_prompt,
+) -> Any:
+    """Run a single benchmark iteration and return the parsed result JSON."""
     current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
     result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_{current_dt}.json"
     if "--result-filename" in args:
@@ -117,10 +123,6 @@ def run_benchmark(args: list, test_name: str, flow, dataset_name: str, num_promp
         ["vllm", "bench", "serve", "--omni"]
         + args
         + [
-            "--backend",
-            "openai-chat-omni",
-            "--endpoint",
-            "/v1/chat/completions",
             "--save-result",
             "--result-dir",
             os.environ.get("BENCHMARK_DIR", "tests"),
@@ -196,7 +198,10 @@ def benchmark_params(request, omni_server):
     total = len(all_params)
     print(f"\n  Running benchmark {current}/{total} for {test_name}")
 
-    return {"test_name": test_name, "params": all_params[param_index]}
+    return {
+        "test_name": test_name,
+        "params": all_params[param_index],
+    }
 
 
 def assert_result(result, params, num_prompt):
@@ -266,14 +271,22 @@ def to_list(value, default=None):
     for qps, num_prompt in zip(qps_list, num_prompt_list):
         args = args + ["--request-rate", str(qps), "--num-prompts", str(num_prompt)]
         result = run_benchmark(
-            args=args, test_name=test_name, flow=qps, dataset_name=dataset_name, num_prompt=num_prompt
+            args=args,
+            test_name=test_name,
+            flow=qps,
+            dataset_name=dataset_name,
+            num_prompt=num_prompt,
         )
         assert_result(result, params, num_prompt=num_prompt)
 
     # concurrency test
     for concurrency, num_prompt in zip(max_concurrency_list, num_prompt_list):
         args = args + ["--max-concurrency", str(concurrency), "--num-prompts", str(num_prompt), "--request-rate", "inf"]
         result = run_benchmark(
-            args=args, test_name=test_name, flow=concurrency, dataset_name=dataset_name, num_prompt=num_prompt
+            args=args,
+            test_name=test_name,
+            flow=concurrency,
+            dataset_name=dataset_name,
+            num_prompt=num_prompt,
         )
         assert_result(result, params, num_prompt=num_prompt)
@@ -0,0 +1,99 @@
+# Stage config for running Qwen3-TTS with 2-stage architecture
+# Stage 0: Talker (text -> 8-layer RVQ codec codes)
+# Stage 1: Code2Wav (codec codes -> audio waveform)
+#
+# The following config has been verified on 1x H100-80G GPU.
+async_chunk: true
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 4
+    engine_args:
+      model_stage: qwen3_tts
+      model_arch: Qwen3TTSTalkerForConditionalGeneration
+      hf_overrides:
+        architectures: [Qwen3TTSTalkerForConditionalGeneration]
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      enforce_eager: false
+      trust_remote_code: true
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: latent
+      gpu_memory_utilization: 0.3
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 512
+      max_model_len: 4096
+      custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk
+    output_connectors:
+      to_stage_1: connector_of_shared_memory
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 4096
+      seed: 42
+      detokenize: false
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 1
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 4
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3TTSCode2Wav
+      hf_overrides:
+        architectures: [Qwen3TTSCode2Wav]
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: audio
+      gpu_memory_utilization: 0.2
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 8192
+      max_model_len: 32768
+    engine_input_source: [0]
+    final_output: true
+    final_output_type: audio
+    input_connectors:
+      from_stage_0: connector_of_shared_memory
+    tts_args:
+      max_instructions_length: 500
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 65536
+      seed: 42
+      detokenize: true
+      repetition_penalty: 1.0
+
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 4
+
+  connectors:
+    connector_of_shared_memory:
+      name: SharedMemoryConnector
+      extra:
+        shm_threshold_bytes: 65536
+        codec_streaming: true
+        connector_get_sleep_s: 0.01
+        connector_get_max_wait_first_chunk: 3000
+        connector_get_max_wait: 300
+        codec_chunk_frames: 25
+        codec_left_context_frames: 25
+
+  edges:
+    - from: 0
+      to: 1
+      window_size: -1
@@ -8,6 +8,8 @@
         "benchmark_params": [
             {
                 "dataset_name": "random",
+                "backend": "openai-chat-omni",
+                "endpoint": "/v1/chat/completions",
                 "num_prompts": [
                     10,
                     40
@@ -28,6 +30,8 @@
             },
             {
                 "dataset_name": "random-mm",
+                "backend": "openai-chat-omni",
+                "endpoint": "/v1/chat/completions",
                 "num_prompts": [
                     10,
                     40
@@ -88,6 +92,8 @@
         "benchmark_params": [
             {
                 "dataset_name": "random",
+                "backend": "openai-chat-omni",
+                "endpoint": "/v1/chat/completions",
                 "num_prompts": [
                     10,
                     40
@@ -108,6 +114,8 @@
             },
             {
                 "dataset_name": "random-mm",
+                "backend": "openai-chat-omni",
+                "endpoint": "/v1/chat/completions",
                 "num_prompts": [
                     10,
                     40
@@ -140,5 +148,38 @@
                 }
             }
         ]
+    },
+    {
+        "test_name": "test_qwen3_tts",
+        "server_params": {
+            "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
+            "stage_config_name": "qwen3_tts.yaml"
+        },
+        "benchmark_params": [
+            {
+                "dataset_name": "random",
+                "backend": "openai-audio-speech",
+                "endpoint": "/v1/audio/speech",
+                "num_prompts": [
+                    10,
+                    40
+                ],
+                "max_concurrency": [
+                    1,
+                    4
+                ],
+                "random_input_len": 100,
+                "random_output_len": 100,
+                "extra_body": {
+                    "voice": "Vivian",
+                    "language": "English"
+                },
+                "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration",
+                "baseline": {
+                    "mean_audio_ttfp_ms": 100000,
+                    "mean_audio_rtf": 100000
+                }
+            }
+        ]
     }
 ]
@@ -208,27 +208,23 @@ async def async_request_openai_chat_omni_completions(
 async def async_request_openai_audio_speech(
     request_func_input: RequestFuncInput, session: aiohttp.ClientSession, pbar: tqdm | None = None
 ) -> MixRequestFuncOutput:
-    """Non-streaming request to /v1/audio/speech endpoint.
+    """Streaming request to /v1/audio/speech endpoint.
 
-    The endpoint returns raw audio bytes (e.g. WAV). Pass voice, instructions,
-    and other TTS-specific fields via ``extra_body``.
+    Sends ``stream=true`` with ``response_format=pcm`` so the server returns
+    raw PCM chunks as they are decoded. This allows measuring TTFP (time to
+    first audio packet) separately from E2EL.
     """
     api_url = request_func_input.api_url
     _validate_api_url(api_url, "OpenAI Audio Speech API", "audio/speech")
 
     payload = {
         "model": request_func_input.model_name if request_func_input.model_name else request_func_input.model,
         "input": request_func_input.prompt,
+        "stream": True,
+        "response_format": "pcm",
     }
     _update_payload_common(payload, request_func_input)
 
-    response_format = payload.get("response_format", "wav")
-    if response_format == "pcm":
-        raise ValueError(
-            "pcm response format is not supported yet. \
-        Please use other formats like wav, mp3, etc. instead."
-        )
-
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
@@ -238,41 +234,38 @@ async def async_request_openai_audio_speech(
     output = MixRequestFuncOutput()
     output.prompt_len = request_func_input.prompt_len
 
+    # PCM format: 16-bit signed, 24 kHz, mono
+    sample_rate = 24000
+    sample_width = 2  # 16-bit = 2 bytes
+    channels = 1
+
     st = time.perf_counter()
     output.start_time = st
+    total_pcm_bytes = 0
     try:
         async with session.post(url=api_url, json=payload, headers=headers) as response:
             if response.status == 200:
-                audio_bytes = await response.read()
+                async for chunk in response.content.iter_any():
+                    if not chunk:
+                        continue
+                    timestamp = time.perf_counter()
+                    if output.audio_ttfp == 0.0:
+                        output.audio_ttfp = timestamp - st
+                        output.ttft = output.audio_ttfp
+                    total_pcm_bytes += len(chunk)
+
                 end_time = time.perf_counter()
                 output.latency = end_time - st
-                # ttft = latency since this is a non-streaming request
-                # hence there is no distinction between first and last token/audio
-                output.ttft = output.latency
-                output.audio_ttfp = output.latency
-
-                try:
-                    audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes))
-                    output.audio_duration = len(audio_segment) / 1000.0
-                    frame_width = audio_segment.frame_width
-                    if frame_width > 0:
-                        output.audio_frames = len(audio_segment.raw_data) // frame_width
-                    else:
-                        output.audio_frames = 0
-                        logger.warning("Audio frame width is zero")
-                    if output.audio_duration > 0:
-                        # rtf = audio_generate_time / audio_duration and
-                        # audio_generate_time = latency since this is a non-streaming request
-                        # so the time to receive last portion of audio is the latency
-                        output.audio_rtf = output.latency / output.audio_duration
-                    else:
-                        output.audio_rtf = 0
-                        logger.warning("Audio duration is zero")
-                    output.success = True
-                except Exception as e:
-                    output.success = False
-                    output.error = f"Failed to parse audio response: {e}"
-                    logger.error(f"ERROR: Failed to parse audio response: {e}")
+
+                total_samples = total_pcm_bytes // (sample_width * channels)
+                output.audio_duration = total_samples / sample_rate
+                output.audio_frames = total_samples
+                if output.audio_duration > 0:
+                    output.audio_rtf = output.latency / output.audio_duration
+                else:
+                    output.audio_rtf = 0
+                    logger.warning("Audio duration is zero")
+                output.success = True
             else:
                 output.error = response.reason or ""
                 output.success = False