vllm-project · hsliuustc0106 · Mar 24, 2026 · Mar 17, 2026 · Mar 18, 2026 · Mar 23, 2026
diff --git a/docs/user_guide/examples/offline_inference/qwen3_tts.md b/docs/user_guide/examples/offline_inference/qwen3_tts.md
@@ -90,6 +90,43 @@ Examples:
 python end2end.py --query-type Base --mode-tag icl
 ```
 
+## Voice and Language Control
+
+### Supported Voices (CustomVoice)
+
+Predefined speaker voices are set via the `speaker` (or `voice_type`) field in `additional_information`. Available speakers depend on the loaded checkpoint; check `talker_config.spk_id` in the model config for the full list. Common voices include `vivian`, `ryan`, `aiden`, `ethan`, `serena` (case-insensitive).
+
+Pass the speaker name in your request:
+
+```python
+additional_information = {
+    "text": ["你好，我是通义千问"],
+    "task_type": ["CustomVoice"],
+    "speaker": ["Vivian"],  
+    "language": ["Chinese"],
+}
+```
+
+### Supported Languages
+
+The `language` field controls the codec-level language tag. Use `"Auto"` (default) for automatic detection.
+
+Supported values: `Auto`, `Chinese`, `English`, `Japanese`, `Korean`, `German`, `French`, `Russian`, `Portuguese`, `Spanish`, `Italian`.
+
+```python
+additional_information = {
+    "text": ["Hello, nice to meet you."],
+    "task_type": ["CustomVoice"],
+    "speaker": ["Aiden"],
+    "language": ["English"],
+}
+```
+
+### VoiceDesign and Base
+
+- **VoiceDesign**: Use `instruct` for natural-language voice description; no `speaker` needed.
+- **Base**: Use `ref_audio` and `ref_text` for voice cloning; `language` is optional.
+
 ## Streaming Mode
 
 Add `--streaming` to stream audio chunks progressively via `AsyncOmni` (requires `async_chunk: true` in the stage config):

diff --git a/docs/user_guide/examples/online_serving/qwen2_5_omni.md b/docs/user_guide/examples/online_serving/qwen2_5_omni.md
@@ -223,10 +223,6 @@ sudo apt install ffmpeg
     ``````py
     --8<-- "examples/online_serving/qwen2_5_omni/gradio_demo.py"
     ``````
-??? abstract "openai_chat_completion_client_for_multimodal_generation.py"
-    ``````py
-    --8<-- "examples/online_serving/qwen2_5_omni/openai_chat_completion_client_for_multimodal_generation.py"
-    ``````
 ??? abstract "run_curl_multimodal_generation.sh"
     ``````sh
     --8<-- "examples/online_serving/qwen2_5_omni/run_curl_multimodal_generation.sh"

diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md
@@ -103,13 +103,13 @@ cd examples/online_serving/qwen3_tts
 # CustomVoice: Use predefined speaker
 python openai_speech_client.py \
     --text "你好，我是通义千问" \
-    --voice vivian \
+    --speaker vivian \
     --language Chinese
 
 # CustomVoice with style instruction
 python openai_speech_client.py \
     --text "今天天气真好" \
-    --voice ryan \
+    --speaker ryan \
     --instructions "用开心的语气说"
 
 # VoiceDesign: Describe the voice style
@@ -134,7 +134,7 @@ The Python client supports the following command-line arguments:
 - `--model` (or `-m`): Model name/path (default: `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice`)
 - `--task-type` (or `-t`): TTS task type. Options: `CustomVoice`, `VoiceDesign`, `Base`
 - `--text`: Text to synthesize (required)
-- `--voice`: Speaker/voice name (default: `vivian`). Options: `vivian`, `ryan`, `aiden`, etc.
+- `--speaker`: Speaker name (default: `vivian`). Options: `vivian`, `ryan`, `aiden`, etc.
 - `--language`: Language. Options: `Auto`, `Chinese`, `English`, `Japanese`, `Korean`, `German`, `French`, `Russian`, `Portuguese`, `Spanish`, `Italian`
 - `--instructions`: Voice style/emotion instructions
 - `--ref-audio`: Reference audio file path or URL for voice cloning (Base task)
@@ -150,7 +150,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{
         "input": "Hello, how are you?",
-        "voice": "vivian",
+        "speaker": "vivian",
         "language": "English"
     }' --output output.wav
 
@@ -159,7 +159,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{
         "input": "I am so excited!",
-        "voice": "vivian",
+        "speaker": "vivian",
         "instructions": "Speak with great enthusiasm"
     }' --output excited.wav
 
@@ -176,7 +176,7 @@ client = OpenAI(base_url="http://localhost:8091/v1", api_key="none")
 
 response = client.audio.speech.create(
     model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
-    voice="vivian",
+    speaker="vivian",
     input="Hello, how are you?",
 )
 
@@ -192,7 +192,7 @@ response = httpx.post(
     "http://localhost:8091/v1/audio/speech",
     json={
         "input": "Hello, how are you?",
-        "voice": "vivian",
+        "speaker": "vivian",
         "language": "English",
     },
     timeout=300.0,
@@ -279,7 +279,7 @@ This endpoint follows the [OpenAI Audio Speech API](https://platform.openai.com/
 ```json
 {
     "input": "Text to synthesize",
-    "voice": "vivian",
+    "speaker": "vivian",
     "response_format": "wav",
     "task_type": "CustomVoice",
     "language": "Auto",
@@ -297,6 +297,12 @@ This endpoint follows the [OpenAI Audio Speech API](https://platform.openai.com/
 
 Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/wav`).
 
+### Voice and language (summary)
+
+- **Speaker**: Use the `speaker` request field to select the speaker (e.g., `vivian`, `ryan`, `aiden`). List available speakers with `GET /v1/audio/voices`.
+- **Language**: Use the `language` field for the codec language tag (`Auto`, `Chinese`, `English`, etc.). Default is `Auto` for automatic detection.
+- **CustomVoice**: Requires a valid `voice` from the model’s speaker set. **VoiceDesign**: Use `instructions` to describe the voice. **Base**: Use `ref_audio` and `ref_text` for voice cloning.
+
 ## Parameters
 
 ### OpenAI Standard Parameters
@@ -305,7 +311,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w
 | ----------------- | ------ | -------------- | ----------------------------------------------------------- |
 | `input`           | string | **required**   | Text to synthesize                                          |
 | `model`           | string | server's model | Model to use (optional, should match server if specified)   |
-| `voice`           | string | "vivian"       | Speaker name (e.g., vivian, ryan, aiden)                    |
+| `speaker`         | string | "vivian"       | Speaker name (e.g., vivian, ryan, aiden)                    |
 | `response_format` | string | "wav"          | Audio format: wav, mp3, flac, pcm, aac, opus                |
 | `speed`           | float  | 1.0            | Playback speed (0.25-4.0, not supported with `stream=true`) |
 
@@ -340,7 +346,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{
         "input": "Hello, how are you?",
-        "voice": "vivian",
+        "speaker": "vivian",
         "language": "English",
         "stream": true,
         "response_format": "pcm"

@@ -16,6 +16,14 @@ class QueryResult(NamedTuple):
     limit_mm_per_prompt: dict[str, int]
 
 
+def make_audio_output_filename(request_id: str | None, index: int) -> str:
+    """Build a stable output filename using request ID when available."""
+    if not request_id:
+        request_id = f"unknown_{index}"
+    safe_request_id = "".join(ch if (ch.isalnum() or ch in ("-", "_")) else "_" for ch in request_id)
+    return f"audio_{safe_request_id}_{index}.wav"
+
+
 def encode_base64_content_from_url(content_url: str) -> str:
     """Encode a content retrieved from a remote url to base64 format."""
 
@@ -165,6 +173,34 @@ def get_system_prompt():
     }
 
 
+def _parse_csv_arg(value: str | None) -> list[str]:
+    if not value:
+        return []
+    return [item.strip() for item in value.split(",") if item.strip()]
+
+
+def _build_prompt_for_query_type(
+    query_type: str,
+    custom_prompt: str | None,
+    video_path: str | None,
+    image_path: str | None,
+    audio_path: str | None,
+):
+    query_func = query_map[query_type]
+    if query_type == "use_video":
+        return query_func(video_path=video_path, custom_prompt=custom_prompt)
+    if query_type == "use_image":
+        return query_func(image_path=image_path, custom_prompt=custom_prompt)
+    if query_type == "use_audio":
+        return query_func(audio_path=audio_path, custom_prompt=custom_prompt)
+    if query_type == "text":
+        return query_func(custom_prompt=custom_prompt)
+    if query_type == "use_audio_in_video":
+        return query_func(video_path=video_path, custom_prompt=custom_prompt)
+    # use_mixed_modalities / use_multi_audios
+    return query_func(custom_prompt=custom_prompt)
+
+
 def get_text_query(custom_prompt: str | None = None):
     question = (
         custom_prompt or "Explain the system architecture for a scalable audio generation pipeline. Answer in 15 words."
@@ -379,38 +415,44 @@ def run_multimodal_generation(args, client: OpenAI) -> None:
     audio_path = getattr(args, "audio_path", None)
     custom_prompt = getattr(args, "prompt", None)
 
-    # Get the query function and call it with appropriate parameters
-    query_func = query_map[args.query_type]
-    if args.query_type == "use_video":
-        prompt = query_func(video_path=video_path, custom_prompt=custom_prompt)
-    elif args.query_type == "use_image":
-        prompt = query_func(image_path=image_path, custom_prompt=custom_prompt)
-    elif args.query_type == "use_audio":
-        prompt = query_func(audio_path=audio_path, custom_prompt=custom_prompt)
-    elif args.query_type == "text":
-        prompt = query_func(custom_prompt=custom_prompt)
-    elif args.query_type == "use_audio_in_video":
-        prompt = query_func(
-            video_path=video_path,
-            custom_prompt=custom_prompt,
-        )
-    else:
-        prompt = query_func()
-
-    extra_body = {
-        "sampling_params_list": sampling_params_list  # Optional, it has a default setting in stage_configs of the corresponding model.
-    }
-
-    if args.query_type == "use_audio_in_video":
-        extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}
-
     if args.modalities is not None:
         output_modalities = args.modalities.split(",")
     else:
         output_modalities = None
 
     # Test multiple concurrent completions
     num_concurrent_requests = args.num_concurrent_requests
+    prompt_list = _parse_csv_arg(getattr(args, "prompts", None))
+    speaker_list = _parse_csv_arg(getattr(args, "speakers", None))
+
+    request_payloads = []
+    for idx in range(num_concurrent_requests):
+        per_req_prompt = (
+            prompt_list[idx]
+            if idx < len(prompt_list)
+            else (custom_prompt if idx == 0 or not prompt_list else prompt_list[-1])
+        )
+        per_req_speaker = (
+            speaker_list[idx]
+            if idx < len(speaker_list)
+            else (args.speaker if idx == 0 or not speaker_list else speaker_list[-1])
+        )
+        prompt = _build_prompt_for_query_type(
+            query_type=args.query_type,
+            custom_prompt=per_req_prompt,
+            video_path=video_path,
+            image_path=image_path,
+            audio_path=audio_path,
+        )
+        extra_body = {
+            # Optional, it has default settings in stage configs.
+            "sampling_params_list": sampling_params_list
+        }
+        if args.query_type == "use_audio_in_video":
+            extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}
+        if per_req_speaker and per_req_speaker.strip():
+            extra_body["speaker"] = per_req_speaker.strip()
+        request_payloads.append({"prompt": prompt, "extra_body": extra_body})
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_concurrent_requests) as executor:
         # Submit multiple completion requests concurrently
@@ -419,14 +461,14 @@ def run_multimodal_generation(args, client: OpenAI) -> None:
                 client.chat.completions.create,
                 messages=[
                     get_system_prompt(),
-                    prompt,
+                    payload["prompt"],
                 ],
                 model=model_name,
                 modalities=output_modalities,
-                extra_body=extra_body,
+                extra_body=payload["extra_body"],
                 stream=args.stream,
             )
-            for _ in range(num_concurrent_requests)
+            for payload in request_payloads
         ]
 
         # Wait for all requests to complete and collect results
@@ -437,10 +479,11 @@ def run_multimodal_generation(args, client: OpenAI) -> None:
     if not args.stream:
         # Verify all completions succeeded
         for chat_completion in chat_completions:
+            request_id = getattr(chat_completion, "id", None)
             for choice in chat_completion.choices:
                 if choice.message.audio:
                     audio_data = base64.b64decode(choice.message.audio.data)
-                    audio_file_path = f"audio_{count}.wav"
+                    audio_file_path = make_audio_output_filename(request_id=request_id, index=count)
                     with open(audio_file_path, "wb") as f:
                         f.write(audio_data)
                     print(f"Audio saved to {audio_file_path}")
@@ -459,7 +502,8 @@ def run_multimodal_generation(args, client: OpenAI) -> None:
 
                     if getattr(chunk, "modality", None) == "audio" and content:
                         audio_data = base64.b64decode(content)
-                        audio_file_path = f"audio_{count}.wav"
+                        request_id = getattr(chunk, "id", None)
+                        audio_file_path = make_audio_output_filename(request_id=request_id, index=count)
                         with open(audio_file_path, "wb") as f:
                             f.write(audio_data)
                         print(f"\nAudio saved to {audio_file_path}")
@@ -546,6 +590,30 @@ def parse_args():
         default="localhost",
         help="Host/IP of the vLLM Omni API server.",
     )
+    parser.add_argument(
+        "--speaker",
+        type=str,
+        default=None,
+        help="TTS speaker/voice for audio output (e.g. Ethan, Vivian). Passed via extra_body to the talker stage.",
+    )
+    parser.add_argument(
+        "--speakers",
+        type=str,
+        default=None,
+        help=(
+            "Comma-separated speakers for concurrent requests, e.g. "
+            "'Ethan,Vivian,Ryan'. Overrides --speaker per request."
+        ),
+    )
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        default=None,
+        help=(
+            "Comma-separated prompts for concurrent requests. "
+            "If fewer than --num-concurrent-requests, the last prompt is reused."
+        ),
+    )
     return parser.parse_args()