Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
100 commits
Select commit Hold shift + click to select a range
c985bed
refactor the voice cache manager
Mar 23, 2026
e7cd85a
refactor the voice cache manager
Mar 23, 2026
38427a7
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 23, 2026
587bfb2
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 23, 2026
2bcd065
update api server and serving speech with new voicemanager.
Mar 24, 2026
7162276
update api server and serving speech with new voicemanager.
Mar 24, 2026
95159c9
remove old voice cache manager, not used.
Mar 24, 2026
469e3bf
remove old voice cache manager, not used.
Mar 24, 2026
b859bd2
update of the new voice cache manager.
Mar 24, 2026
7af99bc
update of the new voice cache manager.
Mar 24, 2026
7605c5b
only use voice name as hash in the cache and update the talker to inj…
Mar 24, 2026
444578f
only use voice name as hash in the cache and update the talker to inj…
Mar 24, 2026
783f96c
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 24, 2026
2fe7afc
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 24, 2026
10be0fb
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 25, 2026
2de3fbe
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 25, 2026
b7f6cb7
merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 25, 2026
a52a22a
merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 25, 2026
a46358a
merge main
Mar 25, 2026
3e64214
merge main
Mar 25, 2026
63e1d57
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 26, 2026
aec41a4
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 26, 2026
04f8148
fix
Mar 26, 2026
367ffe8
fix
Mar 26, 2026
050446e
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 26, 2026
43ed9d9
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 26, 2026
8a9f427
merge
Mar 26, 2026
c5eef1d
merge
Mar 26, 2026
a335dcf
update cache logic
Mar 26, 2026
1ae7e3a
update cache logic
Mar 26, 2026
971f1cc
add tests
Mar 26, 2026
a4bdb76
add tests
Mar 26, 2026
cb087ca
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 26, 2026
061be77
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 26, 2026
f9e37e8
add __init__
Mar 26, 2026
6c12b71
add __init__
Mar 26, 2026
17a8cab
little name change
Mar 26, 2026
622b194
little name change
Mar 26, 2026
e7a772c
add name change
Mar 26, 2026
4f35229
add name change
Mar 26, 2026
8c031f2
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 26, 2026
a8745b6
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 26, 2026
9392a13
fix
Mar 27, 2026
e54a233
fix
Mar 27, 2026
e6d8eee
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 27, 2026
d2be7a5
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 27, 2026
01ef9f5
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 27, 2026
abeac89
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 27, 2026
ed44e69
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 27, 2026
46f1111
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 27, 2026
7896406
fix _build_prompt_embeds
Mar 27, 2026
f0a3438
fix _build_prompt_embeds
Mar 27, 2026
5e35223
merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 28, 2026
fb1495e
merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 28, 2026
30fc2b5
update
Mar 28, 2026
f47c25f
update
Mar 28, 2026
479023e
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 29, 2026
cb6fd2b
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 29, 2026
bbd905d
remove metadata_manager and upadte serving_speech
Mar 29, 2026
a88bc00
remove metadata_manager and upadte serving_speech
Mar 29, 2026
2ff255b
update example
Mar 29, 2026
e1b9c72
update example
Mar 29, 2026
2939daa
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 29, 2026
d0aafb1
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 29, 2026
dbf8f95
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 29, 2026
6ee443c
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 29, 2026
e9a097f
update docs and logger
Mar 30, 2026
490591a
update docs and logger
Mar 30, 2026
8ac7cae
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 30, 2026
9712e74
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 30, 2026
47ba2ea
add env var for max number of entries in the cache
Mar 30, 2026
cdfa2e4
add env var for max number of entries in the cache
Mar 30, 2026
eea7a8a
update test
Mar 30, 2026
6ac1f6c
update test
Mar 30, 2026
29d71a7
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 31, 2026
bec6c1a
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Mar 31, 2026
b20cc4c
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Apr 1, 2026
184a5bc
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Apr 1, 2026
8a66dcf
[AutoRound] Add offline quantized `W4A16` model support (#1777)
yiliu30 Apr 2, 2026
770f993
[Perf] Optimize Wan2.2 rotary embedding (#2393)
gcanlin Apr 2, 2026
2464a76
Add VACE support for WAN 2.1 conditional video generation (#1885)
tangbinh Apr 2, 2026
c7e710b
[skip ci][Bugfix] clean useless log (#2450)
R2-Y Apr 2, 2026
242b74c
[Test] Skip tests/e2e/online_serving/test_zimage_expansion.py due to …
zhumingjue138 Apr 2, 2026
773f260
[Feature] add session based audio streaming input (#2208)
Shirley125 Apr 2, 2026
ae19d5e
Update MRoPE config fallback logic (#2278)
vraiti Apr 2, 2026
a221dc8
[Docs] Update docs to use vllm-ascend v0.18.0rc1 (#2453)
gcanlin Apr 3, 2026
c327ee0
[BAGEL] [Feature]: Add `thinking mode` in Bagel multi-stage serving (…
princepride Apr 3, 2026
331563b
[BugFix][FishSpeech] Fix structured voice clone prefill conditioning …
Sy0307 Apr 3, 2026
8e53911
Refactor StageDiffusionClient and StageEngineCoreClient (#2006)
chickeyton Apr 3, 2026
2f3330a
[Perf] Skip Wan2.2 cross attn Ulysses SP (#2459)
gcanlin Apr 3, 2026
1d3cfa4
[Model] Add two stages inference for model LTX-2 distilled. (#2260)
Songrui625 Apr 3, 2026
4186e82
[Cleanup] Replace bare print() with logger and use specific exception…
Lidang-Jiang Apr 3, 2026
f9eb5c5
[Bugfix] Fix Flux2 Dev Guidance (#2433)
alex-jw-brooks Apr 3, 2026
061b470
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Apr 3, 2026
77cbae9
Merge branch 'main' of https://github.com/vllm-project/vllm-omni into…
Apr 3, 2026
8fd5d49
Retrigger CI
linyueqian Apr 3, 2026
7b695c6
Merge branch 'main' into feat/refactor-voice-cache-manager
linyueqian Apr 3, 2026
5c1702d
change tests path
Apr 3, 2026
8e8f323
Merge branch 'feat/refactor-voice-cache-manager' of https://github.co…
Apr 3, 2026
a03e2dc
remove broken test
Apr 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions docs/serving/speech_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ Content-Type: application/json
| `instructions` | string | "" | Voice style/emotion instructions |
| `max_new_tokens` | integer | 2048 | Maximum tokens to generate |
| `initial_codec_chunk_frames` | integer | null | Per-request initial chunk size override for TTFA tuning. When null, IC is computed dynamically based on server load. |
| `stream` | bool | false | Stream raw PCM chunks as they are decoded (requires `response_format="pcm"`) |

**Supported languages:** Auto, Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian

Expand All @@ -143,9 +144,23 @@ Lists available voices for the loaded model.

```json
{
"voices": ["aiden", "dylan", "eric", "ono_anna", "ryan", "serena", "sohee", "uncle_fu", "vivian"]
"voices": ["aiden", "dylan", "eric", "ono_anna", "ryan", "serena", "sohee", "uncle_fu", "vivian", "custom_voice_1"],
"uploaded_voices": [
{
"name": "custom_voice_1",
"consent": "user_consent_id",
"created_at": 1738660000,
"file_size": 1024000,
"mime_type": "audio/wav",
"ref_text": "The exact transcript of the audio sample.",
"speaker_description": "warm narrator"
}
]
}
```

`uploaded_voices` is always present (empty list when no custom voices have been uploaded). Fields `ref_text` and `speaker_description` are omitted per-entry when not provided at upload time.

```
POST /v1/audio/voices
Content-Type: multipart/form-data
Expand All @@ -161,6 +176,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
| `consent` | string | Yes | Consent recording ID |
| `name` | string | Yes | Name for the new voice |
| `ref_text` | string | No | Transcript of the audio. When provided, enables in-context voice cloning (higher quality). Without it, only the speaker embedding is extracted. |
| `speaker_description` | string | No | Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). Stored as metadata and returned in `GET /v1/audio/voices`. |

**Response Example:**

Expand All @@ -172,19 +188,24 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
"consent": "user_consent_id",
"created_at": 1738660000,
"mime_type": "audio/wav",
"file_size": 1024000
"file_size": 1024000,
"ref_text": "The exact transcript of the audio sample.",
"speaker_description": "warm narrator"
}
}
```

Fields `ref_text` and `speaker_description` are omitted when not provided at upload time.

**Usage Example:**

```bash
curl -X POST http://localhost:8091/v1/audio/voices \
-F "audio_sample=@/path/to/voice_sample.wav" \
-F "consent=user_consent_id" \
-F "name=custom_voice_1" \
-F "ref_text=The exact transcript of the audio sample."
-F "ref_text=The exact transcript of the audio sample." \
-F "speaker_description=warm narrator"
```

## Streaming Text Input (WebSocket)
Expand Down
34 changes: 22 additions & 12 deletions docs/user_guide/examples/online_serving/qwen3_tts.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"input": "Hello, how are you?",
"speaker": "vivian",
"voice": "vivian",
"language": "English"
}' --output output.wav

Expand All @@ -168,7 +168,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"input": "I am so excited!",
"speaker": "vivian",
"voice": "vivian",
"instructions": "Speak with great enthusiasm"
}' --output excited.wav

Expand All @@ -185,7 +185,7 @@ client = OpenAI(base_url="http://localhost:8091/v1", api_key="none")

response = client.audio.speech.create(
model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
speaker="vivian",
voice="vivian",
input="Hello, how are you?",
)

Expand All @@ -201,7 +201,7 @@ response = httpx.post(
"http://localhost:8091/v1/audio/speech",
json={
"input": "Hello, how are you?",
"speaker": "vivian",
"voice": "vivian",
"language": "English",
},
timeout=300.0,
Expand Down Expand Up @@ -237,12 +237,16 @@ List all available voices/speakers from the loaded model, including both built-i
"consent": "user_consent_id",
"created_at": 1738660000,
"file_size": 1024000,
"mime_type": "audio/wav"
"mime_type": "audio/wav",
"ref_text": "The exact transcript of the audio sample.",
"speaker_description": "warm narrator"
}
]
}
```

Fields `ref_text` and `speaker_description` are omitted per-entry when not provided at upload time.

#### POST /v1/audio/voices

Upload a new voice sample for voice cloning in Base task TTS requests.
Expand All @@ -252,6 +256,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
- `consent` (required): Consent recording ID
- `name` (required): Name for the new voice
- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality).
- `speaker_description` (optional): Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). Stored as metadata.

**Response Example:**
```json
Expand All @@ -262,18 +267,23 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
"consent": "user_consent_id",
"created_at": 1738660000,
"mime_type": "audio/wav",
"file_size": 1024000
"file_size": 1024000,
"ref_text": "The exact transcript of the audio sample.",
"speaker_description": "warm narrator"
}
}
```

Fields `ref_text` and `speaker_description` are omitted when not provided at upload time.

**Usage Example:**
```bash
curl -X POST http://localhost:8000/v1/audio/voices \
curl -X POST http://localhost:8091/v1/audio/voices \
-F "audio_sample=@/path/to/voice_sample.wav" \
-F "consent=user_consent_id" \
-F "name=custom_voice_1" \
-F "ref_text=The exact transcript of the audio sample."
-F "ref_text=The exact transcript of the audio sample." \
-F "speaker_description=warm narrator"
```

### Endpoint
Expand All @@ -290,7 +300,7 @@ This endpoint follows the [OpenAI Audio Speech API](https://platform.openai.com/
```json
{
"input": "Text to synthesize",
"speaker": "vivian",
"voice": "vivian",
"response_format": "wav",
"task_type": "CustomVoice",
"language": "Auto",
Expand All @@ -310,7 +320,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w

### Voice and language (summary)

- **Speaker**: Use the `speaker` request field to select the speaker (e.g., `vivian`, `ryan`, `aiden`). List available speakers with `GET /v1/audio/voices`.
- **Speaker**: Use the `voice` request field to select the speaker (e.g., `vivian`, `ryan`, `aiden`). List available speakers with `GET /v1/audio/voices`.
- **Language**: Use the `language` field for the codec language tag (`Auto`, `Chinese`, `English`, etc.). Default is `Auto` for automatic detection.
- **CustomVoice**: Requires a valid `voice` from the model’s speaker set. **VoiceDesign**: Use `instructions` to describe the voice. **Base**: Use `ref_audio` and `ref_text` for voice cloning.

Expand All @@ -322,7 +332,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w
| ----------------- | ------ | -------------- | ----------------------------------------------------------- |
| `input` | string | **required** | Text to synthesize |
| `model` | string | server's model | Model to use (optional, should match server if specified) |
| `speaker` | string | "vivian" | Speaker name (e.g., vivian, ryan, aiden) |
| `voice` | string | "vivian" | Speaker name (e.g., vivian, ryan, aiden) |
| `response_format` | string | "wav" | Audio format: wav, mp3, flac, pcm, aac, opus |
| `speed` | float | 1.0 | Playback speed (0.25-4.0, not supported with `stream=true`) |

Expand Down Expand Up @@ -357,7 +367,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"input": "Hello, how are you?",
"speaker": "vivian",
"voice": "vivian",
"language": "English",
"stream": true,
"response_format": "pcm"
Expand Down
10 changes: 8 additions & 2 deletions examples/online_serving/qwen3_tts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
- `consent` (required): Consent recording ID
- `name` (required): Name for the new voice
- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality).
- `speaker_description` (optional): Free-form description of the voice (e.g. "warm narrator", "energetic presenter").

**Response Example:**
```json
Expand All @@ -243,18 +244,23 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
"consent": "user_consent_id",
"created_at": 1738660000,
"mime_type": "audio/wav",
"file_size": 1024000
"file_size": 1024000,
"ref_text": "The exact transcript of the audio sample.",
"speaker_description": "warm narrator"
}
}
```

Fields `ref_text` and `speaker_description` are omitted when not provided at upload time.

**Usage Example:**
```bash
curl -X POST http://localhost:8000/v1/audio/voices \
-F "audio_sample=@/path/to/voice_sample.wav" \
-F "consent=user_consent_id" \
-F "name=custom_voice_1" \
-F "ref_text=The exact transcript of the audio sample."
-F "ref_text=The exact transcript of the audio sample." \
-F "speaker_description=warm narrator"
```

### Endpoint
Expand Down
78 changes: 65 additions & 13 deletions tests/entrypoints/openai_api/test_serving_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,17 +233,20 @@ async def list_voices():
uploaded_voices = []
if hasattr(speech_server, "uploaded_speakers"):
for voice_name, info in speech_server.uploaded_speakers.items():
uploaded_voices.append(
{
"name": info.get("name", voice_name),
"consent": info.get("consent", ""),
"created_at": info.get("created_at", 0),
"file_size": info.get("file_size", 0),
"mime_type": info.get("mime_type", ""),
"embedding_source": info.get("embedding_source", "audio"),
"embedding_dim": info.get("embedding_dim"),
}
)
voice_entry = {
"name": info.get("name", voice_name),
"consent": info.get("consent", ""),
"created_at": info.get("created_at", 0),
"file_size": info.get("file_size", 0),
"mime_type": info.get("mime_type", ""),
"embedding_source": info.get("embedding_source", "audio"),
"embedding_dim": info.get("embedding_dim"),
}
if info.get("ref_text"):
voice_entry["ref_text"] = info["ref_text"]
if info.get("speaker_description"):
voice_entry["speaker_description"] = info["speaker_description"]
uploaded_voices.append(voice_entry)
return {"voices": speakers, "uploaded_voices": uploaded_voices}

app.add_api_route("/v1/audio/voices", list_voices, methods=["GET"])
Expand All @@ -255,15 +258,22 @@ async def upload_voice(
speaker_embedding: str | None = Form(None),
consent: str = Form(...),
name: str = Form(...),
ref_text: str = Form(None),
ref_text: str | None = Form(None),
speaker_description: str | None = Form(None),
):
try:
if speaker_embedding is not None and audio_sample is not None:
raise ValueError("'audio_sample' and 'speaker_embedding' are mutually exclusive")
if speaker_embedding is not None:
result = await speech_server.upload_voice_embedding(speaker_embedding, consent, name)
elif audio_sample is not None:
result = await speech_server.upload_voice(audio_sample, consent, name, ref_text=ref_text)
result = await speech_server.upload_voice(
audio_sample,
consent,
name,
ref_text=ref_text,
speaker_description=speaker_description,
)
else:
raise ValueError("Either 'audio_sample' or 'speaker_embedding' must be provided")
return {"success": True, "voice": result}
Expand Down Expand Up @@ -397,6 +407,44 @@ def test_upload_voice_with_ref_text(self, client, tmp_path):
assert result["voice"].get("ref_text") == "Hello world transcript"
response = client.delete("/v1/audio/voices/test_voice_rt")

def test_upload_voice_with_speaker_description(self, client, tmp_path):
"""Test voice upload with speaker_description stores and returns the description."""
# Pre-cleanup in case a previous test run left this voice behind
client.delete("/v1/audio/voices/test_voice_vd")

audio_content = b"fake audio content" * 1000
files = {"audio_sample": ("test.wav", audio_content, "audio/wav")}
data = {"consent": "c1", "name": "test_voice_vd", "speaker_description": " warm, energetic narrator "}

response = client.post("/v1/audio/voices", files=files, data=data)
try:
assert response.status_code == 200
result = response.json()
assert result["success"] is True
assert result["voice"]["name"] == "test_voice_vd"
assert result["voice"].get("speaker_description") == "warm, energetic narrator"
finally:
client.delete("/v1/audio/voices/test_voice_vd")

def test_upload_voice_speaker_description_in_listing(self, client):
"""Test that speaker_description survives the upload → list round-trip."""
client.delete("/v1/audio/voices/test_voice_sd_list")

audio_content = b"fake audio content" * 1000
files = {"audio_sample": ("test.wav", audio_content, "audio/wav")}
data = {"consent": "c1", "name": "test_voice_sd_list", "speaker_description": "calm female narrator"}

response = client.post("/v1/audio/voices", files=files, data=data)
try:
assert response.status_code == 200

listing = client.get("/v1/audio/voices").json()
uploaded = {v["name"]: v for v in listing["uploaded_voices"]}
assert "test_voice_sd_list" in uploaded
assert uploaded["test_voice_sd_list"]["speaker_description"] == "calm female narrator"
finally:
client.delete("/v1/audio/voices/test_voice_sd_list")

def test_upload_voice_file_too_large(self, client):
"""Test voice upload with file exceeding size limit."""
# Create a file larger than 10MB
Expand Down Expand Up @@ -850,6 +898,7 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server):
"file_path": "/tmp/voice_samples/custom_voice_consent_123.wav",
"mime_type": "audio/wav",
"ref_text": None,
"created_at": 1711234567.89,
}
}
speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"}
Expand All @@ -862,6 +911,7 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server):
assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"]
assert params["x_vector_only_mode"] == [True]
assert params["task_type"] == ["Base"]
assert params["voice_created_at"] == [1711234567.89]
assert "ref_text" not in params

def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server):
Expand All @@ -872,6 +922,7 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server):
"file_path": "/tmp/voice_samples/custom_voice_consent_123.wav",
"mime_type": "audio/wav",
"ref_text": "Hello world transcript",
"created_at": 1711234567.89,
}
}
speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"}
Expand All @@ -885,6 +936,7 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server):
assert params["x_vector_only_mode"] == [False]
assert params["task_type"] == ["Base"]
assert params["ref_text"] == ["Hello world transcript"]
assert params["voice_created_at"] == [1711234567.89]

def test_build_tts_params_without_uploaded_voice(self, speech_server):
"""Test _build_tts_params does not auto-set ref_audio for non-uploaded voices."""
Expand Down
Loading
Loading