-
Notifications
You must be signed in to change notification settings - Fork 1k
fix: use ref_audio path for uploaded custom voices in voxtral TTS inference (#2479) #2547
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -1183,6 +1183,29 @@ def _build_voxtral_prompt(self, request: OpenAICreateSpeechRequest) -> dict[str, | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| mistral_tokenizer = cached_tokenizer_from_config(self.engine_client.model_config) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| self._tts_tokenizer = mistral_tokenizer.instruct | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if voice is not None: | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| # For custom uploaded voices, mistral_common doesn't know the voice name. | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| # Resolve to reference audio data stored at upload time instead. | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| voice_lower = voice.lower() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if voice_lower in self.uploaded_speakers: | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| speaker_info = self.uploaded_speakers[voice_lower] | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| file_path = Path(speaker_info["file_path"]) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if file_path.exists(): | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| with open(file_path, "rb") as f: | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| audio_bytes = f.read() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| mime_type = speaker_info.get("mime_type", "audio/wav") | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+1191
to
+1196
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| file_path = Path(speaker_info["file_path"]) | |
| if file_path.exists(): | |
| with open(file_path, "rb") as f: | |
| audio_bytes = f.read() | |
| audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") | |
| mime_type = speaker_info.get("mime_type", "audio/wav") | |
| mime_type = speaker_info.get("mime_type", "audio/wav") | |
| embedding_source = speaker_info.get("embedding_source") | |
| is_audio_backed = embedding_source == "audio" or mime_type.startswith("audio/") | |
| if not is_audio_backed: | |
| raise ValueError( | |
| f"Uploaded voice '{voice}' is embedding-only and cannot be used as Voxtral " | |
| "reference audio. Please provide an audio-backed uploaded voice or pass ref_audio." | |
| ) | |
| file_path = Path(speaker_info["file_path"]) | |
| if file_path.exists(): | |
| with open(file_path, "rb") as f: | |
| audio_bytes = f.read() | |
| audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") |
Copilot
AI
Apr 7, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This reimplements the same file-read/base64 logic as _get_uploaded_audio_data(), then immediately strips the data-URI prefix. Consider reusing _get_uploaded_audio_data(voice) and then partition(',') (or have a helper that returns raw base64) to avoid duplication and keep behavior consistent across call sites.
Copilot
AI
Apr 7, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Falling through to the voice-name path when the uploaded voice file is missing will still fail for custom voices (the tokenizer doesn’t recognize the name). Instead of falling through, raise a user-facing error indicating the uploaded voice’s reference audio is missing/unreadable (and possibly suggest re-upload).
| if file_path.exists(): | |
| with open(file_path, "rb") as f: | |
| audio_bytes = f.read() | |
| audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") | |
| mime_type = speaker_info.get("mime_type", "audio/wav") | |
| ref_audio = f"data:{mime_type};base64,{audio_b64}" | |
| # Strip data URI prefix for mistral_common | |
| _, _, ref_audio = ref_audio.partition(",") | |
| tokenized = self._tts_tokenizer.encode_speech_request( | |
| SpeechRequest(input=text, ref_audio=ref_audio) | |
| ) | |
| audio = tokenized.audios[0] | |
| return { | |
| "prompt_token_ids": tokenized.tokens, | |
| "multi_modal_data": {"audio": [(audio.audio_array, audio.sampling_rate)]}, | |
| } | |
| # Fall through to voice-name path if file is missing | |
| if not file_path.exists(): | |
| raise ValueError( | |
| f"Reference audio for uploaded voice '{voice}' is missing. " | |
| "Please re-upload the voice sample and try again." | |
| ) | |
| try: | |
| with open(file_path, "rb") as f: | |
| audio_bytes = f.read() | |
| except OSError as e: | |
| raise ValueError( | |
| f"Reference audio for uploaded voice '{voice}' could not be read. " | |
| "Please re-upload the voice sample and try again." | |
| ) from e | |
| audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") | |
| mime_type = speaker_info.get("mime_type", "audio/wav") | |
| ref_audio = f"data:{mime_type};base64,{audio_b64}" | |
| # Strip data URI prefix for mistral_common | |
| _, _, ref_audio = ref_audio.partition(",") | |
| tokenized = self._tts_tokenizer.encode_speech_request( | |
| SpeechRequest(input=text, ref_audio=ref_audio) | |
| ) | |
| audio = tokenized.audios[0] | |
| return { | |
| "prompt_token_ids": tokenized.tokens, | |
| "multi_modal_data": {"audio": [(audio.audio_array, audio.sampling_rate)]}, | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a unit test covering Voxtral inference with an uploaded voice name: ensure
_build_voxtral_prompt()resolves the voice to the stored reference audio and callsencode_speech_request()withref_audio(notvoice). This prevents regressions of #2479 and covers the new branch.