Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/online_serving/qwen3_tts/openai_speech_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def run_tts_generation(args) -> None:
payload = {
"model": args.model,
"input": args.text,
"speaker": args.speaker,
"voice": args.speaker,
"response_format": args.response_format,
}

Expand Down
3 changes: 2 additions & 1 deletion vllm_omni/entrypoints/openai/protocol/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Literal

import numpy as np
from pydantic import BaseModel, Field, field_validator, model_validator
from pydantic import AliasChoices, BaseModel, Field, field_validator, model_validator

_MAX_EMBEDDING_DIM = 8192

Expand All @@ -12,6 +12,7 @@ class OpenAICreateSpeechRequest(BaseModel):
model: str | None = None
voice: str | None = Field(
default=None,
validation_alias=AliasChoices("voice", "speaker"),
description="Speaker/voice to use. For Qwen3-TTS: vivian, ryan, aiden, etc.",
)
instructions: str | None = Field(
Expand Down
85 changes: 71 additions & 14 deletions vllm_omni/entrypoints/openai/serving_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,44 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None:
logger.error(f"Could not read audio file for voice {voice_name}: {e}")
return None

def _get_uploaded_speaker_embedding(self, voice_name: str) -> list[float] | None:
"""Load pre-computed speaker embedding for an uploaded voice.

Returns the embedding as a list of floats, or None if the voice
was not uploaded with an embedding (i.e. it has audio instead).
"""
voice_name_lower = voice_name.lower()
if voice_name_lower not in self.uploaded_speakers:
return None

speaker_info = self.uploaded_speakers[voice_name_lower]
if speaker_info.get("embedding_source") != "direct":
return None

cache_file = speaker_info.get("cache_file")
if not cache_file or not Path(cache_file).exists():
logger.warning("Embedding file not found for voice %s: %s", voice_name, cache_file)
return None

try:
from safetensors.torch import load_file
except ImportError:
logger.error(
"The 'safetensors' package is required to load speaker embeddings. "
"Install it with: pip install safetensors"
)
return None

try:
tensors = load_file(cache_file)
if "speaker_embedding" not in tensors:
logger.warning("Key 'speaker_embedding' not found in %s for voice %s", cache_file, voice_name)
return None
return tensors["speaker_embedding"].squeeze().tolist()
except Exception as e:
logger.error("Could not load embedding for voice %s: %s", voice_name, e)
return None

async def upload_voice(
self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None
) -> dict:
Expand Down Expand Up @@ -816,11 +854,17 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str
# voice is not None
voice_lower = request.voice.lower()
if voice_lower in self.uploaded_speakers:
# Check if audio file exists for uploaded speaker
# Check if data file exists for uploaded speaker
speaker_info = self.uploaded_speakers[voice_lower]
file_path = Path(speaker_info["file_path"])
if not file_path.exists():
return f"Audio file for uploaded speaker '{request.voice}' not found on disk"
return f"Data file for uploaded speaker '{request.voice}' not found on disk"
# For embedding-uploaded voices, verify the cache is ready
if speaker_info.get("embedding_source") == "direct":
cache_file = speaker_info.get("cache_file")
if not cache_file or not Path(cache_file).exists():
status = speaker_info.get("cache_status", "unknown")
return f"Speaker embedding for '{request.voice}' is not yet ready (cache_status='{status}')"
else:
# need ref_audio for built-in speaker
if request.ref_audio is None:
Expand Down Expand Up @@ -1056,19 +1100,30 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any
# Uploaded voices use task_type="Base" (CustomVoice requires built-in spk_id).
# If ref_text was provided at upload time, use in-context cloning; otherwise x_vector only.
if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None:
audio_data = self._get_uploaded_audio_data(request.voice)
if not audio_data:
raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted")
speaker_info = self.uploaded_speakers[request.voice.lower()]
stored_ref_text = speaker_info.get("ref_text")
params["ref_audio"] = [audio_data]
params["task_type"] = ["Base"]
if stored_ref_text:
params["ref_text"] = [stored_ref_text]
params["x_vector_only_mode"] = [False]
else:

# Check if this voice was uploaded with a pre-computed embedding
embedding = self._get_uploaded_speaker_embedding(request.voice)
if embedding is not None:
params["voice_clone_prompt"] = [{"ref_spk_embedding": embedding}]
params["task_type"] = ["Base"]
params["x_vector_only_mode"] = [True]
logger.info("Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text))
logger.info("Auto-set speaker_embedding for uploaded voice: %s", request.voice)
else:
audio_data = self._get_uploaded_audio_data(request.voice)
if not audio_data:
raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted")
stored_ref_text = speaker_info.get("ref_text")
params["ref_audio"] = [audio_data]
params["task_type"] = ["Base"]
if stored_ref_text:
params["ref_text"] = [stored_ref_text]
params["x_vector_only_mode"] = [False]
else:
params["x_vector_only_mode"] = [True]
logger.info(
"Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text)
)

elif params["task_type"][0] == "CustomVoice":
params["speaker"] = ["Vivian"] # Default for CustomVoice
Expand All @@ -1093,7 +1148,7 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any
]
# speaker_embedding implies x_vector_only_mode
params["x_vector_only_mode"] = [True]
elif request.x_vector_only_mode is not None:
elif request.x_vector_only_mode is not None and "voice_clone_prompt" not in params:
params["x_vector_only_mode"] = [request.x_vector_only_mode]

# Generation parameters
Expand Down Expand Up @@ -1381,6 +1436,8 @@ async def create_speech(
return error_check_ret

try:
request_id = f"speech-{random_uuid()}"

if self._is_tts:
# Validate TTS parameters
validation_error = self._validate_tts_request(request)
Expand Down
Loading