diff --git a/examples/online_serving/qwen3_tts/openai_speech_client.py b/examples/online_serving/qwen3_tts/openai_speech_client.py index 4741a47158c..77e13b08ed2 100644 --- a/examples/online_serving/qwen3_tts/openai_speech_client.py +++ b/examples/online_serving/qwen3_tts/openai_speech_client.py @@ -71,7 +71,7 @@ def run_tts_generation(args) -> None: payload = { "model": args.model, "input": args.text, - "speaker": args.speaker, + "voice": args.speaker, "response_format": args.response_format, } diff --git a/vllm_omni/entrypoints/openai/protocol/audio.py b/vllm_omni/entrypoints/openai/protocol/audio.py index 9bc58f0094a..12c0b83636c 100644 --- a/vllm_omni/entrypoints/openai/protocol/audio.py +++ b/vllm_omni/entrypoints/openai/protocol/audio.py @@ -2,7 +2,7 @@ from typing import Literal import numpy as np -from pydantic import BaseModel, Field, field_validator, model_validator +from pydantic import AliasChoices, BaseModel, Field, field_validator, model_validator _MAX_EMBEDDING_DIM = 8192 @@ -12,6 +12,7 @@ class OpenAICreateSpeechRequest(BaseModel): model: str | None = None voice: str | None = Field( default=None, + validation_alias=AliasChoices("voice", "speaker"), description="Speaker/voice to use. For Qwen3-TTS: vivian, ryan, aiden, etc.", ) instructions: str | None = Field( diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 59d112fb7f3..42a25292fa4 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -416,6 +416,44 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None: logger.error(f"Could not read audio file for voice {voice_name}: {e}") return None + def _get_uploaded_speaker_embedding(self, voice_name: str) -> list[float] | None: + """Load pre-computed speaker embedding for an uploaded voice. + + Returns the embedding as a list of floats, or None if the voice + was not uploaded with an embedding (i.e. it has audio instead). + """ + voice_name_lower = voice_name.lower() + if voice_name_lower not in self.uploaded_speakers: + return None + + speaker_info = self.uploaded_speakers[voice_name_lower] + if speaker_info.get("embedding_source") != "direct": + return None + + cache_file = speaker_info.get("cache_file") + if not cache_file or not Path(cache_file).exists(): + logger.warning("Embedding file not found for voice %s: %s", voice_name, cache_file) + return None + + try: + from safetensors.torch import load_file + except ImportError: + logger.error( + "The 'safetensors' package is required to load speaker embeddings. " + "Install it with: pip install safetensors" + ) + return None + + try: + tensors = load_file(cache_file) + if "speaker_embedding" not in tensors: + logger.warning("Key 'speaker_embedding' not found in %s for voice %s", cache_file, voice_name) + return None + return tensors["speaker_embedding"].squeeze().tolist() + except Exception as e: + logger.error("Could not load embedding for voice %s: %s", voice_name, e) + return None + async def upload_voice( self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None ) -> dict: @@ -816,11 +854,17 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str # voice is not None voice_lower = request.voice.lower() if voice_lower in self.uploaded_speakers: - # Check if audio file exists for uploaded speaker + # Check if data file exists for uploaded speaker speaker_info = self.uploaded_speakers[voice_lower] file_path = Path(speaker_info["file_path"]) if not file_path.exists(): - return f"Audio file for uploaded speaker '{request.voice}' not found on disk" + return f"Data file for uploaded speaker '{request.voice}' not found on disk" + # For embedding-uploaded voices, verify the cache is ready + if speaker_info.get("embedding_source") == "direct": + cache_file = speaker_info.get("cache_file") + if not cache_file or not Path(cache_file).exists(): + status = speaker_info.get("cache_status", "unknown") + return f"Speaker embedding for '{request.voice}' is not yet ready (cache_status='{status}')" else: # need ref_audio for built-in speaker if request.ref_audio is None: @@ -1056,19 +1100,30 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any # Uploaded voices use task_type="Base" (CustomVoice requires built-in spk_id). # If ref_text was provided at upload time, use in-context cloning; otherwise x_vector only. if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None: - audio_data = self._get_uploaded_audio_data(request.voice) - if not audio_data: - raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted") speaker_info = self.uploaded_speakers[request.voice.lower()] - stored_ref_text = speaker_info.get("ref_text") - params["ref_audio"] = [audio_data] - params["task_type"] = ["Base"] - if stored_ref_text: - params["ref_text"] = [stored_ref_text] - params["x_vector_only_mode"] = [False] - else: + + # Check if this voice was uploaded with a pre-computed embedding + embedding = self._get_uploaded_speaker_embedding(request.voice) + if embedding is not None: + params["voice_clone_prompt"] = [{"ref_spk_embedding": embedding}] + params["task_type"] = ["Base"] params["x_vector_only_mode"] = [True] - logger.info("Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text)) + logger.info("Auto-set speaker_embedding for uploaded voice: %s", request.voice) + else: + audio_data = self._get_uploaded_audio_data(request.voice) + if not audio_data: + raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted") + stored_ref_text = speaker_info.get("ref_text") + params["ref_audio"] = [audio_data] + params["task_type"] = ["Base"] + if stored_ref_text: + params["ref_text"] = [stored_ref_text] + params["x_vector_only_mode"] = [False] + else: + params["x_vector_only_mode"] = [True] + logger.info( + "Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text) + ) elif params["task_type"][0] == "CustomVoice": params["speaker"] = ["Vivian"] # Default for CustomVoice @@ -1093,7 +1148,7 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any ] # speaker_embedding implies x_vector_only_mode params["x_vector_only_mode"] = [True] - elif request.x_vector_only_mode is not None: + elif request.x_vector_only_mode is not None and "voice_clone_prompt" not in params: params["x_vector_only_mode"] = [request.x_vector_only_mode] # Generation parameters @@ -1381,6 +1436,8 @@ async def create_speech( return error_check_ret try: + request_id = f"speech-{random_uuid()}" + if self._is_tts: # Validate TTS parameters validation_error = self._validate_tts_request(request)