From 4cd4fa19fe1d0f41486ec83de38323b582d2688b Mon Sep 17 00:00:00 2001 From: reidliu41 Date: Tue, 14 Apr 2026 09:19:06 +0800 Subject: [PATCH 1/2] refactor: normalize speech request handling in serving_speech - add a shared SpeechRequestNormalized helper for speech request canonicalization - centralize uploaded voice resolution for audio and embedding-backed voices - preserve explicit-vs-auto-resolved clone input semantics across validation and generation - route TTS validation, param building, and prepare_speech_generation through the normalized path - add regression tests for uploaded voice normalization and prepare_speech_generation Signed-off-by: reidliu41 --- .../openai_api/test_serving_speech.py | 111 +++++++++ .../entrypoints/openai/serving_speech.py | 224 ++++++++++++------ 2 files changed, 259 insertions(+), 76 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index c884120620..2e2c96085f 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -654,6 +654,87 @@ def speech_server(self, mocker: MockerFixture): yield server server.shutdown() + def test_normalize_speech_request_preserves_voice_but_adds_lookup(self, speech_server): + req = OpenAICreateSpeechRequest(input="Hello", voice="Ryan") + + normalized = speech_server._normalize_speech_request(req) + + assert normalized.voice == "Ryan" + assert normalized.voice_lookup == "ryan" + + def test_normalize_speech_request_resolves_uploaded_audio_voice(self, speech_server, mocker: MockerFixture): + speech_server.uploaded_speakers = { + "custom_voice": { + "name": "custom_voice", + "file_path": "/tmp/voice_samples/custom_voice.wav", + "mime_type": "audio/wav", + "embedding_source": "audio", + "ref_text": "Uploaded transcript", + } + } + mock_audio = mocker.patch.object( + speech_server, "_get_uploaded_audio_data", return_value="data:audio/wav;base64,ZmFrZQ==" + ) + req = OpenAICreateSpeechRequest(input="Hello", voice="CUSTOM_VOICE") + + normalized = speech_server._normalize_speech_request(req) + + assert normalized.voice == "CUSTOM_VOICE" + assert normalized.voice_lookup == "custom_voice" + assert normalized.ref_audio == "data:audio/wav;base64,ZmFrZQ==" + assert normalized.ref_text == "Uploaded transcript" + mock_audio.assert_called_once_with("custom_voice") + + def test_normalize_speech_request_resolves_uploaded_embedding_voice(self, speech_server, mocker: MockerFixture): + speech_server.uploaded_speakers = { + "emb_voice": { + "name": "emb_voice", + "file_path": "/tmp/voice_samples/emb_voice.safetensors", + "mime_type": "application/x-safetensors", + "embedding_source": "direct", + "cache_status": "ready", + "cache_file": "/tmp/voice_samples/emb_voice.safetensors", + } + } + mock_embedding = mocker.patch.object( + speech_server, "_get_uploaded_speaker_embedding", return_value=[0.1] * 1024 + ) + req = OpenAICreateSpeechRequest(input="Hello", voice="EMB_VOICE", x_vector_only_mode=False) + + normalized = speech_server._normalize_speech_request(req) + + assert normalized.voice == "EMB_VOICE" + assert normalized.voice_lookup == "emb_voice" + assert normalized.speaker_embedding is not None + assert normalized.x_vector_only_mode is True + mock_embedding.assert_called_once_with("emb_voice") + + def test_normalize_speech_request_does_not_backfill_uploaded_metadata_when_ref_audio_is_explicit( + self, speech_server, mocker: MockerFixture + ): + speech_server.uploaded_speakers = { + "custom_voice": { + "name": "custom_voice", + "file_path": "/tmp/voice_samples/custom_voice.wav", + "mime_type": "audio/wav", + "embedding_source": "audio", + "ref_text": "Uploaded transcript", + } + } + mock_audio = mocker.patch.object(speech_server, "_get_uploaded_audio_data") + req = OpenAICreateSpeechRequest( + input="Hello", + voice="CUSTOM_VOICE", + ref_audio="data:audio/wav;base64,ZXhwbGljaXQ=", + ) + + normalized = speech_server._normalize_speech_request(req) + + assert normalized.ref_audio == "data:audio/wav;base64,ZXhwbGljaXQ=" + assert normalized.ref_text is None + assert normalized.resolved_upload_audio is False + mock_audio.assert_not_called() + def test_is_tts_detection_no_stage(self, speech_server): """Test TTS model detection when no TTS stage exists.""" # Fixture creates server with stage_configs = [] -> _is_tts should be False @@ -2259,6 +2340,36 @@ def test_prepare_speech_generation_awaits_qwen3_tts_async(self, qwen3_tts_server qwen3_tts_server._build_tts_params.assert_called_once() qwen3_tts_server._estimate_prompt_len_async.assert_awaited_once() + def test_prepare_speech_generation_uploaded_audio_voice_preserves_auto_clone_params( + self, qwen3_tts_server, mocker: MockerFixture + ): + qwen3_tts_server.uploaded_speakers = { + "custom_voice": { + "name": "custom_voice", + "file_path": "/tmp/voice_samples/custom_voice.wav", + "mime_type": "audio/wav", + "embedding_source": "audio", + "ref_text": None, + "created_at": 1711234567.89, + } + } + mocker.patch("pathlib.Path.exists", return_value=True) + mocker.patch.object( + qwen3_tts_server, + "_get_uploaded_audio_data", + return_value="data:audio/wav;base64,ZmFrZQ==", + ) + qwen3_tts_server._resolve_ref_audio = mocker.AsyncMock(return_value=([0.1, 0.2, 0.3], 24000)) + qwen3_tts_server._estimate_prompt_len_async = mocker.AsyncMock(return_value=512) + + request = OpenAICreateSpeechRequest(input="hello", voice="CUSTOM_VOICE") + _, _, tts_params = asyncio.run(qwen3_tts_server._prepare_speech_generation(request)) + + assert tts_params["task_type"] == ["Base"] + assert tts_params["x_vector_only_mode"] == [True] + assert tts_params["voice_created_at"] == [1711234567.89] + assert tts_params["ref_audio"] == [[[0.1, 0.2, 0.3], 24000]] + def test_shutdown_is_idempotent(self, mocker: MockerFixture): """Calling shutdown() twice should not raise.""" mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value=set()) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 3dc5f595d0..c4fcc084dd 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -8,6 +8,7 @@ import struct import time from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass from pathlib import Path from typing import Any @@ -78,6 +79,25 @@ _TTS_MAX_NEW_TOKENS_MAX = 4096 +@dataclass +class SpeechRequestNormalized: + """Canonicalized speech request fields shared by serving paths.""" + + input_text: str + voice: str | None + voice_lookup: str | None + task_type: str | None + language: str | None + instructions: str | None + ref_audio: str | None + ref_text: str | None + speaker_embedding: list[float] | None + x_vector_only_mode: bool | None + uploaded_speaker_info: dict[str, Any] | None = None + resolved_upload_audio: bool = False + resolved_upload_embedding: bool = False + + def _create_wav_header(sample_rate: int, num_channels: int = 1, bits_per_sample: int = 16) -> bytes: """Create a WAV header with placeholder size values for streaming. @@ -512,6 +532,76 @@ def _get_uploaded_speaker_embedding(self, voice_name: str) -> list[float] | None logger.error("Could not load embedding for voice %s: %s", voice_name, e) return None + def _normalize_speech_request(self, request: OpenAICreateSpeechRequest) -> SpeechRequestNormalized: + """Normalize request fields before validation or model adaptation.""" + voice = request.voice + voice_lookup = request.voice.lower() if request.voice is not None else None + task_type = request.task_type + pre_resolved_upload_audio = bool(getattr(request, "_auto_resolved_upload_audio", False)) + pre_resolved_upload_embedding = bool(getattr(request, "_auto_resolved_upload_embedding", False)) + explicit_ref_audio = request.ref_audio is not None and not pre_resolved_upload_audio + explicit_speaker_embedding = request.speaker_embedding is not None and not pre_resolved_upload_embedding + ref_audio = request.ref_audio + ref_text = request.ref_text.strip() if request.ref_text and request.ref_text.strip() else None + speaker_embedding = request.speaker_embedding + x_vector_only_mode = request.x_vector_only_mode + uploaded_speaker_info = self.uploaded_speakers.get(voice_lookup) if voice_lookup is not None else None + resolved_upload_audio = pre_resolved_upload_audio + resolved_upload_embedding = pre_resolved_upload_embedding + + if uploaded_speaker_info is not None and not explicit_ref_audio and not explicit_speaker_embedding: + if ref_text is None: + stored_ref_text = uploaded_speaker_info.get("ref_text") + ref_text = ( + stored_ref_text.strip() if isinstance(stored_ref_text, str) and stored_ref_text.strip() else None + ) + if ref_audio is None and uploaded_speaker_info.get("embedding_source") != "direct": + ref_audio = self._get_uploaded_audio_data(voice_lookup) + resolved_upload_audio = ref_audio is not None + if speaker_embedding is None and uploaded_speaker_info.get("embedding_source") == "direct": + speaker_embedding = self._get_uploaded_speaker_embedding(voice_lookup) + resolved_upload_embedding = speaker_embedding is not None + + if ( + self._tts_model_type in (None, "qwen3_tts") + and task_type is None + and (ref_audio is not None or ref_text is not None or resolved_upload_embedding) + ): + task_type = "Base" + + if speaker_embedding is not None: + x_vector_only_mode = True + + return SpeechRequestNormalized( + input_text=request.input, + voice=voice, + voice_lookup=voice_lookup, + task_type=task_type, + language=request.language, + instructions=request.instructions, + ref_audio=ref_audio, + ref_text=ref_text, + speaker_embedding=speaker_embedding, + x_vector_only_mode=x_vector_only_mode, + uploaded_speaker_info=uploaded_speaker_info, + resolved_upload_audio=resolved_upload_audio, + resolved_upload_embedding=resolved_upload_embedding, + ) + + @staticmethod + def _apply_normalized_speech_request( + request: OpenAICreateSpeechRequest, + normalized: SpeechRequestNormalized, + ) -> None: + """Mutate request so downstream code sees canonicalized fields.""" + request.task_type = normalized.task_type + request.ref_audio = normalized.ref_audio + request.ref_text = normalized.ref_text + request.speaker_embedding = normalized.speaker_embedding + request.x_vector_only_mode = normalized.x_vector_only_mode + object.__setattr__(request, "_auto_resolved_upload_audio", normalized.resolved_upload_audio) + object.__setattr__(request, "_auto_resolved_upload_embedding", normalized.resolved_upload_embedding) + async def upload_voice( self, audio_file: UploadFile, @@ -807,6 +897,9 @@ def _validate_ref_audio_format(self, ref_audio: str) -> str | None: def _validate_voxtral_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: """Validate Voxtral TTS request parameters. Returns error message or None.""" + normalized = self._normalize_speech_request(request) + self._apply_normalized_speech_request(request, normalized) + if not request.input or not request.input.strip(): return "Input text cannot be empty" @@ -819,9 +912,8 @@ def _validate_voxtral_tts_request(self, request: OpenAICreateSpeechRequest) -> s if fmt_err: return fmt_err - if request.voice is not None: - request.voice = request.voice.lower() - if self.supported_speakers and request.voice not in self.supported_speakers: + if normalized.voice_lookup is not None: + if self.supported_speakers and normalized.voice_lookup not in self.supported_speakers: return f"Invalid speaker '{request.voice}'. Supported: {', '.join(sorted(self.supported_speakers))}" if request.max_new_tokens is not None: @@ -834,15 +926,10 @@ def _validate_voxtral_tts_request(self, request: OpenAICreateSpeechRequest) -> s def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: """Validate Qwen TTS request parameters. Returns error message or None.""" - # Infer Base task when ref_audio or ref_text is provided without explicit task_type. - if request.task_type is None and (request.ref_audio is not None or request.ref_text is not None): - request.task_type = "Base" + normalized = self._normalize_speech_request(request) + self._apply_normalized_speech_request(request, normalized) task_type = request.task_type or "CustomVoice" - # Normalize voice to lowercase for case-insensitive matching - if request.voice is not None: - request.voice = request.voice.lower() - # Validate input is not empty if not request.input or not request.input.strip(): return "Input text cannot be empty" @@ -859,7 +946,7 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str "Use task_type='Base' with ref_audio/ref_text for voice cloning, " "or use a CustomVoice model." ) - if request.voice is not None and request.voice not in self.supported_speakers: + if normalized.voice_lookup is not None and normalized.voice_lookup not in self.supported_speakers: return f"Invalid voice '{request.voice}'. Supported: {', '.join(sorted(self.supported_speakers))}" # Validate speaker_embedding constraints @@ -903,7 +990,7 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str # Handle the case where request.voice is NOT None pass # voice is not None - voice_lower = request.voice.lower() + voice_lower = normalized.voice_lookup if voice_lower in self.uploaded_speakers: # Check if data file exists for uploaded speaker speaker_info = self.uploaded_speakers[voice_lower] @@ -963,26 +1050,14 @@ def _validate_fish_tts_request(self, request: OpenAICreateSpeechRequest) -> str Side effect: if request.voice references an uploaded speaker, resolves it to request.ref_audio and request.ref_text for voice cloning. """ + normalized = self._normalize_speech_request(request) + self._apply_normalized_speech_request(request, normalized) + if not request.input or not request.input.strip(): return "Input text cannot be empty" - # Support uploaded voices: auto-resolve voice → ref_audio + ref_text. - if request.voice is not None and request.ref_audio is None: - voice_lower = request.voice.lower() - if voice_lower in self.uploaded_speakers: - speaker_info = self.uploaded_speakers[voice_lower] - file_path = Path(speaker_info["file_path"]) - if not file_path.exists(): - return f"Audio file for uploaded voice '{request.voice}' not found on disk" - audio_data_url = self._get_uploaded_audio_data(voice_lower) - if audio_data_url is None: - return f"Could not load audio for uploaded voice '{request.voice}'" - request.ref_audio = audio_data_url - # Use ref_text from upload metadata if not provided in request. - if not request.ref_text or not request.ref_text.strip(): - upload_ref_text = speaker_info.get("ref_text") - if upload_ref_text and upload_ref_text.strip(): - request.ref_text = upload_ref_text + if normalized.uploaded_speaker_info is not None and request.ref_audio is None and normalized.ref_audio is None: + return f"Could not load audio for uploaded voice '{request.voice}'" if request.ref_audio is not None: fmt_err = self._validate_ref_audio_format(request.ref_audio) @@ -1001,6 +1076,9 @@ def _validate_fish_tts_request(self, request: OpenAICreateSpeechRequest) -> str def _validate_cosyvoice3_request(self, request: OpenAICreateSpeechRequest) -> str | None: """Validate CosyVoice3 request parameters. Returns error message or None.""" + normalized = self._normalize_speech_request(request) + self._apply_normalized_speech_request(request, normalized) + if not request.input or not request.input.strip(): return "Input text cannot be empty" @@ -1169,82 +1247,81 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any Processes each parameter if present, skips if not. Values are wrapped in lists as required by the model. """ + normalized = self._normalize_speech_request(request) params: dict[str, Any] = {} # Text content (always required) - params["text"] = [request.input] + params["text"] = [normalized.input_text] # Task type - if request.task_type is not None: - params["task_type"] = [request.task_type] + if normalized.task_type is not None: + params["task_type"] = [normalized.task_type] else: params["task_type"] = ["CustomVoice"] # Language - if request.language is not None: - params["language"] = [request.language] + if normalized.language is not None: + params["language"] = [normalized.language] else: params["language"] = ["Auto"] # Speaker (voice) - if request.voice is not None: - params["speaker"] = [request.voice] + if normalized.voice is not None: + speaker_value = ( + normalized.voice_lookup if normalized.uploaded_speaker_info is not None else normalized.voice + ) + params["speaker"] = [speaker_value] # Uploaded voices use task_type="Base" (CustomVoice requires built-in spk_id). # If ref_text was provided at upload time, use in-context cloning; otherwise x_vector only. - if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None: - speaker_info = self.uploaded_speakers[request.voice.lower()] - - # Check if this voice was uploaded with a pre-computed embedding. - # Populate request.speaker_embedding so the existing code path - # (below) handles voice_clone_prompt and x_vector_only_mode. - embedding = self._get_uploaded_speaker_embedding(request.voice) - if embedding is not None: - request.speaker_embedding = embedding + if normalized.resolved_upload_audio or normalized.resolved_upload_embedding: + speaker_info = normalized.uploaded_speaker_info + + if normalized.speaker_embedding is not None: params["task_type"] = ["Base"] - logger.info("Auto-set speaker_embedding for uploaded voice: %s", request.voice) - else: - audio_data = self._get_uploaded_audio_data(request.voice) - if not audio_data: - raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted") - stored_ref_text = speaker_info.get("ref_text") - params["ref_audio"] = [audio_data] + logger.info("Auto-set speaker_embedding for uploaded voice: %s", normalized.voice) + elif normalized.ref_audio is not None: + params["ref_audio"] = [normalized.ref_audio] params["task_type"] = ["Base"] params["voice_created_at"] = [speaker_info.get("created_at", 0)] - if stored_ref_text: - params["ref_text"] = [stored_ref_text] + if normalized.ref_text: + params["ref_text"] = [normalized.ref_text] params["x_vector_only_mode"] = [False] else: params["x_vector_only_mode"] = [True] logger.info( - "Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text) + "Auto-set ref_audio for uploaded voice: %s (icl=%s)", + normalized.voice, + bool(normalized.ref_text), ) + else: + raise ValueError(f"Audio file for uploaded voice '{normalized.voice}' is missing or corrupted") elif params["task_type"][0] == "CustomVoice": params["speaker"] = ["Vivian"] # Default for CustomVoice # Instructions for style/emotion control - if request.instructions is not None: - params["instruct"] = [request.instructions] + if normalized.instructions is not None: + params["instruct"] = [normalized.instructions] else: params["instruct"] = [""] # Voice clone: ref_audio resolved in create_speech(), not here. - if request.ref_text is not None: - params["ref_text"] = [request.ref_text] - if request.speaker_embedding is not None: + if normalized.ref_text is not None: + params["ref_text"] = [normalized.ref_text] + if normalized.speaker_embedding is not None: # Store as plain float list (not tensor) so it survives msgspec # serialization through the EngineCore IPC boundary. The talker's # _build_prompt_embeds converts it back to a tensor on the GPU. params["voice_clone_prompt"] = [ { - "ref_spk_embedding": list(request.speaker_embedding), + "ref_spk_embedding": list(normalized.speaker_embedding), } ] # speaker_embedding implies x_vector_only_mode params["x_vector_only_mode"] = [True] - elif request.x_vector_only_mode is not None: - params["x_vector_only_mode"] = [request.x_vector_only_mode] + elif normalized.x_vector_only_mode is not None: + params["x_vector_only_mode"] = [normalized.x_vector_only_mode] # Generation parameters if request.max_new_tokens is not None: @@ -1398,6 +1475,9 @@ async def _prepare_speech_generation( if self.engine_client.errored: raise self.engine_client.dead_error + normalized = self._normalize_speech_request(request) + self._apply_normalized_speech_request(request, normalized) + if self._is_fish_speech: validation_error = self._validate_fish_tts_request(request) if validation_error: @@ -1413,25 +1493,17 @@ async def _prepare_speech_generation( raise ValueError("Input text cannot be empty") tts_params = {} prompt: dict[str, Any] = {"input": request.input} - # Resolve ref_audio: explicit request param or uploaded voice - ref_src = request.ref_audio - if not ref_src and request.voice: - vl = request.voice.lower() - if vl in self.uploaded_speakers: - sp = self.uploaded_speakers[vl] - if sp.get("embedding_source") == "audio": - ref_src = self._get_uploaded_audio_data(request.voice) - if not ref_src: - raise ValueError(f"Audio for voice '{request.voice}' missing") - prompt["ref_text"] = sp.get("ref_text") + ref_src = normalized.ref_audio + if normalized.ref_text: + prompt["ref_text"] = normalized.ref_text + if normalized.uploaded_speaker_info is not None and request.ref_audio is None and ref_src is None: + raise ValueError(f"Audio for voice '{request.voice}' missing") if ref_src: fmt_err = self._validate_ref_audio_format(ref_src) if fmt_err: raise ValueError(fmt_err) wav, sr = await self._resolve_ref_audio(ref_src) prompt["ref_audio"] = (np.asarray(wav, dtype=np.float32), sr) - if request.ref_text: - prompt["ref_text"] = request.ref_text if request.language: prompt["lang"] = request.language if request.instructions: From 032346f8255c92ee28e1ddba98d3826b77d0bf90 Mon Sep 17 00:00:00 2001 From: reidliu41 Date: Fri, 17 Apr 2026 10:09:19 +0800 Subject: [PATCH 2/2] fix: address speech normalization review feedback Reuse the normalized speech request through validation and TTS parameter building to avoid repeated uploaded-voice resolution. Preserve canonical lowercase speaker IDs, restore the specific Fish Speech missing-uploaded-audio error, and keep the latest VoxCPM TTS parameter path compatible with the normalized builder. Signed-off-by: reidliu41 --- .../openai_api/test_serving_speech.py | 23 ++- .../entrypoints/openai/serving_speech.py | 167 +++++++++++------- 2 files changed, 122 insertions(+), 68 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 2e2c96085f..5be4cbb704 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -954,7 +954,7 @@ def test_build_tts_params(self, speech_server): params = speech_server._build_tts_params(req) assert params["text"] == ["Hello"] - assert params["speaker"] == ["Ryan"] + assert params["speaker"] == ["ryan"] assert params["language"] == ["English"] assert params["task_type"] == ["CustomVoice"] @@ -2026,6 +2026,24 @@ def test_validate_tts_request_allows_fish_text_only_batch_items(self, fish_speec assert fish_speech_server._tts_model_type == "fish_tts" assert fish_speech_server._validate_tts_request(OpenAICreateSpeechRequest(input="hello fish")) is None + def test_validate_tts_request_reports_missing_uploaded_fish_audio_file(self, fish_speech_server, mocker): + fish_speech_server.uploaded_speakers = { + "missing_voice": { + "name": "missing_voice", + "file_path": "/tmp/voice_samples/missing_voice.wav", + "mime_type": "audio/wav", + "embedding_source": "audio", + "ref_text": "reference text", + } + } + mocker.patch("pathlib.Path.exists", return_value=False) + + error = fish_speech_server._validate_tts_request( + OpenAICreateSpeechRequest(input="hello fish", voice="missing_voice") + ) + + assert error == "Audio file for uploaded voice 'missing_voice' not found on disk" + def test_prepare_speech_generation_rejects_invalid_fish_max_new_tokens(self, fish_speech_server): with pytest.raises(ValueError, match="max_new_tokens cannot exceed"): asyncio.run( @@ -2354,7 +2372,7 @@ def test_prepare_speech_generation_uploaded_audio_voice_preserves_auto_clone_par } } mocker.patch("pathlib.Path.exists", return_value=True) - mocker.patch.object( + mock_get_audio = mocker.patch.object( qwen3_tts_server, "_get_uploaded_audio_data", return_value="data:audio/wav;base64,ZmFrZQ==", @@ -2369,6 +2387,7 @@ def test_prepare_speech_generation_uploaded_audio_voice_preserves_auto_clone_par assert tts_params["x_vector_only_mode"] == [True] assert tts_params["voice_created_at"] == [1711234567.89] assert tts_params["ref_audio"] == [[[0.1, 0.2, 0.3], 24000]] + mock_get_audio.assert_called_once_with("custom_voice") def test_shutdown_is_idempotent(self, mocker: MockerFixture): """Calling shutdown() twice should not raise.""" diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index c4fcc084dd..bb79b40915 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -537,17 +537,15 @@ def _normalize_speech_request(self, request: OpenAICreateSpeechRequest) -> Speec voice = request.voice voice_lookup = request.voice.lower() if request.voice is not None else None task_type = request.task_type - pre_resolved_upload_audio = bool(getattr(request, "_auto_resolved_upload_audio", False)) - pre_resolved_upload_embedding = bool(getattr(request, "_auto_resolved_upload_embedding", False)) - explicit_ref_audio = request.ref_audio is not None and not pre_resolved_upload_audio - explicit_speaker_embedding = request.speaker_embedding is not None and not pre_resolved_upload_embedding + explicit_ref_audio = request.ref_audio is not None + explicit_speaker_embedding = request.speaker_embedding is not None ref_audio = request.ref_audio ref_text = request.ref_text.strip() if request.ref_text and request.ref_text.strip() else None speaker_embedding = request.speaker_embedding x_vector_only_mode = request.x_vector_only_mode uploaded_speaker_info = self.uploaded_speakers.get(voice_lookup) if voice_lookup is not None else None - resolved_upload_audio = pre_resolved_upload_audio - resolved_upload_embedding = pre_resolved_upload_embedding + resolved_upload_audio = False + resolved_upload_embedding = False if uploaded_speaker_info is not None and not explicit_ref_audio and not explicit_speaker_embedding: if ref_text is None: @@ -592,15 +590,16 @@ def _normalize_speech_request(self, request: OpenAICreateSpeechRequest) -> Speec def _apply_normalized_speech_request( request: OpenAICreateSpeechRequest, normalized: SpeechRequestNormalized, + *, + apply_clone_inputs: bool = True, ) -> None: """Mutate request so downstream code sees canonicalized fields.""" request.task_type = normalized.task_type - request.ref_audio = normalized.ref_audio request.ref_text = normalized.ref_text - request.speaker_embedding = normalized.speaker_embedding request.x_vector_only_mode = normalized.x_vector_only_mode - object.__setattr__(request, "_auto_resolved_upload_audio", normalized.resolved_upload_audio) - object.__setattr__(request, "_auto_resolved_upload_embedding", normalized.resolved_upload_embedding) + if apply_clone_inputs: + request.ref_audio = normalized.ref_audio + request.speaker_embedding = normalized.speaker_embedding async def upload_voice( self, @@ -873,17 +872,21 @@ def _is_tts_model(self) -> bool: """Check if the current model is a supported TTS model.""" return any(stage.engine_args.model_stage in _TTS_MODEL_STAGES for stage in self.engine_client.stage_configs) - def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: + def _validate_tts_request( + self, + request: OpenAICreateSpeechRequest, + normalized: SpeechRequestNormalized | None = None, + ) -> str | None: """Validate TTS request parameters. Returns error message or None.""" if self._tts_model_type == "voxtral_tts": - return self._validate_voxtral_tts_request(request) + return self._validate_voxtral_tts_request(request, normalized) if self._tts_model_type == "fish_tts": - return self._validate_fish_tts_request(request) + return self._validate_fish_tts_request(request, normalized) if self._tts_model_type == "cosyvoice3": - return self._validate_cosyvoice3_request(request) + return self._validate_cosyvoice3_request(request, normalized) if self._tts_model_type == "voxcpm2": return None # VoxCPM2 accepts any text input - return self._validate_qwen_tts_request(request) + return self._validate_qwen_tts_request(request, normalized) def _validate_ref_audio_format(self, ref_audio: str) -> str | None: """Validate ref_audio is a supported URI format. Returns error or None.""" @@ -895,20 +898,25 @@ def _validate_ref_audio_format(self, ref_audio: str) -> str | None: return "ref_audio must be a URL (http/https), base64 data URL (data:...), or file URI (file://...)" return None - def _validate_voxtral_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: + def _validate_voxtral_tts_request( + self, + request: OpenAICreateSpeechRequest, + normalized: SpeechRequestNormalized | None = None, + ) -> str | None: """Validate Voxtral TTS request parameters. Returns error message or None.""" - normalized = self._normalize_speech_request(request) - self._apply_normalized_speech_request(request, normalized) + if normalized is None: + normalized = self._normalize_speech_request(request) + self._apply_normalized_speech_request(request, normalized, apply_clone_inputs=False) if not request.input or not request.input.strip(): return "Input text cannot be empty" # Voxtral TTS requires either a preset voice or ref_audio for voice cloning. - if request.voice is None and request.ref_audio is None: + if request.voice is None and normalized.ref_audio is None: return "Either 'voice' (preset speaker) or 'ref_audio' (voice cloning) must be provided" - if request.ref_audio is not None: - fmt_err = self._validate_ref_audio_format(request.ref_audio) + if normalized.ref_audio is not None: + fmt_err = self._validate_ref_audio_format(normalized.ref_audio) if fmt_err: return fmt_err @@ -924,11 +932,16 @@ def _validate_voxtral_tts_request(self, request: OpenAICreateSpeechRequest) -> s return None - def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: + def _validate_qwen_tts_request( + self, + request: OpenAICreateSpeechRequest, + normalized: SpeechRequestNormalized | None = None, + ) -> str | None: """Validate Qwen TTS request parameters. Returns error message or None.""" - normalized = self._normalize_speech_request(request) - self._apply_normalized_speech_request(request, normalized) - task_type = request.task_type or "CustomVoice" + if normalized is None: + normalized = self._normalize_speech_request(request) + self._apply_normalized_speech_request(request, normalized, apply_clone_inputs=False) + task_type = normalized.task_type or "CustomVoice" # Validate input is not empty if not request.input or not request.input.strip(): @@ -950,15 +963,15 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str return f"Invalid voice '{request.voice}'. Supported: {', '.join(sorted(self.supported_speakers))}" # Validate speaker_embedding constraints - if request.speaker_embedding is not None: + if normalized.speaker_embedding is not None: if task_type != "Base": return "'speaker_embedding' is only valid for Base task" - if not request.speaker_embedding: + if not normalized.speaker_embedding: return "'speaker_embedding' must be a non-empty list of floats" # speaker_embedding implies x_vector_only_mode — set it before # Base task validation so callers don't need to pass it explicitly. request.x_vector_only_mode = True - emb_len = len(request.speaker_embedding) + emb_len = len(normalized.speaker_embedding) # ECAPA-TDNN produces 1024-dim (0.6B) or 2048-dim (1.7B) expected_dims = {1024, 2048} if emb_len not in expected_dims: @@ -972,16 +985,16 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str if task_type == "Base": if request.voice is None: # 1. Ensure a voice source is provided - if request.ref_audio is None and getattr(request, "speaker_embedding", None) is None: + if normalized.ref_audio is None and normalized.speaker_embedding is None: return "Base task requires 'ref_audio' or 'speaker_embedding' for voice cloning" # 2. Validate ref_audio format if it exists (using the helper from main) - if request.ref_audio is not None: - fmt_err = self._validate_ref_audio_format(request.ref_audio) + if normalized.ref_audio is not None: + fmt_err = self._validate_ref_audio_format(normalized.ref_audio) if fmt_err: return fmt_err # 3. Validate text requirements based on the mode - if not getattr(request, "x_vector_only_mode", False): - if not request.ref_text or not request.ref_text.strip(): + if not normalized.x_vector_only_mode: + if not normalized.ref_text: return ( "Base task requires non-empty 'ref_text' (transcript of " "the reference audio) unless 'x_vector_only_mode' is enabled" @@ -1005,16 +1018,14 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str return f"Speaker embedding for '{request.voice}' is not yet ready (cache_status='{status}')" else: # need ref_audio for built-in speaker - if request.ref_audio is None: + if normalized.ref_audio is None: return ( f"Base task with built-in speaker '{request.voice}' requires 'ref_audio' for voice cloning" ) - fmt_err = self._validate_ref_audio_format(request.ref_audio) + fmt_err = self._validate_ref_audio_format(normalized.ref_audio) if fmt_err: return fmt_err - if not getattr(request, "x_vector_only_mode", False) and ( - not request.ref_text or not request.ref_text.strip() - ): + if not normalized.x_vector_only_mode and not normalized.ref_text: return ( "Base task requires non-empty 'ref_text' (transcript of " "the reference audio) unless 'x_vector_only_mode' is enabled" @@ -1022,9 +1033,9 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str # Validate cross-parameter dependencies if task_type != "Base": - if request.ref_text is not None: + if normalized.ref_text is not None: return "'ref_text' is only valid for Base task" - if request.x_vector_only_mode is not None: + if normalized.x_vector_only_mode is not None: return "'x_vector_only_mode' is only valid for Base task" # Validate VoiceDesign task requirements @@ -1044,26 +1055,30 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str return None - def _validate_fish_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: - """Validate Fish Speech request parameters. Returns error message or None. - - Side effect: if request.voice references an uploaded speaker, resolves - it to request.ref_audio and request.ref_text for voice cloning. - """ - normalized = self._normalize_speech_request(request) - self._apply_normalized_speech_request(request, normalized) + def _validate_fish_tts_request( + self, + request: OpenAICreateSpeechRequest, + normalized: SpeechRequestNormalized | None = None, + ) -> str | None: + """Validate Fish Speech request parameters. Returns error message or None.""" + if normalized is None: + normalized = self._normalize_speech_request(request) + self._apply_normalized_speech_request(request, normalized, apply_clone_inputs=False) if not request.input or not request.input.strip(): return "Input text cannot be empty" - if normalized.uploaded_speaker_info is not None and request.ref_audio is None and normalized.ref_audio is None: + if normalized.uploaded_speaker_info is not None and normalized.ref_audio is None: + file_path = normalized.uploaded_speaker_info.get("file_path") + if file_path and not Path(file_path).exists(): + return f"Audio file for uploaded voice '{request.voice}' not found on disk" return f"Could not load audio for uploaded voice '{request.voice}'" - if request.ref_audio is not None: - fmt_err = self._validate_ref_audio_format(request.ref_audio) + if normalized.ref_audio is not None: + fmt_err = self._validate_ref_audio_format(normalized.ref_audio) if fmt_err: return fmt_err - if not request.ref_text or not request.ref_text.strip(): + if not normalized.ref_text: return "Voice cloning requires 'ref_text' (transcript of the reference audio)" if request.max_new_tokens is not None: @@ -1074,23 +1089,28 @@ def _validate_fish_tts_request(self, request: OpenAICreateSpeechRequest) -> str return None - def _validate_cosyvoice3_request(self, request: OpenAICreateSpeechRequest) -> str | None: + def _validate_cosyvoice3_request( + self, + request: OpenAICreateSpeechRequest, + normalized: SpeechRequestNormalized | None = None, + ) -> str | None: """Validate CosyVoice3 request parameters. Returns error message or None.""" - normalized = self._normalize_speech_request(request) - self._apply_normalized_speech_request(request, normalized) + if normalized is None: + normalized = self._normalize_speech_request(request) + self._apply_normalized_speech_request(request, normalized, apply_clone_inputs=False) if not request.input or not request.input.strip(): return "Input text cannot be empty" # CosyVoice3 requires reference audio for voice cloning - if request.ref_audio is None: + if normalized.ref_audio is None: return "CosyVoice3 requires 'ref_audio' (reference audio for voice cloning)" - fmt_err = self._validate_ref_audio_format(request.ref_audio) + fmt_err = self._validate_ref_audio_format(normalized.ref_audio) if fmt_err: return fmt_err - if not request.ref_text or not request.ref_text.strip(): + if not normalized.ref_text: return "CosyVoice3 requires 'ref_text' (transcript of the reference audio)" if request.max_new_tokens is not None: @@ -1241,13 +1261,30 @@ def _extract_audio_output(res) -> tuple[dict | None, str | None]: key = "audio" if "audio" in mm else ("model_outputs" if "model_outputs" in mm else None) return mm, key - def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any]: + def _build_tts_params( + self, + request: OpenAICreateSpeechRequest, + normalized: SpeechRequestNormalized | None = None, + ) -> dict[str, Any]: """Build TTS parameters from request. Processes each parameter if present, skips if not. Values are wrapped in lists as required by the model. """ - normalized = self._normalize_speech_request(request) + if self._tts_model_type == "voxcpm": + params: dict[str, Any] = { + "text": [request.input], + "cfg_value": [2.0], + "inference_timesteps": [10], + "min_len": [2], + "max_new_tokens": [request.max_new_tokens or 4096], + } + if request.ref_text is not None: + params["ref_text"] = [request.ref_text] + return params + + if normalized is None: + normalized = self._normalize_speech_request(request) params: dict[str, Any] = {} # Text content (always required) @@ -1267,9 +1304,7 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any # Speaker (voice) if normalized.voice is not None: - speaker_value = ( - normalized.voice_lookup if normalized.uploaded_speaker_info is not None else normalized.voice - ) + speaker_value = normalized.voice_lookup if normalized.voice_lookup is not None else normalized.voice params["speaker"] = [speaker_value] # Uploaded voices use task_type="Base" (CustomVoice requires built-in spk_id). @@ -1479,7 +1514,7 @@ async def _prepare_speech_generation( self._apply_normalized_speech_request(request, normalized) if self._is_fish_speech: - validation_error = self._validate_fish_tts_request(request) + validation_error = self._validate_fish_tts_request(request, normalized) if validation_error: raise ValueError(validation_error) ref_audio_data = None @@ -1518,7 +1553,7 @@ async def _prepare_speech_generation( if additional: prompt["additional_information"] = additional elif self._is_tts: - validation_error = self._validate_tts_request(request) + validation_error = self._validate_tts_request(request, normalized) if validation_error: raise ValueError(validation_error) @@ -1529,7 +1564,7 @@ async def _prepare_speech_generation( prompt = await self._build_cosyvoice3_prompt(request) tts_params = {} else: - tts_params = self._build_tts_params(request) + tts_params = self._build_tts_params(request, normalized) # Resolve ref_audio (explicit or auto-set for uploaded voices) # to [[wav_list, sr]] so the model doesn't re-decode base64. ref_audio_source = request.ref_audio