From c0ed56288ef8511b94e4621c3343336e30ec2aaa Mon Sep 17 00:00:00 2001 From: linyueqian Date: Tue, 3 Mar 2026 10:33:39 -0500 Subject: [PATCH 1/2] [Bugfix] Add TTS request validation to prevent engine crashes Signed-off-by: linyueqian --- .../entrypoints/openai/serving_speech.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 196cba0dd16..986e06ace95 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -235,6 +235,9 @@ def _is_tts_model(self) -> bool: def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: """Validate TTS request parameters. Returns error message or None.""" + # Infer Base task when ref_audio or ref_text is provided without explicit task_type. + if request.task_type is None and (request.ref_audio is not None or request.ref_text is not None): + request.task_type = "Base" task_type = request.task_type or "CustomVoice" # Normalize voice to lowercase for case-insensitive matching @@ -250,8 +253,14 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non return f"Invalid language '{request.language}'. Supported: {', '.join(sorted(_TTS_LANGUAGES))}" # Validate speaker for CustomVoice task - if task_type == "CustomVoice" and request.voice is not None: - if self.supported_speakers and request.voice not in self.supported_speakers: + if task_type == "CustomVoice": + if not self.supported_speakers: + return ( + "This model does not support CustomVoice task (no speakers configured). " + "Use task_type='Base' with ref_audio/ref_text for voice cloning, " + "or use a CustomVoice model." + ) + if request.voice is not None and request.voice not in self.supported_speakers: return f"Invalid speaker '{request.voice}'. Supported: {', '.join(sorted(self.supported_speakers))}" # Validate Base task requirements @@ -261,6 +270,14 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non # Validate ref_audio format if not (request.ref_audio.startswith(("http://", "https://")) or request.ref_audio.startswith("data:")): return "ref_audio must be a URL (http/https) or base64 data URL (data:...)" + # In-context voice cloning (default) requires non-empty ref_text. + # x_vector_only_mode skips in-context and only uses speaker embedding. + if not request.x_vector_only_mode: + if not request.ref_text or not request.ref_text.strip(): + return ( + "Base task requires non-empty 'ref_text' (transcript of " + "the reference audio) unless 'x_vector_only_mode' is enabled" + ) # Validate cross-parameter dependencies if task_type != "Base": From a0a796fd610ab3605800da68eb6e57c13e936c8f Mon Sep 17 00:00:00 2001 From: linyueqian Date: Tue, 3 Mar 2026 19:09:43 -0500 Subject: [PATCH 2/2] Update and add unit tests for TTS request validation Signed-off-by: linyueqian --- .../openai_api/test_serving_speech.py | 48 ++++++++++++++++--- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 7af4e078fb8..10dbd3a3b9d 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -349,13 +349,13 @@ def test_validate_tts_request_basic(self, speech_server): req = OpenAICreateSpeechRequest(input="Hello", language="InvalidLang") assert "Invalid language" in speech_server._validate_tts_request(req) - # When no speakers loaded, any voice is accepted (unconstrained) + # CustomVoice on model with no speakers -> rejected req = OpenAICreateSpeechRequest(input="Hello", voice="Invalid") - assert speech_server._validate_tts_request(req) is None + assert "does not support CustomVoice" in speech_server._validate_tts_request(req) - # Valid request - req = OpenAICreateSpeechRequest(input="Hello", voice="Vivian") - assert speech_server._validate_tts_request(req) is None + # CustomVoice without voice on model with no speakers -> also rejected + req = OpenAICreateSpeechRequest(input="Hello") + assert "does not support CustomVoice" in speech_server._validate_tts_request(req) def test_validate_tts_request_task_types(self, speech_server): """Test task-specific validation.""" @@ -367,9 +367,43 @@ def test_validate_tts_request_task_types(self, speech_server): req = OpenAICreateSpeechRequest(input="Hello", task_type="VoiceDesign") assert "instructions" in speech_server._validate_tts_request(req) - # ref_text only for Base + # ref_text without task_type auto-infers Base, then fails on missing ref_audio req = OpenAICreateSpeechRequest(input="Hello", ref_text="text") - assert "Base task" in speech_server._validate_tts_request(req) + assert "ref_audio" in speech_server._validate_tts_request(req) + + def test_validate_tts_request_auto_infer_base(self, speech_server): + """Test auto-inference of Base task when ref_audio/ref_text is provided.""" + # ref_audio without task_type -> infers Base, requires non-empty ref_text + req = OpenAICreateSpeechRequest(input="Hello", ref_audio="data:audio/wav;base64,abc") + result = speech_server._validate_tts_request(req) + assert "ref_text" in result + assert req.task_type == "Base" + + # ref_text without task_type -> infers Base, requires ref_audio + req = OpenAICreateSpeechRequest(input="Hello", ref_text="some text") + result = speech_server._validate_tts_request(req) + assert "ref_audio" in result + assert req.task_type == "Base" + + def test_validate_tts_request_base_empty_ref_text(self, speech_server): + """Empty ref_text on Base task returns 400 instead of crashing engine.""" + req = OpenAICreateSpeechRequest( + input="Hello", task_type="Base", ref_audio="data:audio/wav;base64,abc", ref_text="" + ) + result = speech_server._validate_tts_request(req) + assert "non-empty 'ref_text'" in result + + # x_vector_only_mode bypasses ref_text requirement + req = OpenAICreateSpeechRequest( + input="Hello", task_type="Base", ref_audio="data:audio/wav;base64,abc", ref_text="", x_vector_only_mode=True + ) + assert speech_server._validate_tts_request(req) is None + + def test_validate_tts_request_customvoice_no_speakers(self, speech_server): + """CustomVoice on a model with no speakers returns 400 instead of crashing engine.""" + req = OpenAICreateSpeechRequest(input="Hello", task_type="CustomVoice") + result = speech_server._validate_tts_request(req) + assert "does not support CustomVoice" in result def test_build_tts_params(self, speech_server): """Test TTS parameter building."""