vllm-project · hsliuustc0106 · Mar 4, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 4, 2026
@@ -349,13 +349,13 @@ def test_validate_tts_request_basic(self, speech_server):
         req = OpenAICreateSpeechRequest(input="Hello", language="InvalidLang")
         assert "Invalid language" in speech_server._validate_tts_request(req)
 
-        # When no speakers loaded, any voice is accepted (unconstrained)
+        # CustomVoice on model with no speakers -> rejected
         req = OpenAICreateSpeechRequest(input="Hello", voice="Invalid")
-        assert speech_server._validate_tts_request(req) is None
+        assert "does not support CustomVoice" in speech_server._validate_tts_request(req)
 
-        # Valid request
-        req = OpenAICreateSpeechRequest(input="Hello", voice="Vivian")
-        assert speech_server._validate_tts_request(req) is None
+        # CustomVoice without voice on model with no speakers -> also rejected
+        req = OpenAICreateSpeechRequest(input="Hello")
+        assert "does not support CustomVoice" in speech_server._validate_tts_request(req)
 
     def test_validate_tts_request_task_types(self, speech_server):
         """Test task-specific validation."""
@@ -367,9 +367,43 @@ def test_validate_tts_request_task_types(self, speech_server):
         req = OpenAICreateSpeechRequest(input="Hello", task_type="VoiceDesign")
         assert "instructions" in speech_server._validate_tts_request(req)
 
-        # ref_text only for Base
+        # ref_text without task_type auto-infers Base, then fails on missing ref_audio
         req = OpenAICreateSpeechRequest(input="Hello", ref_text="text")
-        assert "Base task" in speech_server._validate_tts_request(req)
+        assert "ref_audio" in speech_server._validate_tts_request(req)
+
+    def test_validate_tts_request_auto_infer_base(self, speech_server):
+        """Test auto-inference of Base task when ref_audio/ref_text is provided."""
+        # ref_audio without task_type -> infers Base, requires non-empty ref_text
+        req = OpenAICreateSpeechRequest(input="Hello", ref_audio="data:audio/wav;base64,abc")
+        result = speech_server._validate_tts_request(req)
+        assert "ref_text" in result
+        assert req.task_type == "Base"
+
+        # ref_text without task_type -> infers Base, requires ref_audio
+        req = OpenAICreateSpeechRequest(input="Hello", ref_text="some text")
+        result = speech_server._validate_tts_request(req)
+        assert "ref_audio" in result
+        assert req.task_type == "Base"
+
+    def test_validate_tts_request_base_empty_ref_text(self, speech_server):
+        """Empty ref_text on Base task returns 400 instead of crashing engine."""
+        req = OpenAICreateSpeechRequest(
+            input="Hello", task_type="Base", ref_audio="data:audio/wav;base64,abc", ref_text=""
+        )
+        result = speech_server._validate_tts_request(req)
+        assert "non-empty 'ref_text'" in result
+
+        # x_vector_only_mode bypasses ref_text requirement
+        req = OpenAICreateSpeechRequest(
+            input="Hello", task_type="Base", ref_audio="data:audio/wav;base64,abc", ref_text="", x_vector_only_mode=True
+        )
+        assert speech_server._validate_tts_request(req) is None
+
+    def test_validate_tts_request_customvoice_no_speakers(self, speech_server):
+        """CustomVoice on a model with no speakers returns 400 instead of crashing engine."""
+        req = OpenAICreateSpeechRequest(input="Hello", task_type="CustomVoice")
+        result = speech_server._validate_tts_request(req)
+        assert "does not support CustomVoice" in result
 
     def test_build_tts_params(self, speech_server):
         """Test TTS parameter building."""

@@ -235,6 +235,9 @@ def _is_tts_model(self) -> bool:
 
     def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None:
         """Validate TTS request parameters. Returns error message or None."""
+        # Infer Base task when ref_audio or ref_text is provided without explicit task_type.
+        if request.task_type is None and (request.ref_audio is not None or request.ref_text is not None):
+            request.task_type = "Base"
         task_type = request.task_type or "CustomVoice"
 
         # Normalize voice to lowercase for case-insensitive matching
@@ -250,8 +253,14 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non
             return f"Invalid language '{request.language}'. Supported: {', '.join(sorted(_TTS_LANGUAGES))}"
 
         # Validate speaker for CustomVoice task
-        if task_type == "CustomVoice" and request.voice is not None:
-            if self.supported_speakers and request.voice not in self.supported_speakers:
+        if task_type == "CustomVoice":
+            if not self.supported_speakers:
+                return (
+                    "This model does not support CustomVoice task (no speakers configured). "
+                    "Use task_type='Base' with ref_audio/ref_text for voice cloning, "
+                    "or use a CustomVoice model."
+                )
+            if request.voice is not None and request.voice not in self.supported_speakers:
                 return f"Invalid speaker '{request.voice}'. Supported: {', '.join(sorted(self.supported_speakers))}"
 
         # Validate Base task requirements
@@ -261,6 +270,14 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non
             # Validate ref_audio format
             if not (request.ref_audio.startswith(("http://", "https://")) or request.ref_audio.startswith("data:")):
                 return "ref_audio must be a URL (http/https) or base64 data URL (data:...)"
+            # In-context voice cloning (default) requires non-empty ref_text.
+            # x_vector_only_mode skips in-context and only uses speaker embedding.
+            if not request.x_vector_only_mode:
+                if not request.ref_text or not request.ref_text.strip():
+                    return (
+                        "Base task requires non-empty 'ref_text' (transcript of "
+                        "the reference audio) unless 'x_vector_only_mode' is enabled"
+                    )
 
         # Validate cross-parameter dependencies
         if task_type != "Base":