Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 41 additions & 7 deletions tests/entrypoints/openai_api/test_serving_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,13 +349,13 @@ def test_validate_tts_request_basic(self, speech_server):
req = OpenAICreateSpeechRequest(input="Hello", language="InvalidLang")
assert "Invalid language" in speech_server._validate_tts_request(req)

# When no speakers loaded, any voice is accepted (unconstrained)
# CustomVoice on model with no speakers -> rejected
req = OpenAICreateSpeechRequest(input="Hello", voice="Invalid")
assert speech_server._validate_tts_request(req) is None
assert "does not support CustomVoice" in speech_server._validate_tts_request(req)

# Valid request
req = OpenAICreateSpeechRequest(input="Hello", voice="Vivian")
assert speech_server._validate_tts_request(req) is None
# CustomVoice without voice on model with no speakers -> also rejected
req = OpenAICreateSpeechRequest(input="Hello")
assert "does not support CustomVoice" in speech_server._validate_tts_request(req)

def test_validate_tts_request_task_types(self, speech_server):
"""Test task-specific validation."""
Expand All @@ -367,9 +367,43 @@ def test_validate_tts_request_task_types(self, speech_server):
req = OpenAICreateSpeechRequest(input="Hello", task_type="VoiceDesign")
assert "instructions" in speech_server._validate_tts_request(req)

# ref_text only for Base
# ref_text without task_type auto-infers Base, then fails on missing ref_audio
req = OpenAICreateSpeechRequest(input="Hello", ref_text="text")
assert "Base task" in speech_server._validate_tts_request(req)
assert "ref_audio" in speech_server._validate_tts_request(req)

def test_validate_tts_request_auto_infer_base(self, speech_server):
"""Test auto-inference of Base task when ref_audio/ref_text is provided."""
# ref_audio without task_type -> infers Base, requires non-empty ref_text
req = OpenAICreateSpeechRequest(input="Hello", ref_audio="data:audio/wav;base64,abc")
result = speech_server._validate_tts_request(req)
assert "ref_text" in result
assert req.task_type == "Base"

# ref_text without task_type -> infers Base, requires ref_audio
req = OpenAICreateSpeechRequest(input="Hello", ref_text="some text")
result = speech_server._validate_tts_request(req)
assert "ref_audio" in result
assert req.task_type == "Base"

def test_validate_tts_request_base_empty_ref_text(self, speech_server):
"""Empty ref_text on Base task returns 400 instead of crashing engine."""
req = OpenAICreateSpeechRequest(
input="Hello", task_type="Base", ref_audio="data:audio/wav;base64,abc", ref_text=""
)
result = speech_server._validate_tts_request(req)
assert "non-empty 'ref_text'" in result

# x_vector_only_mode bypasses ref_text requirement
req = OpenAICreateSpeechRequest(
input="Hello", task_type="Base", ref_audio="data:audio/wav;base64,abc", ref_text="", x_vector_only_mode=True
)
assert speech_server._validate_tts_request(req) is None

def test_validate_tts_request_customvoice_no_speakers(self, speech_server):
"""CustomVoice on a model with no speakers returns 400 instead of crashing engine."""
req = OpenAICreateSpeechRequest(input="Hello", task_type="CustomVoice")
result = speech_server._validate_tts_request(req)
assert "does not support CustomVoice" in result

def test_build_tts_params(self, speech_server):
"""Test TTS parameter building."""
Expand Down
21 changes: 19 additions & 2 deletions vllm_omni/entrypoints/openai/serving_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ def _is_tts_model(self) -> bool:

def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None:
"""Validate TTS request parameters. Returns error message or None."""
# Infer Base task when ref_audio or ref_text is provided without explicit task_type.
if request.task_type is None and (request.ref_audio is not None or request.ref_text is not None):
Comment thread
linyueqian marked this conversation as resolved.
request.task_type = "Base"
task_type = request.task_type or "CustomVoice"

# Normalize voice to lowercase for case-insensitive matching
Expand All @@ -250,8 +253,14 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non
return f"Invalid language '{request.language}'. Supported: {', '.join(sorted(_TTS_LANGUAGES))}"

# Validate speaker for CustomVoice task
if task_type == "CustomVoice" and request.voice is not None:
if self.supported_speakers and request.voice not in self.supported_speakers:
if task_type == "CustomVoice":
Comment thread
linyueqian marked this conversation as resolved.
if not self.supported_speakers:
return (
"This model does not support CustomVoice task (no speakers configured). "
"Use task_type='Base' with ref_audio/ref_text for voice cloning, "
"or use a CustomVoice model."
)
if request.voice is not None and request.voice not in self.supported_speakers:
return f"Invalid speaker '{request.voice}'. Supported: {', '.join(sorted(self.supported_speakers))}"

# Validate Base task requirements
Expand All @@ -261,6 +270,14 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non
# Validate ref_audio format
if not (request.ref_audio.startswith(("http://", "https://")) or request.ref_audio.startswith("data:")):
return "ref_audio must be a URL (http/https) or base64 data URL (data:...)"
# In-context voice cloning (default) requires non-empty ref_text.
Comment thread
linyueqian marked this conversation as resolved.
# x_vector_only_mode skips in-context and only uses speaker embedding.
if not request.x_vector_only_mode:
if not request.ref_text or not request.ref_text.strip():
return (
"Base task requires non-empty 'ref_text' (transcript of "
"the reference audio) unless 'x_vector_only_mode' is enabled"
)

# Validate cross-parameter dependencies
if task_type != "Base":
Expand Down