Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions tests/entrypoints/openai_api/test_serving_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,26 @@ def test_validate_tts_request_base_empty_ref_text(self, speech_server):
)
assert speech_server._validate_tts_request(req) is None

@pytest.mark.parametrize(
"ref_text",
[None, "", " "],
ids=["none", "empty", "whitespace"],
)
def test_validate_base_task_missing_ref_text_returns_400(self, speech_server, ref_text):
"""Regression: Base task without ref_text must return 400, not crash EngineCore.

See https://github.com/vllm-project/vllm-omni/pull/2203
"""
req = OpenAICreateSpeechRequest(
input="Hello",
task_type="Base",
ref_audio="data:audio/wav;base64,abc",
ref_text=ref_text,
)
result = speech_server._validate_tts_request(req)
assert result is not None, f"ref_text={ref_text!r} should be rejected"
assert "ref_text" in result

def test_validate_tts_request_customvoice_no_speakers(self, speech_server):
"""CustomVoice on a model with no speakers returns 400 instead of crashing engine."""
req = OpenAICreateSpeechRequest(input="Hello", task_type="CustomVoice")
Expand Down
7 changes: 7 additions & 0 deletions vllm_omni/entrypoints/openai/serving_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,13 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str
fmt_err = self._validate_ref_audio_format(request.ref_audio)
if fmt_err:
return fmt_err
if not getattr(request, "x_vector_only_mode", False) and (
not request.ref_text or not request.ref_text.strip()
):
return (
"Base task requires non-empty 'ref_text' (transcript of "
"the reference audio) unless 'x_vector_only_mode' is enabled"
)

# Validate cross-parameter dependencies
if task_type != "Base":
Expand Down
15 changes: 10 additions & 5 deletions vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1439,11 +1439,16 @@ def _normalize_voice_clone_prompt(raw: object) -> dict[str, object] | None:
)
if ref_ids is None:
ref_text = _as_singleton(info_dict.get("ref_text"))
if not isinstance(ref_text, str) or not ref_text.strip():
raise ValueError("Base in-context voice cloning requires `ref_text` or tokenized `ref_ids`.")
ref_ids = tok(self._build_ref_text(ref_text), return_tensors="pt", padding=False)["input_ids"].to(
device=input_ids.device
)
if isinstance(ref_text, str) and ref_text.strip():
ref_ids = tok(
self._build_ref_text(ref_text),
return_tensors="pt",
padding=False,
)["input_ids"].to(device=input_ids.device)
else:
logger.warning("Base ICL: ref_text/ref_ids missing, falling back to x-vector-only mode.")
in_context_mode = False
if in_context_mode:
icl_input_embed, trailing_text_hidden = self._generate_icl_prompt(
text_id=input_ids[:, 3:-5],
ref_id=ref_ids[:, 3:-2],
Expand Down
Loading