diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 554164a59c2..06b6f5c16c1 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -752,6 +752,26 @@ def test_validate_tts_request_base_empty_ref_text(self, speech_server): ) assert speech_server._validate_tts_request(req) is None + @pytest.mark.parametrize( + "ref_text", + [None, "", " "], + ids=["none", "empty", "whitespace"], + ) + def test_validate_base_task_missing_ref_text_returns_400(self, speech_server, ref_text): + """Regression: Base task without ref_text must return 400, not crash EngineCore. + + See https://github.com/vllm-project/vllm-omni/pull/2203 + """ + req = OpenAICreateSpeechRequest( + input="Hello", + task_type="Base", + ref_audio="data:audio/wav;base64,abc", + ref_text=ref_text, + ) + result = speech_server._validate_tts_request(req) + assert result is not None, f"ref_text={ref_text!r} should be rejected" + assert "ref_text" in result + def test_validate_tts_request_customvoice_no_speakers(self, speech_server): """CustomVoice on a model with no speakers returns 400 instead of crashing engine.""" req = OpenAICreateSpeechRequest(input="Hello", task_type="CustomVoice") diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 87ef6a4e9b6..52944d50824 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -919,6 +919,13 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str fmt_err = self._validate_ref_audio_format(request.ref_audio) if fmt_err: return fmt_err + if not getattr(request, "x_vector_only_mode", False) and ( + not request.ref_text or not request.ref_text.strip() + ): + return ( + "Base task requires non-empty 'ref_text' (transcript of " + "the reference audio) unless 'x_vector_only_mode' is enabled" + ) # Validate cross-parameter dependencies if task_type != "Base": diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py index f89012ec45d..6b7b688f15a 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py @@ -1439,11 +1439,16 @@ def _normalize_voice_clone_prompt(raw: object) -> dict[str, object] | None: ) if ref_ids is None: ref_text = _as_singleton(info_dict.get("ref_text")) - if not isinstance(ref_text, str) or not ref_text.strip(): - raise ValueError("Base in-context voice cloning requires `ref_text` or tokenized `ref_ids`.") - ref_ids = tok(self._build_ref_text(ref_text), return_tensors="pt", padding=False)["input_ids"].to( - device=input_ids.device - ) + if isinstance(ref_text, str) and ref_text.strip(): + ref_ids = tok( + self._build_ref_text(ref_text), + return_tensors="pt", + padding=False, + )["input_ids"].to(device=input_ids.device) + else: + logger.warning("Base ICL: ref_text/ref_ids missing, falling back to x-vector-only mode.") + in_context_mode = False + if in_context_mode: icl_input_embed, trailing_text_hidden = self._generate_icl_prompt( text_id=input_ids[:, 3:-5], ref_id=ref_ids[:, 3:-2],