From 5b18ee3c69366ebb0cf1094ecdde879078112fda Mon Sep 17 00:00:00 2001 From: Zhanqiu Hu Date: Wed, 4 Mar 2026 17:32:46 -0500 Subject: [PATCH 1/2] [Bugfix] Fix all-silence TTS output: use float32 for speech tokenizer decoder The speech tokenizer in Qwen3TTSCode2Wav was loaded in bfloat16, causing NaN overflow in the SnakeBeta activation's torch.exp() calls. This produced all-silence PCM output (-32768 for every sample). Change torch_dtype from bfloat16 to float32 to match the upstream Qwen3-TTS tokenizer config (encoder_config.dtype: "float32") and official examples which default to float32. Signed-off-by: Zhanqiu Hu --- vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py index 851455bda69..c93d027d79a 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py @@ -73,7 +73,7 @@ def _ensure_speech_tokenizer_loaded(self) -> Qwen3TTSTokenizer: tok = Qwen3TTSTokenizer.from_pretrained( speech_tokenizer_dir, - torch_dtype=torch.bfloat16, + torch_dtype=torch.float32, load_feature_extractor=False, ) From 40cc6df49cb644266eb246f68e1be7327c442cc7 Mon Sep 17 00:00:00 2001 From: Zhanqiu Hu Date: Thu, 5 Mar 2026 10:30:58 -0500 Subject: [PATCH 2/2] Add E2E regression test for Base voice-clone (#1663) Serve the model, send a Base voice-clone request, and verify via Whisper that the output is real audio (not silence). Mirrors test_qwen3_omni.py pattern using convert_audio_file_to_text + cosine_similarity_text. Signed-off-by: Zhanqiu Hu --- .../e2e/online_serving/test_qwen3_tts_base.py | 167 ++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 tests/e2e/online_serving/test_qwen3_tts_base.py diff --git a/tests/e2e/online_serving/test_qwen3_tts_base.py b/tests/e2e/online_serving/test_qwen3_tts_base.py new file mode 100644 index 00000000000..9dd662f084d --- /dev/null +++ b/tests/e2e/online_serving/test_qwen3_tts_base.py @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E tests for Qwen3-TTS Base voice-clone model. + +Regression test for #1663: speech tokenizer loaded in bfloat16 produced +all-silence PCM due to NaN overflow in SnakeBeta activation. +""" + +import os + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +import struct +import tempfile +from pathlib import Path + +import httpx +import pytest + +from tests.conftest import ( + OmniServer, + convert_audio_file_to_text, + cosine_similarity_text, +) +from tests.utils import hardware_test + +MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base" + +# Official Qwen3-TTS reference audio/text from examples/offline_inference/qwen3_tts/end2end.py +REF_AUDIO_URL = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav" +REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you." +SYN_TEXT = "Good one. Okay, fine, I'm just gonna leave this sock monkey here. Goodbye." + + +def get_stage_config(): + return str( + Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "qwen3_tts.yaml" + ) + + +@pytest.fixture(scope="module") +def omni_server(): + """Start vLLM-Omni server with Base voice-clone model.""" + stage_config_path = get_stage_config() + + with OmniServer( + MODEL, + [ + "--stage-configs-path", + stage_config_path, + "--stage-init-timeout", + "120", + "--trust-remote-code", + "--enforce-eager", + "--disable-log-stats", + ], + ) as server: + yield server + + +def make_base_speech_request( + host: str, + port: int, + text: str = SYN_TEXT, + ref_text: str = REF_TEXT, + ref_audio: str = REF_AUDIO_URL, + response_format: str = "wav", + timeout: float = 120.0, +) -> httpx.Response: + url = f"http://{host}:{port}/v1/audio/speech" + payload = { + "model": MODEL, + "input": text, + "task_type": "Base", + "ref_text": ref_text, + "ref_audio": ref_audio, + "response_format": response_format, + } + with httpx.Client(timeout=timeout) as client: + return client.post(url, json=payload) + + +def verify_wav_audio(content: bytes) -> bool: + if len(content) < 44: + return False + return content[:4] == b"RIFF" and content[8:12] == b"WAVE" + + +def assert_not_silence(pcm_bytes: bytes): + """Assert PCM16 samples are not all identical (e.g. all-silence).""" + samples = struct.unpack(f"<{len(pcm_bytes) // 2}h", pcm_bytes) + unique = set(samples) + assert len(unique) > 1, ( + f"All-silence detected: {len(samples)} samples, unique values: {unique}. " + "See https://github.com/vllm-project/vllm-omni/issues/1663" + ) + + +MIN_AUDIO_BYTES = 10000 + + +class TestQwen3TTSBaseVoiceClone: + """Regression tests for Base voice-clone (fix #1663).""" + + @pytest.mark.core_model + @pytest.mark.omni + @hardware_test(res={"cuda": "L4"}, num_cards=4) + def test_base_voice_clone_not_silence(self, omni_server) -> None: + """PCM output must contain real audio, not all-silence.""" + response = make_base_speech_request( + host=omni_server.host, + port=omni_server.port, + response_format="pcm", + ) + + assert response.status_code == 200, f"Request failed: {response.text}" + assert len(response.content) > MIN_AUDIO_BYTES, f"Audio too small: {len(response.content)} bytes" + assert_not_silence(response.content) + + @pytest.mark.core_model + @pytest.mark.omni + @hardware_test(res={"cuda": "L4"}, num_cards=4) + def test_base_voice_clone_whisper_transcription(self, omni_server) -> None: + """Whisper must transcribe the output as intelligible speech.""" + response = make_base_speech_request( + host=omni_server.host, + port=omni_server.port, + response_format="wav", + ) + + assert response.status_code == 200, f"Request failed: {response.text}" + assert verify_wav_audio(response.content), "Response is not valid WAV" + assert len(response.content) > MIN_AUDIO_BYTES + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: + f.write(response.content) + wav_path = f.name + + try: + transcript = convert_audio_file_to_text(wav_path) + print(f"Whisper transcript: {transcript}") + assert len(transcript.strip()) > 0, "Whisper returned empty transcript — audio is likely silence" + similarity = cosine_similarity_text(transcript.lower(), SYN_TEXT.lower()) + print(f"Cosine similarity: {similarity:.3f}") + assert similarity > 0.9, ( + f"Transcript doesn't match input: similarity={similarity:.2f}, transcript='{transcript}'" + ) + finally: + os.unlink(wav_path) + + @pytest.mark.core_model + @pytest.mark.omni + @hardware_test(res={"cuda": "L4"}, num_cards=4) + def test_base_voice_clone_wav_format(self, omni_server) -> None: + """WAV response must have valid headers and sufficient size.""" + response = make_base_speech_request( + host=omni_server.host, + port=omni_server.port, + response_format="wav", + ) + + assert response.status_code == 200, f"Request failed: {response.text}" + assert response.headers.get("content-type") == "audio/wav" + assert verify_wav_audio(response.content), "Response is not valid WAV" + assert len(response.content) > MIN_AUDIO_BYTES