From b874b652dad66c0c7d2deac8f5204457b83f7371 Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Mon, 16 Mar 2026 14:29:55 +0800
Subject: [PATCH 1/9] add tts test case

Signed-off-by: yenuo26 <410167048@qq.com>
---
 tests/conftest.py                             | 591 ++++++++++++++++--
 tests/e2e/offline_inference/test_qwen3_tts.py |  70 +++
 tests/e2e/online_serving/test_qwen3_tts.py    | 332 +++-------
 .../test_qwen3_tts_expansion.py               | 137 ++++
 4 files changed, 837 insertions(+), 293 deletions(-)
 create mode 100644 tests/e2e/offline_inference/test_qwen3_tts.py
 create mode 100644 tests/e2e/online_serving/test_qwen3_tts_expansion.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 0a3c350d75e..18a2b8718cc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,6 +4,7 @@
 import math
 import os
 import random
+import re
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 # Set CPU device for CI environments without GPU
@@ -706,7 +707,7 @@ def generate_synthetic_image(width: int, height: int, save_to_file: bool = False
 
 
 def preprocess_text(text):
-    import re
+    import opencc
 
     word_to_num = {
         "zero": "0",
@@ -728,6 +729,8 @@ def preprocess_text(text):
 
     text = re.sub(r"[^\w\s]", "", text)
     text = re.sub(r"\s+", " ", text)
+    cc = opencc.OpenCC("t2s")
+    text = cc.convert(text)
     return text.lower().strip()
 
 
@@ -739,6 +742,7 @@ def cosine_similarity_text(text1, text2, n: int = 3):
 
     text1 = preprocess_text(text1)
     text2 = preprocess_text(text2)
+    print(f"cosine similarity text1 is: {text1}, text2 is: {text2}")
 
     ngrams1 = [text1[i : i + n] for i in range(len(text1) - n + 1)]
     ngrams2 = [text2[i : i + n] for i in range(len(text2) - n + 1)]
@@ -764,7 +768,7 @@ def convert_audio_to_text(audio_data):
     Convert base64 encoded audio data to text using speech recognition.
     """
     audio_data = base64.b64decode(audio_data)
-    output_path = f"./test_{int(time.time())}"
+    output_path = f"./test_{int(time.time())}.wav"
     with open(output_path, "wb") as audio_file:
         audio_file.write(audio_data)
 
@@ -773,6 +777,18 @@ def convert_audio_to_text(audio_data):
     return text
 
 
+def _merge_base64_audio_to_segment(base64_list: list[str]):
+    """Merge a list of base64-encoded audio chunks into one pydub AudioSegment."""
+    from pydub import AudioSegment
+
+    merged = None
+    for b64 in base64_list:
+        raw = base64.b64decode(b64.split(",", 1)[-1])
+        seg = AudioSegment.from_file(io.BytesIO(raw))
+        merged = seg if merged is None else merged + seg
+    return merged
+
+
 def _whisper_transcribe_in_current_process(output_path: str) -> str:
     import whisper
 
@@ -807,22 +823,27 @@ def convert_audio_file_to_text(output_path: str) -> str:
         return future.result()
 
 
+def convert_audio_bytes_to_text(raw_bytes: bytes) -> str:
+    """
+    Write raw audio bytes to a temp WAV file suitable for Whisper/ffmpeg.
+    Normalizes with soundfile to PCM_16 WAV when possible to avoid codec issues.
+    Caller must os.unlink the returned path when done.
+    """
+    output_path = f"./test_{int(time.time())}.wav"
+    data, samplerate = sf.read(io.BytesIO(raw_bytes))
+    sf.write(output_path, data, samplerate, format="WAV", subtype="PCM_16")
+    text = convert_audio_file_to_text(output_path)
+    return text
+
+
 def merge_base64_and_convert_to_text(base64_list):
     """
     Merge a list of base64 encoded audio data and convert to text.
     """
-    from pydub import AudioSegment
-
-    merged_audio = None
-    for base64_data in base64_list:
-        audio_data = base64.b64decode(base64_data.split(",", 1)[-1])
-        seg = AudioSegment.from_file(io.BytesIO(audio_data))
-        if merged_audio is None:
-            merged_audio = seg
-        else:
-            merged_audio += seg
-    output_path = f"./test_{int(time.time())}"
+    merged_audio = _merge_base64_audio_to_segment(base64_list)
+    output_path = f"./test_{int(time.time())}.wav"
     merged_audio.export(output_path, format="wav")
+    print(f"audio data is saved: {output_path}")
     text = convert_audio_file_to_text(output_path)
     return text
 
@@ -1003,8 +1024,7 @@ def delete_by_path(config_dict: dict, path: str) -> None:
                                 break
 
                         if target_stage is None:
-                            available_ids = [s.get("stage_id") for s in stage_args if "stage_id" in s]
-                            raise KeyError(f"Stage ID {stage_id} not found, available: {available_ids}")
+                            continue
 
                         # Delete specified paths in this stage
                         for path in delete_paths:
@@ -1265,6 +1285,8 @@ class OmniResponse:
     text_content: str | None = None
     audio_data: list[str] | None = None
     audio_content: str | None = None
+    audio_format: str | None = None
+    audio_bytes: bytes | None = None
     similarity: float | None = None
     e2e_latency: float | None = None
     success: bool = False
@@ -1282,6 +1304,130 @@ class DiffusionResponse:
     error_message: str | None = None
 
 
+def _extract_expected_voice_gender(messages: list[dict[str, Any]] | None) -> str | None:
+    """Parse expected voice gender from system prompt text ('male'/'female') if present."""
+    if not messages:
+        return None
+    for msg in messages:
+        if msg.get("role") != "system":
+            continue
+        content = msg.get("content")
+        if isinstance(content, str):
+            text = content
+        elif isinstance(content, list):
+            text = " ".join(
+                item.get("text", "") for item in content if isinstance(item, dict) and item.get("type") == "text"
+            )
+        else:
+            text = ""
+        m = re.search(r"\buse a (male|female) voice\b", text, flags=re.IGNORECASE)
+        if m:
+            return m.group(1).lower()
+    return None
+
+
+def _get_merged_audio_bytes(audio_data: list[str] | None) -> bytes | None:
+    """Merge base64 audio chunks (stream or single) into one WAV bytes for analysis."""
+    if not audio_data:
+        return None
+    merged = _merge_base64_audio_to_segment(audio_data)
+    buf = io.BytesIO()
+    merged.export(buf, format="wav")
+    return buf.getvalue()
+
+
+def _audio_tensor_to_wav_bytes(audio_tensor: Any, sample_rate: int = 24000) -> bytes:
+    """Convert offline pipeline audio tensor to WAV bytes (e.g. for Qwen3-TTS)."""
+    if hasattr(audio_tensor, "float"):
+        arr = audio_tensor.float().detach().cpu().numpy()
+    else:
+        arr = np.asarray(audio_tensor, dtype=np.float32)
+    if arr.ndim > 1:
+        arr = arr.flatten()
+    buf = io.BytesIO()
+    sf.write(buf, arr, sample_rate, format="WAV", subtype="FLOAT")
+    return buf.getvalue()
+
+
+def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
+    """
+    Estimate voice gender from WAV bytes by fundamental frequency (F0).
+    Returns 'male', 'female', or 'unknown' when ambiguous.
+    Uses multiple windows + consistency check to reduce misclassification.
+    """
+    data, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=True)
+    if data.size == 0:
+        raise ValueError("Empty audio")
+    mono = np.mean(data, axis=1)
+
+    def _estimate_f0(seg: np.ndarray) -> float | None:
+        seg = seg - float(np.mean(seg))
+        if float(np.max(np.abs(seg))) < 1e-3:
+            return None
+        min_lag = max(2, int(sr / 350))
+        max_lag = min(len(seg) // 2, int(sr / 60))
+        if max_lag <= min_lag + 2:
+            return None
+        corr = np.correlate(seg, seg, mode="same")
+        mid = len(corr) // 2
+        valid = corr[mid + min_lag : mid + max_lag + 1]
+        the = float(np.max(valid)) * 0.5
+        peaks: list[tuple[int, float]] = []
+        for i in range(1, len(valid) - 1):
+            v = float(valid[i])
+            if v > the and v >= float(valid[i - 1]) and v >= float(valid[i + 1]):
+                peaks.append((min_lag + i, v))
+        if not peaks:
+            lag = min_lag + int(np.argmax(valid))
+            return float(sr) / float(lag)
+        best_v = max(v for _, v in peaks)
+        close = [(lag, v) for lag, v in peaks if v >= best_v * 0.85]
+        lag = max(lag for lag, _ in close)
+        cand2 = [(lag2, v) for lag2, v in peaks if abs(lag2 - 2 * lag) <= max(2, int(0.03 * (2 * lag)))]
+        if cand2:
+            best2 = max(v for _, v in cand2)
+            if best2 >= best_v * 0.75:
+                lag = max(lag_val for lag_val, _ in cand2)
+        return float(sr) / float(lag)
+
+    win_len = min(int(sr * 0.6), len(mono))
+    if win_len < int(sr * 0.15):
+        raise ValueError("Cannot estimate F0: audio too short")
+    # Use more windows (6) over the duration for a more stable and representative F0 distribution.
+    num_windows = 6
+    step = max(1, (len(mono) - win_len) // max(1, num_windows - 1))
+    starts = [min(i * step, len(mono) - win_len) for i in range(num_windows)]
+    starts = [s for s in starts if s >= 0]
+    f0s = [f for f in (_estimate_f0(mono[s : s + win_len]) for s in starts) if f is not None]
+    if not f0s:
+        raise ValueError("Cannot estimate F0: no voiced segment found")
+    f0s.sort()
+    n = len(f0s)
+    median_f0 = float(f0s[n // 2])
+
+    # Conservative thresholds; only classify when F0 is clearly in one range and windows agree.
+    MALE_THRESH = 102.0  # below this → candidate male
+    FEMALE_THRESH = 198.0  # above this → candidate female
+    low_count = sum(1 for f in f0s if f < MALE_THRESH)
+    high_count = sum(1 for f in f0s if f > FEMALE_THRESH)
+    # Require majority of windows to agree with the median band to avoid harmonic/subharmonic flips.
+    if median_f0 < MALE_THRESH and low_count >= (n + 1) // 2:
+        # Check for possible female misread as male (e.g. 250 Hz → 125 Hz): many high F0s.
+        if high_count >= 2:
+            result = "unknown"
+        else:
+            result = "male"
+    elif median_f0 > FEMALE_THRESH and high_count >= (n + 1) // 2:
+        if low_count >= 2:
+            result = "unknown"
+        else:
+            result = "female"
+    else:
+        result = "unknown"
+    print(f"estimated f0 median: {median_f0:.1f} Hz (min={f0s[0]:.1f}, max={f0s[-1]:.1f}); gender: {result}")
+    return result
+
+
 def assert_omni_response(response: OmniResponse, request_config: dict[str, Any], run_level):
     """
     Validate response results.
@@ -1295,7 +1441,7 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any],
     assert response.success, "The request failed."
     e2e_latency = response.e2e_latency
     if e2e_latency is not None:
-        print(f"the avg e2e latency is: {e2e_latency}")
+        print(f"the e2e latency is: {e2e_latency}")
 
     modalities = request_config.get("modalities", ["text", "audio"])
 
@@ -1330,6 +1476,59 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any],
             print(f"similarity is: {response.similarity}")
 
 
+def assert_audio_speech_response(
+    response: OmniResponse,
+    request_config: dict[str, Any],
+    run_level: str,
+) -> None:
+    """
+    Validate /v1/audio/speech response: success, optional format check, and transcription similarity.
+    """
+    assert response.success, "The request failed."
+
+    req_fmt = request_config.get("response_format")
+
+    if req_fmt and response.audio_format is not None:
+        assert req_fmt in response.audio_format, (
+            f"The response audio format {response.audio_format} don't match the request audio format {req_fmt}"
+        )
+
+    e2e_latency = response.e2e_latency
+    if e2e_latency is not None:
+        print(f"the avg e2e latency is: {e2e_latency}")
+
+    if run_level == "advanced_model":
+        # Text–audio semantic similarity check
+        expected_text = request_config.get("input")
+        if expected_text:
+            transcript = (response.audio_content or "").strip()
+            print(f"audio content is: {transcript}")
+            print(f"input text is: {expected_text}")
+            similarity = cosine_similarity_text(transcript.lower(), expected_text.lower())
+            print(f"Cosine similarity: {similarity:.3f}")
+            assert similarity > 0.9, (
+                f"Transcript doesn't match input: similarity={similarity:.2f}, transcript='{transcript}'"
+            )
+
+        # Voice gender consistency check:
+        # When the estimator returns 'unknown', we treat it as inconclusive and do NOT fail the test.
+        voice = (request_config.get("voice") or "").lower()
+        if voice and response.audio_bytes:
+            estimated_gender = _estimate_voice_gender_from_audio(response.audio_bytes)
+            voice_gender_map = {
+                # adjust this mapping to your actual voice names
+                "serena": "female",
+                "eric": "male",
+            }
+            expected_gender = voice_gender_map.get(voice)
+            if expected_gender is not None:
+                print(f"Estimated voice gender from audio: {estimated_gender} (voice='{voice}')")
+                if estimated_gender != "unknown":
+                    assert estimated_gender == expected_gender, (
+                        f"Voice '{voice}' is expected {expected_gender}, but estimated gender is '{estimated_gender}'"
+                    )
+
+
 def assert_diffusion_response(response: DiffusionResponse, request_config: dict[str, Any], run_level: str = None):
     """
     Validate diffusion response results.
@@ -1551,6 +1750,101 @@ def _process_diffusion_response(self, chat_completion) -> DiffusionResponse:
 
         return result
 
+    def _process_stream_audio_speech_response(self, response) -> OmniResponse:
+        """
+        Process streaming /v1/audio/speech responses into an OmniResponse.
+
+        This mirrors _process_stream_omni_response but operates on low-level
+        audio bytes and produces an OmniResponse with audio_content filled
+        from Whisper transcription.
+        """
+        result = OmniResponse()
+        start_time = time.perf_counter()
+
+        try:
+            # Aggregate all audio bytes from the streaming response.
+            data = bytearray()
+
+            # Preferred OpenAI helper.
+            if hasattr(response, "iter_bytes") and callable(getattr(response, "iter_bytes")):
+                for chunk in response.iter_bytes():
+                    if chunk:
+                        data.extend(chunk)
+            else:
+                # Generic iterable-of-bytes fallback (e.g., generator or list of chunks).
+                try:
+                    iterator = iter(response)
+                except TypeError:
+                    iterator = None
+
+                if iterator is not None:
+                    for chunk in iterator:
+                        if not chunk:
+                            continue
+                        if isinstance(chunk, (bytes, bytearray)):
+                            data.extend(chunk)
+                        elif hasattr(chunk, "data"):
+                            data.extend(chunk.data)  # type: ignore[arg-type]
+                        elif hasattr(chunk, "content"):
+                            data.extend(chunk.content)  # type: ignore[arg-type]
+                        else:
+                            raise TypeError(f"Unsupported stream chunk type: {type(chunk)}")
+                else:
+                    raise TypeError(f"Unsupported audio speech streaming response type: {type(response)}")
+
+            raw_bytes = bytes(data)
+            transcript = convert_audio_bytes_to_text(raw_bytes)
+
+            # Populate OmniResponse.
+            result.audio_bytes = raw_bytes
+            result.audio_content = transcript
+            result.e2e_latency = time.perf_counter() - start_time
+            result.success = True
+            result.audio_format = getattr(response, "response", None)
+            if result.audio_format is not None:
+                result.audio_format = result.audio_format.headers.get("content-type", "")
+
+        except Exception as e:
+            result.error_message = f"Audio speech stream processing error: {str(e)}"
+            print(f"Error: {result.error_message}")
+
+        return result
+
+    def _process_non_stream_audio_speech_response(self, response) -> OmniResponse:
+        """
+        Process non-streaming /v1/audio/speech responses into an OmniResponse.
+
+        This mirrors _process_non_stream_omni_response but for the binary
+        audio payload returned by audio.speech.create.
+        """
+        result = OmniResponse()
+        start_time = time.perf_counter()
+
+        try:
+            # OpenAI non-streaming audio.speech.create returns HttpxBinaryResponseContent (.read() or .content)
+            if hasattr(response, "read") and callable(getattr(response, "read")):
+                raw_bytes = response.read()
+            elif hasattr(response, "content"):
+                raw_bytes = response.content  # type: ignore[assignment]
+            else:
+                raise TypeError(f"Unsupported audio speech response type: {type(response)}")
+
+            transcript = convert_audio_bytes_to_text(raw_bytes)
+
+            result.audio_bytes = raw_bytes
+            result.audio_content = transcript
+            result.e2e_latency = time.perf_counter() - start_time
+            result.success = True
+            result.audio_format = getattr(response, "response", None)
+            if result.audio_format is not None:
+                result.audio_format = result.audio_format.headers.get("content-type", "")
+
+        except Exception as e:
+            result.error_message = f"Audio speech non-stream processing error: {str(e)}"
+            print(f"Error: {result.error_message}")
+
+        return result
+
     def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]:
         """
         Send OpenAI requests.
@@ -1585,32 +1879,132 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1
             responses.append(response)
 
         else:
-            # Send concurrent requests
+            # Send concurrent requests: run create + process in worker so e2e_latency includes full round-trip.
+            def _one_omni_request():
+                start = time.perf_counter()
+                chat_completion = self.client.chat.completions.create(
+                    model=request_config.get("model"),
+                    messages=request_config.get("messages"),
+                    modalities=modalities,
+                    stream=stream,
+                )
+                if stream:
+                    response = self._process_stream_omni_response(chat_completion)
+                else:
+                    response = self._process_non_stream_omni_response(chat_completion)
+                response.e2e_latency = time.perf_counter() - start
+                return response
+
             with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor:
-                futures = []
+                futures = [executor.submit(_one_omni_request) for _ in range(request_num)]
+                for future in concurrent.futures.as_completed(futures):
+                    response = future.result()
+                    assert_omni_response(response, request_config, run_level=self.run_level)
+                    responses.append(response)
 
-                # Submit all request tasks
-                for _ in range(request_num):
-                    future = executor.submit(
-                        self.client.chat.completions.create,
-                        model=request_config.get("model"),
-                        messages=request_config.get("messages"),
-                        modalities=modalities,
-                        stream=stream,
-                    )
-                    futures.append(future)
+        return responses
 
-                # Process completed tasks
-                for future in concurrent.futures.as_completed(futures):
-                    chat_completion = future.result()
+    def send_audio_speech_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]:
+        """
+        Call the /v1/audio/speech endpoint using the same configuration-dict
+        style as send_omni_request, but via the OpenAI Python client's
+        audio.speech APIs.
+
+        Expected keys in request_config:
+          - model: model name/path (required)
+          - input: text to synthesize (required)
+          - response_format: audio format such as "wav" or "pcm" (optional)
+          - task_type, ref_text, ref_audio: TTS-specific extras (optional, passed via extra_body)
+          - timeout: request timeout in seconds (float, optional, default 120.0)
+          - stream: whether to use streaming API (bool, optional, default False)
+        """
+        timeout = float(request_config.get("timeout", 120.0))
 
-                    if stream:
-                        response = self._process_stream_omni_response(chat_completion)
-                    else:
-                        response = self._process_non_stream_omni_response(chat_completion)
+        model = request_config["model"]
+        text_input = request_config["input"]
+        stream = bool(request_config.get("stream", False))
+        voice = request_config.get("voice", None)
 
-                    assert_omni_response(response, request_config, run_level=self.run_level)
-                    responses.append(response)
+        # Standard OpenAI param: use omit when not provided to keep default behavior.
+        response_format = request_config.get("response_format", omit)
+
+        # Qwen3-TTS custom fields, forwarded via extra_body.
+        extra_body: dict[str, Any] = {}
+        for key in ("task_type", "ref_text", "ref_audio"):
+            if key in request_config:
+                extra_body[key] = request_config[key]
+
+        responses: list[OmniResponse] = []
+
+        if request_num == 1:
+            if stream:
+                # Use streaming response helper.
+                with self.client.audio.speech.with_streaming_response.create(
+                    model=model,
+                    input=text_input,
+                    response_format=response_format,
+                    extra_body=extra_body or None,
+                    timeout=timeout,
+                    voice=voice,
+                ) as resp:
+                    omni_resp = self._process_stream_audio_speech_response(resp)
+            else:
+                # Non-streaming response.
+                resp = self.client.audio.speech.create(
+                    model=model,
+                    input=text_input,
+                    response_format=response_format,
+                    extra_body=extra_body or None,
+                    timeout=timeout,
+                    voice=voice,
+                )
+                omni_resp = self._process_non_stream_audio_speech_response(resp)
+
+            assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level)
+            responses.append(omni_resp)
+            return responses
+        else:
+            # request_num > 1: concurrent requests (use same params as single-request path)
+
+            if stream:
+
+                def _stream_task():
+                    with self.client.audio.speech.with_streaming_response.create(
+                        model=model,
+                        input=text_input,
+                        response_format=response_format,
+                        extra_body=extra_body or None,
+                        timeout=timeout,
+                        voice=voice,
+                    ) as resp:
+                        return self._process_stream_audio_speech_response(resp)
+
+                with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor:
+                    futures = [executor.submit(_stream_task) for _ in range(request_num)]
+                    for future in concurrent.futures.as_completed(futures):
+                        omni_resp = future.result()
+                        assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level)
+                        responses.append(omni_resp)
+            else:
+                with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor:
+                    futures = []
+                    for _ in range(request_num):
+                        future = executor.submit(
+                            self.client.audio.speech.create,
+                            model=model,
+                            input=text_input,
+                            response_format=response_format,
+                            extra_body=extra_body or None,
+                            timeout=timeout,
+                            voice=voice,
+                        )
+                        futures.append(future)
+
+                    for future in concurrent.futures.as_completed(futures):
+                        resp = future.result()
+                        omni_resp = self._process_non_stream_audio_speech_response(resp)
+                        assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level)
+                        responses.append(omni_resp)
 
         return responses
 
@@ -1726,6 +2120,43 @@ def __init__(
             **kwargs,
         )
 
+    def _estimate_prompt_len(
+        additional_information: dict[str, Any],
+        model_name: str,
+        _cache: dict[str, Any] = {},
+    ) -> int:
+        """Estimate prompt_token_ids placeholder length for the Talker stage.
+
+        The AR Talker replaces all input embeddings via ``preprocess``, so the
+        placeholder values are irrelevant but the **length** must match the
+        embeddings that ``preprocess`` will produce.
+        """
+        try:
+            from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import Qwen3TTSConfig
+            from vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_talker import (
+                Qwen3TTSTalkerForConditionalGeneration,
+            )
+
+            if model_name not in _cache:
+                from transformers import AutoTokenizer
+
+                tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left")
+                cfg = Qwen3TTSConfig.from_pretrained(model_name, trust_remote_code=True)
+                _cache[model_name] = (tok, getattr(cfg, "talker_config", None))
+
+            tok, tcfg = _cache[model_name]
+            task_type = (additional_information.get("task_type") or ["CustomVoice"])[0]
+            return Qwen3TTSTalkerForConditionalGeneration.estimate_prompt_len_from_additional_information(
+                additional_information=additional_information,
+                task_type=task_type,
+                tokenize_prompt=lambda t: tok(t, padding=False)["input_ids"],
+                codec_language_id=getattr(tcfg, "codec_language_id", None),
+                spk_is_dialect=getattr(tcfg, "spk_is_dialect", None),
+            )
+        except Exception as exc:
+            logger.warning("Failed to estimate prompt length, using fallback 2048: %s", exc)
+            return 2048
+
     def get_default_sampling_params_list(self) -> list[OmniSamplingParams]:
         """
         Get a list of default sampling parameters for all stages.
@@ -1778,6 +2209,36 @@ def get_omni_inputs(
         if isinstance(prompts, str):
             prompts = [prompts]
 
+        # Qwen-TTS: follow examples/offline_inference/qwen3_tts/end2end.py style.
+        # Stage 0 expects token placeholders + additional_information (text/speaker/task_type/...),
+        # and Talker replaces embeddings in preprocess based on additional_information only.
+        is_tts_model = "Qwen3-TTS" in self.model_name or "qwen3_tts" in self.model_name.lower()
+        if is_tts_model and modalities == ["audio"]:
+            tts_kw = mm_processor_kwargs or {}
+            task_type = tts_kw.get("task_type", "CustomVoice")
+            speaker = tts_kw.get("speaker", "Vivian")
+            language = tts_kw.get("language", "Auto")
+            max_new_tokens = int(tts_kw.get("max_new_tokens", 2048))
+
+            omni_inputs: list[TextPrompt] = []
+            for prompt_text in prompts:
+                text_str = str(prompt_text).strip() or " "
+                additional_information: dict[str, Any] = {
+                    "task_type": [task_type],
+                    "text": [text_str],
+                    "language": [language],
+                    "speaker": [speaker],
+                    "max_new_tokens": [max_new_tokens],
+                }
+                # Use official helper to get correct placeholder length
+                plen = self._estimate_prompt_len(additional_information, self.model_name)
+                input_dict: TextPrompt = {
+                    "prompt_token_ids": [0] * plen,
+                    "additional_information": additional_information,
+                }
+                omni_inputs.append(input_dict)
+            return omni_inputs
+
         def _normalize_mm_input(mm_input, num_prompts):
             if mm_input is None:
                 return [None] * num_prompts
@@ -2004,6 +2465,62 @@ def send_request(self, request_config: dict[str, Any] | None = None) -> OmniResp
         assert_omni_response(response, request_config, run_level="L2")
         return response
 
+    def send_audio_speech_request(
+        self,
+        request_config: dict[str, Any],
+    ) -> OmniResponse:
+        """
+        Offline TTS: text -> audio via generate_multimodal, then validate with assert_audio_speech_response.
+
+        request_config must contain:
+          - 'input' or 'prompts': text to synthesize.
+        Optional keys:
+          - 'voice'       -> speaker (CustomVoice)
+          - 'task_type'   -> task_type in additional_information (default: "CustomVoice")
+          - 'language'    -> language in additional_information (default: "Auto")
+          - 'max_new_tokens' -> max_new_tokens in additional_information (default: 2048)
+          - 'response_format' -> desired audio format (used only for assertion)
+        """
+        input_text = request_config.get("input") or request_config.get("prompts")
+        if input_text is None:
+            raise ValueError("request_config must contain 'input' or 'prompts' for TTS")
+        if isinstance(input_text, list):
+            input_text = input_text[0] if input_text else ""
+
+        # Build TTS-specific kwargs passed through to get_omni_inputs for Qwen3-TTS,
+        # matching examples/offline_inference/qwen3_tts/end2end.py.
+        mm_processor_kwargs: dict[str, Any] = {}
+        if "voice" in request_config:
+            mm_processor_kwargs["speaker"] = request_config["voice"]
+        if "task_type" in request_config:
+            mm_processor_kwargs["task_type"] = request_config["task_type"]
+        if "language" in request_config:
+            mm_processor_kwargs["language"] = request_config["language"]
+        if "max_new_tokens" in request_config:
+            mm_processor_kwargs["max_new_tokens"] = request_config["max_new_tokens"]
+
+        outputs = self.runner.generate_multimodal(
+            prompts=input_text,
+            modalities=["audio"],
+            mm_processor_kwargs=mm_processor_kwargs or None,
+        )
+        audio_tensor = None
+        for stage_out in outputs:
+            if getattr(stage_out, "final_output_type", None) == "audio":
+                audio_tensor = stage_out.request_output[0].outputs[0].multimodal_output["audio"]
+                break
+        if audio_tensor is None:
+            result = OmniResponse(success=False, error_message="No audio output from pipeline")
+            assert result.success, result.error_message
+            return result
+
+        # Convert tensor to WAV bytes for downstream analysis (transcription, gender estimation).
+        raw_bytes = _audio_tensor_to_wav_bytes(audio_tensor, sample_rate=24000)
+        result = OmniResponse(success=True, audio_bytes=raw_bytes)
+        # Use advanced_model to enable similarity / voice checks when configured.
+        assert_audio_speech_response(result, request_config, run_level="advanced_model")
+        return result
+
 
 @pytest.fixture
 def omni_runner_handler(omni_runner):
diff --git a/tests/e2e/offline_inference/test_qwen3_tts.py b/tests/e2e/offline_inference/test_qwen3_tts.py
new file mode 100644
index 00000000000..e26657901ae
--- /dev/null
+++ b/tests/e2e/offline_inference/test_qwen3_tts.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E offline tests for Qwen3-TTS model with text input and audio output.
+
+Async_chunk disable, cuda_graph disabled (no_async_chunk stage config).
+Same structure as test_qwen3_omni (models, stage_configs, test_params, parametrize omni_runner).
+"""
+
+import os
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
+
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import modify_stage_config
+from tests.utils import hardware_test
+
+MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
+
+
+def get_cuda_graph_config():
+    path = modify_stage_config(
+        get_stage_config(),
+        updates={
+            "stage_args": {
+                0: {
+                    "engine_args.enforce_eager": "true",
+                },
+                1: {"engine_args.enforce_eager": "true"},
+            },
+        },
+    )
+    return path
+
+
+def get_stage_config(name: str = "qwen3_tts_no_async_chunk.yaml"):
+    """Get the no_async_chunk stage config path (async_chunk disable, cuda_graph disabled)."""
+    return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name)
+
+
+# Same structure as test_qwen3_omni: models, stage_configs, test_params
+tts_server_params = [
+    pytest.param(
+        (MODEL, get_cuda_graph_config()),
+        id="no_cuda_graph",
+    )
+]
+
+
+def get_prompt():
+    """Text prompt for text-to-audio (same role as get_question in test_qwen3_omni)."""
+    return "Hello, this is a test for text to audio."
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_runner", tts_server_params, indirect=True)
+def test_text_to_audio_001(omni_runner, omni_runner_handler) -> None:
+    """
+    Text-to-audio E2E (offline): pure text input, audio output.
+    Async_chunk disable, cuda_graph disabled. Single request.
+    Uses send_audio_speech_request for TT-S-specific validation (same style as online /v1/audio/speech).
+    """
+    request_config = {"input": get_prompt(), "voice": "vivian"}
+    omni_runner_handler.send_audio_speech_request(request_config)
diff --git a/tests/e2e/online_serving/test_qwen3_tts.py b/tests/e2e/online_serving/test_qwen3_tts.py
index fd0ef766283..fb60df725ba 100644
--- a/tests/e2e/online_serving/test_qwen3_tts.py
+++ b/tests/e2e/online_serving/test_qwen3_tts.py
@@ -14,272 +14,92 @@
 
 from pathlib import Path
 
-import httpx
 import pytest
 
-from tests.conftest import OmniServer
+from tests.conftest import OmniServerParams
 from tests.utils import hardware_test
 
-MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice"
+MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
 
 
 def get_stage_config(name: str = "qwen3_tts.yaml"):
-    """Get the stage config path for Qwen3-TTS."""
+    """Get the stage config path from vllm_omni model_executor stage_configs."""
     return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name)
 
 
-@pytest.fixture(scope="class")
-def omni_server():
-    """Start vLLM-Omni server with CustomVoice model."""
-    stage_config_path = get_stage_config()
-
-    print(f"Starting OmniServer with model: {MODEL}")
-
-    with OmniServer(
-        MODEL,
-        [
-            "--stage-configs-path",
-            stage_config_path,
-            "--stage-init-timeout",
-            "120",
-            "--trust-remote-code",
-            "--enforce-eager",
-            "--disable-log-stats",
-        ],
-    ) as server:
-        print("OmniServer started successfully")
-        yield server
-        print("OmniServer stopping...")
-
-    print("OmniServer stopped")
-
-
-def make_speech_request(
-    host: str,
-    port: int,
-    text: str,
-    voice: str = "vivian",
-    language: str = "English",
-    task_type: str | None = None,
-    instructions: str | None = None,
-    timeout: float = 120.0,
-) -> httpx.Response:
-    """Make a request to the /v1/audio/speech endpoint."""
-    url = f"http://{host}:{port}/v1/audio/speech"
-    payload = {
-        "input": text,
-        "voice": voice,
-        "language": language,
+def get_prompt(prompt_type="text"):
+    """Text prompt for text-to-audio tests (same as test_qwen3_omni - beijing test case)."""
+    prompts = {
+        "text": "Beijing, China's capital, blends ancient wonders like the Great Wall with modern marvels. This vibrant metropolis offers rich culture, delicious Peking duck, and endless exploration opportunities.",
+    }
+    return prompts.get(prompt_type, prompts["text"])
+
+
+def get_max_batch_size(size_type="few"):
+    """Batch size for concurrent requests (same as test_qwen3_omni)."""
+    batch_sizes = {"few": 5, "medium": 100, "large": 256}
+    return batch_sizes.get(size_type, 5)
+
+
+tts_server_params = [
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=get_stage_config("qwen3_tts.yaml"),
+            server_args=["--trust-remote-code", "--disable-log-stats"],
+        ),
+        id="async_chunk",
+    )
+]
+
+
+@pytest.mark.core_model
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+def test_text_to_audio_001(omni_server, openai_client) -> None:
+    """
+    Test text input processing and audio output via OpenAI API.
+    Deploy Setting: default yaml
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=False
+    Datasets: few requests
+    """
+    request_config = {
+        "model": omni_server.model,
+        "input": get_prompt(),
+        "stream": False,
+        "response_format": "wav",
+        "task_type": "CustomVoice",
+        "voice": "vivian",
     }
-    if task_type:
-        payload["task_type"] = task_type
-    if instructions:
-        payload["instructions"] = instructions
-
-    with httpx.Client(timeout=timeout) as client:
-        return client.post(url, json=payload)
-
-
-def verify_wav_audio(content: bytes) -> bool:
-    """Verify that content is valid WAV audio data."""
-    # WAV files start with "RIFF" header
-    if len(content) < 44:  # Minimum WAV header size
-        return False
-    return content[:4] == b"RIFF" and content[8:12] == b"WAVE"
-
-
-# Minimum expected audio size for a short sentence (~1 second of 24kHz 16-bit mono WAV)
-MIN_AUDIO_BYTES = 10000
-
-
-class TestQwen3TTSCustomVoice:
-    """E2E tests for Qwen3-TTS CustomVoice model."""
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_speech_english_basic(self, omni_server) -> None:
-        """Test basic English TTS generation."""
-        response = make_speech_request(
-            host=omni_server.host,
-            port=omni_server.port,
-            text="Hello, how are you?",
-            voice="vivian",
-            language="English",
-        )
-
-        assert response.status_code == 200, f"Request failed: {response.text}"
-        assert response.headers.get("content-type") == "audio/wav"
-        assert verify_wav_audio(response.content), "Response is not valid WAV audio"
-        assert len(response.content) > MIN_AUDIO_BYTES, (
-            f"Audio content too small ({len(response.content)} bytes), expected at least {MIN_AUDIO_BYTES} bytes"
-        )
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_speech_chinese_basic(self, omni_server) -> None:
-        """Test basic Chinese TTS generation."""
-        response = make_speech_request(
-            host=omni_server.host,
-            port=omni_server.port,
-            text="你好，我是通义千问",
-            voice="vivian",
-            language="Chinese",
-        )
-
-        assert response.status_code == 200, f"Request failed: {response.text}"
-        assert response.headers.get("content-type") == "audio/wav"
-        assert verify_wav_audio(response.content), "Response is not valid WAV audio"
-        assert len(response.content) > MIN_AUDIO_BYTES, (
-            f"Audio content too small ({len(response.content)} bytes), expected at least {MIN_AUDIO_BYTES} bytes"
-        )
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_speech_different_voices(self, omni_server) -> None:
-        """Test TTS with different voice options."""
-        voices = ["vivian", "ryan"]
-        for voice in voices:
-            response = make_speech_request(
-                host=omni_server.host,
-                port=omni_server.port,
-                text="Testing voice selection.",
-                voice=voice,
-                language="English",
-            )
-
-            assert response.status_code == 200, f"Request failed for voice {voice}: {response.text}"
-            assert verify_wav_audio(response.content), f"Invalid WAV for voice {voice}"
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_speech_binary_response_not_utf8_error(self, omni_server) -> None:
-        """
-        Regression test: Verify binary audio is returned, not UTF-8 error.
-
-        This test ensures the multimodal_output property correctly retrieves
-        audio from completion outputs, preventing the "TTS model did not
-        produce audio output" error.
-        """
-        response = make_speech_request(
-            host=omni_server.host,
-            port=omni_server.port,
-            text="This should return binary audio, not a JSON error.",
-            voice="vivian",
-            language="English",
-        )
-
-        # Should NOT be a JSON error response
-        assert response.status_code == 200, f"Request failed: {response.text}"
-
-        # Verify it's binary audio, not JSON
-        try:
-            # If this succeeds and starts with {"error", it's a bug
-            text = response.content.decode("utf-8")
-            assert not text.startswith('{"error"'), f"Got error response instead of audio: {text}"
-        except UnicodeDecodeError:
-            # This is expected - binary audio can't be decoded as UTF-8
-            pass
-
-        assert verify_wav_audio(response.content), "Response is not valid WAV audio"
-
-
-class TestQwen3TTSAPIEndpoints:
-    """Test API endpoint functionality."""
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_list_voices_endpoint(self, omni_server) -> None:
-        """Test the /v1/audio/voices endpoint returns available voices."""
-        url = f"http://{omni_server.host}:{omni_server.port}/v1/audio/voices"
-
-        with httpx.Client(timeout=30.0) as client:
-            response = client.get(url)
-
-        assert response.status_code == 200
-        data = response.json()
-        assert "voices" in data
-        assert isinstance(data["voices"], list)
-        assert len(data["voices"]) > 0
-        # Check some expected voices are present
-        voices_lower = [v.lower() for v in data["voices"]]
-        assert "vivian" in voices_lower or "ryan" in voices_lower
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_models_endpoint(self, omni_server) -> None:
-        """Test the /v1/models endpoint returns loaded model."""
-        url = f"http://{omni_server.host}:{omni_server.port}/v1/models"
-
-        with httpx.Client(timeout=30.0) as client:
-            response = client.get(url)
-
-        assert response.status_code == 200
-        data = response.json()
-        assert "data" in data
-        assert len(data["data"]) > 0
-
-
-@pytest.fixture(scope="class")
-def omni_server_no_async_chunk():
-    """Start vLLM-Omni server with non-async-chunk config."""
-    stage_config_path = get_stage_config("qwen3_tts_no_async_chunk.yaml")
-
-    with OmniServer(
-        MODEL,
-        [
-            "--stage-configs-path",
-            stage_config_path,
-            "--stage-init-timeout",
-            "120",
-            "--trust-remote-code",
-            "--enforce-eager",
-            "--disable-log-stats",
-        ],
-    ) as server:
-        yield server
-
-
-class TestQwen3TTSNoAsyncChunk:
-    """E2E tests for Qwen3-TTS in non-async-chunk (full decode) mode."""
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_speech_english(self, omni_server_no_async_chunk) -> None:
-        """Test English TTS with non-async-chunk pipeline."""
-        response = make_speech_request(
-            host=omni_server_no_async_chunk.host,
-            port=omni_server_no_async_chunk.port,
-            text="Hello, how are you?",
-            voice="vivian",
-            language="English",
-        )
-
-        assert response.status_code == 200, f"Request failed: {response.text}"
-        assert response.headers.get("content-type") == "audio/wav"
-        assert verify_wav_audio(response.content), "Response is not valid WAV audio"
-        assert len(response.content) > MIN_AUDIO_BYTES
 
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_speech_chinese(self, omni_server_no_async_chunk) -> None:
-        """Test Chinese TTS with non-async-chunk pipeline."""
-        response = make_speech_request(
-            host=omni_server_no_async_chunk.host,
-            port=omni_server_no_async_chunk.port,
-            text="你好，我是通义千问",
-            voice="vivian",
-            language="Chinese",
-        )
+    openai_client.send_audio_speech_request(request_config, request_num=get_max_batch_size())
+
+
+@pytest.mark.core_model
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+def test_text_to_audio_002(omni_server, openai_client) -> None:
+    """
+    Test text input processing and audio output via OpenAI API.
+    Deploy Setting: default yaml
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=True
+    Datasets: single request
+    """
+    request_config = {
+        "model": omni_server.model,
+        "input": get_prompt(),
+        "stream": True,
+        "response_format": "wav",
+        "task_type": "CustomVoice",
+        "voice": "vivian",
+    }
 
-        assert response.status_code == 200, f"Request failed: {response.text}"
-        assert response.headers.get("content-type") == "audio/wav"
-        assert verify_wav_audio(response.content), "Response is not valid WAV audio"
-        assert len(response.content) > MIN_AUDIO_BYTES
+    openai_client.send_audio_speech_request(request_config)
diff --git a/tests/e2e/online_serving/test_qwen3_tts_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_expansion.py
new file mode 100644
index 00000000000..0a3764c8a7d
--- /dev/null
+++ b/tests/e2e/online_serving/test_qwen3_tts_expansion.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E Online tests for Qwen3-TTS model with text input and audio output.
+
+These tests verify the /v1/audio/speech endpoint works correctly with
+actual model inference, not mocks.
+"""
+
+import os
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
+
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import OmniServerParams
+from tests.utils import hardware_test
+
+MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
+
+
+def get_stage_config(name: str = "qwen3_tts.yaml"):
+    """Get the stage config path from vllm_omni model_executor stage_configs."""
+    return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name)
+
+
+def get_prompt(prompt_type="english"):
+    """Text prompt for text-to-audio tests (same as test_qwen3_omni - beijing test case)."""
+    prompts = {
+        "english": "Beijing, China's capital, blends ancient wonders like the Great Wall with modern marvels. This vibrant metropolis offers rich culture, delicious Peking duck, and endless exploration opportunities.",
+        "chinese": "北京，中国的首都，是一座融合了紫禁城、长城等历史遗迹与现代建筑的国际化大都市，充满了独特的文化与活力",
+    }
+    return prompts.get(prompt_type, prompts["english"])
+
+
+def get_max_batch_size(size_type="few"):
+    """Batch size for concurrent requests (same as test_qwen3_omni)."""
+    batch_sizes = {"few": 5, "medium": 100, "large": 256}
+    return batch_sizes.get(size_type, 5)
+
+
+tts_server_params = [
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=get_stage_config("qwen3_tts.yaml"),
+            server_args=["--trust-remote-code", "--disable-log-stats"],
+        ),
+        id="async_chunk",
+    ),
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=get_stage_config("qwen3_tts_no_async_chunk.yaml"),
+            server_args=["--trust-remote-code", "--disable-log-stats"],
+        ),
+        id="no_async_chunk",
+    ),
+]
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+def test_language_001(omni_server, openai_client) -> None:
+    """
+    Test text input processing and audio output via OpenAI API.
+    Deploy Setting: default yaml
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=False, language=chinese
+    Datasets: few requests
+    """
+    request_config = {
+        "model": omni_server.model,
+        "input": get_prompt("chinese"),
+        "stream": False,
+        "response_format": "wav",
+        "task_type": "CustomVoice",
+        "voice": "vivian",
+    }
+
+    openai_client.send_audio_speech_request(request_config)
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+def test_voice_001(omni_server, openai_client) -> None:
+    """
+    Test text input processing and audio output via OpenAI API.
+    Deploy Setting: default yaml
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=False, voice=eric
+    Datasets: few requests
+    """
+    request_config = {
+        "model": omni_server.model,
+        "input": get_prompt(),
+        "stream": True,
+        "response_format": "wav",
+        "task_type": "CustomVoice",
+        "voice": "eric",
+    }
+
+    openai_client.send_audio_speech_request(request_config, request_num=get_max_batch_size())
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+def test_voice_002(omni_server, openai_client) -> None:
+    """
+    Test text input processing and audio output via OpenAI API.
+    Deploy Setting: default yaml
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=False, language=chinese
+    Datasets: few requests
+    """
+    request_config = {
+        "model": omni_server.model,
+        "input": get_prompt(),
+        "stream": False,
+        "response_format": "wav",
+        "task_type": "CustomVoice",
+        "voice": "Serena",
+    }
+
+    openai_client.send_audio_speech_request(request_config)

From e55129bea9479974ed587621df4ef45142e54d2d Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Mon, 16 Mar 2026 15:50:19 +0800
Subject: [PATCH 2/9] Refactor audio file handling to use UUIDs for temporary
 filenames

Co-authored-by: yenuo26 <410167048@example.com>
Co-authored-by: R2-Y <ruiruyang2@gmail.com>
Signed-off-by: yenuo26 <410167048@qq.com>
---
 tests/conftest.py | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 18a2b8718cc..99c15354853 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -19,6 +19,7 @@
 import sys
 import threading
 import time
+import uuid
 from collections.abc import Generator
 from dataclasses import dataclass
 from io import BytesIO
@@ -827,9 +828,9 @@ def convert_audio_bytes_to_text(raw_bytes: bytes) -> str:
     """
     Write raw audio bytes to a temp WAV file suitable for Whisper/ffmpeg.
     Normalizes with soundfile to PCM_16 WAV when possible to avoid codec issues.
-    Caller must os.unlink the returned path when done.
     """
-    output_path = f"./test_{int(time.time())}.wav"
+    # Use high-entropy filename to avoid collisions under concurrency.
+    output_path = f"./test_{uuid.uuid4().hex}.wav"
     data, samplerate = sf.read(io.BytesIO(raw_bytes))
     sf.write(output_path, data, samplerate, format="WAV", subtype="PCM_16")
     text = convert_audio_file_to_text(output_path)
@@ -841,7 +842,7 @@ def merge_base64_and_convert_to_text(base64_list):
     Merge a list of base64 encoded audio data and convert to text.
     """
     merged_audio = _merge_base64_audio_to_segment(base64_list)
-    output_path = f"./test_{int(time.time())}.wav"
+    output_path = f"./test_{uuid.uuid4().hex}.wav"
     merged_audio.export(output_path, format="wav")
     print(f"audio data is saved: {output_path}")
     text = convert_audio_file_to_text(output_path)
@@ -1336,19 +1337,6 @@ def _get_merged_audio_bytes(audio_data: list[str] | None) -> bytes | None:
     return buf.getvalue()
 
 
-def _audio_tensor_to_wav_bytes(audio_tensor: Any, sample_rate: int = 24000) -> bytes:
-    """Convert offline pipeline audio tensor to WAV bytes (e.g. for Qwen3-TTS)."""
-    if hasattr(audio_tensor, "float"):
-        arr = audio_tensor.float().detach().cpu().numpy()
-    else:
-        arr = np.asarray(audio_tensor, dtype=np.float32)
-    if arr.ndim > 1:
-        arr = arr.flatten()
-    buf = io.BytesIO()
-    sf.write(buf, arr, sample_rate, format="WAV", subtype="FLOAT")
-    return buf.getvalue()
-
-
 def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
     """
     Estimate voice gender from WAV bytes by fundamental frequency (F0).
@@ -2121,6 +2109,7 @@ def __init__(
         )
 
     def _estimate_prompt_len(
+        self,
         additional_information: dict[str, Any],
         model_name: str,
         _cache: dict[str, Any] = {},
@@ -2462,7 +2451,7 @@ def send_request(self, request_config: dict[str, Any] | None = None) -> OmniResp
             prompts=prompts, videos=videos, images=images, audios=audios, modalities=modalities
         )
         response = self._process_output(outputs)
-        assert_omni_response(response, request_config, run_level="L2")
+        assert_omni_response(response, request_config, run_level="core_model")
         return response
 
     def send_audio_speech_request(
@@ -2515,10 +2504,9 @@ def send_audio_speech_request(
             return result
 
         # Convert tensor to WAV bytes for downstream analysis (transcription, gender estimation).
-        raw_bytes = _audio_tensor_to_wav_bytes(audio_tensor, sample_rate=24000)
-        result = OmniResponse(success=True, audio_bytes=raw_bytes)
+        result = OmniResponse(success=True)
         # Use advanced_model to enable similarity / voice checks when configured.
-        assert_audio_speech_response(result, request_config, run_level="advanced_model")
+        assert_audio_speech_response(result, request_config, run_level="core_model")
         return result
 
 

From 5092916a95d79be294281224b1ca81ef11b6bd76 Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Mon, 16 Mar 2026 16:19:29 +0800
Subject: [PATCH 3/9] Update dependencies and enhance CI testing for Qwen3-TTS

- Added `opencc>=1.2.0` to development dependencies in `pyproject.toml`.
- Introduced a test for Qwen3-TTS in the Buildkite pipeline.

Signed-off-by: yenuo26 <410167048@qq.com>
---
 .buildkite/test-merge.yml   | 22 ++++++++++++++++++++++
 .buildkite/test-nightly.yml |  3 ++-
 .buildkite/test-ready.yml   |  5 ++---
 pyproject.toml              |  3 ++-
 tests/conftest.py           | 33 ---------------------------------
 5 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
index 5479f8ac1e8..42ba0c1f1b4 100644
--- a/.buildkite/test-merge.yml
+++ b/.buildkite/test-merge.yml
@@ -200,6 +200,28 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
+  - label: "Qwen3-TTS E2E Test"
+    depends_on: upload-merge-pipeline
+    commands:
+      - |
+        timeout 20m bash -c '
+          export VLLM_LOGGING_LEVEL=DEBUG
+          export VLLM_WORKER_MULTIPROC_METHOD=spawn
+          pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline/test_qwen3_tts.py
+        '
+    agents:
+      queue: "gpu_1_queue"
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+
   - label: "Omni Model Test with H100"
     timeout_in_minutes: 30
     depends_on: upload-merge-pipeline
diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 7c8c4a89eb6..8dcd11542d6 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -47,7 +47,8 @@ steps:
     if: build.env("NIGHTLY") == "1"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/examples/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model"
+      - pytest -s -v examples/ -m "advanced_model and L4" --run-level "advanced_model"
+      - pytest -s -v e2e/online_serving/test_*_expansion.py -m "advanced_model and L4" --run-level "advanced_model"
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:
diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
index 66598f3eaf6..cf479e1398c 100644
--- a/.buildkite/test-ready.yml
+++ b/.buildkite/test-ready.yml
@@ -206,13 +206,12 @@ steps:
     commands:
       - |
         timeout 20m bash -c '
-          huggingface-cli download Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice
           export VLLM_LOGGING_LEVEL=DEBUG
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py -k "not NoAsyncChunk"
+          pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py -m "core_model" --run-level "core_model"
         '
     agents:
-      queue: "gpu_4_queue"
+      queue: "gpu_1_queue"
     plugins:
       - docker#v5.2.0:
           image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
diff --git a/pyproject.toml b/pyproject.toml
index 5bcfab7d2d6..f42a84f0fa4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,8 @@ dev = [
     "mooncake-transfer-engine==0.3.8.post1",
     "av", # for ComfyUI tests
     "openpyxl>=3.0.0", # for nightly CI
-    "pyttsx3>=2.99"
+    "pyttsx3>=2.99",
+    "opencc>=1.2.0"
 ]
 
 docs = [
diff --git a/tests/conftest.py b/tests/conftest.py
index 99c15354853..12d06193054 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1305,38 +1305,6 @@ class DiffusionResponse:
     error_message: str | None = None
 
 
-def _extract_expected_voice_gender(messages: list[dict[str, Any]] | None) -> str | None:
-    """Parse expected voice gender from system prompt text ('male'/'female') if present."""
-    if not messages:
-        return None
-    for msg in messages:
-        if msg.get("role") != "system":
-            continue
-        content = msg.get("content")
-        if isinstance(content, str):
-            text = content
-        elif isinstance(content, list):
-            text = " ".join(
-                item.get("text", "") for item in content if isinstance(item, dict) and item.get("type") == "text"
-            )
-        else:
-            text = ""
-        m = re.search(r"\buse a (male|female) voice\b", text, flags=re.IGNORECASE)
-        if m:
-            return m.group(1).lower()
-    return None
-
-
-def _get_merged_audio_bytes(audio_data: list[str] | None) -> bytes | None:
-    """Merge base64 audio chunks (stream or single) into one WAV bytes for analysis."""
-    if not audio_data:
-        return None
-    merged = _merge_base64_audio_to_segment(audio_data)
-    buf = io.BytesIO()
-    merged.export(buf, format="wav")
-    return buf.getvalue()
-
-
 def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
     """
     Estimate voice gender from WAV bytes by fundamental frequency (F0).
@@ -2505,7 +2473,6 @@ def send_audio_speech_request(
 
         # Convert tensor to WAV bytes for downstream analysis (transcription, gender estimation).
         result = OmniResponse(success=True)
-        # Use advanced_model to enable similarity / voice checks when configured.
         assert_audio_speech_response(result, request_config, run_level="core_model")
         return result
 

From 8b05df2daade5089c17e0b5389d186b831ea58eb Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Tue, 17 Mar 2026 18:55:59 +0800
Subject: [PATCH 4/9] Implement gender classification pipeline for audio input
 Signed-off-by: yenuo26 <410167048@qq.com>

Signed-off-by: yenuo26 <410167048@qq.com>
---
 tests/conftest.py | 139 +++++++++++++++++++++++-----------------------
 1 file changed, 70 insertions(+), 69 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 12d06193054..88e28660909 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -35,6 +35,7 @@
 import yaml
 from openai import OpenAI, omit
 from PIL import Image
+from transformers import pipeline
 from vllm import TextPrompt
 from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
 from vllm.logger import init_logger
@@ -50,6 +51,8 @@
 PromptImageInput = list[Any] | Any | None
 PromptVideoInput = list[Any] | Any | None
 
+_GENDER_PIPELINE = None
+
 
 class OmniServerParams(NamedTuple):
     model: str
@@ -84,7 +87,6 @@ def assert_video_valid(frames: Path | np.ndarray, *, width: int, height: int, nu
 
 def assert_audio_valid(path: Path, *, sample_rate: int, channels: int, duration_s: float) -> None:
     """Assert the WAV has the expected sample rate, channel count, and duration."""
-
     assert path.exists(), f"Audio not found: {path}"
     info = sf.info(str(path))
     assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}"
@@ -732,6 +734,12 @@ def preprocess_text(text):
     text = re.sub(r"\s+", " ", text)
     cc = opencc.OpenCC("t2s")
     text = cc.convert(text)
+
+    # Special handling for spaces between Chinese characters:
+    # - Keep single spaces between English words/numbers
+    # - Remove spaces only when surrounded by Chinese characters on both sides to prevent incorrect word segmentation
+    text = re.sub(r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", text)
+
     return text.lower().strip()
 
 
@@ -1305,83 +1313,76 @@ class DiffusionResponse:
     error_message: str | None = None
 
 
+def _load_gender_pipeline():
+    """
+    Lazy-load a cached audio-classification pipeline for gender.
+
+    We prefer the pipeline wrapper because it encapsulates processor/model loading
+    and avoids direct AutoProcessor.from_pretrained call sites in this file.
+    """
+    global _GENDER_PIPELINE
+    if _GENDER_PIPELINE is not None:
+        return _GENDER_PIPELINE
+
+    model_name = "7wolf/wav2vec2-base-gender-classification"
+    try:
+        # device=-1 forces CPU for pipeline.
+        _GENDER_PIPELINE = pipeline(
+            task="audio-classification",
+            model=model_name,
+            device=-1,
+        )
+        return _GENDER_PIPELINE
+    except Exception as exc:  # pragma: no cover - best-effort fallback
+        print(f"Warning: failed to create gender pipeline '{model_name}': {exc}")
+        _GENDER_PIPELINE = None
+        return None
+
+
 def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
     """
-    Estimate voice gender from WAV bytes by fundamental frequency (F0).
-    Returns 'male', 'female', or 'unknown' when ambiguous.
-    Uses multiple windows + consistency check to reduce misclassification.
+    Estimate voice gender from audio using a small pre-trained classification model.
+
+    Uses a cached `audio-classification` pipeline to classify the clip.
+    Returns 'male' / 'female' when the model confidence is >= 0.8 and the label
+    maps to one of these; otherwise returns 'unknown'. If the model is unavailable
+    or inference fails, returns 'unknown' to keep tests stable.
     """
     data, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=True)
     if data.size == 0:
         raise ValueError("Empty audio")
     mono = np.mean(data, axis=1)
 
-    def _estimate_f0(seg: np.ndarray) -> float | None:
-        seg = seg - float(np.mean(seg))
-        if float(np.max(np.abs(seg))) < 1e-3:
-            return None
-        min_lag = max(2, int(sr / 350))
-        max_lag = min(len(seg) // 2, int(sr / 60))
-        if max_lag <= min_lag + 2:
-            return None
-        corr = np.correlate(seg, seg, mode="same")
-        mid = len(corr) // 2
-        valid = corr[mid + min_lag : mid + max_lag + 1]
-        the = float(np.max(valid)) * 0.5
-        peaks: list[tuple[int, float]] = []
-        for i in range(1, len(valid) - 1):
-            v = float(valid[i])
-            if v > the and v >= float(valid[i - 1]) and v >= float(valid[i + 1]):
-                peaks.append((min_lag + i, v))
-        if not peaks:
-            lag = min_lag + int(np.argmax(valid))
-            return float(sr) / float(lag)
-        best_v = max(v for _, v in peaks)
-        close = [(lag, v) for lag, v in peaks if v >= best_v * 0.85]
-        lag = max(lag for lag, _ in close)
-        cand2 = [(lag2, v) for lag2, v in peaks if abs(lag2 - 2 * lag) <= max(2, int(0.03 * (2 * lag)))]
-        if cand2:
-            best2 = max(v for _, v in cand2)
-            if best2 >= best_v * 0.75:
-                lag = max(lag_val for lag_val, _ in cand2)
-        return float(sr) / float(lag)
-
-    win_len = min(int(sr * 0.6), len(mono))
-    if win_len < int(sr * 0.15):
-        raise ValueError("Cannot estimate F0: audio too short")
-    # Use more windows (6) over the duration for a more stable and representative F0 distribution.
-    num_windows = 6
-    step = max(1, (len(mono) - win_len) // max(1, num_windows - 1))
-    starts = [min(i * step, len(mono) - win_len) for i in range(num_windows)]
-    starts = [s for s in starts if s >= 0]
-    f0s = [f for f in (_estimate_f0(mono[s : s + win_len]) for s in starts) if f is not None]
-    if not f0s:
-        raise ValueError("Cannot estimate F0: no voiced segment found")
-    f0s.sort()
-    n = len(f0s)
-    median_f0 = float(f0s[n // 2])
-
-    # Conservative thresholds; only classify when F0 is clearly in one range and windows agree.
-    MALE_THRESH = 102.0  # below this → candidate male
-    FEMALE_THRESH = 198.0  # above this → candidate female
-    low_count = sum(1 for f in f0s if f < MALE_THRESH)
-    high_count = sum(1 for f in f0s if f > FEMALE_THRESH)
-    # Require majority of windows to agree with the median band to avoid harmonic/subharmonic flips.
-    if median_f0 < MALE_THRESH and low_count >= (n + 1) // 2:
-        # Check for possible female misread as male (e.g. 250 Hz → 125 Hz): many high F0s.
-        if high_count >= 2:
-            result = "unknown"
-        else:
-            result = "male"
-    elif median_f0 > FEMALE_THRESH and high_count >= (n + 1) // 2:
-        if low_count >= 2:
-            result = "unknown"
+    try:
+        clf = _load_gender_pipeline()
+        if clf is None:
+            print("gender model not available, returning 'unknown'")
+            return "unknown"
+
+        # transformers pipeline returns a list of {label, score} (highest score first).
+        outputs = clf(mono, sampling_rate=sr)
+        if not outputs:
+            return "unknown"
+
+        top = outputs[0]
+        label = str(top.get("label", "")).lower()
+        conf = float(top.get("score", 0.0))
+
+        if conf < 0.8:
+            gender = "unknown"
+        # Some models use non-English labels (e.g., Russian). Normalize to 'male'/'female'.
+        elif ("female" in label) or ("жен" in label):
+            gender = "female"
+        elif ("male" in label) or ("муж" in label):
+            gender = "male"
         else:
-            result = "female"
-    else:
-        result = "unknown"
-    print(f"estimated f0 median: {median_f0:.1f} Hz (min={f0s[0]:.1f}, max={f0s[-1]:.1f}); gender: {result}")
-    return result
+            gender = "unknown"
+
+        print(f"gender classifier: label={label}, conf={conf:.3f}, gender={gender}")
+        return gender
+    except Exception as exc:  # pragma: no cover - best-effort fallback
+        print(f"Warning: gender classification failed, returning 'unknown': {exc}")
+        return "unknown"
 
 
 def assert_omni_response(response: OmniResponse, request_config: dict[str, Any], run_level):

From 61f511cb99e26bba93b17fa62cc6df788b2848a1 Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Tue, 17 Mar 2026 19:33:36 +0800
Subject: [PATCH 5/9] add tts-base test case

Signed-off-by: yenuo26 <410167048@qq.com>
---
 tests/conftest.py                             |  15 +-
 .../offline_inference/test_qwen3_tts_base.py  |  83 +++++++
 ...3_tts.py => test_qwen3_tts_customvoice.py} |  12 +-
 .../e2e/online_serving/test_qwen3_tts_base.py | 215 +++++++-----------
 .../test_qwen3_tts_base_expansion.py          |  92 ++++++++
 ...3_tts.py => test_qwen3_tts_customvoice.py} |   0
 ...> test_qwen3_tts_customvoice_expansion.py} |   0
 7 files changed, 275 insertions(+), 142 deletions(-)
 create mode 100644 tests/e2e/offline_inference/test_qwen3_tts_base.py
 rename tests/e2e/offline_inference/{test_qwen3_tts.py => test_qwen3_tts_customvoice.py} (81%)
 create mode 100644 tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
 rename tests/e2e/online_serving/{test_qwen3_tts.py => test_qwen3_tts_customvoice.py} (100%)
 rename tests/e2e/online_serving/{test_qwen3_tts_expansion.py => test_qwen3_tts_customvoice_expansion.py} (100%)

diff --git a/tests/conftest.py b/tests/conftest.py
index 88e28660909..c4d6bfbb16f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1887,7 +1887,8 @@ def send_audio_speech_request(self, request_config: dict[str, Any], request_num:
 
         # Qwen3-TTS custom fields, forwarded via extra_body.
         extra_body: dict[str, Any] = {}
-        for key in ("task_type", "ref_text", "ref_audio"):
+        # Keep this list aligned with vllm_omni.entrypoints.openai.protocol.audio params.
+        for key in ("task_type", "ref_text", "ref_audio", "language", "max_new_tokens"):
             if key in request_config:
                 extra_body[key] = request_config[key]
 
@@ -2177,6 +2178,8 @@ def get_omni_inputs(
             speaker = tts_kw.get("speaker", "Vivian")
             language = tts_kw.get("language", "Auto")
             max_new_tokens = int(tts_kw.get("max_new_tokens", 2048))
+            ref_audio = tts_kw.get("ref_audio", None)
+            ref_text = tts_kw.get("ref_text", None)
 
             omni_inputs: list[TextPrompt] = []
             for prompt_text in prompts:
@@ -2188,6 +2191,10 @@ def get_omni_inputs(
                     "speaker": [speaker],
                     "max_new_tokens": [max_new_tokens],
                 }
+                if ref_audio is not None:
+                    additional_information["ref_audio"] = [ref_audio]
+                if ref_text is not None:
+                    additional_information["ref_text"] = [ref_text]
                 # Use official helper to get correct placeholder length
                 plen = self._estimate_prompt_len(additional_information, self.model_name)
                 input_dict: TextPrompt = {
@@ -2452,6 +2459,10 @@ def send_audio_speech_request(
             mm_processor_kwargs["speaker"] = request_config["voice"]
         if "task_type" in request_config:
             mm_processor_kwargs["task_type"] = request_config["task_type"]
+        if "ref_audio" in request_config:
+            mm_processor_kwargs["ref_audio"] = request_config["ref_audio"]
+        if "ref_text" in request_config:
+            mm_processor_kwargs["ref_text"] = request_config["ref_text"]
         if "language" in request_config:
             mm_processor_kwargs["language"] = request_config["language"]
         if "max_new_tokens" in request_config:
@@ -2472,7 +2483,7 @@ def send_audio_speech_request(
             assert result.success, result.error_message
             return result
 
-        # Convert tensor to WAV bytes for downstream analysis (transcription, gender estimation).
+        # Keep the offline path lightweight: downstream validation can be done elsewhere if needed.
         result = OmniResponse(success=True)
         assert_audio_speech_response(result, request_config, run_level="core_model")
         return result
diff --git a/tests/e2e/offline_inference/test_qwen3_tts_base.py b/tests/e2e/offline_inference/test_qwen3_tts_base.py
new file mode 100644
index 00000000000..b45f12bc698
--- /dev/null
+++ b/tests/e2e/offline_inference/test_qwen3_tts_base.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E offline tests for Qwen3-TTS Base model with text input and audio output.
+
+Async_chunk disable, cuda_graph disabled (no_async_chunk stage config).
+CUDA graph is disabled by setting engine_args.enforce_eager=true via modify_stage_config().
+Same structure as test_qwen3_omni (models, stage_configs, test_params, parametrize omni_runner).
+"""
+
+import os
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
+
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import modify_stage_config
+from tests.utils import hardware_test
+
+MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
+REF_AUDIO_URL = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav"
+REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."
+
+
+def get_cuda_graph_config():
+    path = modify_stage_config(
+        get_stage_config(),
+        updates={
+            "stage_args": {
+                0: {
+                    "engine_args.enforce_eager": "true",
+                },
+                1: {"engine_args.enforce_eager": "true"},
+            },
+        },
+    )
+    return path
+
+
+def get_stage_config(name: str = "qwen3_tts_no_async_chunk.yaml"):
+    """Get the no_async_chunk stage config path (async_chunk disable, cuda_graph disabled)."""
+    return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name)
+
+
+# Same structure as test_qwen3_omni: models, stage_configs, test_params
+tts_server_params = [
+    pytest.param(
+        (MODEL, get_cuda_graph_config()),
+        id="no_cuda_graph",
+    )
+]
+
+
+def get_prompt():
+    """Text prompt for text-to-audio (same role as get_question in test_qwen3_omni)."""
+    return "Hello, this is a test for text to audio."
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_runner", tts_server_params, indirect=True)
+def test_text_to_audio_001(omni_runner, omni_runner_handler) -> None:
+    """
+    Test text input processing and audio output via offline Omni runner.
+    Deploy Setting: qwen3_tts_no_async_chunk.yaml + enforce_eager=true
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=False
+    Extra Setting: task_type=Base, voice=clone, ref_audio/ref_text provided
+    Datasets: few requests
+    """
+    request_config = {
+        "input": get_prompt(),
+        "task_type": "Base",
+        "voice": "clone",
+        "ref_audio": REF_AUDIO_URL,
+        "ref_text": REF_TEXT,
+    }
+    omni_runner_handler.send_audio_speech_request(request_config)
diff --git a/tests/e2e/offline_inference/test_qwen3_tts.py b/tests/e2e/offline_inference/test_qwen3_tts_customvoice.py
similarity index 81%
rename from tests/e2e/offline_inference/test_qwen3_tts.py
rename to tests/e2e/offline_inference/test_qwen3_tts_customvoice.py
index e26657901ae..67d72df908c 100644
--- a/tests/e2e/offline_inference/test_qwen3_tts.py
+++ b/tests/e2e/offline_inference/test_qwen3_tts_customvoice.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-E2E offline tests for Qwen3-TTS model with text input and audio output.
+E2E offline tests for Qwen3-TTS CustomVoice model with text input and audio output.
 
 Async_chunk disable, cuda_graph disabled (no_async_chunk stage config).
+CUDA graph is disabled by setting engine_args.enforce_eager=true via modify_stage_config().
 Same structure as test_qwen3_omni (models, stage_configs, test_params, parametrize omni_runner).
 """
 
@@ -62,9 +63,12 @@ def get_prompt():
 @pytest.mark.parametrize("omni_runner", tts_server_params, indirect=True)
 def test_text_to_audio_001(omni_runner, omni_runner_handler) -> None:
     """
-    Text-to-audio E2E (offline): pure text input, audio output.
-    Async_chunk disable, cuda_graph disabled. Single request.
-    Uses send_audio_speech_request for TT-S-specific validation (same style as online /v1/audio/speech).
+    Test text input processing and audio output via offline Omni runner.
+    Deploy Setting: qwen3_tts_no_async_chunk.yaml + enforce_eager=true
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=False
+    Datasets: few requests
     """
     request_config = {"input": get_prompt(), "voice": "vivian"}
     omni_runner_handler.send_audio_speech_request(request_config)
diff --git a/tests/e2e/online_serving/test_qwen3_tts_base.py b/tests/e2e/online_serving/test_qwen3_tts_base.py
index 9dd662f084d..79c20f36872 100644
--- a/tests/e2e/online_serving/test_qwen3_tts_base.py
+++ b/tests/e2e/online_serving/test_qwen3_tts_base.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-E2E tests for Qwen3-TTS Base voice-clone model.
+E2E Online tests for Qwen3-TTS model with text input and audio output.
 
-Regression test for #1663: speech tokenizer loaded in bfloat16 produced
-all-silence PCM due to NaN overflow in SnakeBeta activation.
+These tests verify the /v1/audio/speech endpoint works correctly with
+actual model inference, not mocks.
 """
 
 import os
@@ -12,156 +12,99 @@
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
 
-import struct
-import tempfile
 from pathlib import Path
 
-import httpx
 import pytest
 
-from tests.conftest import (
-    OmniServer,
-    convert_audio_file_to_text,
-    cosine_similarity_text,
-)
+from tests.conftest import OmniServerParams
 from tests.utils import hardware_test
 
 MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
 
-# Official Qwen3-TTS reference audio/text from examples/offline_inference/qwen3_tts/end2end.py
 REF_AUDIO_URL = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav"
 REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."
-SYN_TEXT = "Good one. Okay, fine, I'm just gonna leave this sock monkey here. Goodbye."
 
 
-def get_stage_config():
-    return str(
-        Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "qwen3_tts.yaml"
-    )
+def get_stage_config(name: str = "qwen3_tts.yaml"):
+    """Get the stage config path from vllm_omni model_executor stage_configs."""
+    return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name)
 
 
-@pytest.fixture(scope="module")
-def omni_server():
-    """Start vLLM-Omni server with Base voice-clone model."""
-    stage_config_path = get_stage_config()
-
-    with OmniServer(
-        MODEL,
-        [
-            "--stage-configs-path",
-            stage_config_path,
-            "--stage-init-timeout",
-            "120",
-            "--trust-remote-code",
-            "--enforce-eager",
-            "--disable-log-stats",
-        ],
-    ) as server:
-        yield server
-
-
-def make_base_speech_request(
-    host: str,
-    port: int,
-    text: str = SYN_TEXT,
-    ref_text: str = REF_TEXT,
-    ref_audio: str = REF_AUDIO_URL,
-    response_format: str = "wav",
-    timeout: float = 120.0,
-) -> httpx.Response:
-    url = f"http://{host}:{port}/v1/audio/speech"
-    payload = {
-        "model": MODEL,
-        "input": text,
-        "task_type": "Base",
-        "ref_text": ref_text,
-        "ref_audio": ref_audio,
-        "response_format": response_format,
+def get_prompt(prompt_type="text"):
+    """Text prompt for text-to-audio tests (same as test_qwen3_omni - beijing test case)."""
+    prompts = {
+        "text": "The weather is nice today, perfect for a walk in the park.",
     }
-    with httpx.Client(timeout=timeout) as client:
-        return client.post(url, json=payload)
+    return prompts.get(prompt_type, prompts["text"])
 
 
-def verify_wav_audio(content: bytes) -> bool:
-    if len(content) < 44:
-        return False
-    return content[:4] == b"RIFF" and content[8:12] == b"WAVE"
+def get_max_batch_size(size_type="few"):
+    """Batch size for concurrent requests (same as test_qwen3_omni)."""
+    batch_sizes = {"few": 5, "medium": 100, "large": 256}
+    return batch_sizes.get(size_type, 5)
 
 
-def assert_not_silence(pcm_bytes: bytes):
-    """Assert PCM16 samples are not all identical (e.g. all-silence)."""
-    samples = struct.unpack(f"<{len(pcm_bytes) // 2}h", pcm_bytes)
-    unique = set(samples)
-    assert len(unique) > 1, (
-        f"All-silence detected: {len(samples)} samples, unique values: {unique}. "
-        "See https://github.com/vllm-project/vllm-omni/issues/1663"
+tts_server_params = [
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=get_stage_config("qwen3_tts.yaml"),
+            server_args=["--trust-remote-code", "--disable-log-stats"],
+        ),
+        id="async_chunk",
     )
-
-
-MIN_AUDIO_BYTES = 10000
-
-
-class TestQwen3TTSBaseVoiceClone:
-    """Regression tests for Base voice-clone (fix #1663)."""
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_base_voice_clone_not_silence(self, omni_server) -> None:
-        """PCM output must contain real audio, not all-silence."""
-        response = make_base_speech_request(
-            host=omni_server.host,
-            port=omni_server.port,
-            response_format="pcm",
-        )
-
-        assert response.status_code == 200, f"Request failed: {response.text}"
-        assert len(response.content) > MIN_AUDIO_BYTES, f"Audio too small: {len(response.content)} bytes"
-        assert_not_silence(response.content)
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_base_voice_clone_whisper_transcription(self, omni_server) -> None:
-        """Whisper must transcribe the output as intelligible speech."""
-        response = make_base_speech_request(
-            host=omni_server.host,
-            port=omni_server.port,
-            response_format="wav",
-        )
-
-        assert response.status_code == 200, f"Request failed: {response.text}"
-        assert verify_wav_audio(response.content), "Response is not valid WAV"
-        assert len(response.content) > MIN_AUDIO_BYTES
-
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            f.write(response.content)
-            wav_path = f.name
-
-        try:
-            transcript = convert_audio_file_to_text(wav_path)
-            print(f"Whisper transcript: {transcript}")
-            assert len(transcript.strip()) > 0, "Whisper returned empty transcript — audio is likely silence"
-            similarity = cosine_similarity_text(transcript.lower(), SYN_TEXT.lower())
-            print(f"Cosine similarity: {similarity:.3f}")
-            assert similarity > 0.9, (
-                f"Transcript doesn't match input: similarity={similarity:.2f}, transcript='{transcript}'"
-            )
-        finally:
-            os.unlink(wav_path)
-
-    @pytest.mark.core_model
-    @pytest.mark.omni
-    @hardware_test(res={"cuda": "L4"}, num_cards=4)
-    def test_base_voice_clone_wav_format(self, omni_server) -> None:
-        """WAV response must have valid headers and sufficient size."""
-        response = make_base_speech_request(
-            host=omni_server.host,
-            port=omni_server.port,
-            response_format="wav",
-        )
-
-        assert response.status_code == 200, f"Request failed: {response.text}"
-        assert response.headers.get("content-type") == "audio/wav"
-        assert verify_wav_audio(response.content), "Response is not valid WAV"
-        assert len(response.content) > MIN_AUDIO_BYTES
+]
+
+
+@pytest.mark.advanced_model
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+def test_text_to_audio_001(omni_server, openai_client) -> None:
+    """
+    Test text input processing and audio output via OpenAI API.
+    Deploy Setting: default yaml
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=False
+    Datasets: few requests
+    """
+
+    request_config = {
+        "model": omni_server.model,
+        "input": get_prompt(),
+        "stream": False,
+        "response_format": "wav",
+        "task_type": "Base",
+        "voice": "clone",
+        "ref_audio": REF_AUDIO_URL,
+        "ref_text": REF_TEXT,
+    }
+    openai_client.send_audio_speech_request(request_config, request_num=get_max_batch_size("few"))
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+def test_text_to_audio_002(omni_server, openai_client) -> None:
+    """
+    Test text input processing and audio output via OpenAI API.
+    Deploy Setting: default yaml
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=True
+    Datasets: single request
+    """
+    request_config = {
+        "model": omni_server.model,
+        "input": "The weather is nice today, perfect for a walk in the park.",
+        "stream": True,
+        "response_format": "wav",
+        "task_type": "Base",
+        "voice": "clone",
+        "ref_audio": REF_AUDIO_URL,
+        "ref_text": REF_TEXT,
+    }
+    openai_client.send_audio_speech_request(request_config)
diff --git a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
new file mode 100644
index 00000000000..73d16e89d31
--- /dev/null
+++ b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E Online tests for Qwen3-TTS model with text input and audio output.
+
+These tests verify the /v1/audio/speech endpoint works correctly with
+actual model inference, not mocks.
+"""
+
+import os
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
+
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import OmniServerParams
+from tests.utils import hardware_test
+
+MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
+
+REF_AUDIO_URL = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav"
+REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."
+
+
+def get_stage_config(name: str = "qwen3_tts.yaml"):
+    """Get the stage config path from vllm_omni model_executor stage_configs."""
+    return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name)
+
+
+def get_prompt(prompt_type="text"):
+    """Text prompt for text-to-audio tests (same as test_qwen3_omni - beijing test case)."""
+    prompts = {
+        "text": "The weather is nice today, perfect for a walk in the park.",
+    }
+    return prompts.get(prompt_type, prompts["text"])
+
+
+def get_max_batch_size(size_type="few"):
+    """Batch size for concurrent requests (same as test_qwen3_omni)."""
+    batch_sizes = {"few": 5, "medium": 100, "large": 256}
+    return batch_sizes.get(size_type, 5)
+
+
+tts_server_params = [
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=get_stage_config("qwen3_tts.yaml"),
+            server_args=["--trust-remote-code", "--disable-log-stats"],
+        ),
+        id="async_chunk",
+    ),
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=get_stage_config("qwen3_tts_no_async_chunk.yaml"),
+            server_args=["--trust-remote-code", "--disable-log-stats"],
+        ),
+        id="no_async_chunk",
+    ),
+]
+
+
+@pytest.mark.advanced_model
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+def test_voice_clone_streaming_001(omni_server, openai_client) -> None:
+    """
+    Test text input processing and audio output via OpenAI API.
+    Deploy Setting: default yaml
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=True
+    Datasets: few requests
+    """
+
+    request_config = {
+        "model": omni_server.model,
+        "input": get_prompt(),
+        "stream": True,
+        "response_format": "wav",
+        "task_type": "Base",
+        "voice": "clone",
+        "ref_audio": REF_AUDIO_URL,
+        "ref_text": REF_TEXT,
+    }
+    openai_client.send_audio_speech_request(request_config, request_num=get_max_batch_size("few"))
diff --git a/tests/e2e/online_serving/test_qwen3_tts.py b/tests/e2e/online_serving/test_qwen3_tts_customvoice.py
similarity index 100%
rename from tests/e2e/online_serving/test_qwen3_tts.py
rename to tests/e2e/online_serving/test_qwen3_tts_customvoice.py
diff --git a/tests/e2e/online_serving/test_qwen3_tts_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py
similarity index 100%
rename from tests/e2e/online_serving/test_qwen3_tts_expansion.py
rename to tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py

From a3896239838beb060fb071453890d78a8ff6a432 Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Wed, 18 Mar 2026 18:22:16 +0800
Subject: [PATCH 6/9] Refactor audio processing and model handling in tests

Signed-off-by: yenuo26 <410167048@qq.com>
---
 tests/conftest.py                             | 30 +++++++++++--------
 .../offline_inference/test_qwen3_tts_base.py  |  2 +-
 .../e2e/online_serving/test_qwen3_tts_base.py |  4 +--
 .../test_qwen3_tts_base_expansion.py          |  2 +-
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c4d6bfbb16f..3ee79a47516 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,6 +17,7 @@
 import socket
 import subprocess
 import sys
+import tempfile
 import threading
 import time
 import uuid
@@ -289,7 +290,6 @@ def generate_synthetic_audio(
     """
     Generate TTS speech with pyttsx3 and return base64 string.
     """
-    import tempfile
 
     import pyttsx3
     import soundfile as sf
@@ -497,8 +497,8 @@ def _enhance_speech(audio: np.ndarray) -> np.ndarray:
     # Return result
     base64_audio = base64.b64encode(audio_bytes).decode("utf-8")
     result["base64"] = base64_audio
-    if save_to_file and output_path:
-        result["file_path"] = output_path
+    # Always include file_path to avoid KeyError in callers.
+    result["file_path"] = output_path if save_to_file and output_path else None
 
     return result
 
@@ -801,8 +801,8 @@ def _merge_base64_audio_to_segment(base64_list: list[str]):
 def _whisper_transcribe_in_current_process(output_path: str) -> str:
     import whisper
 
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = whisper.load_model("small", device=device)
+    # Keep Whisper on CPU to avoid consuming GPU memory in tests.
+    model = whisper.load_model("small", device="cpu")
     try:
         text = model.transcribe(
             output_path,
@@ -811,14 +811,8 @@ def _whisper_transcribe_in_current_process(output_path: str) -> str:
             condition_on_previous_text=False,
         )["text"]
     finally:
-        # Sync GPU so in-flight ops finish before we free the model; otherwise
-        # freed memory may not show up until those ops complete.
-        if torch is not None and torch.cuda.is_available():
-            torch.cuda.synchronize()
         del model
         gc.collect()
-        if torch is not None and torch.cuda.is_available():
-            torch.cuda.empty_cache()
 
     return text or ""
 
@@ -1344,7 +1338,7 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
     Estimate voice gender from audio using a small pre-trained classification model.
 
     Uses a cached `audio-classification` pipeline to classify the clip.
-    Returns 'male' / 'female' when the model confidence is >= 0.8 and the label
+    Returns 'male' / 'female' when the model confidence is >= 0.9 and the label
     maps to one of these; otherwise returns 'unknown'. If the model is unavailable
     or inference fails, returns 'unknown' to keep tests stable.
     """
@@ -1354,6 +1348,15 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
     mono = np.mean(data, axis=1)
 
     try:
+        target_sr = 16000
+        if int(sr) != target_sr and mono.size > 1:
+            src_len = int(mono.shape[0])
+            dst_len = max(1, int(round(src_len * float(target_sr) / float(sr))))
+            src_idx = np.arange(src_len, dtype=np.float32)
+            dst_idx = np.linspace(0, src_len - 1, dst_len, dtype=np.float32)
+            mono = np.interp(dst_idx, src_idx, mono.astype(np.float32, copy=False)).astype(np.float32)
+            sr = target_sr
+
         clf = _load_gender_pipeline()
         if clf is None:
             print("gender model not available, returning 'unknown'")
@@ -1368,7 +1371,7 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
         label = str(top.get("label", "")).lower()
         conf = float(top.get("score", 0.0))
 
-        if conf < 0.8:
+        if conf < 0.9:
             gender = "unknown"
         # Some models use non-English labels (e.g., Russian). Normalize to 'male'/'female'.
         elif ("female" in label) or ("жен" in label):
@@ -1476,6 +1479,7 @@ def assert_audio_speech_response(
                 # adjust this mapping to your actual voice names
                 "serena": "female",
                 "eric": "male",
+                "clone": "female",
             }
             expected_gender = voice_gender_map.get(voice)
             if expected_gender is not None:
diff --git a/tests/e2e/offline_inference/test_qwen3_tts_base.py b/tests/e2e/offline_inference/test_qwen3_tts_base.py
index b45f12bc698..be7bd50a36a 100644
--- a/tests/e2e/offline_inference/test_qwen3_tts_base.py
+++ b/tests/e2e/offline_inference/test_qwen3_tts_base.py
@@ -20,7 +20,7 @@
 from tests.conftest import modify_stage_config
 from tests.utils import hardware_test
 
-MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
+MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-Base"
 REF_AUDIO_URL = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav"
 REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."
 
diff --git a/tests/e2e/online_serving/test_qwen3_tts_base.py b/tests/e2e/online_serving/test_qwen3_tts_base.py
index 79c20f36872..002f9d99724 100644
--- a/tests/e2e/online_serving/test_qwen3_tts_base.py
+++ b/tests/e2e/online_serving/test_qwen3_tts_base.py
@@ -19,7 +19,7 @@
 from tests.conftest import OmniServerParams
 from tests.utils import hardware_test
 
-MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
+MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-Base"
 
 REF_AUDIO_URL = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav"
 REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."
@@ -99,7 +99,7 @@ def test_text_to_audio_002(omni_server, openai_client) -> None:
     """
     request_config = {
         "model": omni_server.model,
-        "input": "The weather is nice today, perfect for a walk in the park.",
+        "input": get_prompt(),
         "stream": True,
         "response_format": "wav",
         "task_type": "Base",
diff --git a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
index 73d16e89d31..3557ece9ea9 100644
--- a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
@@ -19,7 +19,7 @@
 from tests.conftest import OmniServerParams
 from tests.utils import hardware_test
 
-MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
+MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-Base"
 
 REF_AUDIO_URL = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav"
 REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."

From 986cecc879196abca42425e3669d434253940a1b Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Fri, 20 Mar 2026 10:38:25 +0800
Subject: [PATCH 7/9] skip some test

Signed-off-by: yenuo26 <410167048@qq.com>
---
 tests/conftest.py                                         | 2 +-
 tests/e2e/offline_inference/test_qwen3_tts_base.py        | 1 +
 tests/e2e/online_serving/test_qwen3_tts_base_expansion.py | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 3ee79a47516..583ef453eff 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1371,7 +1371,7 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
         label = str(top.get("label", "")).lower()
         conf = float(top.get("score", 0.0))
 
-        if conf < 0.9:
+        if conf < 0.6:
             gender = "unknown"
         # Some models use non-English labels (e.g., Russian). Normalize to 'male'/'female'.
         elif ("female" in label) or ("жен" in label):
diff --git a/tests/e2e/offline_inference/test_qwen3_tts_base.py b/tests/e2e/offline_inference/test_qwen3_tts_base.py
index be7bd50a36a..6ce1d345105 100644
--- a/tests/e2e/offline_inference/test_qwen3_tts_base.py
+++ b/tests/e2e/offline_inference/test_qwen3_tts_base.py
@@ -59,6 +59,7 @@ def get_prompt():
     return "Hello, this is a test for text to audio."
 
 
+@pytest.mark.skip(reason="Known issue(2030): qwen3_tts_no_async_chunk path temporarily disabled.")
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
diff --git a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
index 3557ece9ea9..c171205149a 100644
--- a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
@@ -60,6 +60,7 @@ def get_max_batch_size(size_type="few"):
             server_args=["--trust-remote-code", "--disable-log-stats"],
         ),
         id="no_async_chunk",
+        marks=pytest.mark.skip(reason="Known issue(2030): qwen3_tts_no_async_chunk path temporarily disabled."),
     ),
 ]
 

From e3139a136b707ac4fd2fc0bf032c3ae6cdfe2c6a Mon Sep 17 00:00:00 2001
From: yenuo26 <410167048@qq.com>
Date: Mon, 23 Mar 2026 19:59:32 +0800
Subject: [PATCH 8/9] Update Qwen3-TTS E2E tests to include CustomVoice and
 Base models

Signed-off-by: yenuo26 <410167048@qq.com>
---
 .buildkite/test-merge.yml   | 26 ++++++++++++++++++++++++--
 .buildkite/test-nightly.yml |  2 +-
 .buildkite/test-ready.yml   |  2 +-
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
index 42ba0c1f1b4..2d296271bdf 100644
--- a/.buildkite/test-merge.yml
+++ b/.buildkite/test-merge.yml
@@ -200,14 +200,36 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Qwen3-TTS E2E Test"
+  - label: "Qwen3-TTS CustomVoice E2E Test"
     depends_on: upload-merge-pipeline
     commands:
       - |
         timeout 20m bash -c '
           export VLLM_LOGGING_LEVEL=DEBUG
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline/test_qwen3_tts.py
+          pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline/test_qwen3_tts_customvoice.py
+        '
+    agents:
+      queue: "gpu_1_queue"
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+
+  - label: "Qwen3-TTS Base E2E Test"
+    depends_on: upload-merge-pipeline
+    commands:
+      - |
+        timeout 20m bash -c '
+          export VLLM_LOGGING_LEVEL=DEBUG
+          export VLLM_WORKER_MULTIPROC_METHOD=spawn
+          pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline/test_qwen3_tts_base.py
         '
     agents:
       queue: "gpu_1_queue"
diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 8dcd11542d6..41998756010 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -65,7 +65,7 @@ steps:
   - label: ":full_moon: Diffusion Model Test with H100"
     timeout_in_minutes: 60
     depends_on: upload-nightly-pipeline
-    # if: build.env("NIGHTLY") == "1"
+    if: build.env("NIGHTLY") == "1"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
       - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model"
diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
index 661387cea37..bc343f18e35 100644
--- a/.buildkite/test-ready.yml
+++ b/.buildkite/test-ready.yml
@@ -208,7 +208,7 @@ steps:
         timeout 20m bash -c '
           export VLLM_LOGGING_LEVEL=DEBUG
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py -m "core_model" --run-level "core_model"
+          pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "core_model" --run-level "core_model"
         '
     agents:
       queue: "gpu_1_queue"

From 6fea777ea4f635aa5f12385b554744d5605ce964 Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Wed, 25 Mar 2026 22:50:52 +0800
Subject: [PATCH 9/9] add pcm test

Signed-off-by: wangyu <410167048@qq.com>
---
 pyproject.toml                                |  2 +-
 tests/conftest.py                             | 94 ++++++++++++++++---
 .../test_qwen3_tts_base_expansion.py          | 28 +++++-
 3 files changed, 108 insertions(+), 16 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 973370b4f33..f4b8ecf021a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,7 @@ dev = [
     "av", # for ComfyUI tests
     "openpyxl>=3.0.0", # for nightly CI
     "pyttsx3>=2.99",
-    "opencc>=1.2.0"
+    "opencc>=1.2.0",
     "mistune>=3.2.0", # for example tests
 ]
 
diff --git a/tests/conftest.py b/tests/conftest.py
index d560d088619..96f6a4c9299 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -54,6 +54,9 @@
 
 _GENDER_PIPELINE = None
 
+# int16 mono PCM from /v1/audio/speech when response_format=pcm (Qwen3-TTS code2wav output rate).
+_PCM_SPEECH_SAMPLE_RATE_HZ = 24_000
+
 
 class OmniServerParams(NamedTuple):
     model: str
@@ -828,10 +831,9 @@ def convert_audio_file_to_text(output_path: str) -> str:
 
 def convert_audio_bytes_to_text(raw_bytes: bytes) -> str:
     """
-    Write raw audio bytes to a temp WAV file suitable for Whisper/ffmpeg.
+    Write container audio bytes (WAV, etc.) to a temp WAV file suitable for Whisper/ffmpeg.
     Normalizes with soundfile to PCM_16 WAV when possible to avoid codec issues.
     """
-    # Use high-entropy filename to avoid collisions under concurrency.
     output_path = f"./test_{uuid.uuid4().hex}.wav"
     data, samplerate = sf.read(io.BytesIO(raw_bytes))
     sf.write(output_path, data, samplerate, format="WAV", subtype="PCM_16")
@@ -1385,6 +1387,51 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
         return "unknown"
 
 
+# Threshold aligned with _compute_pcm_hnr_db docstring (clean clone vs distorted).
+_MIN_PCM_SPEECH_HNR_DB = 1.2
+
+
+def _compute_pcm_hnr_db(pcm_samples: np.ndarray, sr: int = _PCM_SPEECH_SAMPLE_RATE_HZ) -> float:
+    """Compute mean Harmonic-to-Noise Ratio (dB) for speech quality.
+
+    Clean cloned speech has HNR > 1.2 dB; distorted speech (e.g. lost
+    ref_code decoder context) drops below 1.0 dB.
+    """
+    frame_len = int(0.03 * sr)  # 30ms frames
+    hop = frame_len // 2
+    hnr_values: list[float] = []
+
+    for start in range(0, len(pcm_samples) - frame_len, hop):
+        frame = pcm_samples[start : start + frame_len].astype(np.float32, copy=False)
+        frame = frame - np.mean(frame)
+        if np.max(np.abs(frame)) < 0.01:
+            continue
+        ac = np.correlate(frame, frame, mode="full")[len(frame) - 1 :]
+        ac = ac / (ac[0] + 1e-10)
+        min_lag = int(sr / 400)
+        max_lag = min(int(sr / 80), len(ac))
+        if min_lag >= max_lag:
+            continue
+        peak = float(np.max(ac[min_lag:max_lag]))
+        if 0 < peak < 1:
+            hnr_values.append(10 * np.log10(peak / (1 - peak + 1e-10)))
+
+    return float(np.mean(hnr_values)) if hnr_values else 0.0
+
+
+def _assert_pcm_int16_speech_hnr(audio_bytes: bytes) -> None:
+    """Validate harmonic-to-noise ratio on raw int16 PCM from /v1/audio/speech."""
+    assert audio_bytes is not None and len(audio_bytes) >= 2, "missing PCM bytes"
+    assert len(audio_bytes) % 2 == 0, "PCM byte length must be aligned to int16"
+    pcm_samples = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
+    hnr = _compute_pcm_hnr_db(pcm_samples)
+    print(f"PCM speech HNR: {hnr:.2f} dB (threshold: {_MIN_PCM_SPEECH_HNR_DB} dB)")
+    assert hnr >= _MIN_PCM_SPEECH_HNR_DB, (
+        f"Audio distortion detected: HNR={hnr:.2f} dB < {_MIN_PCM_SPEECH_HNR_DB} dB. "
+        "Voice clone decoder may be losing ref_code speaker context on later chunks."
+    )
+
+
 def assert_omni_response(response: OmniResponse, request_config: dict[str, Any], run_level):
     """
     Validate response results.
@@ -1439,13 +1486,21 @@ def assert_audio_speech_response(
     run_level: str,
 ) -> None:
     """
-    Validate /v1/audio/speech response: success, optional format check, and transcription similarity.
+    Validate /v1/audio/speech response: success, optional format check, transcription similarity
+    and gender (non-PCM only for advanced_model), and int16 PCM HNR when response_format is pcm.
     """
     assert response.success, "The request failed."
 
     req_fmt = request_config.get("response_format")
 
-    if req_fmt and response.audio_format is not None:
+    if req_fmt == "pcm" and response.audio_bytes:
+        _assert_pcm_int16_speech_hnr(response.audio_bytes)
+        if response.audio_format:
+            assert "pcm" in response.audio_format.lower(), (
+                f"Expected audio/pcm content-type, got {response.audio_format!r}"
+            )
+
+    elif req_fmt == "wav" and response.audio_format:
         assert req_fmt in response.audio_format, (
             f"The response audio format {response.audio_format} don't match the request audio format {req_fmt}"
         )
@@ -1454,8 +1509,8 @@ def assert_audio_speech_response(
     if e2e_latency is not None:
         print(f"the avg e2e latency is: {e2e_latency}")
 
-    if run_level == "advanced_model":
-        # Text–audio semantic similarity check
+    if run_level == "advanced_model" and req_fmt != "pcm":
+        # Text–audio semantic similarity check (skipped for raw PCM: no Whisper transcript).
         expected_text = request_config.get("input")
         if expected_text:
             transcript = (response.audio_content or "").strip()
@@ -1708,7 +1763,7 @@ def _process_diffusion_response(self, chat_completion) -> DiffusionResponse:
 
         return result
 
-    def _process_stream_audio_speech_response(self, response) -> OmniResponse:
+    def _process_stream_audio_speech_response(self, response, *, response_format: str | None = None) -> OmniResponse:
         """
         Process streaming /v1/audio/speech responses into an OmniResponse.
 
@@ -1751,7 +1806,10 @@ def _process_stream_audio_speech_response(self, response) -> OmniResponse:
                     raise TypeError(f"Unsupported audio speech streaming response type: {type(response)}")
 
             raw_bytes = bytes(data)
-            transcript = convert_audio_bytes_to_text(raw_bytes)
+            if response_format == "pcm":
+                transcript = None
+            else:
+                transcript = convert_audio_bytes_to_text(raw_bytes)
 
             # Populate OmniResponse.
             result.audio_bytes = raw_bytes
@@ -1768,7 +1826,9 @@ def _process_stream_audio_speech_response(self, response) -> OmniResponse:
 
         return result
 
-    def _process_non_stream_audio_speech_response(self, response) -> OmniResponse:
+    def _process_non_stream_audio_speech_response(
+        self, response, *, response_format: str | None = None
+    ) -> OmniResponse:
         """
         Process non-streaming /v1/audio/speech responses into an OmniResponse.
 
@@ -1787,7 +1847,10 @@ def _process_non_stream_audio_speech_response(self, response) -> OmniResponse:
             else:
                 raise TypeError(f"Unsupported audio speech response type: {type(response)}")
 
-            transcript = convert_audio_bytes_to_text(raw_bytes)
+            if response_format == "pcm":
+                transcript = None
+            else:
+                transcript = convert_audio_bytes_to_text(raw_bytes)
 
             result.audio_bytes = raw_bytes
             result.audio_content = transcript
@@ -1895,6 +1958,8 @@ def send_audio_speech_request(self, request_config: dict[str, Any], request_num:
 
         responses: list[OmniResponse] = []
 
+        speech_fmt: str | None = None if response_format is omit else str(response_format).lower()
+
         if request_num == 1:
             if stream:
                 # Use streaming response helper.
@@ -1906,7 +1971,7 @@ def send_audio_speech_request(self, request_config: dict[str, Any], request_num:
                     timeout=timeout,
                     voice=voice,
                 ) as resp:
-                    omni_resp = self._process_stream_audio_speech_response(resp)
+                    omni_resp = self._process_stream_audio_speech_response(resp, response_format=speech_fmt)
             else:
                 # Non-streaming response.
                 resp = self.client.audio.speech.create(
@@ -1917,7 +1982,7 @@ def send_audio_speech_request(self, request_config: dict[str, Any], request_num:
                     timeout=timeout,
                     voice=voice,
                 )
-                omni_resp = self._process_non_stream_audio_speech_response(resp)
+                omni_resp = self._process_non_stream_audio_speech_response(resp, response_format=speech_fmt)
 
             assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level)
             responses.append(omni_resp)
@@ -1936,7 +2001,7 @@ def _stream_task():
                         timeout=timeout,
                         voice=voice,
                     ) as resp:
-                        return self._process_stream_audio_speech_response(resp)
+                        return self._process_stream_audio_speech_response(resp, response_format=speech_fmt)
 
                 with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor:
                     futures = [executor.submit(_stream_task) for _ in range(request_num)]
@@ -1961,7 +2026,7 @@ def _stream_task():
 
                     for future in concurrent.futures.as_completed(futures):
                         resp = future.result()
-                        omni_resp = self._process_non_stream_audio_speech_response(resp)
+                        omni_resp = self._process_non_stream_audio_speech_response(resp, response_format=speech_fmt)
                         assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level)
                         responses.append(omni_resp)
 
@@ -2534,6 +2599,7 @@ def send_audio_speech_request(
         result = OmniResponse(success=True)
         assert_audio_speech_response(result, request_config, run_level="core_model")
         return result
+
     def start_profile(
         self,
         profile_prefix: str | None = None,
diff --git a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
index c171205149a..3c33485e4f4 100644
--- a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
@@ -60,7 +60,6 @@ def get_max_batch_size(size_type="few"):
             server_args=["--trust-remote-code", "--disable-log-stats"],
         ),
         id="no_async_chunk",
-        marks=pytest.mark.skip(reason="Known issue(2030): qwen3_tts_no_async_chunk path temporarily disabled."),
     ),
 ]
 
@@ -91,3 +90,30 @@ def test_voice_clone_streaming_001(omni_server, openai_client) -> None:
         "ref_text": REF_TEXT,
     }
     openai_client.send_audio_speech_request(request_config, request_num=get_max_batch_size("few"))
+
+
+@pytest.mark.advanced_model
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+def test_response_format_001(omni_server, openai_client) -> None:
+    """
+    Test text input processing and audio output via OpenAI API.
+    Deploy Setting: default yaml
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: non-stream PCM
+    Datasets: few requests
+    """
+
+    request_config = {
+        "model": omni_server.model,
+        "input": get_prompt(),
+        "response_format": "pcm",
+        "task_type": "Base",
+        "voice": "clone",
+        "ref_audio": REF_AUDIO_URL,
+        "ref_text": REF_TEXT,
+    }
+    openai_client.send_audio_speech_request(request_config)