From db0ef930f0ca4c02776833760ab28bb9936e4377 Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Fri, 20 Mar 2026 09:17:14 +0000
Subject: [PATCH 01/11] support ref_text

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 vllm_omni/entrypoints/openai/api_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index c3c250fda7f..5abd8b79a7b 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -978,6 +978,7 @@ async def upload_voice(
     audio_sample: UploadFile = File(...),
     consent: str = Form(...),
     name: str = Form(...),
+    ref_text: str = Form(None),
 ):
     """Upload a new voice sample for voice cloning.
 
@@ -999,8 +1000,7 @@ async def upload_voice(
         return base(raw_request).create_error_response(message="The model does not support Speech API")
 
     try:
-        # Upload the voice
-        result = await handler.upload_voice(audio_sample, consent, name)
+        result = await handler.upload_voice(audio_sample, consent, name, ref_text=ref_text)
 
         return JSONResponse(content={"success": True, "voice": result})
 

From 0accaed50bbc9f7d0fea547813c98599806cf32d Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Fri, 20 Mar 2026 09:17:49 +0000
Subject: [PATCH 02/11] support ref_text in serving speech

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 .../entrypoints/openai/serving_speech.py      | 52 ++++++++++++++-----
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index 5ece2e7501d..7cf5d032efd 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -361,8 +361,9 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None:
             logger.error(f"Could not read audio file for voice {voice_name}: {e}")
             return None
 
-    async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> dict:
-        """Upload a new voice sample."""
+    async def upload_voice(
+        self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None
+    ) -> dict:
         # Validate file size (max 10MB)
         MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
         audio_file.file.seek(0, 2)  # Seek to end
@@ -453,9 +454,10 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) ->
             "mime_type": mime_type,
             "original_filename": audio_file.filename,
             "file_size": file_size,
-            "cache_status": "pending",  # The initial cache state is pending.
-            "cache_file": None,  # The initial cache file is empty.
-            "cache_generated_at": None,  # The initial cache generation time is empty.
+            "ref_text": ref_text,
+            "cache_status": "pending",
+            "cache_file": None,
+            "cache_generated_at": None,
         }
 
         # Save metadata using metadata manager (concurrency safe)
@@ -668,7 +670,16 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int
         wav_np = np.asarray(wav_np, dtype=np.float32)
         if wav_np.ndim > 1:
             wav_np = np.mean(wav_np, axis=-1)
-        return wav_np.tolist(), int(sr)
+        sr = int(sr)
+        duration = len(wav_np) / sr if sr > 0 else 0.0
+        if duration < 1.0:
+            raise ValueError(
+                f"Reference audio too short ({duration:.1f}s). "
+                "At least 1s of clear speech is required for speaker embedding."
+            )
+        if duration > 20.0:
+            raise ValueError(f"Reference audio too long ({duration:.1f}s). Maximum 20s supported — use a shorter clip.")
+        return wav_np.tolist(), sr
 
     async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"):
         """Generate audio chunks for streaming response.
@@ -792,15 +803,22 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any
         if request.voice is not None:
             params["speaker"] = [request.voice]
 
-            # If voice is an uploaded speaker and no ref_audio provided, auto-set it
+            # Uploaded voices use task_type="Base" (CustomVoice requires built-in spk_id).
+            # If ref_text was provided at upload time, use in-context cloning; otherwise x_vector only.
             if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None:
                 audio_data = self._get_uploaded_audio_data(request.voice)
-                if audio_data:
-                    params["ref_audio"] = [audio_data]
-                    params["x_vector_only_mode"] = [True]
-                    logger.info(f"Auto-set ref_audio for uploaded voice: {request.voice}")
-                else:
+                if not audio_data:
                     raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted")
+                speaker_info = self.uploaded_speakers[request.voice.lower()]
+                stored_ref_text = speaker_info.get("ref_text")
+                params["ref_audio"] = [audio_data]
+                params["task_type"] = ["Base"]
+                if stored_ref_text:
+                    params["ref_text"] = [stored_ref_text]
+                    params["x_vector_only_mode"] = [False]
+                else:
+                    params["x_vector_only_mode"] = [True]
+                logger.info("Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text))
 
         elif params["task_type"][0] == "CustomVoice":
             params["speaker"] = ["Vivian"]  # Default for CustomVoice
@@ -970,8 +988,14 @@ async def _prepare_speech_generation(
                 tts_params = {}
             else:
                 tts_params = self._build_tts_params(request)
-                if request.ref_audio is not None:
-                    wav_list, sr = await self._resolve_ref_audio(request.ref_audio)
+                # Resolve ref_audio (explicit or auto-set for uploaded voices)
+                # to [[wav_list, sr]] so the model doesn't re-decode base64.
+                ref_audio_source = request.ref_audio
+                if ref_audio_source is None and isinstance(tts_params.get("ref_audio"), list):
+                    # Uploaded voice: ref_audio was auto-set as [base64_data_url]
+                    ref_audio_source = tts_params["ref_audio"][0]
+                if ref_audio_source is not None and isinstance(ref_audio_source, str):
+                    wav_list, sr = await self._resolve_ref_audio(ref_audio_source)
                     tts_params["ref_audio"] = [[wav_list, sr]]
 
                 ph_len = self._estimate_prompt_len(tts_params)

From 0b89a269ba18705e1a2de8f63a8c6a7a0262314b Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Fri, 20 Mar 2026 09:18:23 +0000
Subject: [PATCH 03/11] move log to warning of code2wav

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 .../model_executor/models/qwen3_tts/qwen3_tts_code2wav.py    | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py
index 6be039df105..a22ce8488b4 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py
@@ -228,10 +228,9 @@ def forward(
             flat = req_ids
             n = flat.numel()
             if n == 0 or n % q != 0:
-                if n > 0:
+                if n > 1:
                     logger.warning(
-                        "Code2Wav input_ids length %d not divisible by num_quantizers %d, "
-                        "likely a warmup run; returning empty audio.",
+                        "Code2Wav input_ids length %d not divisible by num_quantizers %d; skipping malformed request.",
                         n,
                         q,
                     )

From e4508b17a3b767b68c2abdae7607b16374082d9d Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Fri, 20 Mar 2026 09:18:53 +0000
Subject: [PATCH 04/11] add to docs voice upload ref_text

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 docs/serving/speech_api.md                     | 18 ++++++++++++++----
 .../examples/online_serving/qwen3_tts.md       |  4 +++-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md
index 17787e682d0..67374f6ede4 100644
--- a/docs/serving/speech_api.md
+++ b/docs/serving/speech_api.md
@@ -160,6 +160,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 | `audio_sample` | file | Yes | Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) |
 | `consent` | string | Yes | Consent recording ID |
 | `name` | string | Yes | Name for the new voice |
+| `ref_text` | string | No | Transcript of the audio. When provided, enables in-context voice cloning (higher quality). Without it, only the speaker embedding is extracted. |
 
 **Response Example:**
 
@@ -182,7 +183,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 curl -X POST http://localhost:8091/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
-  -F "name=custom_voice_1"
+  -F "name=custom_voice_1" \
+  -F "ref_text=The exact transcript of the audio sample."
 ```
 
 ## Streaming Text Input (WebSocket)
@@ -317,7 +319,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
     }' --output cloned.wav
 ```
 
-upload voice
+upload voice (speaker embedding only)
 ```bash
 curl -X POST http://localhost:8091/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
@@ -325,13 +327,21 @@ curl -X POST http://localhost:8091/v1/audio/voices \
   -F "name=custom_voice_1"
 ```
 
-use upload voice
+upload voice with transcript (in-context cloning, higher quality)
+```bash
+curl -X POST http://localhost:8091/v1/audio/voices \
+  -F "audio_sample=@/path/to/voice_sample.wav" \
+  -F "consent=user_consent_id" \
+  -F "name=custom_voice_2" \
+  -F "ref_text=The exact transcript of the audio sample."
+```
+
+use uploaded voice
 ```bash
 curl -X POST http://localhost:8091/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{
         "input": "Hello, this is a cloned voice",
-        "task_type": "Base",
         "voice": "custom_voice_1"
     }' --output cloned.wav
 ```
diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md
index 993505b876e..bee3283a04a 100644
--- a/docs/user_guide/examples/online_serving/qwen3_tts.md
+++ b/docs/user_guide/examples/online_serving/qwen3_tts.md
@@ -242,6 +242,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 - `audio_sample` (required): Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4)
 - `consent` (required): Consent recording ID
 - `name` (required): Name for the new voice
+- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality).
 
 **Response Example:**
 ```json
@@ -262,7 +263,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 curl -X POST http://localhost:8000/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
-  -F "name=custom_voice_1"
+  -F "name=custom_voice_1" \
+  -F "ref_text=The exact transcript of the audio sample."
 ```
 
 ### Endpoint

From 75638c3555733b5ec2e36bf69f84cc0bd95187ab Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Fri, 20 Mar 2026 09:44:13 +0000
Subject: [PATCH 05/11] update readme and voice upload test

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 examples/online_serving/qwen3_tts/README.md   |  4 +-
 .../openai_api/test_serving_speech.py         | 64 +++++++++++++------
 2 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md
index 4709c9d4218..2feef51aacf 100644
--- a/examples/online_serving/qwen3_tts/README.md
+++ b/examples/online_serving/qwen3_tts/README.md
@@ -239,6 +239,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 - `audio_sample` (required): Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4)
 - `consent` (required): Consent recording ID
 - `name` (required): Name for the new voice
+- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality).
 
 **Response Example:**
 ```json
@@ -259,7 +260,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 curl -X POST http://localhost:8000/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
-  -F "name=custom_voice_1"
+  -F "name=custom_voice_1" \
+  -F "ref_text=The exact transcript of the audio sample."
 ```
 
 ### Endpoint
diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index 67abd7617b7..e5435f3d7f7 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -337,30 +337,35 @@ def test_list_voices_endpoint(self, client):
         assert "voices" in response.json()
 
     def test_upload_voice_success(self, client, tmp_path):
-        """Test successful voice upload."""
-        # Create a mock audio file
-        audio_content = b"fake audio content" * 1000  # ~17KB
-        files = {
-            "audio_sample": ("test.wav", audio_content, "audio/wav"),
-        }
-        data = {
-            "consent": "user_consent_123",
-            "name": "test_voice",
-        }
+        """Test successful voice upload without ref_text."""
+        audio_content = b"fake audio content" * 1000
+        files = {"audio_sample": ("test.wav", audio_content, "audio/wav")}
+        data = {"consent": "user_consent_123", "name": "test_voice"}
 
         response = client.post("/v1/audio/voices", files=files, data=data)
         assert response.status_code == 200
         result = response.json()
         assert result["success"] is True
-        assert "voice" in result
         voice_info = result["voice"]
         assert voice_info["name"] == "test_voice"
         assert voice_info["consent"] == "user_consent_123"
-        assert "created_at" in voice_info
         assert voice_info["mime_type"] == "audio/wav"
         assert voice_info["file_size"] == len(audio_content)
         response = client.delete("/v1/audio/voices/test_voice")
 
+    def test_upload_voice_with_ref_text(self, client, tmp_path):
+        """Test voice upload with ref_text enables in-context cloning."""
+        audio_content = b"fake audio content" * 1000
+        files = {"audio_sample": ("test.wav", audio_content, "audio/wav")}
+        data = {"consent": "c1", "name": "test_voice_rt", "ref_text": "Hello world transcript"}
+
+        response = client.post("/v1/audio/voices", files=files, data=data)
+        assert response.status_code == 200
+        result = response.json()
+        assert result["success"] is True
+        assert result["voice"]["name"] == "test_voice_rt"
+        response = client.delete("/v1/audio/voices/test_voice_rt")
+
     def test_upload_voice_file_too_large(self, client):
         """Test voice upload with file exceeding size limit."""
         # Create a file larger than 10MB
@@ -634,31 +639,48 @@ def test_load_supported_speakers(self, mocker: MockerFixture):
         assert server.supported_speakers == {"ryan", "vivian", "aiden"}
 
     def test_build_tts_params_with_uploaded_voice(self, speech_server):
-        """Test _build_tts_params auto-sets ref_audio for uploaded voices."""
-        # Mock an uploaded speaker
+        """Test _build_tts_params auto-sets ref_audio for uploaded voices (x_vector only)."""
         speech_server.uploaded_speakers = {
             "custom_voice": {
                 "name": "custom_voice",
                 "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav",
                 "mime_type": "audio/wav",
+                "ref_text": None,
             }
         }
         speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"}
 
-        # Mock _get_uploaded_audio_data to return base64 data
         with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio:
             mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv"
+            req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice")
+            params = speech_server._build_tts_params(req)
 
-            req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice", task_type="Base")
+            assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"]
+            assert params["x_vector_only_mode"] == [True]
+            assert params["task_type"] == ["Base"]
+            assert "ref_text" not in params
+
+    def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server):
+        """Test _build_tts_params enables in-context cloning when ref_text is stored."""
+        speech_server.uploaded_speakers = {
+            "custom_voice": {
+                "name": "custom_voice",
+                "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav",
+                "mime_type": "audio/wav",
+                "ref_text": "Hello world transcript",
+            }
+        }
+        speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"}
 
+        with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio:
+            mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv"
+            req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice")
             params = speech_server._build_tts_params(req)
 
-            # Verify ref_audio was auto-set
-            assert "ref_audio" in params
             assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"]
-            assert "x_vector_only_mode" in params
-            assert params["x_vector_only_mode"] == [True]
-            mock_get_audio.assert_called_once_with("custom_voice")
+            assert params["x_vector_only_mode"] == [False]
+            assert params["task_type"] == ["Base"]
+            assert params["ref_text"] == ["Hello world transcript"]
 
     def test_build_tts_params_without_uploaded_voice(self, speech_server):
         """Test _build_tts_params does not auto-set ref_audio for non-uploaded voices."""

From b7140f3cd3c75e50f763d9bf426289a6a382a73f Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Fri, 20 Mar 2026 10:01:10 +0000
Subject: [PATCH 06/11] update test

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 tests/entrypoints/openai_api/test_serving_speech.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index e5435f3d7f7..39f315a8d15 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -233,9 +233,14 @@ async def list_voices():
     app.add_api_route("/v1/audio/voices", list_voices, methods=["GET"])
 
     # Add upload_voice endpoint
-    async def upload_voice(audio_sample: UploadFile = File(...), consent: str = Form(...), name: str = Form(...)):
+    async def upload_voice(
+        audio_sample: UploadFile = File(...),
+        consent: str = Form(...),
+        name: str = Form(...),
+        ref_text: str = Form(None),
+    ):
         try:
-            result = await speech_server.upload_voice(audio_sample, consent, name)
+            result = await speech_server.upload_voice(audio_sample, consent, name, ref_text=ref_text)
             return {"success": True, "voice": result}
         except ValueError as e:
             raise HTTPException(status_code=400, detail=str(e))

From eeb344909adc6a9d7099a3b507e858924bd076d5 Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Fri, 20 Mar 2026 12:24:51 +0000
Subject: [PATCH 07/11] add generate to AR stage0

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 vllm_omni/engine/async_omni_engine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 5562b84ff29..4651bfa7762 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -532,6 +532,8 @@ def _initialize_stages(self, stage_init_timeout: int) -> None:
             supported_tasks.add("generate")
         if any(metadata.get("final_output_type") == "audio" for metadata in stage_metadata):
             supported_tasks.add("speech")
+            # TTS stage-0 is an AR model, so we need to add generate
+            supported_tasks.add("generate")
         self.supported_tasks = tuple(supported_tasks) if supported_tasks else ("generate",)
 
         self.default_sampling_params_list = default_sampling_params_list

From 610809ddb1d7ac4f718517ac2baf7dec265d2cc2 Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Fri, 20 Mar 2026 13:26:39 +0000
Subject: [PATCH 08/11] revert 'generate', add cap to 30s and clean ref_text

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 vllm_omni/engine/async_omni_engine.py          | 2 --
 vllm_omni/entrypoints/openai/serving_speech.py | 7 +++++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 4651bfa7762..5562b84ff29 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -532,8 +532,6 @@ def _initialize_stages(self, stage_init_timeout: int) -> None:
             supported_tasks.add("generate")
         if any(metadata.get("final_output_type") == "audio" for metadata in stage_metadata):
             supported_tasks.add("speech")
-            # TTS stage-0 is an AR model, so we need to add generate
-            supported_tasks.add("generate")
         self.supported_tasks = tuple(supported_tasks) if supported_tasks else ("generate",)
 
         self.default_sampling_params_list = default_sampling_params_list
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index 7cf5d032efd..d6f3207fc56 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -364,6 +364,9 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None:
     async def upload_voice(
         self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None
     ) -> dict:
+        # Normalize ref_text: treat whitespace-only as absent
+        if ref_text is not None:
+            ref_text = ref_text.strip() or None
         # Validate file size (max 10MB)
         MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
         audio_file.file.seek(0, 2)  # Seek to end
@@ -677,8 +680,8 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int
                 f"Reference audio too short ({duration:.1f}s). "
                 "At least 1s of clear speech is required for speaker embedding."
             )
-        if duration > 20.0:
-            raise ValueError(f"Reference audio too long ({duration:.1f}s). Maximum 20s supported — use a shorter clip.")
+        if duration > 30.0:
+            raise ValueError(f"Reference audio too long ({duration:.1f}s). Maximum 30s supported — use a shorter clip.")
         return wav_np.tolist(), sr
 
     async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"):

From 9ef29d7408f634237963945a8c7645d7055cc5da Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Mon, 23 Mar 2026 06:33:28 +0000
Subject: [PATCH 09/11] add clone sample limit at voice upload, add ref_text in
 tests

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 .../openai_api/test_serving_speech.py         |  1 +
 .../entrypoints/openai/serving_speech.py      | 21 ++++++++++++++++++-
 .../models/qwen3_tts/qwen3_tts_code2wav.py    |  2 +-
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index 39f315a8d15..aa43d7c0682 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -369,6 +369,7 @@ def test_upload_voice_with_ref_text(self, client, tmp_path):
         result = response.json()
         assert result["success"] is True
         assert result["voice"]["name"] == "test_voice_rt"
+        assert result["voice"].get("ref_text") == "Hello world transcript"
         response = client.delete("/v1/audio/voices/test_voice_rt")
 
     def test_upload_voice_file_too_large(self, client):
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index f124ae942f0..786038ca3fc 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -1,5 +1,6 @@
 import asyncio
 import base64
+import io
 import json
 import math
 import os
@@ -10,6 +11,7 @@
 from typing import Any
 
 import numpy as np
+import soundfile as sf
 from fastapi import Request, UploadFile
 from fastapi.responses import Response, StreamingResponse
 from transformers.utils.hub import cached_file
@@ -441,10 +443,27 @@ async def upload_voice(
         if not _validate_path_within_directory(file_path, self.uploaded_speakers_dir):
             raise ValueError("Invalid file path: potential path traversal attack detected")
 
+        # Read content and validate duration before saving
+        content = await audio_file.read()
+        try:
+            wav_np, sr = sf.read(io.BytesIO(content))
+            duration = len(wav_np) / sr if sr > 0 else 0.0
+            if duration < 1.0:
+                raise ValueError(
+                    f"Reference audio too short ({duration:.1f}s). At least 1s of clear speech is required."
+                )
+            if duration > 30.0:
+                raise ValueError(
+                    f"Reference audio too long ({duration:.1f}s). Maximum 30s supported — use a shorter clip."
+                )
+        except ValueError:
+            raise
+        except Exception as e:
+            logger.warning("Could not validate audio duration: %s", e)
+
         # Save audio file
         try:
             with open(file_path, "wb") as f:
-                content = await audio_file.read()
                 f.write(content)
         except Exception as e:
             raise ValueError(f"Failed to save audio file: {e}")
diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py
index a22ce8488b4..2a7c3378cf1 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py
@@ -228,7 +228,7 @@ def forward(
             flat = req_ids
             n = flat.numel()
             if n == 0 or n % q != 0:
-                if n > 1:
+                if n > 0:
                     logger.warning(
                         "Code2Wav input_ids length %d not divisible by num_quantizers %d; skipping malformed request.",
                         n,

From 55a27c1af5e2fb1ca3212e81e2983117839159c6 Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Mon, 23 Mar 2026 06:48:36 +0000
Subject: [PATCH 10/11] added min/max global and add check in voice_upload

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 .../entrypoints/openai/serving_speech.py      | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index 786038ca3fc..f35341bac09 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -49,6 +49,8 @@
     "Spanish",
     "Italian",
 }
+_REF_AUDIO_MIN_DURATION = 1.0  # seconds
+_REF_AUDIO_MAX_DURATION = 30.0  # seconds
 _TTS_MAX_INSTRUCTIONS_LENGTH = 500
 _TTS_MAX_NEW_TOKENS_MIN = 1
 _TTS_MAX_NEW_TOKENS_MAX = 4096
@@ -448,13 +450,15 @@ async def upload_voice(
         try:
             wav_np, sr = sf.read(io.BytesIO(content))
             duration = len(wav_np) / sr if sr > 0 else 0.0
-            if duration < 1.0:
+            if duration < _REF_AUDIO_MIN_DURATION:
                 raise ValueError(
-                    f"Reference audio too short ({duration:.1f}s). At least 1s of clear speech is required."
+                    f"Reference audio too short ({duration:.1f}s). "
+                    f"At least {_REF_AUDIO_MIN_DURATION:.0f}s of clear speech is required."
                 )
-            if duration > 30.0:
+            if duration > _REF_AUDIO_MAX_DURATION:
                 raise ValueError(
-                    f"Reference audio too long ({duration:.1f}s). Maximum 30s supported — use a shorter clip."
+                    f"Reference audio too long ({duration:.1f}s). "
+                    f"Maximum {_REF_AUDIO_MAX_DURATION:.0f}s supported — use a shorter clip."
                 )
         except ValueError:
             raise
@@ -695,13 +699,16 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int
             wav_np = np.mean(wav_np, axis=-1)
         sr = int(sr)
         duration = len(wav_np) / sr if sr > 0 else 0.0
-        if duration < 1.0:
+        if duration < _REF_AUDIO_MIN_DURATION:
             raise ValueError(
                 f"Reference audio too short ({duration:.1f}s). "
-                "At least 1s of clear speech is required for speaker embedding."
+                f"At least {_REF_AUDIO_MIN_DURATION:.0f}s of clear speech is required."
+            )
+        if duration > _REF_AUDIO_MAX_DURATION:
+            raise ValueError(
+                f"Reference audio too long ({duration:.1f}s). "
+                f"Maximum {_REF_AUDIO_MAX_DURATION:.0f}s supported — use a shorter clip."
             )
-        if duration > 30.0:
-            raise ValueError(f"Reference audio too long ({duration:.1f}s). Maximum 30s supported — use a shorter clip.")
         return wav_np.tolist(), sr
 
     async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"):

From 9a4552a6c09be3e887602eddf2ed06f064417e23 Mon Sep 17 00:00:00 2001
From: JuanPZuluaga <juanz9312@gmail.com>
Date: Thu, 26 Mar 2026 14:02:36 +0000
Subject: [PATCH 11/11] fix ci

Signed-off-by: JuanPZuluaga <juanz9312@gmail.com>
---
 vllm_omni/entrypoints/openai/serving_speech.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index 16d57e15f17..9d3a0fe7889 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -580,13 +580,16 @@ async def upload_voice(
         logger.info(f"Uploaded new voice '{name}' with consent ID '{consent}'")
 
         # Return voice information without exposing the server file path
-        return {
+        result = {
             "name": name,
             "consent": consent,
             "created_at": timestamp,
             "mime_type": mime_type,
             "file_size": file_size,
         }
+        if ref_text is not None:
+            result["ref_text"] = ref_text
+        return result
 
     async def upload_voice_embedding(self, embedding_json: str, consent: str, name: str) -> dict:
         """Upload a voice from a pre-computed speaker embedding.