vllm-project · linyueqian · Mar 26, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
@@ -160,6 +160,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 | `audio_sample` | file | Yes | Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) |
 | `consent` | string | Yes | Consent recording ID |
 | `name` | string | Yes | Name for the new voice |
+| `ref_text` | string | No | Transcript of the audio. When provided, enables in-context voice cloning (higher quality). Without it, only the speaker embedding is extracted. |
 
 **Response Example:**
 
@@ -182,7 +183,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 curl -X POST http://localhost:8091/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
-  -F "name=custom_voice_1"
+  -F "name=custom_voice_1" \
+  -F "ref_text=The exact transcript of the audio sample."
 ```
 
 ## Streaming Text Input (WebSocket)
@@ -318,20 +320,30 @@ curl -X POST http://localhost:8091/v1/audio/speech \
 ```
 
 ### Upload Voice
+
+Upload voice (speaker embedding only):
 ```bash
 curl -X POST http://localhost:8091/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
   -F "name=custom_voice_1"
 ```
 
+Upload voice with transcript (in-context cloning, higher quality):
+```bash
+curl -X POST http://localhost:8091/v1/audio/voices \
+  -F "audio_sample=@/path/to/voice_sample.wav" \
+  -F "consent=user_consent_id" \
+  -F "name=custom_voice_2" \
+  -F "ref_text=The exact transcript of the audio sample."
+```
+
 ### Use Uploaded Voice
 ```bash
 curl -X POST http://localhost:8091/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{
         "input": "Hello, this is a cloned voice",
-        "task_type": "Base",
         "voice": "custom_voice_1"
     }' --output cloned.wav
 ```

@@ -242,6 +242,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 - `audio_sample` (required): Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4)
 - `consent` (required): Consent recording ID
 - `name` (required): Name for the new voice
+- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality).
 
 **Response Example:**
 ```json
@@ -262,7 +263,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 curl -X POST http://localhost:8000/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
-  -F "name=custom_voice_1"
+  -F "name=custom_voice_1" \
+  -F "ref_text=The exact transcript of the audio sample."
 ```
 
 ### Endpoint

@@ -232,6 +232,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 - `audio_sample` (required): Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4)
 - `consent` (required): Consent recording ID
 - `name` (required): Name for the new voice
+- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality).
 
 **Response Example:**
 ```json
@@ -252,7 +253,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 curl -X POST http://localhost:8000/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
-  -F "name=custom_voice_1"
+  -F "name=custom_voice_1" \
+  -F "ref_text=The exact transcript of the audio sample."
 ```
 
 ### Endpoint

@@ -250,14 +250,15 @@ async def upload_voice(
         speaker_embedding: str | None = Form(None),
         consent: str = Form(...),
         name: str = Form(...),
+        ref_text: str = Form(None),
     ):
         try:
             if speaker_embedding is not None and audio_sample is not None:
                 raise ValueError("'audio_sample' and 'speaker_embedding' are mutually exclusive")
             if speaker_embedding is not None:
                 result = await speech_server.upload_voice_embedding(speaker_embedding, consent, name)
             elif audio_sample is not None:
-                result = await speech_server.upload_voice(audio_sample, consent, name)
+                result = await speech_server.upload_voice(audio_sample, consent, name, ref_text=ref_text)
             else:
                 raise ValueError("Either 'audio_sample' or 'speaker_embedding' must be provided")
             return {"success": True, "voice": result}
@@ -361,30 +362,36 @@ def test_list_voices_endpoint(self, client):
         assert "voices" in response.json()
 
     def test_upload_voice_success(self, client, tmp_path):
-        """Test successful voice upload."""
-        # Create a mock audio file
-        audio_content = b"fake audio content" * 1000  # ~17KB
-        files = {
-            "audio_sample": ("test.wav", audio_content, "audio/wav"),
-        }
-        data = {
-            "consent": "user_consent_123",
-            "name": "test_voice",
-        }
+        """Test successful voice upload without ref_text."""
+        audio_content = b"fake audio content" * 1000
+        files = {"audio_sample": ("test.wav", audio_content, "audio/wav")}
+        data = {"consent": "user_consent_123", "name": "test_voice"}
 
         response = client.post("/v1/audio/voices", files=files, data=data)
         assert response.status_code == 200
         result = response.json()
         assert result["success"] is True
-        assert "voice" in result
         voice_info = result["voice"]
         assert voice_info["name"] == "test_voice"
         assert voice_info["consent"] == "user_consent_123"
-        assert "created_at" in voice_info
         assert voice_info["mime_type"] == "audio/wav"
         assert voice_info["file_size"] == len(audio_content)
         response = client.delete("/v1/audio/voices/test_voice")
 
+    def test_upload_voice_with_ref_text(self, client, tmp_path):
+        """Test voice upload with ref_text enables in-context cloning."""
+        audio_content = b"fake audio content" * 1000
+        files = {"audio_sample": ("test.wav", audio_content, "audio/wav")}
+        data = {"consent": "c1", "name": "test_voice_rt", "ref_text": "Hello world transcript"}
+
+        response = client.post("/v1/audio/voices", files=files, data=data)
+        assert response.status_code == 200
+        result = response.json()
+        assert result["success"] is True
+        assert result["voice"]["name"] == "test_voice_rt"
+        assert result["voice"].get("ref_text") == "Hello world transcript"
+        response = client.delete("/v1/audio/voices/test_voice_rt")
+
     def test_upload_voice_file_too_large(self, client):
         """Test voice upload with file exceeding size limit."""
         # Create a file larger than 10MB
@@ -831,31 +838,48 @@ def test_load_supported_speakers(self, mocker: MockerFixture):
         assert server.supported_speakers == {"ryan", "vivian", "aiden"}
 
     def test_build_tts_params_with_uploaded_voice(self, speech_server):
-        """Test _build_tts_params auto-sets ref_audio for uploaded voices."""
-        # Mock an uploaded speaker
+        """Test _build_tts_params auto-sets ref_audio for uploaded voices (x_vector only)."""
         speech_server.uploaded_speakers = {
             "custom_voice": {
                 "name": "custom_voice",
                 "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav",
                 "mime_type": "audio/wav",
+                "ref_text": None,
             }
         }
         speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"}
 
-        # Mock _get_uploaded_audio_data to return base64 data
         with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio:
             mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv"
+            req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice")
+            params = speech_server._build_tts_params(req)
 
-            req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice", task_type="Base")
+            assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"]
+            assert params["x_vector_only_mode"] == [True]
+            assert params["task_type"] == ["Base"]
+            assert "ref_text" not in params
+
+    def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server):
+        """Test _build_tts_params enables in-context cloning when ref_text is stored."""
+        speech_server.uploaded_speakers = {
+            "custom_voice": {
+                "name": "custom_voice",
+                "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav",
+                "mime_type": "audio/wav",
+                "ref_text": "Hello world transcript",
+            }
+        }
+        speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"}
 
+        with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio:
+            mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv"
+            req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice")
             params = speech_server._build_tts_params(req)
 
-            # Verify ref_audio was auto-set
-            assert "ref_audio" in params
             assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"]
-            assert "x_vector_only_mode" in params
-            assert params["x_vector_only_mode"] == [True]
-            mock_get_audio.assert_called_once_with("custom_voice")
+            assert params["x_vector_only_mode"] == [False]
+            assert params["task_type"] == ["Base"]
+            assert params["ref_text"] == ["Hello world transcript"]
 
     def test_build_tts_params_without_uploaded_voice(self, speech_server):
         """Test _build_tts_params does not auto-set ref_audio for non-uploaded voices."""

@@ -1046,6 +1046,7 @@ async def upload_voice(
     speaker_embedding: str | None = Form(None),
     consent: str = Form(...),
     name: str = Form(...),
+    ref_text: str = Form(None),
 ):
     """Upload a new voice for voice cloning.
 
@@ -1081,7 +1082,7 @@ async def upload_voice(
         if speaker_embedding is not None:
             result = await handler.upload_voice_embedding(speaker_embedding, consent, name)
         elif audio_sample is not None:
-            result = await handler.upload_voice(audio_sample, consent, name)
+            result = await handler.upload_voice(audio_sample, consent, name, ref_text=ref_text)
         else:
             return base(raw_request).create_error_response(
                 message="Either 'audio_sample' or 'speaker_embedding' must be provided"

@@ -1,5 +1,6 @@
 import asyncio
 import base64
+import io
 import json
 import math
 import os
@@ -11,6 +12,7 @@
 from typing import Any
 
 import numpy as np
+import soundfile as sf
 import torch
 from fastapi import Request, UploadFile
 from fastapi.responses import Response, StreamingResponse
@@ -54,6 +56,8 @@
     "Spanish",
     "Italian",
 }
+_REF_AUDIO_MIN_DURATION = 1.0  # seconds
+_REF_AUDIO_MAX_DURATION = 30.0  # seconds
 _TTS_MAX_INSTRUCTIONS_LENGTH = 500
 _TTS_MAX_NEW_TOKENS_MIN = 1
 _TTS_MAX_NEW_TOKENS_MAX = 4096
@@ -437,8 +441,12 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None:
             logger.error(f"Could not read audio file for voice {voice_name}: {e}")
             return None
 
-    async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> dict:
-        """Upload a new voice sample."""
+    async def upload_voice(
+        self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None
+    ) -> dict:
+        # Normalize ref_text: treat whitespace-only as absent
+        if ref_text is not None:
+            ref_text = ref_text.strip() or None
         # Validate file size (max 10MB)
         MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
         audio_file.file.seek(0, 2)  # Seek to end
@@ -512,10 +520,29 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) ->
         if not _validate_path_within_directory(file_path, self.uploaded_speakers_dir):
             raise ValueError("Invalid file path: potential path traversal attack detected")
 
+        # Read content and validate duration before saving
+        content = await audio_file.read()
+        try:
+            wav_np, sr = sf.read(io.BytesIO(content))
+            duration = len(wav_np) / sr if sr > 0 else 0.0
+            if duration < _REF_AUDIO_MIN_DURATION:
+                raise ValueError(
+                    f"Reference audio too short ({duration:.1f}s). "
+                    f"At least {_REF_AUDIO_MIN_DURATION:.0f}s of clear speech is required."
+                )
+            if duration > _REF_AUDIO_MAX_DURATION:
+                raise ValueError(
+                    f"Reference audio too long ({duration:.1f}s). "
+                    f"Maximum {_REF_AUDIO_MAX_DURATION:.0f}s supported — use a shorter clip."
+                )
+        except ValueError:
+            raise
+        except Exception as e:
+            logger.warning("Could not validate audio duration: %s", e)
+
         # Save audio file
         try:
             with open(file_path, "wb") as f:
-                content = await audio_file.read()
                 f.write(content)
         except Exception as e:
             raise ValueError(f"Failed to save audio file: {e}")
@@ -529,6 +556,7 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) ->
             "mime_type": mime_type,
             "original_filename": audio_file.filename,
             "file_size": file_size,
+            "ref_text": ref_text,
             "cache_status": "pending",  # The initial cache state is pending.
             "cache_file": None,  # The initial cache file is empty.
             "cache_generated_at": None,  # The initial cache generation time is empty.
@@ -552,13 +580,16 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) ->
         logger.info(f"Uploaded new voice '{name}' with consent ID '{consent}'")
 
         # Return voice information without exposing the server file path
-        return {
+        result = {
             "name": name,
             "consent": consent,
             "created_at": timestamp,
             "mime_type": mime_type,
             "file_size": file_size,
         }
+        if ref_text is not None:
+            result["ref_text"] = ref_text
+        return result
 
     async def upload_voice_embedding(self, embedding_json: str, consent: str, name: str) -> dict:
         """Upload a voice from a pre-computed speaker embedding.
@@ -863,7 +894,19 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int
         wav_np = np.asarray(wav_np, dtype=np.float32)
         if wav_np.ndim > 1:
             wav_np = np.mean(wav_np, axis=-1)
-        return wav_np.tolist(), int(sr)
+        sr = int(sr)
+        duration = len(wav_np) / sr if sr > 0 else 0.0
+        if duration < _REF_AUDIO_MIN_DURATION:
+            raise ValueError(
+                f"Reference audio too short ({duration:.1f}s). "
+                f"At least {_REF_AUDIO_MIN_DURATION:.0f}s of clear speech is required."
+            )
+        if duration > _REF_AUDIO_MAX_DURATION:
+            raise ValueError(
+                f"Reference audio too long ({duration:.1f}s). "
+                f"Maximum {_REF_AUDIO_MAX_DURATION:.0f}s supported — use a shorter clip."
+            )
+        return wav_np.tolist(), sr
 
     async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"):
         """Generate audio chunks for streaming response.
@@ -987,15 +1030,22 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any
         if request.voice is not None:
             params["speaker"] = [request.voice]
 
-            # If voice is an uploaded speaker and no ref_audio provided, auto-set it
+            # Uploaded voices use task_type="Base" (CustomVoice requires built-in spk_id).
+            # If ref_text was provided at upload time, use in-context cloning; otherwise x_vector only.
             if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None:
                 audio_data = self._get_uploaded_audio_data(request.voice)
-                if audio_data:
-                    params["ref_audio"] = [audio_data]
-                    params["x_vector_only_mode"] = [True]
-                    logger.info(f"Auto-set ref_audio for uploaded voice: {request.voice}")
-                else:
+                if not audio_data:
                     raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted")
+                speaker_info = self.uploaded_speakers[request.voice.lower()]
+                stored_ref_text = speaker_info.get("ref_text")
+                params["ref_audio"] = [audio_data]
+                params["task_type"] = ["Base"]
+                if stored_ref_text:
+                    params["ref_text"] = [stored_ref_text]
+                    params["x_vector_only_mode"] = [False]
+                else:
+                    params["x_vector_only_mode"] = [True]
+                logger.info("Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text))
 
         elif params["task_type"][0] == "CustomVoice":
             params["speaker"] = ["Vivian"]  # Default for CustomVoice
@@ -1162,8 +1212,14 @@ async def _prepare_speech_generation(
                 tts_params = {}
             else:
                 tts_params = self._build_tts_params(request)
-                if request.ref_audio is not None:
-                    wav_list, sr = await self._resolve_ref_audio(request.ref_audio)
+                # Resolve ref_audio (explicit or auto-set for uploaded voices)
+                # to [[wav_list, sr]] so the model doesn't re-decode base64.
+                ref_audio_source = request.ref_audio
+                if ref_audio_source is None and isinstance(tts_params.get("ref_audio"), list):
+                    # Uploaded voice: ref_audio was auto-set as [base64_data_url]
+                    ref_audio_source = tts_params["ref_audio"][0]
+                if ref_audio_source is not None and isinstance(ref_audio_source, str):
+                    wav_list, sr = await self._resolve_ref_audio(ref_audio_source)
                     tts_params["ref_audio"] = [[wav_list, sr]]
 
                 ph_len = self._estimate_prompt_len(tts_params)

@@ -236,8 +236,7 @@ def forward(
             if n == 0 or n % q != 0:
                 if n > 0:
                     logger.warning(
-                        "Code2Wav input_ids length %d not divisible by num_quantizers %d, "
-                        "likely a warmup run; returning empty audio.",
+                        "Code2Wav input_ids length %d not divisible by num_quantizers %d; skipping malformed request.",
                         n,
                         q,
                     )