diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md index 6d033bc20ab..e6ab77edda8 100644 --- a/docs/serving/speech_api.md +++ b/docs/serving/speech_api.md @@ -160,6 +160,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. | `audio_sample` | file | Yes | Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) | | `consent` | string | Yes | Consent recording ID | | `name` | string | Yes | Name for the new voice | +| `ref_text` | string | No | Transcript of the audio. When provided, enables in-context voice cloning (higher quality). Without it, only the speaker embedding is extracted. | **Response Example:** @@ -182,7 +183,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests. curl -X POST http://localhost:8091/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ - -F "name=custom_voice_1" + -F "name=custom_voice_1" \ + -F "ref_text=The exact transcript of the audio sample." ``` ## Streaming Text Input (WebSocket) @@ -318,6 +320,8 @@ curl -X POST http://localhost:8091/v1/audio/speech \ ``` ### Upload Voice + +Upload voice (speaker embedding only): ```bash curl -X POST http://localhost:8091/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ @@ -325,13 +329,21 @@ curl -X POST http://localhost:8091/v1/audio/voices \ -F "name=custom_voice_1" ``` +Upload voice with transcript (in-context cloning, higher quality): +```bash +curl -X POST http://localhost:8091/v1/audio/voices \ + -F "audio_sample=@/path/to/voice_sample.wav" \ + -F "consent=user_consent_id" \ + -F "name=custom_voice_2" \ + -F "ref_text=The exact transcript of the audio sample." +``` + ### Use Uploaded Voice ```bash curl -X POST http://localhost:8091/v1/audio/speech \ -H "Content-Type: application/json" \ -d '{ "input": "Hello, this is a cloned voice", - "task_type": "Base", "voice": "custom_voice_1" }' --output cloned.wav ``` diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md index ee293131d93..03135085d43 100644 --- a/docs/user_guide/examples/online_serving/qwen3_tts.md +++ b/docs/user_guide/examples/online_serving/qwen3_tts.md @@ -242,6 +242,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. - `audio_sample` (required): Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) - `consent` (required): Consent recording ID - `name` (required): Name for the new voice +- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality). **Response Example:** ```json @@ -262,7 +263,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests. curl -X POST http://localhost:8000/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ - -F "name=custom_voice_1" + -F "name=custom_voice_1" \ + -F "ref_text=The exact transcript of the audio sample." ``` ### Endpoint diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index 1c86ffce816..1b51e00f122 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -232,6 +232,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. - `audio_sample` (required): Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) - `consent` (required): Consent recording ID - `name` (required): Name for the new voice +- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality). **Response Example:** ```json @@ -252,7 +253,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests. curl -X POST http://localhost:8000/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ - -F "name=custom_voice_1" + -F "name=custom_voice_1" \ + -F "ref_text=The exact transcript of the audio sample." ``` ### Endpoint diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index c84b501c26b..49d5e1a77dc 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -250,6 +250,7 @@ async def upload_voice( speaker_embedding: str | None = Form(None), consent: str = Form(...), name: str = Form(...), + ref_text: str = Form(None), ): try: if speaker_embedding is not None and audio_sample is not None: @@ -257,7 +258,7 @@ async def upload_voice( if speaker_embedding is not None: result = await speech_server.upload_voice_embedding(speaker_embedding, consent, name) elif audio_sample is not None: - result = await speech_server.upload_voice(audio_sample, consent, name) + result = await speech_server.upload_voice(audio_sample, consent, name, ref_text=ref_text) else: raise ValueError("Either 'audio_sample' or 'speaker_embedding' must be provided") return {"success": True, "voice": result} @@ -361,30 +362,36 @@ def test_list_voices_endpoint(self, client): assert "voices" in response.json() def test_upload_voice_success(self, client, tmp_path): - """Test successful voice upload.""" - # Create a mock audio file - audio_content = b"fake audio content" * 1000 # ~17KB - files = { - "audio_sample": ("test.wav", audio_content, "audio/wav"), - } - data = { - "consent": "user_consent_123", - "name": "test_voice", - } + """Test successful voice upload without ref_text.""" + audio_content = b"fake audio content" * 1000 + files = {"audio_sample": ("test.wav", audio_content, "audio/wav")} + data = {"consent": "user_consent_123", "name": "test_voice"} response = client.post("/v1/audio/voices", files=files, data=data) assert response.status_code == 200 result = response.json() assert result["success"] is True - assert "voice" in result voice_info = result["voice"] assert voice_info["name"] == "test_voice" assert voice_info["consent"] == "user_consent_123" - assert "created_at" in voice_info assert voice_info["mime_type"] == "audio/wav" assert voice_info["file_size"] == len(audio_content) response = client.delete("/v1/audio/voices/test_voice") + def test_upload_voice_with_ref_text(self, client, tmp_path): + """Test voice upload with ref_text enables in-context cloning.""" + audio_content = b"fake audio content" * 1000 + files = {"audio_sample": ("test.wav", audio_content, "audio/wav")} + data = {"consent": "c1", "name": "test_voice_rt", "ref_text": "Hello world transcript"} + + response = client.post("/v1/audio/voices", files=files, data=data) + assert response.status_code == 200 + result = response.json() + assert result["success"] is True + assert result["voice"]["name"] == "test_voice_rt" + assert result["voice"].get("ref_text") == "Hello world transcript" + response = client.delete("/v1/audio/voices/test_voice_rt") + def test_upload_voice_file_too_large(self, client): """Test voice upload with file exceeding size limit.""" # Create a file larger than 10MB @@ -831,31 +838,48 @@ def test_load_supported_speakers(self, mocker: MockerFixture): assert server.supported_speakers == {"ryan", "vivian", "aiden"} def test_build_tts_params_with_uploaded_voice(self, speech_server): - """Test _build_tts_params auto-sets ref_audio for uploaded voices.""" - # Mock an uploaded speaker + """Test _build_tts_params auto-sets ref_audio for uploaded voices (x_vector only).""" speech_server.uploaded_speakers = { "custom_voice": { "name": "custom_voice", "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav", "mime_type": "audio/wav", + "ref_text": None, } } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} - # Mock _get_uploaded_audio_data to return base64 data with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio: mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" + req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") + params = speech_server._build_tts_params(req) - req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice", task_type="Base") + assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] + assert params["x_vector_only_mode"] == [True] + assert params["task_type"] == ["Base"] + assert "ref_text" not in params + + def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): + """Test _build_tts_params enables in-context cloning when ref_text is stored.""" + speech_server.uploaded_speakers = { + "custom_voice": { + "name": "custom_voice", + "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav", + "mime_type": "audio/wav", + "ref_text": "Hello world transcript", + } + } + speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} + with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio: + mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" + req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") params = speech_server._build_tts_params(req) - # Verify ref_audio was auto-set - assert "ref_audio" in params assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] - assert "x_vector_only_mode" in params - assert params["x_vector_only_mode"] == [True] - mock_get_audio.assert_called_once_with("custom_voice") + assert params["x_vector_only_mode"] == [False] + assert params["task_type"] == ["Base"] + assert params["ref_text"] == ["Hello world transcript"] def test_build_tts_params_without_uploaded_voice(self, speech_server): """Test _build_tts_params does not auto-set ref_audio for non-uploaded voices.""" diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 717bc08ce40..c88a8144dd7 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1046,6 +1046,7 @@ async def upload_voice( speaker_embedding: str | None = Form(None), consent: str = Form(...), name: str = Form(...), + ref_text: str = Form(None), ): """Upload a new voice for voice cloning. @@ -1081,7 +1082,7 @@ async def upload_voice( if speaker_embedding is not None: result = await handler.upload_voice_embedding(speaker_embedding, consent, name) elif audio_sample is not None: - result = await handler.upload_voice(audio_sample, consent, name) + result = await handler.upload_voice(audio_sample, consent, name, ref_text=ref_text) else: return base(raw_request).create_error_response( message="Either 'audio_sample' or 'speaker_embedding' must be provided" diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 8d87bff00bd..9d3a0fe7889 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -1,5 +1,6 @@ import asyncio import base64 +import io import json import math import os @@ -11,6 +12,7 @@ from typing import Any import numpy as np +import soundfile as sf import torch from fastapi import Request, UploadFile from fastapi.responses import Response, StreamingResponse @@ -54,6 +56,8 @@ "Spanish", "Italian", } +_REF_AUDIO_MIN_DURATION = 1.0 # seconds +_REF_AUDIO_MAX_DURATION = 30.0 # seconds _TTS_MAX_INSTRUCTIONS_LENGTH = 500 _TTS_MAX_NEW_TOKENS_MIN = 1 _TTS_MAX_NEW_TOKENS_MAX = 4096 @@ -437,8 +441,12 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None: logger.error(f"Could not read audio file for voice {voice_name}: {e}") return None - async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> dict: - """Upload a new voice sample.""" + async def upload_voice( + self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None + ) -> dict: + # Normalize ref_text: treat whitespace-only as absent + if ref_text is not None: + ref_text = ref_text.strip() or None # Validate file size (max 10MB) MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB audio_file.file.seek(0, 2) # Seek to end @@ -512,10 +520,29 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> if not _validate_path_within_directory(file_path, self.uploaded_speakers_dir): raise ValueError("Invalid file path: potential path traversal attack detected") + # Read content and validate duration before saving + content = await audio_file.read() + try: + wav_np, sr = sf.read(io.BytesIO(content)) + duration = len(wav_np) / sr if sr > 0 else 0.0 + if duration < _REF_AUDIO_MIN_DURATION: + raise ValueError( + f"Reference audio too short ({duration:.1f}s). " + f"At least {_REF_AUDIO_MIN_DURATION:.0f}s of clear speech is required." + ) + if duration > _REF_AUDIO_MAX_DURATION: + raise ValueError( + f"Reference audio too long ({duration:.1f}s). " + f"Maximum {_REF_AUDIO_MAX_DURATION:.0f}s supported — use a shorter clip." + ) + except ValueError: + raise + except Exception as e: + logger.warning("Could not validate audio duration: %s", e) + # Save audio file try: with open(file_path, "wb") as f: - content = await audio_file.read() f.write(content) except Exception as e: raise ValueError(f"Failed to save audio file: {e}") @@ -529,6 +556,7 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> "mime_type": mime_type, "original_filename": audio_file.filename, "file_size": file_size, + "ref_text": ref_text, "cache_status": "pending", # The initial cache state is pending. "cache_file": None, # The initial cache file is empty. "cache_generated_at": None, # The initial cache generation time is empty. @@ -552,13 +580,16 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> logger.info(f"Uploaded new voice '{name}' with consent ID '{consent}'") # Return voice information without exposing the server file path - return { + result = { "name": name, "consent": consent, "created_at": timestamp, "mime_type": mime_type, "file_size": file_size, } + if ref_text is not None: + result["ref_text"] = ref_text + return result async def upload_voice_embedding(self, embedding_json: str, consent: str, name: str) -> dict: """Upload a voice from a pre-computed speaker embedding. @@ -863,7 +894,19 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int wav_np = np.asarray(wav_np, dtype=np.float32) if wav_np.ndim > 1: wav_np = np.mean(wav_np, axis=-1) - return wav_np.tolist(), int(sr) + sr = int(sr) + duration = len(wav_np) / sr if sr > 0 else 0.0 + if duration < _REF_AUDIO_MIN_DURATION: + raise ValueError( + f"Reference audio too short ({duration:.1f}s). " + f"At least {_REF_AUDIO_MIN_DURATION:.0f}s of clear speech is required." + ) + if duration > _REF_AUDIO_MAX_DURATION: + raise ValueError( + f"Reference audio too long ({duration:.1f}s). " + f"Maximum {_REF_AUDIO_MAX_DURATION:.0f}s supported — use a shorter clip." + ) + return wav_np.tolist(), sr async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"): """Generate audio chunks for streaming response. @@ -987,15 +1030,22 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any if request.voice is not None: params["speaker"] = [request.voice] - # If voice is an uploaded speaker and no ref_audio provided, auto-set it + # Uploaded voices use task_type="Base" (CustomVoice requires built-in spk_id). + # If ref_text was provided at upload time, use in-context cloning; otherwise x_vector only. if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None: audio_data = self._get_uploaded_audio_data(request.voice) - if audio_data: - params["ref_audio"] = [audio_data] - params["x_vector_only_mode"] = [True] - logger.info(f"Auto-set ref_audio for uploaded voice: {request.voice}") - else: + if not audio_data: raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted") + speaker_info = self.uploaded_speakers[request.voice.lower()] + stored_ref_text = speaker_info.get("ref_text") + params["ref_audio"] = [audio_data] + params["task_type"] = ["Base"] + if stored_ref_text: + params["ref_text"] = [stored_ref_text] + params["x_vector_only_mode"] = [False] + else: + params["x_vector_only_mode"] = [True] + logger.info("Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text)) elif params["task_type"][0] == "CustomVoice": params["speaker"] = ["Vivian"] # Default for CustomVoice @@ -1162,8 +1212,14 @@ async def _prepare_speech_generation( tts_params = {} else: tts_params = self._build_tts_params(request) - if request.ref_audio is not None: - wav_list, sr = await self._resolve_ref_audio(request.ref_audio) + # Resolve ref_audio (explicit or auto-set for uploaded voices) + # to [[wav_list, sr]] so the model doesn't re-decode base64. + ref_audio_source = request.ref_audio + if ref_audio_source is None and isinstance(tts_params.get("ref_audio"), list): + # Uploaded voice: ref_audio was auto-set as [base64_data_url] + ref_audio_source = tts_params["ref_audio"][0] + if ref_audio_source is not None and isinstance(ref_audio_source, str): + wav_list, sr = await self._resolve_ref_audio(ref_audio_source) tts_params["ref_audio"] = [[wav_list, sr]] ph_len = self._estimate_prompt_len(tts_params) diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py index c75f6e52e8f..f6ac91a994f 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py @@ -236,8 +236,7 @@ def forward( if n == 0 or n % q != 0: if n > 0: logger.warning( - "Code2Wav input_ids length %d not divisible by num_quantizers %d, " - "likely a warmup run; returning empty audio.", + "Code2Wav input_ids length %d not divisible by num_quantizers %d; skipping malformed request.", n, q, )