From db0ef930f0ca4c02776833760ab28bb9936e4377 Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Fri, 20 Mar 2026 09:17:14 +0000 Subject: [PATCH 01/11] support ref_text Signed-off-by: JuanPZuluaga --- vllm_omni/entrypoints/openai/api_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index c3c250fda7f..5abd8b79a7b 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -978,6 +978,7 @@ async def upload_voice( audio_sample: UploadFile = File(...), consent: str = Form(...), name: str = Form(...), + ref_text: str = Form(None), ): """Upload a new voice sample for voice cloning. @@ -999,8 +1000,7 @@ async def upload_voice( return base(raw_request).create_error_response(message="The model does not support Speech API") try: - # Upload the voice - result = await handler.upload_voice(audio_sample, consent, name) + result = await handler.upload_voice(audio_sample, consent, name, ref_text=ref_text) return JSONResponse(content={"success": True, "voice": result}) From 0accaed50bbc9f7d0fea547813c98599806cf32d Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Fri, 20 Mar 2026 09:17:49 +0000 Subject: [PATCH 02/11] support ref_text in serving speech Signed-off-by: JuanPZuluaga --- .../entrypoints/openai/serving_speech.py | 52 ++++++++++++++----- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 5ece2e7501d..7cf5d032efd 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -361,8 +361,9 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None: logger.error(f"Could not read audio file for voice {voice_name}: {e}") return None - async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> dict: - """Upload a new voice sample.""" + async def upload_voice( + self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None + ) -> dict: # Validate file size (max 10MB) MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB audio_file.file.seek(0, 2) # Seek to end @@ -453,9 +454,10 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> "mime_type": mime_type, "original_filename": audio_file.filename, "file_size": file_size, - "cache_status": "pending", # The initial cache state is pending. - "cache_file": None, # The initial cache file is empty. - "cache_generated_at": None, # The initial cache generation time is empty. + "ref_text": ref_text, + "cache_status": "pending", + "cache_file": None, + "cache_generated_at": None, } # Save metadata using metadata manager (concurrency safe) @@ -668,7 +670,16 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int wav_np = np.asarray(wav_np, dtype=np.float32) if wav_np.ndim > 1: wav_np = np.mean(wav_np, axis=-1) - return wav_np.tolist(), int(sr) + sr = int(sr) + duration = len(wav_np) / sr if sr > 0 else 0.0 + if duration < 1.0: + raise ValueError( + f"Reference audio too short ({duration:.1f}s). " + "At least 1s of clear speech is required for speaker embedding." + ) + if duration > 20.0: + raise ValueError(f"Reference audio too long ({duration:.1f}s). Maximum 20s supported — use a shorter clip.") + return wav_np.tolist(), sr async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"): """Generate audio chunks for streaming response. @@ -792,15 +803,22 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any if request.voice is not None: params["speaker"] = [request.voice] - # If voice is an uploaded speaker and no ref_audio provided, auto-set it + # Uploaded voices use task_type="Base" (CustomVoice requires built-in spk_id). + # If ref_text was provided at upload time, use in-context cloning; otherwise x_vector only. if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None: audio_data = self._get_uploaded_audio_data(request.voice) - if audio_data: - params["ref_audio"] = [audio_data] - params["x_vector_only_mode"] = [True] - logger.info(f"Auto-set ref_audio for uploaded voice: {request.voice}") - else: + if not audio_data: raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted") + speaker_info = self.uploaded_speakers[request.voice.lower()] + stored_ref_text = speaker_info.get("ref_text") + params["ref_audio"] = [audio_data] + params["task_type"] = ["Base"] + if stored_ref_text: + params["ref_text"] = [stored_ref_text] + params["x_vector_only_mode"] = [False] + else: + params["x_vector_only_mode"] = [True] + logger.info("Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text)) elif params["task_type"][0] == "CustomVoice": params["speaker"] = ["Vivian"] # Default for CustomVoice @@ -970,8 +988,14 @@ async def _prepare_speech_generation( tts_params = {} else: tts_params = self._build_tts_params(request) - if request.ref_audio is not None: - wav_list, sr = await self._resolve_ref_audio(request.ref_audio) + # Resolve ref_audio (explicit or auto-set for uploaded voices) + # to [[wav_list, sr]] so the model doesn't re-decode base64. + ref_audio_source = request.ref_audio + if ref_audio_source is None and isinstance(tts_params.get("ref_audio"), list): + # Uploaded voice: ref_audio was auto-set as [base64_data_url] + ref_audio_source = tts_params["ref_audio"][0] + if ref_audio_source is not None and isinstance(ref_audio_source, str): + wav_list, sr = await self._resolve_ref_audio(ref_audio_source) tts_params["ref_audio"] = [[wav_list, sr]] ph_len = self._estimate_prompt_len(tts_params) From 0b89a269ba18705e1a2de8f63a8c6a7a0262314b Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Fri, 20 Mar 2026 09:18:23 +0000 Subject: [PATCH 03/11] move log to warning of code2wav Signed-off-by: JuanPZuluaga --- .../model_executor/models/qwen3_tts/qwen3_tts_code2wav.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py index 6be039df105..a22ce8488b4 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py @@ -228,10 +228,9 @@ def forward( flat = req_ids n = flat.numel() if n == 0 or n % q != 0: - if n > 0: + if n > 1: logger.warning( - "Code2Wav input_ids length %d not divisible by num_quantizers %d, " - "likely a warmup run; returning empty audio.", + "Code2Wav input_ids length %d not divisible by num_quantizers %d; skipping malformed request.", n, q, ) From e4508b17a3b767b68c2abdae7607b16374082d9d Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Fri, 20 Mar 2026 09:18:53 +0000 Subject: [PATCH 04/11] add to docs voice upload ref_text Signed-off-by: JuanPZuluaga --- docs/serving/speech_api.md | 18 ++++++++++++++---- .../examples/online_serving/qwen3_tts.md | 4 +++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md index 17787e682d0..67374f6ede4 100644 --- a/docs/serving/speech_api.md +++ b/docs/serving/speech_api.md @@ -160,6 +160,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. | `audio_sample` | file | Yes | Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) | | `consent` | string | Yes | Consent recording ID | | `name` | string | Yes | Name for the new voice | +| `ref_text` | string | No | Transcript of the audio. When provided, enables in-context voice cloning (higher quality). Without it, only the speaker embedding is extracted. | **Response Example:** @@ -182,7 +183,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests. curl -X POST http://localhost:8091/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ - -F "name=custom_voice_1" + -F "name=custom_voice_1" \ + -F "ref_text=The exact transcript of the audio sample." ``` ## Streaming Text Input (WebSocket) @@ -317,7 +319,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ }' --output cloned.wav ``` -upload voice +upload voice (speaker embedding only) ```bash curl -X POST http://localhost:8091/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ @@ -325,13 +327,21 @@ curl -X POST http://localhost:8091/v1/audio/voices \ -F "name=custom_voice_1" ``` -use upload voice +upload voice with transcript (in-context cloning, higher quality) +```bash +curl -X POST http://localhost:8091/v1/audio/voices \ + -F "audio_sample=@/path/to/voice_sample.wav" \ + -F "consent=user_consent_id" \ + -F "name=custom_voice_2" \ + -F "ref_text=The exact transcript of the audio sample." +``` + +use uploaded voice ```bash curl -X POST http://localhost:8091/v1/audio/speech \ -H "Content-Type: application/json" \ -d '{ "input": "Hello, this is a cloned voice", - "task_type": "Base", "voice": "custom_voice_1" }' --output cloned.wav ``` diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md index 993505b876e..bee3283a04a 100644 --- a/docs/user_guide/examples/online_serving/qwen3_tts.md +++ b/docs/user_guide/examples/online_serving/qwen3_tts.md @@ -242,6 +242,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. - `audio_sample` (required): Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) - `consent` (required): Consent recording ID - `name` (required): Name for the new voice +- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality). **Response Example:** ```json @@ -262,7 +263,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests. curl -X POST http://localhost:8000/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ - -F "name=custom_voice_1" + -F "name=custom_voice_1" \ + -F "ref_text=The exact transcript of the audio sample." ``` ### Endpoint From 75638c3555733b5ec2e36bf69f84cc0bd95187ab Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Fri, 20 Mar 2026 09:44:13 +0000 Subject: [PATCH 05/11] update readme and voice upload test Signed-off-by: JuanPZuluaga --- examples/online_serving/qwen3_tts/README.md | 4 +- .../openai_api/test_serving_speech.py | 64 +++++++++++++------ 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index 4709c9d4218..2feef51aacf 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -239,6 +239,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. - `audio_sample` (required): Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) - `consent` (required): Consent recording ID - `name` (required): Name for the new voice +- `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality). **Response Example:** ```json @@ -259,7 +260,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests. curl -X POST http://localhost:8000/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ - -F "name=custom_voice_1" + -F "name=custom_voice_1" \ + -F "ref_text=The exact transcript of the audio sample." ``` ### Endpoint diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 67abd7617b7..e5435f3d7f7 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -337,30 +337,35 @@ def test_list_voices_endpoint(self, client): assert "voices" in response.json() def test_upload_voice_success(self, client, tmp_path): - """Test successful voice upload.""" - # Create a mock audio file - audio_content = b"fake audio content" * 1000 # ~17KB - files = { - "audio_sample": ("test.wav", audio_content, "audio/wav"), - } - data = { - "consent": "user_consent_123", - "name": "test_voice", - } + """Test successful voice upload without ref_text.""" + audio_content = b"fake audio content" * 1000 + files = {"audio_sample": ("test.wav", audio_content, "audio/wav")} + data = {"consent": "user_consent_123", "name": "test_voice"} response = client.post("/v1/audio/voices", files=files, data=data) assert response.status_code == 200 result = response.json() assert result["success"] is True - assert "voice" in result voice_info = result["voice"] assert voice_info["name"] == "test_voice" assert voice_info["consent"] == "user_consent_123" - assert "created_at" in voice_info assert voice_info["mime_type"] == "audio/wav" assert voice_info["file_size"] == len(audio_content) response = client.delete("/v1/audio/voices/test_voice") + def test_upload_voice_with_ref_text(self, client, tmp_path): + """Test voice upload with ref_text enables in-context cloning.""" + audio_content = b"fake audio content" * 1000 + files = {"audio_sample": ("test.wav", audio_content, "audio/wav")} + data = {"consent": "c1", "name": "test_voice_rt", "ref_text": "Hello world transcript"} + + response = client.post("/v1/audio/voices", files=files, data=data) + assert response.status_code == 200 + result = response.json() + assert result["success"] is True + assert result["voice"]["name"] == "test_voice_rt" + response = client.delete("/v1/audio/voices/test_voice_rt") + def test_upload_voice_file_too_large(self, client): """Test voice upload with file exceeding size limit.""" # Create a file larger than 10MB @@ -634,31 +639,48 @@ def test_load_supported_speakers(self, mocker: MockerFixture): assert server.supported_speakers == {"ryan", "vivian", "aiden"} def test_build_tts_params_with_uploaded_voice(self, speech_server): - """Test _build_tts_params auto-sets ref_audio for uploaded voices.""" - # Mock an uploaded speaker + """Test _build_tts_params auto-sets ref_audio for uploaded voices (x_vector only).""" speech_server.uploaded_speakers = { "custom_voice": { "name": "custom_voice", "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav", "mime_type": "audio/wav", + "ref_text": None, } } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} - # Mock _get_uploaded_audio_data to return base64 data with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio: mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" + req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") + params = speech_server._build_tts_params(req) - req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice", task_type="Base") + assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] + assert params["x_vector_only_mode"] == [True] + assert params["task_type"] == ["Base"] + assert "ref_text" not in params + + def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): + """Test _build_tts_params enables in-context cloning when ref_text is stored.""" + speech_server.uploaded_speakers = { + "custom_voice": { + "name": "custom_voice", + "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav", + "mime_type": "audio/wav", + "ref_text": "Hello world transcript", + } + } + speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} + with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio: + mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" + req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") params = speech_server._build_tts_params(req) - # Verify ref_audio was auto-set - assert "ref_audio" in params assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] - assert "x_vector_only_mode" in params - assert params["x_vector_only_mode"] == [True] - mock_get_audio.assert_called_once_with("custom_voice") + assert params["x_vector_only_mode"] == [False] + assert params["task_type"] == ["Base"] + assert params["ref_text"] == ["Hello world transcript"] def test_build_tts_params_without_uploaded_voice(self, speech_server): """Test _build_tts_params does not auto-set ref_audio for non-uploaded voices.""" From b7140f3cd3c75e50f763d9bf426289a6a382a73f Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Fri, 20 Mar 2026 10:01:10 +0000 Subject: [PATCH 06/11] update test Signed-off-by: JuanPZuluaga --- tests/entrypoints/openai_api/test_serving_speech.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index e5435f3d7f7..39f315a8d15 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -233,9 +233,14 @@ async def list_voices(): app.add_api_route("/v1/audio/voices", list_voices, methods=["GET"]) # Add upload_voice endpoint - async def upload_voice(audio_sample: UploadFile = File(...), consent: str = Form(...), name: str = Form(...)): + async def upload_voice( + audio_sample: UploadFile = File(...), + consent: str = Form(...), + name: str = Form(...), + ref_text: str = Form(None), + ): try: - result = await speech_server.upload_voice(audio_sample, consent, name) + result = await speech_server.upload_voice(audio_sample, consent, name, ref_text=ref_text) return {"success": True, "voice": result} except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) From eeb344909adc6a9d7099a3b507e858924bd076d5 Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Fri, 20 Mar 2026 12:24:51 +0000 Subject: [PATCH 07/11] add generate to AR stage0 Signed-off-by: JuanPZuluaga --- vllm_omni/engine/async_omni_engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 5562b84ff29..4651bfa7762 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -532,6 +532,8 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: supported_tasks.add("generate") if any(metadata.get("final_output_type") == "audio" for metadata in stage_metadata): supported_tasks.add("speech") + # TTS stage-0 is an AR model, so we need to add generate + supported_tasks.add("generate") self.supported_tasks = tuple(supported_tasks) if supported_tasks else ("generate",) self.default_sampling_params_list = default_sampling_params_list From 610809ddb1d7ac4f718517ac2baf7dec265d2cc2 Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Fri, 20 Mar 2026 13:26:39 +0000 Subject: [PATCH 08/11] revert 'generate', add cap to 30s and clean ref_text Signed-off-by: JuanPZuluaga --- vllm_omni/engine/async_omni_engine.py | 2 -- vllm_omni/entrypoints/openai/serving_speech.py | 7 +++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 4651bfa7762..5562b84ff29 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -532,8 +532,6 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: supported_tasks.add("generate") if any(metadata.get("final_output_type") == "audio" for metadata in stage_metadata): supported_tasks.add("speech") - # TTS stage-0 is an AR model, so we need to add generate - supported_tasks.add("generate") self.supported_tasks = tuple(supported_tasks) if supported_tasks else ("generate",) self.default_sampling_params_list = default_sampling_params_list diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 7cf5d032efd..d6f3207fc56 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -364,6 +364,9 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None: async def upload_voice( self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None ) -> dict: + # Normalize ref_text: treat whitespace-only as absent + if ref_text is not None: + ref_text = ref_text.strip() or None # Validate file size (max 10MB) MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB audio_file.file.seek(0, 2) # Seek to end @@ -677,8 +680,8 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int f"Reference audio too short ({duration:.1f}s). " "At least 1s of clear speech is required for speaker embedding." ) - if duration > 20.0: - raise ValueError(f"Reference audio too long ({duration:.1f}s). Maximum 20s supported — use a shorter clip.") + if duration > 30.0: + raise ValueError(f"Reference audio too long ({duration:.1f}s). Maximum 30s supported — use a shorter clip.") return wav_np.tolist(), sr async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"): From 9ef29d7408f634237963945a8c7645d7055cc5da Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Mon, 23 Mar 2026 06:33:28 +0000 Subject: [PATCH 09/11] add clone sample limit at voice upload, add ref_text in tests Signed-off-by: JuanPZuluaga --- .../openai_api/test_serving_speech.py | 1 + .../entrypoints/openai/serving_speech.py | 21 ++++++++++++++++++- .../models/qwen3_tts/qwen3_tts_code2wav.py | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 39f315a8d15..aa43d7c0682 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -369,6 +369,7 @@ def test_upload_voice_with_ref_text(self, client, tmp_path): result = response.json() assert result["success"] is True assert result["voice"]["name"] == "test_voice_rt" + assert result["voice"].get("ref_text") == "Hello world transcript" response = client.delete("/v1/audio/voices/test_voice_rt") def test_upload_voice_file_too_large(self, client): diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index f124ae942f0..786038ca3fc 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -1,5 +1,6 @@ import asyncio import base64 +import io import json import math import os @@ -10,6 +11,7 @@ from typing import Any import numpy as np +import soundfile as sf from fastapi import Request, UploadFile from fastapi.responses import Response, StreamingResponse from transformers.utils.hub import cached_file @@ -441,10 +443,27 @@ async def upload_voice( if not _validate_path_within_directory(file_path, self.uploaded_speakers_dir): raise ValueError("Invalid file path: potential path traversal attack detected") + # Read content and validate duration before saving + content = await audio_file.read() + try: + wav_np, sr = sf.read(io.BytesIO(content)) + duration = len(wav_np) / sr if sr > 0 else 0.0 + if duration < 1.0: + raise ValueError( + f"Reference audio too short ({duration:.1f}s). At least 1s of clear speech is required." + ) + if duration > 30.0: + raise ValueError( + f"Reference audio too long ({duration:.1f}s). Maximum 30s supported — use a shorter clip." + ) + except ValueError: + raise + except Exception as e: + logger.warning("Could not validate audio duration: %s", e) + # Save audio file try: with open(file_path, "wb") as f: - content = await audio_file.read() f.write(content) except Exception as e: raise ValueError(f"Failed to save audio file: {e}") diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py index a22ce8488b4..2a7c3378cf1 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py @@ -228,7 +228,7 @@ def forward( flat = req_ids n = flat.numel() if n == 0 or n % q != 0: - if n > 1: + if n > 0: logger.warning( "Code2Wav input_ids length %d not divisible by num_quantizers %d; skipping malformed request.", n, From 55a27c1af5e2fb1ca3212e81e2983117839159c6 Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Mon, 23 Mar 2026 06:48:36 +0000 Subject: [PATCH 10/11] added min/max global and add check in voice_upload Signed-off-by: JuanPZuluaga --- .../entrypoints/openai/serving_speech.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 786038ca3fc..f35341bac09 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -49,6 +49,8 @@ "Spanish", "Italian", } +_REF_AUDIO_MIN_DURATION = 1.0 # seconds +_REF_AUDIO_MAX_DURATION = 30.0 # seconds _TTS_MAX_INSTRUCTIONS_LENGTH = 500 _TTS_MAX_NEW_TOKENS_MIN = 1 _TTS_MAX_NEW_TOKENS_MAX = 4096 @@ -448,13 +450,15 @@ async def upload_voice( try: wav_np, sr = sf.read(io.BytesIO(content)) duration = len(wav_np) / sr if sr > 0 else 0.0 - if duration < 1.0: + if duration < _REF_AUDIO_MIN_DURATION: raise ValueError( - f"Reference audio too short ({duration:.1f}s). At least 1s of clear speech is required." + f"Reference audio too short ({duration:.1f}s). " + f"At least {_REF_AUDIO_MIN_DURATION:.0f}s of clear speech is required." ) - if duration > 30.0: + if duration > _REF_AUDIO_MAX_DURATION: raise ValueError( - f"Reference audio too long ({duration:.1f}s). Maximum 30s supported — use a shorter clip." + f"Reference audio too long ({duration:.1f}s). " + f"Maximum {_REF_AUDIO_MAX_DURATION:.0f}s supported — use a shorter clip." ) except ValueError: raise @@ -695,13 +699,16 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int wav_np = np.mean(wav_np, axis=-1) sr = int(sr) duration = len(wav_np) / sr if sr > 0 else 0.0 - if duration < 1.0: + if duration < _REF_AUDIO_MIN_DURATION: raise ValueError( f"Reference audio too short ({duration:.1f}s). " - "At least 1s of clear speech is required for speaker embedding." + f"At least {_REF_AUDIO_MIN_DURATION:.0f}s of clear speech is required." + ) + if duration > _REF_AUDIO_MAX_DURATION: + raise ValueError( + f"Reference audio too long ({duration:.1f}s). " + f"Maximum {_REF_AUDIO_MAX_DURATION:.0f}s supported — use a shorter clip." ) - if duration > 30.0: - raise ValueError(f"Reference audio too long ({duration:.1f}s). Maximum 30s supported — use a shorter clip.") return wav_np.tolist(), sr async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"): From 9a4552a6c09be3e887602eddf2ed06f064417e23 Mon Sep 17 00:00:00 2001 From: JuanPZuluaga Date: Thu, 26 Mar 2026 14:02:36 +0000 Subject: [PATCH 11/11] fix ci Signed-off-by: JuanPZuluaga --- vllm_omni/entrypoints/openai/serving_speech.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 16d57e15f17..9d3a0fe7889 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -580,13 +580,16 @@ async def upload_voice( logger.info(f"Uploaded new voice '{name}' with consent ID '{consent}'") # Return voice information without exposing the server file path - return { + result = { "name": name, "consent": consent, "created_at": timestamp, "mime_type": mime_type, "file_size": file_size, } + if ref_text is not None: + result["ref_text"] = ref_text + return result async def upload_voice_embedding(self, embedding_json: str, consent: str, name: str) -> dict: """Upload a voice from a pre-computed speaker embedding.