diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md index e6ab77edda8..ecbe8d9ac98 100644 --- a/docs/serving/speech_api.md +++ b/docs/serving/speech_api.md @@ -118,6 +118,7 @@ Content-Type: application/json | `instructions` | string | "" | Voice style/emotion instructions | | `max_new_tokens` | integer | 2048 | Maximum tokens to generate | | `initial_codec_chunk_frames` | integer | null | Per-request initial chunk size override for TTFA tuning. When null, IC is computed dynamically based on server load. | +| `stream` | bool | false | Stream raw PCM chunks as they are decoded (requires `response_format="pcm"`) | **Supported languages:** Auto, Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian @@ -143,9 +144,23 @@ Lists available voices for the loaded model. ```json { - "voices": ["aiden", "dylan", "eric", "ono_anna", "ryan", "serena", "sohee", "uncle_fu", "vivian"] + "voices": ["aiden", "dylan", "eric", "ono_anna", "ryan", "serena", "sohee", "uncle_fu", "vivian", "custom_voice_1"], + "uploaded_voices": [ + { + "name": "custom_voice_1", + "consent": "user_consent_id", + "created_at": 1738660000, + "file_size": 1024000, + "mime_type": "audio/wav", + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" + } + ] } ``` + +`uploaded_voices` is always present (empty list when no custom voices have been uploaded). Fields `ref_text` and `speaker_description` are omitted per-entry when not provided at upload time. + ``` POST /v1/audio/voices Content-Type: multipart/form-data @@ -161,6 +176,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. | `consent` | string | Yes | Consent recording ID | | `name` | string | Yes | Name for the new voice | | `ref_text` | string | No | Transcript of the audio. When provided, enables in-context voice cloning (higher quality). Without it, only the speaker embedding is extracted. | +| `speaker_description` | string | No | Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). Stored as metadata and returned in `GET /v1/audio/voices`. | **Response Example:** @@ -172,11 +188,15 @@ Upload a new voice sample for voice cloning in Base task TTS requests. "consent": "user_consent_id", "created_at": 1738660000, "mime_type": "audio/wav", - "file_size": 1024000 + "file_size": 1024000, + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" } } ``` +Fields `ref_text` and `speaker_description` are omitted when not provided at upload time. + **Usage Example:** ```bash @@ -184,7 +204,8 @@ curl -X POST http://localhost:8091/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ -F "name=custom_voice_1" \ - -F "ref_text=The exact transcript of the audio sample." + -F "ref_text=The exact transcript of the audio sample." \ + -F "speaker_description=warm narrator" ``` ## Streaming Text Input (WebSocket) diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md index 401a5c2e945..156c4942cd9 100644 --- a/docs/user_guide/examples/online_serving/qwen3_tts.md +++ b/docs/user_guide/examples/online_serving/qwen3_tts.md @@ -159,7 +159,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ -H "Content-Type: application/json" \ -d '{ "input": "Hello, how are you?", - "speaker": "vivian", + "voice": "vivian", "language": "English" }' --output output.wav @@ -168,7 +168,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ -H "Content-Type: application/json" \ -d '{ "input": "I am so excited!", - "speaker": "vivian", + "voice": "vivian", "instructions": "Speak with great enthusiasm" }' --output excited.wav @@ -185,7 +185,7 @@ client = OpenAI(base_url="http://localhost:8091/v1", api_key="none") response = client.audio.speech.create( model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", - speaker="vivian", + voice="vivian", input="Hello, how are you?", ) @@ -201,7 +201,7 @@ response = httpx.post( "http://localhost:8091/v1/audio/speech", json={ "input": "Hello, how are you?", - "speaker": "vivian", + "voice": "vivian", "language": "English", }, timeout=300.0, @@ -237,12 +237,16 @@ List all available voices/speakers from the loaded model, including both built-i "consent": "user_consent_id", "created_at": 1738660000, "file_size": 1024000, - "mime_type": "audio/wav" + "mime_type": "audio/wav", + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" } ] } ``` +Fields `ref_text` and `speaker_description` are omitted per-entry when not provided at upload time. + #### POST /v1/audio/voices Upload a new voice sample for voice cloning in Base task TTS requests. @@ -252,6 +256,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. - `consent` (required): Consent recording ID - `name` (required): Name for the new voice - `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality). +- `speaker_description` (optional): Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). Stored as metadata. **Response Example:** ```json @@ -262,18 +267,23 @@ Upload a new voice sample for voice cloning in Base task TTS requests. "consent": "user_consent_id", "created_at": 1738660000, "mime_type": "audio/wav", - "file_size": 1024000 + "file_size": 1024000, + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" } } ``` +Fields `ref_text` and `speaker_description` are omitted when not provided at upload time. + **Usage Example:** ```bash -curl -X POST http://localhost:8000/v1/audio/voices \ +curl -X POST http://localhost:8091/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ -F "name=custom_voice_1" \ - -F "ref_text=The exact transcript of the audio sample." + -F "ref_text=The exact transcript of the audio sample." \ + -F "speaker_description=warm narrator" ``` ### Endpoint @@ -290,7 +300,7 @@ This endpoint follows the [OpenAI Audio Speech API](https://platform.openai.com/ ```json { "input": "Text to synthesize", - "speaker": "vivian", + "voice": "vivian", "response_format": "wav", "task_type": "CustomVoice", "language": "Auto", @@ -310,7 +320,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w ### Voice and language (summary) -- **Speaker**: Use the `speaker` request field to select the speaker (e.g., `vivian`, `ryan`, `aiden`). List available speakers with `GET /v1/audio/voices`. +- **Speaker**: Use the `voice` request field to select the speaker (e.g., `vivian`, `ryan`, `aiden`). List available speakers with `GET /v1/audio/voices`. - **Language**: Use the `language` field for the codec language tag (`Auto`, `Chinese`, `English`, etc.). Default is `Auto` for automatic detection. - **CustomVoice**: Requires a valid `voice` from the model’s speaker set. **VoiceDesign**: Use `instructions` to describe the voice. **Base**: Use `ref_audio` and `ref_text` for voice cloning. @@ -322,7 +332,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w | ----------------- | ------ | -------------- | ----------------------------------------------------------- | | `input` | string | **required** | Text to synthesize | | `model` | string | server's model | Model to use (optional, should match server if specified) | -| `speaker` | string | "vivian" | Speaker name (e.g., vivian, ryan, aiden) | +| `voice` | string | "vivian" | Speaker name (e.g., vivian, ryan, aiden) | | `response_format` | string | "wav" | Audio format: wav, mp3, flac, pcm, aac, opus | | `speed` | float | 1.0 | Playback speed (0.25-4.0, not supported with `stream=true`) | @@ -357,7 +367,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ -H "Content-Type: application/json" \ -d '{ "input": "Hello, how are you?", - "speaker": "vivian", + "voice": "vivian", "language": "English", "stream": true, "response_format": "pcm" diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index 1b51e00f122..5504b5737a8 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -233,6 +233,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. - `consent` (required): Consent recording ID - `name` (required): Name for the new voice - `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality). +- `speaker_description` (optional): Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). **Response Example:** ```json @@ -243,18 +244,23 @@ Upload a new voice sample for voice cloning in Base task TTS requests. "consent": "user_consent_id", "created_at": 1738660000, "mime_type": "audio/wav", - "file_size": 1024000 + "file_size": 1024000, + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" } } ``` +Fields `ref_text` and `speaker_description` are omitted when not provided at upload time. + **Usage Example:** ```bash curl -X POST http://localhost:8000/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ -F "name=custom_voice_1" \ - -F "ref_text=The exact transcript of the audio sample." + -F "ref_text=The exact transcript of the audio sample." \ + -F "speaker_description=warm narrator" ``` ### Endpoint diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 969df5bce0d..17203cb577f 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -233,17 +233,20 @@ async def list_voices(): uploaded_voices = [] if hasattr(speech_server, "uploaded_speakers"): for voice_name, info in speech_server.uploaded_speakers.items(): - uploaded_voices.append( - { - "name": info.get("name", voice_name), - "consent": info.get("consent", ""), - "created_at": info.get("created_at", 0), - "file_size": info.get("file_size", 0), - "mime_type": info.get("mime_type", ""), - "embedding_source": info.get("embedding_source", "audio"), - "embedding_dim": info.get("embedding_dim"), - } - ) + voice_entry = { + "name": info.get("name", voice_name), + "consent": info.get("consent", ""), + "created_at": info.get("created_at", 0), + "file_size": info.get("file_size", 0), + "mime_type": info.get("mime_type", ""), + "embedding_source": info.get("embedding_source", "audio"), + "embedding_dim": info.get("embedding_dim"), + } + if info.get("ref_text"): + voice_entry["ref_text"] = info["ref_text"] + if info.get("speaker_description"): + voice_entry["speaker_description"] = info["speaker_description"] + uploaded_voices.append(voice_entry) return {"voices": speakers, "uploaded_voices": uploaded_voices} app.add_api_route("/v1/audio/voices", list_voices, methods=["GET"]) @@ -255,7 +258,8 @@ async def upload_voice( speaker_embedding: str | None = Form(None), consent: str = Form(...), name: str = Form(...), - ref_text: str = Form(None), + ref_text: str | None = Form(None), + speaker_description: str | None = Form(None), ): try: if speaker_embedding is not None and audio_sample is not None: @@ -263,7 +267,13 @@ async def upload_voice( if speaker_embedding is not None: result = await speech_server.upload_voice_embedding(speaker_embedding, consent, name) elif audio_sample is not None: - result = await speech_server.upload_voice(audio_sample, consent, name, ref_text=ref_text) + result = await speech_server.upload_voice( + audio_sample, + consent, + name, + ref_text=ref_text, + speaker_description=speaker_description, + ) else: raise ValueError("Either 'audio_sample' or 'speaker_embedding' must be provided") return {"success": True, "voice": result} @@ -397,6 +407,44 @@ def test_upload_voice_with_ref_text(self, client, tmp_path): assert result["voice"].get("ref_text") == "Hello world transcript" response = client.delete("/v1/audio/voices/test_voice_rt") + def test_upload_voice_with_speaker_description(self, client, tmp_path): + """Test voice upload with speaker_description stores and returns the description.""" + # Pre-cleanup in case a previous test run left this voice behind + client.delete("/v1/audio/voices/test_voice_vd") + + audio_content = b"fake audio content" * 1000 + files = {"audio_sample": ("test.wav", audio_content, "audio/wav")} + data = {"consent": "c1", "name": "test_voice_vd", "speaker_description": " warm, energetic narrator "} + + response = client.post("/v1/audio/voices", files=files, data=data) + try: + assert response.status_code == 200 + result = response.json() + assert result["success"] is True + assert result["voice"]["name"] == "test_voice_vd" + assert result["voice"].get("speaker_description") == "warm, energetic narrator" + finally: + client.delete("/v1/audio/voices/test_voice_vd") + + def test_upload_voice_speaker_description_in_listing(self, client): + """Test that speaker_description survives the upload → list round-trip.""" + client.delete("/v1/audio/voices/test_voice_sd_list") + + audio_content = b"fake audio content" * 1000 + files = {"audio_sample": ("test.wav", audio_content, "audio/wav")} + data = {"consent": "c1", "name": "test_voice_sd_list", "speaker_description": "calm female narrator"} + + response = client.post("/v1/audio/voices", files=files, data=data) + try: + assert response.status_code == 200 + + listing = client.get("/v1/audio/voices").json() + uploaded = {v["name"]: v for v in listing["uploaded_voices"]} + assert "test_voice_sd_list" in uploaded + assert uploaded["test_voice_sd_list"]["speaker_description"] == "calm female narrator" + finally: + client.delete("/v1/audio/voices/test_voice_sd_list") + def test_upload_voice_file_too_large(self, client): """Test voice upload with file exceeding size limit.""" # Create a file larger than 10MB @@ -850,6 +898,7 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server): "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav", "mime_type": "audio/wav", "ref_text": None, + "created_at": 1711234567.89, } } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} @@ -862,6 +911,7 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server): assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] assert params["x_vector_only_mode"] == [True] assert params["task_type"] == ["Base"] + assert params["voice_created_at"] == [1711234567.89] assert "ref_text" not in params def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): @@ -872,6 +922,7 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav", "mime_type": "audio/wav", "ref_text": "Hello world transcript", + "created_at": 1711234567.89, } } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} @@ -885,6 +936,7 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): assert params["x_vector_only_mode"] == [False] assert params["task_type"] == ["Base"] assert params["ref_text"] == ["Hello world transcript"] + assert params["voice_created_at"] == [1711234567.89] def test_build_tts_params_without_uploaded_voice(self, speech_server): """Test _build_tts_params does not auto-set ref_audio for non-uploaded voices.""" diff --git a/tests/test_voice_cache.py b/tests/test_voice_cache.py new file mode 100644 index 00000000000..69327aae571 --- /dev/null +++ b/tests/test_voice_cache.py @@ -0,0 +1,129 @@ +import threading + +import pytest + +from vllm_omni.utils.voice_cache import VoiceEmbeddingCache + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture +def cache(): + return VoiceEmbeddingCache(max_entries=4) + + +class TestVoiceEmbeddingCache: + def test_miss_returns_none(self, cache: VoiceEmbeddingCache): + assert cache.get("nonexistent") is None + assert cache.stats()["misses"] == 1 + + def test_put_and_hit(self, cache: VoiceEmbeddingCache): + cache.put("abc", {"val": 42}) + result = cache.get("abc") + assert result is not None + assert result["val"] == 42 + assert cache.stats()["hits"] == 1 + + def test_lru_eviction(self, cache: VoiceEmbeddingCache): + for i in range(5): + cache.put(f"key{i}", {"i": i}) + # key0 should have been evicted (oldest, max_entries=4) + assert cache.get("key0") is None + # key1..key4 should still be present + for i in range(1, 5): + assert cache.get(f"key{i}") is not None + assert cache.stats()["entries"] == 4 + + def test_lru_access_promotes(self, cache: VoiceEmbeddingCache): + cache.put("a", {"v": 1}) + cache.put("b", {"v": 2}) + cache.put("c", {"v": 3}) + cache.put("d", {"v": 4}) + # Access "a" to promote it to MRU + cache.get("a") + # Insert "e" -- should evict "b" (now the oldest), not "a" + cache.put("e", {"v": 5}) + assert cache.get("a") is not None + assert cache.get("b") is None + + def test_put_overwrites(self, cache: VoiceEmbeddingCache): + cache.put("k", {"old": True}) + cache.put("k", {"new": True}) + result = cache.get("k") + assert result is not None + assert "new" in result + assert "old" not in result + assert cache.stats()["entries"] == 1 + + def test_make_cache_key_includes_mode(self): + k1 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=True) + k2 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False) + assert k1 != k2 + assert "xvec" in k1 + assert "icl" in k2 + + def test_make_cache_key_deterministic(self): + k1 = VoiceEmbeddingCache.make_cache_key("bob", xvec_only=True) + k2 = VoiceEmbeddingCache.make_cache_key("bob", xvec_only=True) + assert k1 == k2 + + def test_make_cache_key_created_at_isolation(self): + """Different created_at timestamps must produce different keys (stale-cache protection).""" + k1 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=1000.0) + k2 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=2000.0) + assert k1 != k2 + + def test_stale_cache_protection(self, cache: VoiceEmbeddingCache): + """Re-upload (new created_at) must produce a cache miss, not a stale hit.""" + key_old = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=1000.0) + key_new = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=2000.0) + cache.put(key_old, {"ref_spk_embedding": "old_emb"}) + # Re-upload produces a new created_at → different key → cold miss + assert cache.get(key_new) is None + # Old key still in cache (not yet evicted) + assert cache.get(key_old) is not None + + def test_cache_mode_isolation(self, cache: VoiceEmbeddingCache): + """xvec entry must NOT be served for an icl request (same voice).""" + key_xvec = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=True) + key_icl = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False) + cache.put(key_xvec, {"ref_code": None, "ref_spk_embedding": "emb"}) + # icl request should miss — different key + assert cache.get(key_icl) is None + # xvec request should hit + assert cache.get(key_xvec) is not None + + def test_stats_counters(self, cache: VoiceEmbeddingCache): + cache.put("x", {"v": 1}) + cache.get("x") # hit + cache.get("x") # hit + cache.get("y") # miss + s = cache.stats() + assert s["hits"] == 2 + assert s["misses"] == 1 + assert s["entries"] == 1 + assert s["max_entries"] == 4 + + def test_thread_safety(self): + cache = VoiceEmbeddingCache(max_entries=32) + errors = [] + + def worker(thread_id: int): + try: + for i in range(50): + key = f"t{thread_id}_k{i}" + cache.put(key, {"tid": thread_id, "i": i}) + cache.get(key) + cache.get(f"t{(thread_id + 1) % 10}_k{i}") + except Exception as e: + errors.append(e) + + threads = [threading.Thread(target=worker, args=(t,)) for t in range(10)] + for t in threads: + t.start() + for t in threads: + t.join() + + assert not errors, f"Thread safety errors: {errors}" + s = cache.stats() + assert s["entries"] <= 32 diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 0ffe33abde2..80f4f8d4aee 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1024,17 +1024,20 @@ async def list_voices(raw_request: Request): uploaded_speakers = [] if hasattr(handler, "uploaded_speakers"): for voice_name, info in handler.uploaded_speakers.items(): - uploaded_speakers.append( - { - "name": info.get("name", voice_name), - "consent": info.get("consent", ""), - "created_at": info.get("created_at", 0), - "file_size": info.get("file_size", 0), - "mime_type": info.get("mime_type", ""), - "embedding_source": info.get("embedding_source", "audio"), - "embedding_dim": info.get("embedding_dim"), - } - ) + voice_entry = { + "name": info.get("name", voice_name), + "consent": info.get("consent", ""), + "created_at": info.get("created_at", 0), + "file_size": info.get("file_size", 0), + "mime_type": info.get("mime_type", ""), + "embedding_source": info.get("embedding_source", "audio"), + "embedding_dim": info.get("embedding_dim"), + } + if info.get("ref_text"): + voice_entry["ref_text"] = info["ref_text"] + if info.get("speaker_description"): + voice_entry["speaker_description"] = info["speaker_description"] + uploaded_speakers.append(voice_entry) return JSONResponse(content={"voices": speakers, "uploaded_voices": uploaded_speakers}) @@ -1053,7 +1056,8 @@ async def upload_voice( speaker_embedding: str | None = Form(None), consent: str = Form(...), name: str = Form(...), - ref_text: str = Form(None), + ref_text: str | None = Form(None), + speaker_description: str | None = Form(None), ): """Upload a new voice for voice cloning. @@ -1072,6 +1076,11 @@ async def upload_voice( speaker_embedding: JSON-encoded float list. Mutually exclusive with audio_sample. consent: Consent recording ID name: Name for the new voice + ref_text: Optional transcript of the audio for ICL (in-context + learning) mode. When provided, voice clone requests using this + voice will produce higher quality results. + speaker_description: Optional free-form description of the voice + (e.g. "warm speaker", "energetic narrator"). raw_request: Raw FastAPI request Returns: @@ -1089,7 +1098,13 @@ async def upload_voice( if speaker_embedding is not None: result = await handler.upload_voice_embedding(speaker_embedding, consent, name) elif audio_sample is not None: - result = await handler.upload_voice(audio_sample, consent, name, ref_text=ref_text) + result = await handler.upload_voice( + audio_sample, + consent, + name, + ref_text=ref_text, + speaker_description=speaker_description, + ) else: return base(raw_request).create_error_response( message="Either 'audio_sample' or 'speaker_embedding' must be provided" diff --git a/vllm_omni/entrypoints/openai/metadata_manager.py b/vllm_omni/entrypoints/openai/metadata_manager.py deleted file mode 100644 index 4077aa23bcd..00000000000 --- a/vllm_omni/entrypoints/openai/metadata_manager.py +++ /dev/null @@ -1,243 +0,0 @@ -""" -Metadata manager for voice samples and cache information. - -Provides a unified interface for managing metadata.json with -concurrency safety and data consistency across multiple processes. -""" - -import fcntl -import json -import logging -import os -import threading -import time -from collections.abc import Callable -from pathlib import Path -from typing import Any - -logger = logging.getLogger(__name__) - - -class MetadataManager: - """ - Manages metadata for uploaded speakers and cache information. - - Features: - 1. Single source of truth for metadata - 2. Concurrency safety with threading locks - 3. Atomic read-modify-write operations - 4. Merge updates to preserve fields from different components - """ - - def __init__(self, metadata_file: Path): - """ - Initialize the metadata manager. - - Args: - metadata_file: Path to metadata.json file - """ - self.metadata_file = metadata_file - self._lock = threading.Lock() # For intra-process concurrency - self._metadata = self._load_from_disk() - - # Create lock file for cross-process synchronization - self.lock_file = metadata_file.with_suffix(".lock") - self.lock_file.parent.mkdir(parents=True, exist_ok=True) - - def _load_from_disk(self) -> dict[str, Any]: - """Load metadata from disk.""" - if not self.metadata_file.exists(): - return {"uploaded_speakers": {}} - - try: - with open(self.metadata_file) as f: - return json.load(f) - except Exception as e: - logger.error(f"Failed to load metadata from {self.metadata_file}: {e}") - return {"uploaded_speakers": {}} - - def _save_to_disk(self, metadata: dict[str, Any]) -> bool: - """Save metadata to disk.""" - try: - self.metadata_file.parent.mkdir(parents=True, exist_ok=True) - tmp = self.metadata_file.with_suffix(".tmp") - with open(tmp, "w") as f: - json.dump(metadata, f, indent=2) - tmp.replace(self.metadata_file) - return True - except Exception as e: - logger.error(f"Failed to save metadata to {self.metadata_file}: {e}") - return False - - # ================================ - # Core fix: single flock overwrites RMW - # ================================ - def _update_with_file_lock( - self, update_fn: Callable[[dict[str, Any]], dict[str, Any] | None] - ) -> dict[str, Any] | None: - lock_fd = os.open(self.lock_file, os.O_CREAT | os.O_RDWR) - try: - fcntl.flock(lock_fd, fcntl.LOCK_EX) - - metadata = self._load_from_disk() - result = update_fn(metadata) - if result is None: - return None - - if not self._save_to_disk(metadata): - return None - - self._metadata = metadata - return result - finally: - fcntl.flock(lock_fd, fcntl.LOCK_UN) - os.close(lock_fd) - - def get_uploaded_speakers(self) -> dict[str, dict[str, Any]]: - """Get all uploaded speakers.""" - # Read directly from disk to ensure getting the latest data - metadata = self._load_from_disk() - return metadata.get("uploaded_speakers", {}).copy() - - def get_speaker(self, speaker_key: str) -> dict[str, Any] | None: - """Get specific speaker information.""" - # Read directly from disk to ensure getting the latest data - metadata = self._load_from_disk() - speakers = metadata.get("uploaded_speakers", {}) - return speakers.get(speaker_key, {}).copy() if speaker_key in speakers else None - - def update_speaker(self, speaker_key: str, updates: dict[str, Any]) -> bool: - """ - Update speaker information with merge semantics. - - Uses file locking for cross-process atomic operations. - """ - with self._lock: - - def _update(metadata: dict[str, Any]): - speakers = metadata.setdefault("uploaded_speakers", {}) - entry = speakers.get(speaker_key, {}) - entry.update(updates) - speakers[speaker_key] = entry - return True - - return self._update_with_file_lock(_update) is not None - - def create_speaker(self, speaker_key: str, speaker_data: dict[str, Any]) -> bool: - """ - Create a new speaker entry. - - Uses file locking for cross-process atomic operations. - """ - with self._lock: - - def _create(metadata: dict[str, Any]): - speakers = metadata.setdefault("uploaded_speakers", {}) - if speaker_key in speakers: - logger.warning(f"Speaker {speaker_key} already exists") - return None - speakers[speaker_key] = speaker_data - return True - - return self._update_with_file_lock(_create) is not None - - def update_cache_info(self, speaker_key: str, cache_file_path: Path, status: str = "ready") -> bool: - """ - Update cache information for a speaker. - """ - updates = { - "cache_status": status, - "cache_file": str(cache_file_path), - "cache_generated_at": time.time(), - } - return self.update_speaker(speaker_key, updates) - - def delete_speaker(self, speaker_key: str) -> dict[str, Any] | None: - """ - Delete a speaker from metadata and clean up associated files. - - Uses file locking for cross-process atomic operations. - - Args: - speaker_key: Speaker name (lowercase) - base_dir: Base directory for file validation (optional) - - Returns: - dict: Deleted speaker information if successful, None if speaker doesn't exist or error - """ - with self._lock: - - def _delete(metadata: dict[str, Any]): - speakers = metadata.get("uploaded_speakers", {}) - if speaker_key not in speakers: - logger.warning(f"Speaker {speaker_key} not found in metadata") - return None - - speaker_info = speakers.pop(speaker_key) - - # Clean up associated files - deleted_files = self._cleanup_speaker_files(speaker_info) - if deleted_files: - logger.info(f"Deleted {len(deleted_files)} files for speaker {speaker_key}: {deleted_files}") - - return speaker_info - - return self._update_with_file_lock(_delete) - - def _cleanup_speaker_files(self, speaker_info: dict[str, Any]) -> list[str]: - """ - Clean up files associated with a speaker. - - Args: - speaker_info: Speaker information dictionary - base_dir: Base directory for file validation (optional) - - Returns: - list: List of successfully deleted file paths - """ - deleted_files = [] - - # Helper function to safely delete a file - def safe_delete(file_path_str: str, description: str) -> bool: - if not file_path_str: - return False - - try: - file_path = Path(file_path_str) - - # Check if file exists - if not file_path.exists(): - logger.debug(f"{description} not found: {file_path}") - return False - - # Delete the file - file_path.unlink() - logger.info(f"Deleted {description}: {file_path}") - deleted_files.append(str(file_path)) - return True - - except Exception as e: - logger.error(f"Failed to delete {description} {file_path_str}: {e}") - return False - - # Delete audio file - audio_file = speaker_info.get("file_path") - if audio_file: - safe_delete(audio_file, "audio file") - - # Delete cache file - cache_file = speaker_info.get("cache_file") - if cache_file: - safe_delete(cache_file, "cache file") - - return deleted_files - - def reload_from_disk(self) -> bool: - """Force reload metadata from disk (useful for external changes).""" - with self._lock: - try: - self._metadata = self._load_from_disk() - return True - except Exception as e: - logger.error(f"Failed to reload metadata from disk: {e}") - return False diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index b483181fd5f..90c99fdabf6 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -24,7 +24,6 @@ from vllm.utils import random_uuid from vllm_omni.entrypoints.openai.audio_utils_mixin import AudioMixin -from vllm_omni.entrypoints.openai.metadata_manager import MetadataManager from vllm_omni.entrypoints.openai.protocol.audio import ( AudioResponse, BatchSpeechRequest, @@ -147,14 +146,10 @@ def _validate_path_within_directory(file_path: Path, directory: Path) -> bool: class OmniOpenAIServingSpeech(OpenAIServing, AudioMixin): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # Initialize uploaded speakers storage + # Initialize uploaded speakers storage (ephemeral — cleared on restart) speech_voice_samples_dir = os.environ.get("SPEECH_VOICE_SAMPLES", "/tmp/voice_samples") self.uploaded_speakers_dir = Path(speech_voice_samples_dir) self.uploaded_speakers_dir.mkdir(parents=True, exist_ok=True) - self.metadata_file = self.uploaded_speakers_dir / "metadata.json" - - # Initialize metadata manager - self.metadata_manager = MetadataManager(self.metadata_file) # Find and cache the TTS stage (if any) during initialization self._tts_stage = self._find_tts_stage() @@ -171,17 +166,16 @@ def __init__(self, *args, **kwargs): # Cache TTS configuration values (computed once, reused per request) self._max_instructions_length = self._compute_max_instructions_length() - # Load supported speakers + # Load supported speakers (built-in only; uploaded voices start empty) self.supported_speakers = self._load_supported_speakers() - # Load uploaded speakers - self.uploaded_speakers = self.metadata_manager.get_uploaded_speakers() - - # Merge supported speakers with uploaded speakers - self.supported_speakers.update(self.uploaded_speakers.keys()) + self.uploaded_speakers: dict[str, dict] = {} + logger.warning( + "Uploaded voices are ephemeral and will be lost on server restart. " + "Re-upload voices after each restart if needed." + ) self._tts_tokenizer = None logger.info(f"Loaded {len(self.supported_speakers)} supported speakers: {sorted(self.supported_speakers)}") - logger.info(f"Loaded {len(self.uploaded_speakers)} uploaded speakers") # Batch configuration self._batch_max_items: int = getattr(self.engine_client, "tts_batch_max_items", 32) @@ -417,11 +411,20 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None: return None async def upload_voice( - self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None + self, + audio_file: UploadFile, + consent: str, + name: str, + *, + ref_text: str | None = None, + speaker_description: str | None = None, ) -> dict: - # Normalize ref_text: treat whitespace-only as absent + """Upload a new voice sample.""" + # Normalize optional strings: treat whitespace-only as absent if ref_text is not None: ref_text = ref_text.strip() or None + if speaker_description is not None: + speaker_description = speaker_description.strip() or None # Validate file size (max 10MB) MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB audio_file.file.seek(0, 2) # Seek to end @@ -473,7 +476,9 @@ async def upload_voice( # Check if voice already exists if voice_name_lower in self.uploaded_speakers: - raise ValueError(f"Voice '{name}' already exists") + raise ValueError( + f"Voice '{name}' already exists. To re-register this voice, delete it first and then upload it again." + ) # Sanitize name and consent to prevent path traversal sanitized_name = _sanitize_filename(name) @@ -523,7 +528,7 @@ async def upload_voice( raise ValueError(f"Failed to save audio file: {e}") # Create speaker data - speaker_data = { + speaker_data: dict[str, Any] = { "name": name, "consent": consent, "file_path": str(file_path), @@ -532,23 +537,13 @@ async def upload_voice( "original_filename": audio_file.filename, "file_size": file_size, "ref_text": ref_text, - "cache_status": "pending", # The initial cache state is pending. - "cache_file": None, # The initial cache file is empty. - "cache_generated_at": None, # The initial cache generation time is empty. "embedding_source": "audio", } - # Save metadata using metadata manager (concurrency safe) - success = self.metadata_manager.create_speaker(voice_name_lower, speaker_data) - if not success: - # Clean up the saved file if metadata creation failed - try: - file_path.unlink() - except Exception: - pass - raise ValueError(f"Failed to create metadata for voice '{name}' (possibly already exists)") + # Store voice description if provided. + if speaker_description: + speaker_data["speaker_description"] = speaker_description - # Update in-memory cache self.uploaded_speakers[voice_name_lower] = speaker_data self.supported_speakers.add(voice_name_lower) @@ -562,8 +557,10 @@ async def upload_voice( "mime_type": mime_type, "file_size": file_size, } - if ref_text is not None: - result["ref_text"] = ref_text + if speaker_data.get("ref_text"): + result["ref_text"] = speaker_data["ref_text"] + if speaker_data.get("speaker_description"): + result["speaker_description"] = speaker_data["speaker_description"] return result async def upload_voice_embedding(self, embedding_json: str, consent: str, name: str) -> dict: @@ -633,21 +630,10 @@ async def upload_voice_embedding(self, embedding_json: str, consent: str, name: "mime_type": "application/x-safetensors", "original_filename": filename, "file_size": file_path.stat().st_size, - "cache_status": "ready", - "cache_file": str(file_path), - "cache_generated_at": timestamp, "embedding_source": "direct", "embedding_dim": emb_dim, } - success = self.metadata_manager.create_speaker(voice_name_lower, speaker_data) - if not success: - try: - file_path.unlink() - except Exception: - pass - raise ValueError(f"Failed to create metadata for voice '{name}' (possibly already exists)") - self.uploaded_speakers[voice_name_lower] = speaker_data self.supported_speakers.add(voice_name_lower) @@ -673,25 +659,22 @@ async def delete_voice(self, name: str) -> bool: """ voice_name_lower = name.lower() - # Check if voice exists in memory cache if voice_name_lower not in self.uploaded_speakers: - logger.warning(f"Voice '{name}' not found in memory cache") + logger.warning(f"Voice '{name}' not found") return False - # Delete from metadata manager with file cleanup - # Pass base_dir for path validation - deleted_info = self.metadata_manager.delete_speaker(voice_name_lower) - if not deleted_info: - logger.error(f"Failed to delete voice '{name}' from metadata") - return False + speaker_info = self.uploaded_speakers.pop(voice_name_lower) + self.supported_speakers.discard(voice_name_lower) - # Update in-memory cache - if voice_name_lower in self.uploaded_speakers: - del self.uploaded_speakers[voice_name_lower] - if voice_name_lower in self.supported_speakers: - self.supported_speakers.remove(voice_name_lower) + # Clean up audio file on disk + file_path = speaker_info.get("file_path") + if file_path: + try: + Path(file_path).unlink(missing_ok=True) + except Exception as e: + logger.warning(f"Failed to delete audio file for '{name}': {e}") - logger.info(f"Deleted voice '{name}' and associated files") + logger.info(f"Deleted voice '{name}'") return True def _is_tts_model(self) -> bool: @@ -1037,6 +1020,7 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any stored_ref_text = speaker_info.get("ref_text") params["ref_audio"] = [audio_data] params["task_type"] = ["Base"] + params["voice_created_at"] = [speaker_info.get("created_at", 0)] if stored_ref_text: params["ref_text"] = [stored_ref_text] params["x_vector_only_mode"] = [False] diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py index 08c0f9a1e67..bc6222bbe2c 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py @@ -27,6 +27,7 @@ from vllm.sequence import IntermediateTensors from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.utils.voice_cache import VoiceEmbeddingCache from .configuration_qwen3_tts import Qwen3TTSConfig, Qwen3TTSSpeakerEncoderConfig, Qwen3TTSTalkerConfig from .qwen3_tts_code_predictor_vllm import Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM @@ -406,6 +407,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._tokenizer = None self._speech_tokenizer: Qwen3TTSTokenizer | None = None + # In-memory LRU cache for voice extraction artifacts (Base voice clone). + self._voice_cache = VoiceEmbeddingCache() + # -------------------- vLLM required hooks -------------------- def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: @@ -1326,6 +1330,25 @@ def _normalize_voice_clone_prompt(raw: object) -> dict[str, object] | None: xvec_only = bool((info_dict.get("x_vector_only_mode") or [False])[0]) in_context_mode = not xvec_only voice_clone_prompt = _normalize_voice_clone_prompt(info_dict.get("voice_clone_prompt")) + + # Voice cache: only for uploaded voices (created_at > 0) + _voice_cache_key = None + if voice_clone_prompt is None: + _speaker_list = info_dict.get("speaker") + if isinstance(_speaker_list, list) and _speaker_list: + _voice_name = str(_speaker_list[0]).lower() + _voice_created_at = float((info_dict.get("voice_created_at") or [0])[0]) + if _voice_created_at > 0: + _voice_cache_key = self._voice_cache.make_cache_key(_voice_name, xvec_only, _voice_created_at) + _cached = self._voice_cache.get(_voice_cache_key) if _voice_cache_key is not None else None + if _cached is not None: + voice_clone_prompt = { + "ref_code": _cached.get("ref_code"), + "ref_spk_embedding": _cached.get("ref_spk_embedding"), + "icl_mode": _cached.get("icl_mode"), + } + _voice_cache_key = None # hit -> don't store again + # Official implementation may pass `voice_clone_prompt.icl_mode`. if voice_clone_prompt is not None and "icl_mode" in voice_clone_prompt: icl_flag = _as_singleton(voice_clone_prompt.get("icl_mode")) @@ -1375,6 +1398,19 @@ def _normalize_voice_clone_prompt(raw: object) -> dict[str, object] | None: wav_np, sr = self._normalize_ref_audio(ref_audio_list[0]) speaker_embed = self._extract_speaker_embedding(wav_np, sr).view(1, 1, -1) + # Cache miss: store extraction result + if _voice_cache_key is not None and speaker_embed is not None: + self._voice_cache.put( + _voice_cache_key, + { + "ref_code": ref_code_prompt.detach().cpu() + if isinstance(ref_code_prompt, torch.Tensor) + else None, + "ref_spk_embedding": speaker_embed.detach().cpu().reshape(-1), + "icl_mode": in_context_mode, + }, + ) + codec_input = torch.cat([codec_input_0, speaker_embed, codec_input_1], dim=1) # Role header (<|im_start|>assistant\n) -> projected text embeds. diff --git a/vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py b/vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py deleted file mode 100644 index 1e26a161da4..00000000000 --- a/vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py +++ /dev/null @@ -1,271 +0,0 @@ -# Copyright 2026 The Alibaba Qwen team. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import torch -from safetensors import safe_open -from safetensors.torch import save_file -from vllm.logger import init_logger - -from vllm_omni.entrypoints.openai.metadata_manager import MetadataManager - -logger = init_logger(__name__) - - -@dataclass -class VoiceClonePromptItem: - """ - Container for one sample's voice-clone prompt information that can be fed to the model. - - Fields are aligned with `Qwen3TTSForConditionalGeneration.generate(..., voice_clone_prompt=...)`. - """ - - ref_code: torch.Tensor | None # (T, Q) or (T,) depending on tokenizer 25Hz/12Hz - ref_spk_embedding: torch.Tensor # (D,) - x_vector_only_mode: bool - icl_mode: bool - ref_text: str | None = None - - -class VoiceCacheManager: - """ - Voice cache manager, responsible for managing custom voice cache functionality. - - Main features: - 1. Load uploaded speaker information from metadata.json - 2. Manage voice clone prompt cache - 3. Update cache status to metadata.json - - Security properties: - - No pickle / torch.load - - Safetensors-only - - Cache path confined to voice samples directory - """ - - def __init__(self, speech_voice_samples_dir: str | None = None, metadata_manager: MetadataManager | None = None): - """ - Initialize the voice cache manager. - - Args: - speech_voice_samples_dir: Speech voice samples directory path, - if None, get from environment variable - metadata_manager: Optional MetadataManager instance for shared metadata access. - If not provided, will create its own (less efficient). - """ - self.speech_voice_samples_dir = speech_voice_samples_dir or os.environ.get( - "SPEECH_VOICE_SAMPLES", "/tmp/voice_samples" - ) - - # Initialize metadata manager - if metadata_manager is not None: - self.metadata_manager = metadata_manager - else: - metadata_file = Path(self.speech_voice_samples_dir) / "metadata.json" - self.metadata_manager = MetadataManager(metadata_file) - - # ------------------------------------------------------------------ - # Metadata helpers - # ------------------------------------------------------------------ - - def load_uploaded_speakers_from_metadata(self) -> dict[str, Any] | None: - """Load uploaded speakers from metadata manager.""" - try: - return self.metadata_manager.get_uploaded_speakers() - except Exception as e: - logger.warning(f"Failed to load uploaded speakers from metadata: {e}") - return None - - def update_metadata_cache_info(self, speaker: str, cache_file_path: Path, status: str = "ready") -> bool: - """ - Update cache information using metadata manager. - - Args: - speaker: Speaker name - cache_file_path: Cache file path - status: Cache status, default is "ready" - - Returns: - bool: Whether the update was successful - """ - try: - speaker_key = speaker.lower() - return self.metadata_manager.update_cache_info( - speaker_key=speaker_key, cache_file_path=cache_file_path, status=status - ) - except Exception as e: - logger.error(f"Failed to update metadata cache info: {e}") - return False - - # ------------------------------------------------------------------ - # Cache save (SAFE) - # ------------------------------------------------------------------ - - def save_voice_cache( - self, - speaker: str, - audio_file_path: Path, - prompt_items: list[VoiceClonePromptItem], - ) -> bool: - """ - Save voice cache using safetensors (no pickle, no RCE). - """ - try: - cache_file_path = audio_file_path.with_suffix(".safetensors") - - tensors: dict[str, torch.Tensor] = {} - metadata: dict[str, str] = {} - - tensors["__len__"] = torch.tensor(len(prompt_items), dtype=torch.int64) - - for i, item in enumerate(prompt_items): - prefix = f"item_{i}_" - - tensors[prefix + "ref_spk_embedding"] = item.ref_spk_embedding.detach().cpu() - - has_ref_code = item.ref_code is not None - tensors[prefix + "has_ref_code"] = torch.tensor(int(has_ref_code), dtype=torch.int8) - - if has_ref_code: - tensors[prefix + "ref_code"] = item.ref_code.detach().cpu() - - tensors[prefix + "x_vector_only_mode"] = torch.tensor(int(item.x_vector_only_mode), dtype=torch.int8) - tensors[prefix + "icl_mode"] = torch.tensor(int(item.icl_mode), dtype=torch.int8) - - if item.ref_text is not None: - metadata[prefix + "ref_text"] = item.ref_text - - save_file(tensors, str(cache_file_path), metadata=metadata) - - return self.update_metadata_cache_info( - speaker=speaker, - cache_file_path=cache_file_path, - status="ready", - ) - - except Exception as e: - logger.error(f"Failed to save safetensors cache for speaker {speaker}: {e}") - self.update_metadata_cache_info(speaker, Path(""), "failed") - return False - - # ------------------------------------------------------------------ - # Cache load (SAFE) - # ------------------------------------------------------------------ - - def load_cached_voice_prompt( - self, - speaker: str, - device: str | None = None, - ) -> list[VoiceClonePromptItem] | None: - """ - Load cached VoiceClonePromptItem list from safetensors. - """ - try: - uploaded_speakers = self.load_uploaded_speakers_from_metadata() - if not uploaded_speakers: - return None - - speaker_key = speaker.lower() - if speaker_key not in uploaded_speakers: - return None - - speaker_info = uploaded_speakers[speaker_key] - if speaker_info.get("cache_status") != "ready": - return None - - cache_file_path = Path(speaker_info.get("cache_file", "")).resolve() - - base_dir = Path(self.speech_voice_samples_dir).resolve() - - # ---- Path confinement (critical security check) - if not str(cache_file_path).startswith(str(base_dir)): - logger.error(f"Illegal cache path outside base dir: {cache_file_path}") - return None - - if not cache_file_path.exists(): - return None - - if cache_file_path.suffix != ".safetensors": - logger.error(f"Legacy or unsafe cache format rejected: {cache_file_path}") - return None - - with safe_open(cache_file_path, framework="pt", device="cpu") as f: - meta = f.metadata() - - num_items = int(f.get_tensor("__len__").item()) - result: list[VoiceClonePromptItem] = [] - - for i in range(num_items): - prefix = f"item_{i}_" - - has_ref_code = bool(f.get_tensor(prefix + "has_ref_code").item()) - - ref_code = f.get_tensor(prefix + "ref_code").to(device) if has_ref_code else None - - ref_spk_embedding = f.get_tensor(prefix + "ref_spk_embedding").to(device) - - x_vector_only_mode = bool(f.get_tensor(prefix + "x_vector_only_mode").item()) - icl_mode = bool(f.get_tensor(prefix + "icl_mode").item()) - - ref_text = meta.get(prefix + "ref_text") - - result.append( - VoiceClonePromptItem( - ref_code=ref_code, - ref_spk_embedding=ref_spk_embedding, - x_vector_only_mode=x_vector_only_mode, - icl_mode=icl_mode, - ref_text=ref_text, - ) - ) - - logger.info(f"Safetensors cache loaded for speaker: {speaker}") - return result - - except Exception as e: - logger.warning(f"Failed to load safetensors cache for speaker {speaker}: {e}") - return None - - # ------------------------------------------------------------------ - # Audio path helper - # ------------------------------------------------------------------ - - def get_speaker_audio_path(self, speaker: str) -> Path | None: - """ - Get speaker's audio file path. - - Args: - speaker: Speaker name - - Returns: - Optional[Path]: Audio file path, returns None if speaker doesn't exist - """ - uploaded_speakers = self.load_uploaded_speakers_from_metadata() - if not uploaded_speakers: - return None - - speaker_key = speaker.lower() - if speaker_key not in uploaded_speakers: - return None - - audio_file_path = Path(uploaded_speakers[speaker_key]["file_path"]) - if audio_file_path.exists(): - return audio_file_path - - logger.warning(f"Audio file not found for speaker {speaker}: {audio_file_path}") - return None diff --git a/vllm_omni/utils/voice_cache.py b/vllm_omni/utils/voice_cache.py new file mode 100644 index 00000000000..2d78a5bfdb9 --- /dev/null +++ b/vllm_omni/utils/voice_cache.py @@ -0,0 +1,89 @@ +"""In-memory LRU cache for voice extraction artifacts. + +Keyed by voice name + extraction mode (e.g. ``"alice:icl"``). +Only named voices are cached; inline ``ref_audio`` without a voice +name is not cached. + +Usage:: + + key = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False) + cached = cache.get(key) + if cached is None: + # ... extract ... + cache.put(key, {"artifact": result}) +""" + +import os +import threading +from collections import OrderedDict +from typing import Any + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +_DEFAULT_MAX_ENTRIES = 128 + + +class VoiceEmbeddingCache: + """LRU cache for voice extraction outputs. + + Each entry stores a ``dict[str, Any]`` whose contents are model-specific. + Thread-safe via a lightweight ``threading.Lock``. + """ + + def __init__(self, max_entries: int | None = None): + if max_entries is None: + max_entries = int(os.environ.get("VOICE_CACHE_MAX_ENTRIES", _DEFAULT_MAX_ENTRIES)) + self._cache: OrderedDict[str, dict[str, Any]] = OrderedDict() + self._max_entries = max_entries + self._lock = threading.Lock() + self._hits = 0 + self._misses = 0 + logger.info("Voice embedding cache initialized (max_entries=%d)", max_entries) + + @staticmethod + def make_cache_key(voice_name: str, xvec_only: bool, created_at: float = 0.0) -> str: + """Build a cache key from a voice name, upload timestamp, and extraction mode. + + Args: + voice_name: The speaker/voice name (case-insensitive, lowered + by the caller). + xvec_only: True for speaker-embedding-only mode, False for + ICL mode (speaker embedding + ref_code). + created_at: Upload timestamp from metadata. Prevents stale cache + hits after a voice is deleted and re-uploaded with the same + name but different audio. + """ + mode = "xvec" if xvec_only else "icl" + return f"{voice_name}:{created_at:.6f}:{mode}" + + def get(self, key: str) -> dict[str, Any] | None: + """Return cached artifacts or ``None`` on miss. Promotes to MRU on hit.""" + with self._lock: + if key in self._cache: + self._cache.move_to_end(key) + self._hits += 1 + logger.debug("Voice cache HIT (key=%s, hits=%d)", key, self._hits) + return self._cache[key] + self._misses += 1 + return None + + def put(self, key: str, artifacts: dict[str, Any]) -> None: + """Store *artifacts* under *key*, evicting the LRU entry if full.""" + with self._lock: + self._cache[key] = artifacts + self._cache.move_to_end(key) + while len(self._cache) > self._max_entries: + evicted_key, _ = self._cache.popitem(last=False) + logger.debug("Voice cache EVICT (key=%s)", evicted_key) + + def stats(self) -> dict[str, int]: + """Return cache statistics.""" + with self._lock: + return { + "entries": len(self._cache), + "max_entries": self._max_entries, + "hits": self._hits, + "misses": self._misses, + }