diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md
index e6ab77edda8..ecbe8d9ac98 100644
--- a/docs/serving/speech_api.md
+++ b/docs/serving/speech_api.md
@@ -118,6 +118,7 @@ Content-Type: application/json
 | `instructions` | string | "" | Voice style/emotion instructions |
 | `max_new_tokens` | integer | 2048 | Maximum tokens to generate |
 | `initial_codec_chunk_frames` | integer | null | Per-request initial chunk size override for TTFA tuning. When null, IC is computed dynamically based on server load. |
+| `stream` | bool | false | Stream raw PCM chunks as they are decoded (requires `response_format="pcm"`) |
 
 **Supported languages:** Auto, Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian
 
@@ -143,9 +144,23 @@ Lists available voices for the loaded model.
 
 ```json
 {
-    "voices": ["aiden", "dylan", "eric", "ono_anna", "ryan", "serena", "sohee", "uncle_fu", "vivian"]
+    "voices": ["aiden", "dylan", "eric", "ono_anna", "ryan", "serena", "sohee", "uncle_fu", "vivian", "custom_voice_1"],
+    "uploaded_voices": [
+        {
+            "name": "custom_voice_1",
+            "consent": "user_consent_id",
+            "created_at": 1738660000,
+            "file_size": 1024000,
+            "mime_type": "audio/wav",
+            "ref_text": "The exact transcript of the audio sample.",
+            "speaker_description": "warm narrator"
+        }
+    ]
 }
 ```
+
+`uploaded_voices` is always present (empty list when no custom voices have been uploaded). Fields `ref_text` and `speaker_description` are omitted per-entry when not provided at upload time.
+
 ```
 POST /v1/audio/voices
 Content-Type: multipart/form-data
@@ -161,6 +176,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 | `consent` | string | Yes | Consent recording ID |
 | `name` | string | Yes | Name for the new voice |
 | `ref_text` | string | No | Transcript of the audio. When provided, enables in-context voice cloning (higher quality). Without it, only the speaker embedding is extracted. |
+| `speaker_description` | string | No | Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). Stored as metadata and returned in `GET /v1/audio/voices`. |
 
 **Response Example:**
 
@@ -172,11 +188,15 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
     "consent": "user_consent_id",
     "created_at": 1738660000,
     "mime_type": "audio/wav",
-    "file_size": 1024000
+    "file_size": 1024000,
+    "ref_text": "The exact transcript of the audio sample.",
+    "speaker_description": "warm narrator"
   }
 }
 ```
 
+Fields `ref_text` and `speaker_description` are omitted when not provided at upload time.
+
 **Usage Example:**
 
 ```bash
@@ -184,7 +204,8 @@ curl -X POST http://localhost:8091/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
   -F "name=custom_voice_1" \
-  -F "ref_text=The exact transcript of the audio sample."
+  -F "ref_text=The exact transcript of the audio sample." \
+  -F "speaker_description=warm narrator"
 ```
 
 ## Streaming Text Input (WebSocket)
diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md
index 401a5c2e945..156c4942cd9 100644
--- a/docs/user_guide/examples/online_serving/qwen3_tts.md
+++ b/docs/user_guide/examples/online_serving/qwen3_tts.md
@@ -159,7 +159,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{
         "input": "Hello, how are you?",
-        "speaker": "vivian",
+        "voice": "vivian",
         "language": "English"
     }' --output output.wav
 
@@ -168,7 +168,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{
         "input": "I am so excited!",
-        "speaker": "vivian",
+        "voice": "vivian",
         "instructions": "Speak with great enthusiasm"
     }' --output excited.wav
 
@@ -185,7 +185,7 @@ client = OpenAI(base_url="http://localhost:8091/v1", api_key="none")
 
 response = client.audio.speech.create(
     model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
-    speaker="vivian",
+    voice="vivian",
     input="Hello, how are you?",
 )
 
@@ -201,7 +201,7 @@ response = httpx.post(
     "http://localhost:8091/v1/audio/speech",
     json={
         "input": "Hello, how are you?",
-        "speaker": "vivian",
+        "voice": "vivian",
         "language": "English",
     },
     timeout=300.0,
@@ -237,12 +237,16 @@ List all available voices/speakers from the loaded model, including both built-i
       "consent": "user_consent_id",
       "created_at": 1738660000,
       "file_size": 1024000,
-      "mime_type": "audio/wav"
+      "mime_type": "audio/wav",
+      "ref_text": "The exact transcript of the audio sample.",
+      "speaker_description": "warm narrator"
     }
   ]
 }
 ```
 
+Fields `ref_text` and `speaker_description` are omitted per-entry when not provided at upload time.
+
 #### POST /v1/audio/voices
 
 Upload a new voice sample for voice cloning in Base task TTS requests.
@@ -252,6 +256,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 - `consent` (required): Consent recording ID
 - `name` (required): Name for the new voice
 - `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality).
+- `speaker_description` (optional): Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). Stored as metadata.
 
 **Response Example:**
 ```json
@@ -262,18 +267,23 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
     "consent": "user_consent_id",
     "created_at": 1738660000,
     "mime_type": "audio/wav",
-    "file_size": 1024000
+    "file_size": 1024000,
+    "ref_text": "The exact transcript of the audio sample.",
+    "speaker_description": "warm narrator"
   }
 }
 ```
 
+Fields `ref_text` and `speaker_description` are omitted when not provided at upload time.
+
 **Usage Example:**
 ```bash
-curl -X POST http://localhost:8000/v1/audio/voices \
+curl -X POST http://localhost:8091/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
   -F "name=custom_voice_1" \
-  -F "ref_text=The exact transcript of the audio sample."
+  -F "ref_text=The exact transcript of the audio sample." \
+  -F "speaker_description=warm narrator"
 ```
 
 ### Endpoint
@@ -290,7 +300,7 @@ This endpoint follows the [OpenAI Audio Speech API](https://platform.openai.com/
 ```json
 {
     "input": "Text to synthesize",
-    "speaker": "vivian",
+    "voice": "vivian",
     "response_format": "wav",
     "task_type": "CustomVoice",
     "language": "Auto",
@@ -310,7 +320,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w
 
 ### Voice and language (summary)
 
-- **Speaker**: Use the `speaker` request field to select the speaker (e.g., `vivian`, `ryan`, `aiden`). List available speakers with `GET /v1/audio/voices`.
+- **Speaker**: Use the `voice` request field to select the speaker (e.g., `vivian`, `ryan`, `aiden`). List available speakers with `GET /v1/audio/voices`.
 - **Language**: Use the `language` field for the codec language tag (`Auto`, `Chinese`, `English`, etc.). Default is `Auto` for automatic detection.
 - **CustomVoice**: Requires a valid `voice` from the model’s speaker set. **VoiceDesign**: Use `instructions` to describe the voice. **Base**: Use `ref_audio` and `ref_text` for voice cloning.
 
@@ -322,7 +332,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w
 | ----------------- | ------ | -------------- | ----------------------------------------------------------- |
 | `input`           | string | **required**   | Text to synthesize                                          |
 | `model`           | string | server's model | Model to use (optional, should match server if specified)   |
-| `speaker`         | string | "vivian"       | Speaker name (e.g., vivian, ryan, aiden)                    |
+| `voice`           | string | "vivian"       | Speaker name (e.g., vivian, ryan, aiden)                    |
 | `response_format` | string | "wav"          | Audio format: wav, mp3, flac, pcm, aac, opus                |
 | `speed`           | float  | 1.0            | Playback speed (0.25-4.0, not supported with `stream=true`) |
 
@@ -357,7 +367,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{
         "input": "Hello, how are you?",
-        "speaker": "vivian",
+        "voice": "vivian",
         "language": "English",
         "stream": true,
         "response_format": "pcm"
diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md
index 1b51e00f122..5504b5737a8 100644
--- a/examples/online_serving/qwen3_tts/README.md
+++ b/examples/online_serving/qwen3_tts/README.md
@@ -233,6 +233,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
 - `consent` (required): Consent recording ID
 - `name` (required): Name for the new voice
 - `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality).
+- `speaker_description` (optional): Free-form description of the voice (e.g. "warm narrator", "energetic presenter").
 
 **Response Example:**
 ```json
@@ -243,18 +244,23 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
     "consent": "user_consent_id",
     "created_at": 1738660000,
     "mime_type": "audio/wav",
-    "file_size": 1024000
+    "file_size": 1024000,
+    "ref_text": "The exact transcript of the audio sample.",
+    "speaker_description": "warm narrator"
   }
 }
 ```
 
+Fields `ref_text` and `speaker_description` are omitted when not provided at upload time.
+
 **Usage Example:**
 ```bash
 curl -X POST http://localhost:8000/v1/audio/voices \
   -F "audio_sample=@/path/to/voice_sample.wav" \
   -F "consent=user_consent_id" \
   -F "name=custom_voice_1" \
-  -F "ref_text=The exact transcript of the audio sample."
+  -F "ref_text=The exact transcript of the audio sample." \
+  -F "speaker_description=warm narrator"
 ```
 
 ### Endpoint
diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index 969df5bce0d..17203cb577f 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -233,17 +233,20 @@ async def list_voices():
         uploaded_voices = []
         if hasattr(speech_server, "uploaded_speakers"):
             for voice_name, info in speech_server.uploaded_speakers.items():
-                uploaded_voices.append(
-                    {
-                        "name": info.get("name", voice_name),
-                        "consent": info.get("consent", ""),
-                        "created_at": info.get("created_at", 0),
-                        "file_size": info.get("file_size", 0),
-                        "mime_type": info.get("mime_type", ""),
-                        "embedding_source": info.get("embedding_source", "audio"),
-                        "embedding_dim": info.get("embedding_dim"),
-                    }
-                )
+                voice_entry = {
+                    "name": info.get("name", voice_name),
+                    "consent": info.get("consent", ""),
+                    "created_at": info.get("created_at", 0),
+                    "file_size": info.get("file_size", 0),
+                    "mime_type": info.get("mime_type", ""),
+                    "embedding_source": info.get("embedding_source", "audio"),
+                    "embedding_dim": info.get("embedding_dim"),
+                }
+                if info.get("ref_text"):
+                    voice_entry["ref_text"] = info["ref_text"]
+                if info.get("speaker_description"):
+                    voice_entry["speaker_description"] = info["speaker_description"]
+                uploaded_voices.append(voice_entry)
         return {"voices": speakers, "uploaded_voices": uploaded_voices}
 
     app.add_api_route("/v1/audio/voices", list_voices, methods=["GET"])
@@ -255,7 +258,8 @@ async def upload_voice(
         speaker_embedding: str | None = Form(None),
         consent: str = Form(...),
         name: str = Form(...),
-        ref_text: str = Form(None),
+        ref_text: str | None = Form(None),
+        speaker_description: str | None = Form(None),
     ):
         try:
             if speaker_embedding is not None and audio_sample is not None:
@@ -263,7 +267,13 @@ async def upload_voice(
             if speaker_embedding is not None:
                 result = await speech_server.upload_voice_embedding(speaker_embedding, consent, name)
             elif audio_sample is not None:
-                result = await speech_server.upload_voice(audio_sample, consent, name, ref_text=ref_text)
+                result = await speech_server.upload_voice(
+                    audio_sample,
+                    consent,
+                    name,
+                    ref_text=ref_text,
+                    speaker_description=speaker_description,
+                )
             else:
                 raise ValueError("Either 'audio_sample' or 'speaker_embedding' must be provided")
             return {"success": True, "voice": result}
@@ -397,6 +407,44 @@ def test_upload_voice_with_ref_text(self, client, tmp_path):
         assert result["voice"].get("ref_text") == "Hello world transcript"
         response = client.delete("/v1/audio/voices/test_voice_rt")
 
+    def test_upload_voice_with_speaker_description(self, client, tmp_path):
+        """Test voice upload with speaker_description stores and returns the description."""
+        # Pre-cleanup in case a previous test run left this voice behind
+        client.delete("/v1/audio/voices/test_voice_vd")
+
+        audio_content = b"fake audio content" * 1000
+        files = {"audio_sample": ("test.wav", audio_content, "audio/wav")}
+        data = {"consent": "c1", "name": "test_voice_vd", "speaker_description": "  warm, energetic narrator  "}
+
+        response = client.post("/v1/audio/voices", files=files, data=data)
+        try:
+            assert response.status_code == 200
+            result = response.json()
+            assert result["success"] is True
+            assert result["voice"]["name"] == "test_voice_vd"
+            assert result["voice"].get("speaker_description") == "warm, energetic narrator"
+        finally:
+            client.delete("/v1/audio/voices/test_voice_vd")
+
+    def test_upload_voice_speaker_description_in_listing(self, client):
+        """Test that speaker_description survives the upload → list round-trip."""
+        client.delete("/v1/audio/voices/test_voice_sd_list")
+
+        audio_content = b"fake audio content" * 1000
+        files = {"audio_sample": ("test.wav", audio_content, "audio/wav")}
+        data = {"consent": "c1", "name": "test_voice_sd_list", "speaker_description": "calm female narrator"}
+
+        response = client.post("/v1/audio/voices", files=files, data=data)
+        try:
+            assert response.status_code == 200
+
+            listing = client.get("/v1/audio/voices").json()
+            uploaded = {v["name"]: v for v in listing["uploaded_voices"]}
+            assert "test_voice_sd_list" in uploaded
+            assert uploaded["test_voice_sd_list"]["speaker_description"] == "calm female narrator"
+        finally:
+            client.delete("/v1/audio/voices/test_voice_sd_list")
+
     def test_upload_voice_file_too_large(self, client):
         """Test voice upload with file exceeding size limit."""
         # Create a file larger than 10MB
@@ -850,6 +898,7 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server):
                 "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav",
                 "mime_type": "audio/wav",
                 "ref_text": None,
+                "created_at": 1711234567.89,
             }
         }
         speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"}
@@ -862,6 +911,7 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server):
             assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"]
             assert params["x_vector_only_mode"] == [True]
             assert params["task_type"] == ["Base"]
+            assert params["voice_created_at"] == [1711234567.89]
             assert "ref_text" not in params
 
     def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server):
@@ -872,6 +922,7 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server):
                 "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav",
                 "mime_type": "audio/wav",
                 "ref_text": "Hello world transcript",
+                "created_at": 1711234567.89,
             }
         }
         speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"}
@@ -885,6 +936,7 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server):
             assert params["x_vector_only_mode"] == [False]
             assert params["task_type"] == ["Base"]
             assert params["ref_text"] == ["Hello world transcript"]
+            assert params["voice_created_at"] == [1711234567.89]
 
     def test_build_tts_params_without_uploaded_voice(self, speech_server):
         """Test _build_tts_params does not auto-set ref_audio for non-uploaded voices."""
diff --git a/tests/test_voice_cache.py b/tests/test_voice_cache.py
new file mode 100644
index 00000000000..69327aae571
--- /dev/null
+++ b/tests/test_voice_cache.py
@@ -0,0 +1,129 @@
+import threading
+
+import pytest
+
+from vllm_omni.utils.voice_cache import VoiceEmbeddingCache
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+@pytest.fixture
+def cache():
+    return VoiceEmbeddingCache(max_entries=4)
+
+
+class TestVoiceEmbeddingCache:
+    def test_miss_returns_none(self, cache: VoiceEmbeddingCache):
+        assert cache.get("nonexistent") is None
+        assert cache.stats()["misses"] == 1
+
+    def test_put_and_hit(self, cache: VoiceEmbeddingCache):
+        cache.put("abc", {"val": 42})
+        result = cache.get("abc")
+        assert result is not None
+        assert result["val"] == 42
+        assert cache.stats()["hits"] == 1
+
+    def test_lru_eviction(self, cache: VoiceEmbeddingCache):
+        for i in range(5):
+            cache.put(f"key{i}", {"i": i})
+        # key0 should have been evicted (oldest, max_entries=4)
+        assert cache.get("key0") is None
+        # key1..key4 should still be present
+        for i in range(1, 5):
+            assert cache.get(f"key{i}") is not None
+        assert cache.stats()["entries"] == 4
+
+    def test_lru_access_promotes(self, cache: VoiceEmbeddingCache):
+        cache.put("a", {"v": 1})
+        cache.put("b", {"v": 2})
+        cache.put("c", {"v": 3})
+        cache.put("d", {"v": 4})
+        # Access "a" to promote it to MRU
+        cache.get("a")
+        # Insert "e" -- should evict "b" (now the oldest), not "a"
+        cache.put("e", {"v": 5})
+        assert cache.get("a") is not None
+        assert cache.get("b") is None
+
+    def test_put_overwrites(self, cache: VoiceEmbeddingCache):
+        cache.put("k", {"old": True})
+        cache.put("k", {"new": True})
+        result = cache.get("k")
+        assert result is not None
+        assert "new" in result
+        assert "old" not in result
+        assert cache.stats()["entries"] == 1
+
+    def test_make_cache_key_includes_mode(self):
+        k1 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=True)
+        k2 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False)
+        assert k1 != k2
+        assert "xvec" in k1
+        assert "icl" in k2
+
+    def test_make_cache_key_deterministic(self):
+        k1 = VoiceEmbeddingCache.make_cache_key("bob", xvec_only=True)
+        k2 = VoiceEmbeddingCache.make_cache_key("bob", xvec_only=True)
+        assert k1 == k2
+
+    def test_make_cache_key_created_at_isolation(self):
+        """Different created_at timestamps must produce different keys (stale-cache protection)."""
+        k1 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=1000.0)
+        k2 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=2000.0)
+        assert k1 != k2
+
+    def test_stale_cache_protection(self, cache: VoiceEmbeddingCache):
+        """Re-upload (new created_at) must produce a cache miss, not a stale hit."""
+        key_old = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=1000.0)
+        key_new = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=2000.0)
+        cache.put(key_old, {"ref_spk_embedding": "old_emb"})
+        # Re-upload produces a new created_at → different key → cold miss
+        assert cache.get(key_new) is None
+        # Old key still in cache (not yet evicted)
+        assert cache.get(key_old) is not None
+
+    def test_cache_mode_isolation(self, cache: VoiceEmbeddingCache):
+        """xvec entry must NOT be served for an icl request (same voice)."""
+        key_xvec = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=True)
+        key_icl = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False)
+        cache.put(key_xvec, {"ref_code": None, "ref_spk_embedding": "emb"})
+        # icl request should miss — different key
+        assert cache.get(key_icl) is None
+        # xvec request should hit
+        assert cache.get(key_xvec) is not None
+
+    def test_stats_counters(self, cache: VoiceEmbeddingCache):
+        cache.put("x", {"v": 1})
+        cache.get("x")  # hit
+        cache.get("x")  # hit
+        cache.get("y")  # miss
+        s = cache.stats()
+        assert s["hits"] == 2
+        assert s["misses"] == 1
+        assert s["entries"] == 1
+        assert s["max_entries"] == 4
+
+    def test_thread_safety(self):
+        cache = VoiceEmbeddingCache(max_entries=32)
+        errors = []
+
+        def worker(thread_id: int):
+            try:
+                for i in range(50):
+                    key = f"t{thread_id}_k{i}"
+                    cache.put(key, {"tid": thread_id, "i": i})
+                    cache.get(key)
+                    cache.get(f"t{(thread_id + 1) % 10}_k{i}")
+            except Exception as e:
+                errors.append(e)
+
+        threads = [threading.Thread(target=worker, args=(t,)) for t in range(10)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        assert not errors, f"Thread safety errors: {errors}"
+        s = cache.stats()
+        assert s["entries"] <= 32
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 0ffe33abde2..80f4f8d4aee 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1024,17 +1024,20 @@ async def list_voices(raw_request: Request):
     uploaded_speakers = []
     if hasattr(handler, "uploaded_speakers"):
         for voice_name, info in handler.uploaded_speakers.items():
-            uploaded_speakers.append(
-                {
-                    "name": info.get("name", voice_name),
-                    "consent": info.get("consent", ""),
-                    "created_at": info.get("created_at", 0),
-                    "file_size": info.get("file_size", 0),
-                    "mime_type": info.get("mime_type", ""),
-                    "embedding_source": info.get("embedding_source", "audio"),
-                    "embedding_dim": info.get("embedding_dim"),
-                }
-            )
+            voice_entry = {
+                "name": info.get("name", voice_name),
+                "consent": info.get("consent", ""),
+                "created_at": info.get("created_at", 0),
+                "file_size": info.get("file_size", 0),
+                "mime_type": info.get("mime_type", ""),
+                "embedding_source": info.get("embedding_source", "audio"),
+                "embedding_dim": info.get("embedding_dim"),
+            }
+            if info.get("ref_text"):
+                voice_entry["ref_text"] = info["ref_text"]
+            if info.get("speaker_description"):
+                voice_entry["speaker_description"] = info["speaker_description"]
+            uploaded_speakers.append(voice_entry)
 
     return JSONResponse(content={"voices": speakers, "uploaded_voices": uploaded_speakers})
 
@@ -1053,7 +1056,8 @@ async def upload_voice(
     speaker_embedding: str | None = Form(None),
     consent: str = Form(...),
     name: str = Form(...),
-    ref_text: str = Form(None),
+    ref_text: str | None = Form(None),
+    speaker_description: str | None = Form(None),
 ):
     """Upload a new voice for voice cloning.
 
@@ -1072,6 +1076,11 @@ async def upload_voice(
         speaker_embedding: JSON-encoded float list. Mutually exclusive with audio_sample.
         consent: Consent recording ID
         name: Name for the new voice
+        ref_text: Optional transcript of the audio for ICL (in-context
+            learning) mode. When provided, voice clone requests using this
+            voice will produce higher quality results.
+        speaker_description: Optional free-form description of the voice
+            (e.g. "warm speaker", "energetic narrator").
         raw_request: Raw FastAPI request
 
     Returns:
@@ -1089,7 +1098,13 @@ async def upload_voice(
         if speaker_embedding is not None:
             result = await handler.upload_voice_embedding(speaker_embedding, consent, name)
         elif audio_sample is not None:
-            result = await handler.upload_voice(audio_sample, consent, name, ref_text=ref_text)
+            result = await handler.upload_voice(
+                audio_sample,
+                consent,
+                name,
+                ref_text=ref_text,
+                speaker_description=speaker_description,
+            )
         else:
             return base(raw_request).create_error_response(
                 message="Either 'audio_sample' or 'speaker_embedding' must be provided"
diff --git a/vllm_omni/entrypoints/openai/metadata_manager.py b/vllm_omni/entrypoints/openai/metadata_manager.py
deleted file mode 100644
index 4077aa23bcd..00000000000
--- a/vllm_omni/entrypoints/openai/metadata_manager.py
+++ /dev/null
@@ -1,243 +0,0 @@
-"""
-Metadata manager for voice samples and cache information.
-
-Provides a unified interface for managing metadata.json with
-concurrency safety and data consistency across multiple processes.
-"""
-
-import fcntl
-import json
-import logging
-import os
-import threading
-import time
-from collections.abc import Callable
-from pathlib import Path
-from typing import Any
-
-logger = logging.getLogger(__name__)
-
-
-class MetadataManager:
-    """
-    Manages metadata for uploaded speakers and cache information.
-
-    Features:
-    1. Single source of truth for metadata
-    2. Concurrency safety with threading locks
-    3. Atomic read-modify-write operations
-    4. Merge updates to preserve fields from different components
-    """
-
-    def __init__(self, metadata_file: Path):
-        """
-        Initialize the metadata manager.
-
-        Args:
-            metadata_file: Path to metadata.json file
-        """
-        self.metadata_file = metadata_file
-        self._lock = threading.Lock()  # For intra-process concurrency
-        self._metadata = self._load_from_disk()
-
-        # Create lock file for cross-process synchronization
-        self.lock_file = metadata_file.with_suffix(".lock")
-        self.lock_file.parent.mkdir(parents=True, exist_ok=True)
-
-    def _load_from_disk(self) -> dict[str, Any]:
-        """Load metadata from disk."""
-        if not self.metadata_file.exists():
-            return {"uploaded_speakers": {}}
-
-        try:
-            with open(self.metadata_file) as f:
-                return json.load(f)
-        except Exception as e:
-            logger.error(f"Failed to load metadata from {self.metadata_file}: {e}")
-            return {"uploaded_speakers": {}}
-
-    def _save_to_disk(self, metadata: dict[str, Any]) -> bool:
-        """Save metadata to disk."""
-        try:
-            self.metadata_file.parent.mkdir(parents=True, exist_ok=True)
-            tmp = self.metadata_file.with_suffix(".tmp")
-            with open(tmp, "w") as f:
-                json.dump(metadata, f, indent=2)
-            tmp.replace(self.metadata_file)
-            return True
-        except Exception as e:
-            logger.error(f"Failed to save metadata to {self.metadata_file}: {e}")
-            return False
-
-    # ================================
-    # Core fix: single flock overwrites RMW
-    # ================================
-    def _update_with_file_lock(
-        self, update_fn: Callable[[dict[str, Any]], dict[str, Any] | None]
-    ) -> dict[str, Any] | None:
-        lock_fd = os.open(self.lock_file, os.O_CREAT | os.O_RDWR)
-        try:
-            fcntl.flock(lock_fd, fcntl.LOCK_EX)
-
-            metadata = self._load_from_disk()
-            result = update_fn(metadata)
-            if result is None:
-                return None
-
-            if not self._save_to_disk(metadata):
-                return None
-
-            self._metadata = metadata
-            return result
-        finally:
-            fcntl.flock(lock_fd, fcntl.LOCK_UN)
-            os.close(lock_fd)
-
-    def get_uploaded_speakers(self) -> dict[str, dict[str, Any]]:
-        """Get all uploaded speakers."""
-        # Read directly from disk to ensure getting the latest data
-        metadata = self._load_from_disk()
-        return metadata.get("uploaded_speakers", {}).copy()
-
-    def get_speaker(self, speaker_key: str) -> dict[str, Any] | None:
-        """Get specific speaker information."""
-        # Read directly from disk to ensure getting the latest data
-        metadata = self._load_from_disk()
-        speakers = metadata.get("uploaded_speakers", {})
-        return speakers.get(speaker_key, {}).copy() if speaker_key in speakers else None
-
-    def update_speaker(self, speaker_key: str, updates: dict[str, Any]) -> bool:
-        """
-        Update speaker information with merge semantics.
-
-        Uses file locking for cross-process atomic operations.
-        """
-        with self._lock:
-
-            def _update(metadata: dict[str, Any]):
-                speakers = metadata.setdefault("uploaded_speakers", {})
-                entry = speakers.get(speaker_key, {})
-                entry.update(updates)
-                speakers[speaker_key] = entry
-                return True
-
-            return self._update_with_file_lock(_update) is not None
-
-    def create_speaker(self, speaker_key: str, speaker_data: dict[str, Any]) -> bool:
-        """
-        Create a new speaker entry.
-
-        Uses file locking for cross-process atomic operations.
-        """
-        with self._lock:
-
-            def _create(metadata: dict[str, Any]):
-                speakers = metadata.setdefault("uploaded_speakers", {})
-                if speaker_key in speakers:
-                    logger.warning(f"Speaker {speaker_key} already exists")
-                    return None
-                speakers[speaker_key] = speaker_data
-                return True
-
-            return self._update_with_file_lock(_create) is not None
-
-    def update_cache_info(self, speaker_key: str, cache_file_path: Path, status: str = "ready") -> bool:
-        """
-        Update cache information for a speaker.
-        """
-        updates = {
-            "cache_status": status,
-            "cache_file": str(cache_file_path),
-            "cache_generated_at": time.time(),
-        }
-        return self.update_speaker(speaker_key, updates)
-
-    def delete_speaker(self, speaker_key: str) -> dict[str, Any] | None:
-        """
-        Delete a speaker from metadata and clean up associated files.
-
-        Uses file locking for cross-process atomic operations.
-
-        Args:
-            speaker_key: Speaker name (lowercase)
-            base_dir: Base directory for file validation (optional)
-
-        Returns:
-            dict: Deleted speaker information if successful, None if speaker doesn't exist or error
-        """
-        with self._lock:
-
-            def _delete(metadata: dict[str, Any]):
-                speakers = metadata.get("uploaded_speakers", {})
-                if speaker_key not in speakers:
-                    logger.warning(f"Speaker {speaker_key} not found in metadata")
-                    return None
-
-                speaker_info = speakers.pop(speaker_key)
-
-                # Clean up associated files
-                deleted_files = self._cleanup_speaker_files(speaker_info)
-                if deleted_files:
-                    logger.info(f"Deleted {len(deleted_files)} files for speaker {speaker_key}: {deleted_files}")
-
-                return speaker_info
-
-            return self._update_with_file_lock(_delete)
-
-    def _cleanup_speaker_files(self, speaker_info: dict[str, Any]) -> list[str]:
-        """
-        Clean up files associated with a speaker.
-
-        Args:
-            speaker_info: Speaker information dictionary
-            base_dir: Base directory for file validation (optional)
-
-        Returns:
-            list: List of successfully deleted file paths
-        """
-        deleted_files = []
-
-        # Helper function to safely delete a file
-        def safe_delete(file_path_str: str, description: str) -> bool:
-            if not file_path_str:
-                return False
-
-            try:
-                file_path = Path(file_path_str)
-
-                # Check if file exists
-                if not file_path.exists():
-                    logger.debug(f"{description} not found: {file_path}")
-                    return False
-
-                # Delete the file
-                file_path.unlink()
-                logger.info(f"Deleted {description}: {file_path}")
-                deleted_files.append(str(file_path))
-                return True
-
-            except Exception as e:
-                logger.error(f"Failed to delete {description} {file_path_str}: {e}")
-                return False
-
-        # Delete audio file
-        audio_file = speaker_info.get("file_path")
-        if audio_file:
-            safe_delete(audio_file, "audio file")
-
-        # Delete cache file
-        cache_file = speaker_info.get("cache_file")
-        if cache_file:
-            safe_delete(cache_file, "cache file")
-
-        return deleted_files
-
-    def reload_from_disk(self) -> bool:
-        """Force reload metadata from disk (useful for external changes)."""
-        with self._lock:
-            try:
-                self._metadata = self._load_from_disk()
-                return True
-            except Exception as e:
-                logger.error(f"Failed to reload metadata from disk: {e}")
-                return False
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index b483181fd5f..90c99fdabf6 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -24,7 +24,6 @@
 from vllm.utils import random_uuid
 
 from vllm_omni.entrypoints.openai.audio_utils_mixin import AudioMixin
-from vllm_omni.entrypoints.openai.metadata_manager import MetadataManager
 from vllm_omni.entrypoints.openai.protocol.audio import (
     AudioResponse,
     BatchSpeechRequest,
@@ -147,14 +146,10 @@ def _validate_path_within_directory(file_path: Path, directory: Path) -> bool:
 class OmniOpenAIServingSpeech(OpenAIServing, AudioMixin):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        # Initialize uploaded speakers storage
+        # Initialize uploaded speakers storage (ephemeral — cleared on restart)
         speech_voice_samples_dir = os.environ.get("SPEECH_VOICE_SAMPLES", "/tmp/voice_samples")
         self.uploaded_speakers_dir = Path(speech_voice_samples_dir)
         self.uploaded_speakers_dir.mkdir(parents=True, exist_ok=True)
-        self.metadata_file = self.uploaded_speakers_dir / "metadata.json"
-
-        # Initialize metadata manager
-        self.metadata_manager = MetadataManager(self.metadata_file)
 
         # Find and cache the TTS stage (if any) during initialization
         self._tts_stage = self._find_tts_stage()
@@ -171,17 +166,16 @@ def __init__(self, *args, **kwargs):
         # Cache TTS configuration values (computed once, reused per request)
         self._max_instructions_length = self._compute_max_instructions_length()
 
-        # Load supported speakers
+        # Load supported speakers (built-in only; uploaded voices start empty)
         self.supported_speakers = self._load_supported_speakers()
-        # Load uploaded speakers
-        self.uploaded_speakers = self.metadata_manager.get_uploaded_speakers()
-
-        # Merge supported speakers with uploaded speakers
-        self.supported_speakers.update(self.uploaded_speakers.keys())
+        self.uploaded_speakers: dict[str, dict] = {}
+        logger.warning(
+            "Uploaded voices are ephemeral and will be lost on server restart. "
+            "Re-upload voices after each restart if needed."
+        )
         self._tts_tokenizer = None
 
         logger.info(f"Loaded {len(self.supported_speakers)} supported speakers: {sorted(self.supported_speakers)}")
-        logger.info(f"Loaded {len(self.uploaded_speakers)} uploaded speakers")
 
         # Batch configuration
         self._batch_max_items: int = getattr(self.engine_client, "tts_batch_max_items", 32)
@@ -417,11 +411,20 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None:
             return None
 
     async def upload_voice(
-        self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None
+        self,
+        audio_file: UploadFile,
+        consent: str,
+        name: str,
+        *,
+        ref_text: str | None = None,
+        speaker_description: str | None = None,
     ) -> dict:
-        # Normalize ref_text: treat whitespace-only as absent
+        """Upload a new voice sample."""
+        # Normalize optional strings: treat whitespace-only as absent
         if ref_text is not None:
             ref_text = ref_text.strip() or None
+        if speaker_description is not None:
+            speaker_description = speaker_description.strip() or None
         # Validate file size (max 10MB)
         MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
         audio_file.file.seek(0, 2)  # Seek to end
@@ -473,7 +476,9 @@ async def upload_voice(
 
         # Check if voice already exists
         if voice_name_lower in self.uploaded_speakers:
-            raise ValueError(f"Voice '{name}' already exists")
+            raise ValueError(
+                f"Voice '{name}' already exists. To re-register this voice, delete it first and then upload it again."
+            )
 
         # Sanitize name and consent to prevent path traversal
         sanitized_name = _sanitize_filename(name)
@@ -523,7 +528,7 @@ async def upload_voice(
             raise ValueError(f"Failed to save audio file: {e}")
 
         # Create speaker data
-        speaker_data = {
+        speaker_data: dict[str, Any] = {
             "name": name,
             "consent": consent,
             "file_path": str(file_path),
@@ -532,23 +537,13 @@ async def upload_voice(
             "original_filename": audio_file.filename,
             "file_size": file_size,
             "ref_text": ref_text,
-            "cache_status": "pending",  # The initial cache state is pending.
-            "cache_file": None,  # The initial cache file is empty.
-            "cache_generated_at": None,  # The initial cache generation time is empty.
             "embedding_source": "audio",
         }
 
-        # Save metadata using metadata manager (concurrency safe)
-        success = self.metadata_manager.create_speaker(voice_name_lower, speaker_data)
-        if not success:
-            # Clean up the saved file if metadata creation failed
-            try:
-                file_path.unlink()
-            except Exception:
-                pass
-            raise ValueError(f"Failed to create metadata for voice '{name}' (possibly already exists)")
+        # Store voice description if provided.
+        if speaker_description:
+            speaker_data["speaker_description"] = speaker_description
 
-        # Update in-memory cache
         self.uploaded_speakers[voice_name_lower] = speaker_data
         self.supported_speakers.add(voice_name_lower)
 
@@ -562,8 +557,10 @@ async def upload_voice(
             "mime_type": mime_type,
             "file_size": file_size,
         }
-        if ref_text is not None:
-            result["ref_text"] = ref_text
+        if speaker_data.get("ref_text"):
+            result["ref_text"] = speaker_data["ref_text"]
+        if speaker_data.get("speaker_description"):
+            result["speaker_description"] = speaker_data["speaker_description"]
         return result
 
     async def upload_voice_embedding(self, embedding_json: str, consent: str, name: str) -> dict:
@@ -633,21 +630,10 @@ async def upload_voice_embedding(self, embedding_json: str, consent: str, name:
             "mime_type": "application/x-safetensors",
             "original_filename": filename,
             "file_size": file_path.stat().st_size,
-            "cache_status": "ready",
-            "cache_file": str(file_path),
-            "cache_generated_at": timestamp,
             "embedding_source": "direct",
             "embedding_dim": emb_dim,
         }
 
-        success = self.metadata_manager.create_speaker(voice_name_lower, speaker_data)
-        if not success:
-            try:
-                file_path.unlink()
-            except Exception:
-                pass
-            raise ValueError(f"Failed to create metadata for voice '{name}' (possibly already exists)")
-
         self.uploaded_speakers[voice_name_lower] = speaker_data
         self.supported_speakers.add(voice_name_lower)
 
@@ -673,25 +659,22 @@ async def delete_voice(self, name: str) -> bool:
         """
         voice_name_lower = name.lower()
 
-        # Check if voice exists in memory cache
         if voice_name_lower not in self.uploaded_speakers:
-            logger.warning(f"Voice '{name}' not found in memory cache")
+            logger.warning(f"Voice '{name}' not found")
             return False
 
-        # Delete from metadata manager with file cleanup
-        # Pass base_dir for path validation
-        deleted_info = self.metadata_manager.delete_speaker(voice_name_lower)
-        if not deleted_info:
-            logger.error(f"Failed to delete voice '{name}' from metadata")
-            return False
+        speaker_info = self.uploaded_speakers.pop(voice_name_lower)
+        self.supported_speakers.discard(voice_name_lower)
 
-        # Update in-memory cache
-        if voice_name_lower in self.uploaded_speakers:
-            del self.uploaded_speakers[voice_name_lower]
-        if voice_name_lower in self.supported_speakers:
-            self.supported_speakers.remove(voice_name_lower)
+        # Clean up audio file on disk
+        file_path = speaker_info.get("file_path")
+        if file_path:
+            try:
+                Path(file_path).unlink(missing_ok=True)
+            except Exception as e:
+                logger.warning(f"Failed to delete audio file for '{name}': {e}")
 
-        logger.info(f"Deleted voice '{name}' and associated files")
+        logger.info(f"Deleted voice '{name}'")
         return True
 
     def _is_tts_model(self) -> bool:
@@ -1037,6 +1020,7 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any
                 stored_ref_text = speaker_info.get("ref_text")
                 params["ref_audio"] = [audio_data]
                 params["task_type"] = ["Base"]
+                params["voice_created_at"] = [speaker_info.get("created_at", 0)]
                 if stored_ref_text:
                     params["ref_text"] = [stored_ref_text]
                     params["x_vector_only_mode"] = [False]
diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py
index 08c0f9a1e67..bc6222bbe2c 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py
@@ -27,6 +27,7 @@
 from vllm.sequence import IntermediateTensors
 
 from vllm_omni.model_executor.models.output_templates import OmniOutput
+from vllm_omni.utils.voice_cache import VoiceEmbeddingCache
 
 from .configuration_qwen3_tts import Qwen3TTSConfig, Qwen3TTSSpeakerEncoderConfig, Qwen3TTSTalkerConfig
 from .qwen3_tts_code_predictor_vllm import Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM
@@ -406,6 +407,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self._tokenizer = None
         self._speech_tokenizer: Qwen3TTSTokenizer | None = None
 
+        # In-memory LRU cache for voice extraction artifacts (Base voice clone).
+        self._voice_cache = VoiceEmbeddingCache()
+
     # -------------------- vLLM required hooks --------------------
 
     def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor:
@@ -1326,6 +1330,25 @@ def _normalize_voice_clone_prompt(raw: object) -> dict[str, object] | None:
             xvec_only = bool((info_dict.get("x_vector_only_mode") or [False])[0])
             in_context_mode = not xvec_only
             voice_clone_prompt = _normalize_voice_clone_prompt(info_dict.get("voice_clone_prompt"))
+
+            # Voice cache: only for uploaded voices (created_at > 0)
+            _voice_cache_key = None
+            if voice_clone_prompt is None:
+                _speaker_list = info_dict.get("speaker")
+                if isinstance(_speaker_list, list) and _speaker_list:
+                    _voice_name = str(_speaker_list[0]).lower()
+                    _voice_created_at = float((info_dict.get("voice_created_at") or [0])[0])
+                    if _voice_created_at > 0:
+                        _voice_cache_key = self._voice_cache.make_cache_key(_voice_name, xvec_only, _voice_created_at)
+                    _cached = self._voice_cache.get(_voice_cache_key) if _voice_cache_key is not None else None
+                    if _cached is not None:
+                        voice_clone_prompt = {
+                            "ref_code": _cached.get("ref_code"),
+                            "ref_spk_embedding": _cached.get("ref_spk_embedding"),
+                            "icl_mode": _cached.get("icl_mode"),
+                        }
+                        _voice_cache_key = None  # hit -> don't store again
+
             # Official implementation may pass `voice_clone_prompt.icl_mode`.
             if voice_clone_prompt is not None and "icl_mode" in voice_clone_prompt:
                 icl_flag = _as_singleton(voice_clone_prompt.get("icl_mode"))
@@ -1375,6 +1398,19 @@ def _normalize_voice_clone_prompt(raw: object) -> dict[str, object] | None:
                 wav_np, sr = self._normalize_ref_audio(ref_audio_list[0])
                 speaker_embed = self._extract_speaker_embedding(wav_np, sr).view(1, 1, -1)
 
+            # Cache miss: store extraction result
+            if _voice_cache_key is not None and speaker_embed is not None:
+                self._voice_cache.put(
+                    _voice_cache_key,
+                    {
+                        "ref_code": ref_code_prompt.detach().cpu()
+                        if isinstance(ref_code_prompt, torch.Tensor)
+                        else None,
+                        "ref_spk_embedding": speaker_embed.detach().cpu().reshape(-1),
+                        "icl_mode": in_context_mode,
+                    },
+                )
+
             codec_input = torch.cat([codec_input_0, speaker_embed, codec_input_1], dim=1)
 
             # Role header (<|im_start|>assistant\n) -> projected text embeds.
diff --git a/vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py b/vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py
deleted file mode 100644
index 1e26a161da4..00000000000
--- a/vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Copyright 2026 The Alibaba Qwen team.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
-
-import torch
-from safetensors import safe_open
-from safetensors.torch import save_file
-from vllm.logger import init_logger
-
-from vllm_omni.entrypoints.openai.metadata_manager import MetadataManager
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class VoiceClonePromptItem:
-    """
-    Container for one sample's voice-clone prompt information that can be fed to the model.
-
-    Fields are aligned with `Qwen3TTSForConditionalGeneration.generate(..., voice_clone_prompt=...)`.
-    """
-
-    ref_code: torch.Tensor | None  # (T, Q) or (T,) depending on tokenizer 25Hz/12Hz
-    ref_spk_embedding: torch.Tensor  # (D,)
-    x_vector_only_mode: bool
-    icl_mode: bool
-    ref_text: str | None = None
-
-
-class VoiceCacheManager:
-    """
-    Voice cache manager, responsible for managing custom voice cache functionality.
-
-    Main features:
-    1. Load uploaded speaker information from metadata.json
-    2. Manage voice clone prompt cache
-    3. Update cache status to metadata.json
-
-    Security properties:
-    - No pickle / torch.load
-    - Safetensors-only
-    - Cache path confined to voice samples directory
-    """
-
-    def __init__(self, speech_voice_samples_dir: str | None = None, metadata_manager: MetadataManager | None = None):
-        """
-        Initialize the voice cache manager.
-
-        Args:
-            speech_voice_samples_dir: Speech voice samples directory path,
-                if None, get from environment variable
-            metadata_manager: Optional MetadataManager instance for shared metadata access.
-                If not provided, will create its own (less efficient).
-        """
-        self.speech_voice_samples_dir = speech_voice_samples_dir or os.environ.get(
-            "SPEECH_VOICE_SAMPLES", "/tmp/voice_samples"
-        )
-
-        # Initialize metadata manager
-        if metadata_manager is not None:
-            self.metadata_manager = metadata_manager
-        else:
-            metadata_file = Path(self.speech_voice_samples_dir) / "metadata.json"
-            self.metadata_manager = MetadataManager(metadata_file)
-
-    # ------------------------------------------------------------------
-    # Metadata helpers
-    # ------------------------------------------------------------------
-
-    def load_uploaded_speakers_from_metadata(self) -> dict[str, Any] | None:
-        """Load uploaded speakers from metadata manager."""
-        try:
-            return self.metadata_manager.get_uploaded_speakers()
-        except Exception as e:
-            logger.warning(f"Failed to load uploaded speakers from metadata: {e}")
-            return None
-
-    def update_metadata_cache_info(self, speaker: str, cache_file_path: Path, status: str = "ready") -> bool:
-        """
-        Update cache information using metadata manager.
-
-        Args:
-            speaker: Speaker name
-            cache_file_path: Cache file path
-            status: Cache status, default is "ready"
-
-        Returns:
-            bool: Whether the update was successful
-        """
-        try:
-            speaker_key = speaker.lower()
-            return self.metadata_manager.update_cache_info(
-                speaker_key=speaker_key, cache_file_path=cache_file_path, status=status
-            )
-        except Exception as e:
-            logger.error(f"Failed to update metadata cache info: {e}")
-            return False
-
-    # ------------------------------------------------------------------
-    # Cache save (SAFE)
-    # ------------------------------------------------------------------
-
-    def save_voice_cache(
-        self,
-        speaker: str,
-        audio_file_path: Path,
-        prompt_items: list[VoiceClonePromptItem],
-    ) -> bool:
-        """
-        Save voice cache using safetensors (no pickle, no RCE).
-        """
-        try:
-            cache_file_path = audio_file_path.with_suffix(".safetensors")
-
-            tensors: dict[str, torch.Tensor] = {}
-            metadata: dict[str, str] = {}
-
-            tensors["__len__"] = torch.tensor(len(prompt_items), dtype=torch.int64)
-
-            for i, item in enumerate(prompt_items):
-                prefix = f"item_{i}_"
-
-                tensors[prefix + "ref_spk_embedding"] = item.ref_spk_embedding.detach().cpu()
-
-                has_ref_code = item.ref_code is not None
-                tensors[prefix + "has_ref_code"] = torch.tensor(int(has_ref_code), dtype=torch.int8)
-
-                if has_ref_code:
-                    tensors[prefix + "ref_code"] = item.ref_code.detach().cpu()
-
-                tensors[prefix + "x_vector_only_mode"] = torch.tensor(int(item.x_vector_only_mode), dtype=torch.int8)
-                tensors[prefix + "icl_mode"] = torch.tensor(int(item.icl_mode), dtype=torch.int8)
-
-                if item.ref_text is not None:
-                    metadata[prefix + "ref_text"] = item.ref_text
-
-            save_file(tensors, str(cache_file_path), metadata=metadata)
-
-            return self.update_metadata_cache_info(
-                speaker=speaker,
-                cache_file_path=cache_file_path,
-                status="ready",
-            )
-
-        except Exception as e:
-            logger.error(f"Failed to save safetensors cache for speaker {speaker}: {e}")
-            self.update_metadata_cache_info(speaker, Path(""), "failed")
-            return False
-
-    # ------------------------------------------------------------------
-    # Cache load (SAFE)
-    # ------------------------------------------------------------------
-
-    def load_cached_voice_prompt(
-        self,
-        speaker: str,
-        device: str | None = None,
-    ) -> list[VoiceClonePromptItem] | None:
-        """
-        Load cached VoiceClonePromptItem list from safetensors.
-        """
-        try:
-            uploaded_speakers = self.load_uploaded_speakers_from_metadata()
-            if not uploaded_speakers:
-                return None
-
-            speaker_key = speaker.lower()
-            if speaker_key not in uploaded_speakers:
-                return None
-
-            speaker_info = uploaded_speakers[speaker_key]
-            if speaker_info.get("cache_status") != "ready":
-                return None
-
-            cache_file_path = Path(speaker_info.get("cache_file", "")).resolve()
-
-            base_dir = Path(self.speech_voice_samples_dir).resolve()
-
-            # ---- Path confinement (critical security check)
-            if not str(cache_file_path).startswith(str(base_dir)):
-                logger.error(f"Illegal cache path outside base dir: {cache_file_path}")
-                return None
-
-            if not cache_file_path.exists():
-                return None
-
-            if cache_file_path.suffix != ".safetensors":
-                logger.error(f"Legacy or unsafe cache format rejected: {cache_file_path}")
-                return None
-
-            with safe_open(cache_file_path, framework="pt", device="cpu") as f:
-                meta = f.metadata()
-
-                num_items = int(f.get_tensor("__len__").item())
-                result: list[VoiceClonePromptItem] = []
-
-                for i in range(num_items):
-                    prefix = f"item_{i}_"
-
-                    has_ref_code = bool(f.get_tensor(prefix + "has_ref_code").item())
-
-                    ref_code = f.get_tensor(prefix + "ref_code").to(device) if has_ref_code else None
-
-                    ref_spk_embedding = f.get_tensor(prefix + "ref_spk_embedding").to(device)
-
-                    x_vector_only_mode = bool(f.get_tensor(prefix + "x_vector_only_mode").item())
-                    icl_mode = bool(f.get_tensor(prefix + "icl_mode").item())
-
-                    ref_text = meta.get(prefix + "ref_text")
-
-                    result.append(
-                        VoiceClonePromptItem(
-                            ref_code=ref_code,
-                            ref_spk_embedding=ref_spk_embedding,
-                            x_vector_only_mode=x_vector_only_mode,
-                            icl_mode=icl_mode,
-                            ref_text=ref_text,
-                        )
-                    )
-
-            logger.info(f"Safetensors cache loaded for speaker: {speaker}")
-            return result
-
-        except Exception as e:
-            logger.warning(f"Failed to load safetensors cache for speaker {speaker}: {e}")
-            return None
-
-    # ------------------------------------------------------------------
-    # Audio path helper
-    # ------------------------------------------------------------------
-
-    def get_speaker_audio_path(self, speaker: str) -> Path | None:
-        """
-        Get speaker's audio file path.
-
-        Args:
-            speaker: Speaker name
-
-        Returns:
-            Optional[Path]: Audio file path, returns None if speaker doesn't exist
-        """
-        uploaded_speakers = self.load_uploaded_speakers_from_metadata()
-        if not uploaded_speakers:
-            return None
-
-        speaker_key = speaker.lower()
-        if speaker_key not in uploaded_speakers:
-            return None
-
-        audio_file_path = Path(uploaded_speakers[speaker_key]["file_path"])
-        if audio_file_path.exists():
-            return audio_file_path
-
-        logger.warning(f"Audio file not found for speaker {speaker}: {audio_file_path}")
-        return None
diff --git a/vllm_omni/utils/voice_cache.py b/vllm_omni/utils/voice_cache.py
new file mode 100644
index 00000000000..2d78a5bfdb9
--- /dev/null
+++ b/vllm_omni/utils/voice_cache.py
@@ -0,0 +1,89 @@
+"""In-memory LRU cache for voice extraction artifacts.
+
+Keyed by voice name + extraction mode (e.g. ``"alice:icl"``).
+Only named voices are cached; inline ``ref_audio`` without a voice
+name is not cached.
+
+Usage::
+
+    key = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False)
+    cached = cache.get(key)
+    if cached is None:
+        # ... extract ...
+        cache.put(key, {"artifact": result})
+"""
+
+import os
+import threading
+from collections import OrderedDict
+from typing import Any
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+_DEFAULT_MAX_ENTRIES = 128
+
+
+class VoiceEmbeddingCache:
+    """LRU cache for voice extraction outputs.
+
+    Each entry stores a ``dict[str, Any]`` whose contents are model-specific.
+    Thread-safe via a lightweight ``threading.Lock``.
+    """
+
+    def __init__(self, max_entries: int | None = None):
+        if max_entries is None:
+            max_entries = int(os.environ.get("VOICE_CACHE_MAX_ENTRIES", _DEFAULT_MAX_ENTRIES))
+        self._cache: OrderedDict[str, dict[str, Any]] = OrderedDict()
+        self._max_entries = max_entries
+        self._lock = threading.Lock()
+        self._hits = 0
+        self._misses = 0
+        logger.info("Voice embedding cache initialized (max_entries=%d)", max_entries)
+
+    @staticmethod
+    def make_cache_key(voice_name: str, xvec_only: bool, created_at: float = 0.0) -> str:
+        """Build a cache key from a voice name, upload timestamp, and extraction mode.
+
+        Args:
+            voice_name: The speaker/voice name (case-insensitive, lowered
+                by the caller).
+            xvec_only: True for speaker-embedding-only mode, False for
+                ICL mode (speaker embedding + ref_code).
+            created_at: Upload timestamp from metadata. Prevents stale cache
+                hits after a voice is deleted and re-uploaded with the same
+                name but different audio.
+        """
+        mode = "xvec" if xvec_only else "icl"
+        return f"{voice_name}:{created_at:.6f}:{mode}"
+
+    def get(self, key: str) -> dict[str, Any] | None:
+        """Return cached artifacts or ``None`` on miss.  Promotes to MRU on hit."""
+        with self._lock:
+            if key in self._cache:
+                self._cache.move_to_end(key)
+                self._hits += 1
+                logger.debug("Voice cache HIT (key=%s, hits=%d)", key, self._hits)
+                return self._cache[key]
+            self._misses += 1
+            return None
+
+    def put(self, key: str, artifacts: dict[str, Any]) -> None:
+        """Store *artifacts* under *key*, evicting the LRU entry if full."""
+        with self._lock:
+            self._cache[key] = artifacts
+            self._cache.move_to_end(key)
+            while len(self._cache) > self._max_entries:
+                evicted_key, _ = self._cache.popitem(last=False)
+                logger.debug("Voice cache EVICT (key=%s)", evicted_key)
+
+    def stats(self) -> dict[str, int]:
+        """Return cache statistics."""
+        with self._lock:
+            return {
+                "entries": len(self._cache),
+                "max_entries": self._max_entries,
+                "hits": self._hits,
+                "misses": self._misses,
+            }