vllm-project · linyueqian · Apr 9, 2026 · Apr 8, 2026 · Apr 9, 2026
@@ -18,7 +18,6 @@
 import logging
 import math
 import os
-import tempfile
 import time
 
 import numpy as np
@@ -88,17 +87,10 @@ def build_prompt(
         semantic_len,
     )
 
-    # The model-side structured clone prefill consumes a temporary .npy file and
-    # removes it after loading. Abnormal termination can still leave the file
-    # behind, which is acceptable for this offline example.
-    with tempfile.NamedTemporaryFile(prefix="fish_ref_", suffix=".npy", delete=False) as f:
-        np.save(f, np.asarray(ref_audio_wav, dtype=np.float32))
-        ref_audio_npy_path = f.name
-
     additional_information = {
         "text": normalized_text,
         "ref_text": normalized_ref_text,
-        "ref_audio_path": ref_audio_npy_path,
+        "ref_audio_wav": torch.from_numpy(np.asarray(ref_audio_wav, dtype=np.float32)),
         "ref_audio_sr": int(ref_audio_sr),
         "fish_structured_voice_clone": True,
     }

@@ -1861,8 +1861,8 @@ def test_build_fish_clone_prompt_normalizes_text_fields(self, fish_speech_server
         assert info["text"] == "<|speaker:1|>你好，欢迎回来。"
         assert info["ref_text"] == "<|speaker:0|>参考音频的原始文本。"
         assert info["fish_structured_voice_clone"] is True
-        assert os.path.exists(info["ref_audio_path"])
-        os.remove(info["ref_audio_path"])
+        assert isinstance(info["ref_audio_wav"], torch.Tensor)
+        assert info["ref_audio_wav"].dtype == torch.float32
         fish_speech_server._estimate_fish_prompt_len.assert_called_once_with(
             "<|speaker:1|>你好，欢迎回来。",
             "<|speaker:0|>参考音频的原始文本。",

@@ -80,8 +80,6 @@ def test_structured_voice_clone_prefill_adds_full_codebooks_with_decode_scale(mo
     model.codebook_embeddings = codebook_embed
     model._get_tokenizer = lambda: _FakeTokenizer({"<|audio_start|>": 10, "<|audio_end|>": 11})
 
-    monkeypatch.setattr(slow_ar_module.np, "load", lambda path: [0.0])
-    monkeypatch.setattr(slow_ar_module.os, "remove", lambda path: None)
     monkeypatch.setattr(
         slow_ar_module,
         "encode_reference_audio_codes",
@@ -97,7 +95,7 @@ def test_structured_voice_clone_prefill_adds_full_codebooks_with_decode_scale(mo
         {
             "ref_text": "ref",
             "text": "target",
-            "ref_audio_path": "unused.npy",
+            "ref_audio_wav": torch.tensor([0.0]),
             "ref_audio_sr": 16000,
         }
     )

@@ -6,7 +6,6 @@
 import os
 import re
 import struct
-import tempfile
 import time
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
@@ -1301,17 +1300,13 @@ def _build_fish_speech_prompt(
         wav_samples, sr = ref_audio_data
         normalized_text, normalized_ref_text = normalize_fish_voice_clone_texts(request.input, request.ref_text)
         ph_len = self._estimate_fish_prompt_len(normalized_text, normalized_ref_text, ref_audio_data)
-        with tempfile.NamedTemporaryFile(prefix="fish_ref_", suffix=".npy", delete=False) as f:
-            np.save(f, np.asarray(wav_samples, dtype=np.float32))
-            ref_audio_path = f.name
 
-        # Structured clone metadata is consumed directly by
-        # FishSpeechSlowARForConditionalGeneration.preprocess(), so keep these
-        # values as scalars instead of the list-wrapped prompt-dict convention.
+        # Structured clone: scalars (not list-wrapped) because model-side
+        # preprocess() consumes per-request fields directly.
         additional_information = {
             "text": normalized_text,
             "ref_text": normalized_ref_text,
-            "ref_audio_path": ref_audio_path,
+            "ref_audio_wav": torch.from_numpy(np.asarray(wav_samples, dtype=np.float32)),
             "ref_audio_sr": int(sr),
             "fish_structured_voice_clone": True,
         }

@@ -14,7 +14,6 @@
 
 import dataclasses
 import math
-import os
 from collections.abc import Iterable
 from typing import Any
 
@@ -518,17 +517,19 @@ def _build_structured_voice_clone_prefill_embeds(self, info_dict: dict[str, Any]
         tokenizer = self._get_tokenizer()
         ref_text = info_dict.get("ref_text")
         text = info_dict.get("text")
-        ref_audio_path = info_dict.get("ref_audio_path")
         ref_audio_sr = info_dict.get("ref_audio_sr")
         if not isinstance(ref_text, str) or not isinstance(text, str):
             raise ValueError("Fish Speech structured voice clone requires string text and ref_text")
-        if not isinstance(ref_audio_path, str) or not ref_audio_path:
-            raise ValueError("Fish Speech structured voice clone requires ref_audio_path")
         if not isinstance(ref_audio_sr, int):
             raise ValueError("Fish Speech structured voice clone requires integer ref_audio_sr")
 
-        ref_audio_wav = np.load(ref_audio_path)
-        os.remove(ref_audio_path)
+        ref_audio_wav_raw = info_dict.get("ref_audio_wav")
+        if ref_audio_wav_raw is None:
+            raise ValueError("Fish Speech structured voice clone requires ref_audio_wav")
+        if isinstance(ref_audio_wav_raw, torch.Tensor):
+            ref_audio_wav = ref_audio_wav_raw.cpu().numpy()
+        else:
+            ref_audio_wav = np.asarray(ref_audio_wav_raw, dtype=np.float32)
 
         ref_codes_fq = encode_reference_audio_codes(
             self.model_path,