diff --git a/examples/offline_inference/fish_speech/end2end.py b/examples/offline_inference/fish_speech/end2end.py index 31c24d3d5d6..60830d06b7f 100644 --- a/examples/offline_inference/fish_speech/end2end.py +++ b/examples/offline_inference/fish_speech/end2end.py @@ -18,7 +18,6 @@ import logging import math import os -import tempfile import time import numpy as np @@ -88,17 +87,10 @@ def build_prompt( semantic_len, ) - # The model-side structured clone prefill consumes a temporary .npy file and - # removes it after loading. Abnormal termination can still leave the file - # behind, which is acceptable for this offline example. - with tempfile.NamedTemporaryFile(prefix="fish_ref_", suffix=".npy", delete=False) as f: - np.save(f, np.asarray(ref_audio_wav, dtype=np.float32)) - ref_audio_npy_path = f.name - additional_information = { "text": normalized_text, "ref_text": normalized_ref_text, - "ref_audio_path": ref_audio_npy_path, + "ref_audio_wav": torch.from_numpy(np.asarray(ref_audio_wav, dtype=np.float32)), "ref_audio_sr": int(ref_audio_sr), "fish_structured_voice_clone": True, } diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 334264602ea..57aeef8f9de 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -1861,8 +1861,8 @@ def test_build_fish_clone_prompt_normalizes_text_fields(self, fish_speech_server assert info["text"] == "<|speaker:1|>你好,欢迎回来。" assert info["ref_text"] == "<|speaker:0|>参考音频的原始文本。" assert info["fish_structured_voice_clone"] is True - assert os.path.exists(info["ref_audio_path"]) - os.remove(info["ref_audio_path"]) + assert isinstance(info["ref_audio_wav"], torch.Tensor) + assert info["ref_audio_wav"].dtype == torch.float32 fish_speech_server._estimate_fish_prompt_len.assert_called_once_with( "<|speaker:1|>你好,欢迎回来。", "<|speaker:0|>参考音频的原始文本。", diff --git a/tests/model_executor/models/test_fish_speech_regressions.py b/tests/model_executor/models/test_fish_speech_regressions.py index 1f8c3cf71e8..04d1b20dff6 100644 --- a/tests/model_executor/models/test_fish_speech_regressions.py +++ b/tests/model_executor/models/test_fish_speech_regressions.py @@ -80,8 +80,6 @@ def test_structured_voice_clone_prefill_adds_full_codebooks_with_decode_scale(mo model.codebook_embeddings = codebook_embed model._get_tokenizer = lambda: _FakeTokenizer({"<|audio_start|>": 10, "<|audio_end|>": 11}) - monkeypatch.setattr(slow_ar_module.np, "load", lambda path: [0.0]) - monkeypatch.setattr(slow_ar_module.os, "remove", lambda path: None) monkeypatch.setattr( slow_ar_module, "encode_reference_audio_codes", @@ -97,7 +95,7 @@ def test_structured_voice_clone_prefill_adds_full_codebooks_with_decode_scale(mo { "ref_text": "ref", "text": "target", - "ref_audio_path": "unused.npy", + "ref_audio_wav": torch.tensor([0.0]), "ref_audio_sr": 16000, } ) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 5903c0cd60a..494c977d779 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -6,7 +6,6 @@ import os import re import struct -import tempfile import time from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -1301,17 +1300,13 @@ def _build_fish_speech_prompt( wav_samples, sr = ref_audio_data normalized_text, normalized_ref_text = normalize_fish_voice_clone_texts(request.input, request.ref_text) ph_len = self._estimate_fish_prompt_len(normalized_text, normalized_ref_text, ref_audio_data) - with tempfile.NamedTemporaryFile(prefix="fish_ref_", suffix=".npy", delete=False) as f: - np.save(f, np.asarray(wav_samples, dtype=np.float32)) - ref_audio_path = f.name - # Structured clone metadata is consumed directly by - # FishSpeechSlowARForConditionalGeneration.preprocess(), so keep these - # values as scalars instead of the list-wrapped prompt-dict convention. + # Structured clone: scalars (not list-wrapped) because model-side + # preprocess() consumes per-request fields directly. additional_information = { "text": normalized_text, "ref_text": normalized_ref_text, - "ref_audio_path": ref_audio_path, + "ref_audio_wav": torch.from_numpy(np.asarray(wav_samples, dtype=np.float32)), "ref_audio_sr": int(sr), "fish_structured_voice_clone": True, } diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py index 4ad2a1fa63b..9333400593a 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py @@ -14,7 +14,6 @@ import dataclasses import math -import os from collections.abc import Iterable from typing import Any @@ -518,17 +517,19 @@ def _build_structured_voice_clone_prefill_embeds(self, info_dict: dict[str, Any] tokenizer = self._get_tokenizer() ref_text = info_dict.get("ref_text") text = info_dict.get("text") - ref_audio_path = info_dict.get("ref_audio_path") ref_audio_sr = info_dict.get("ref_audio_sr") if not isinstance(ref_text, str) or not isinstance(text, str): raise ValueError("Fish Speech structured voice clone requires string text and ref_text") - if not isinstance(ref_audio_path, str) or not ref_audio_path: - raise ValueError("Fish Speech structured voice clone requires ref_audio_path") if not isinstance(ref_audio_sr, int): raise ValueError("Fish Speech structured voice clone requires integer ref_audio_sr") - ref_audio_wav = np.load(ref_audio_path) - os.remove(ref_audio_path) + ref_audio_wav_raw = info_dict.get("ref_audio_wav") + if ref_audio_wav_raw is None: + raise ValueError("Fish Speech structured voice clone requires ref_audio_wav") + if isinstance(ref_audio_wav_raw, torch.Tensor): + ref_audio_wav = ref_audio_wav_raw.cpu().numpy() + else: + ref_audio_wav = np.asarray(ref_audio_wav_raw, dtype=np.float32) ref_codes_fq = encode_reference_audio_codes( self.model_path,