Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions examples/offline_inference/fish_speech/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import logging
import math
import os
import tempfile
import time

import numpy as np
Expand Down Expand Up @@ -88,17 +87,10 @@ def build_prompt(
semantic_len,
)

# The model-side structured clone prefill consumes a temporary .npy file and
# removes it after loading. Abnormal termination can still leave the file
# behind, which is acceptable for this offline example.
with tempfile.NamedTemporaryFile(prefix="fish_ref_", suffix=".npy", delete=False) as f:
np.save(f, np.asarray(ref_audio_wav, dtype=np.float32))
ref_audio_npy_path = f.name

additional_information = {
"text": normalized_text,
"ref_text": normalized_ref_text,
"ref_audio_path": ref_audio_npy_path,
"ref_audio_wav": torch.from_numpy(np.asarray(ref_audio_wav, dtype=np.float32)),
"ref_audio_sr": int(ref_audio_sr),
"fish_structured_voice_clone": True,
}
Expand Down
4 changes: 2 additions & 2 deletions tests/entrypoints/openai_api/test_serving_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -1861,8 +1861,8 @@ def test_build_fish_clone_prompt_normalizes_text_fields(self, fish_speech_server
assert info["text"] == "<|speaker:1|>你好,欢迎回来。"
assert info["ref_text"] == "<|speaker:0|>参考音频的原始文本。"
assert info["fish_structured_voice_clone"] is True
assert os.path.exists(info["ref_audio_path"])
os.remove(info["ref_audio_path"])
assert isinstance(info["ref_audio_wav"], torch.Tensor)
assert info["ref_audio_wav"].dtype == torch.float32
fish_speech_server._estimate_fish_prompt_len.assert_called_once_with(
"<|speaker:1|>你好,欢迎回来。",
"<|speaker:0|>参考音频的原始文本。",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@ def test_structured_voice_clone_prefill_adds_full_codebooks_with_decode_scale(mo
model.codebook_embeddings = codebook_embed
model._get_tokenizer = lambda: _FakeTokenizer({"<|audio_start|>": 10, "<|audio_end|>": 11})

monkeypatch.setattr(slow_ar_module.np, "load", lambda path: [0.0])
monkeypatch.setattr(slow_ar_module.os, "remove", lambda path: None)
monkeypatch.setattr(
slow_ar_module,
"encode_reference_audio_codes",
Expand All @@ -97,7 +95,7 @@ def test_structured_voice_clone_prefill_adds_full_codebooks_with_decode_scale(mo
{
"ref_text": "ref",
"text": "target",
"ref_audio_path": "unused.npy",
"ref_audio_wav": torch.tensor([0.0]),
"ref_audio_sr": 16000,
}
)
Expand Down
11 changes: 3 additions & 8 deletions vllm_omni/entrypoints/openai/serving_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import os
import re
import struct
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
Expand Down Expand Up @@ -1301,17 +1300,13 @@ def _build_fish_speech_prompt(
wav_samples, sr = ref_audio_data
normalized_text, normalized_ref_text = normalize_fish_voice_clone_texts(request.input, request.ref_text)
ph_len = self._estimate_fish_prompt_len(normalized_text, normalized_ref_text, ref_audio_data)
with tempfile.NamedTemporaryFile(prefix="fish_ref_", suffix=".npy", delete=False) as f:
np.save(f, np.asarray(wav_samples, dtype=np.float32))
ref_audio_path = f.name

# Structured clone metadata is consumed directly by
# FishSpeechSlowARForConditionalGeneration.preprocess(), so keep these
# values as scalars instead of the list-wrapped prompt-dict convention.
# Structured clone: scalars (not list-wrapped) because model-side
# preprocess() consumes per-request fields directly.
additional_information = {
"text": normalized_text,
"ref_text": normalized_ref_text,
"ref_audio_path": ref_audio_path,
"ref_audio_wav": torch.from_numpy(np.asarray(wav_samples, dtype=np.float32)),
"ref_audio_sr": int(sr),
"fish_structured_voice_clone": True,
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import dataclasses
import math
import os
from collections.abc import Iterable
from typing import Any

Expand Down Expand Up @@ -518,17 +517,19 @@ def _build_structured_voice_clone_prefill_embeds(self, info_dict: dict[str, Any]
tokenizer = self._get_tokenizer()
ref_text = info_dict.get("ref_text")
text = info_dict.get("text")
ref_audio_path = info_dict.get("ref_audio_path")
ref_audio_sr = info_dict.get("ref_audio_sr")
if not isinstance(ref_text, str) or not isinstance(text, str):
raise ValueError("Fish Speech structured voice clone requires string text and ref_text")
if not isinstance(ref_audio_path, str) or not ref_audio_path:
raise ValueError("Fish Speech structured voice clone requires ref_audio_path")
if not isinstance(ref_audio_sr, int):
raise ValueError("Fish Speech structured voice clone requires integer ref_audio_sr")

ref_audio_wav = np.load(ref_audio_path)
os.remove(ref_audio_path)
ref_audio_wav_raw = info_dict.get("ref_audio_wav")
if ref_audio_wav_raw is None:
raise ValueError("Fish Speech structured voice clone requires ref_audio_wav")
if isinstance(ref_audio_wav_raw, torch.Tensor):
ref_audio_wav = ref_audio_wav_raw.cpu().numpy()
else:
ref_audio_wav = np.asarray(ref_audio_wav_raw, dtype=np.float32)

ref_codes_fq = encode_reference_audio_codes(
self.model_path,
Expand Down
Loading