Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
using SLERP and sends the result to the /v1/audio/speech API.

Requirements:
pip install torch resampy soundfile numpy httpx
pip install torch soundfile numpy httpx

Examples:
# Extract and save an embedding
Expand Down Expand Up @@ -143,11 +143,12 @@ def _load_speaker_encoder_weights(encoder: torch.nn.Module, model_path: str) ->

def compute_mel_spectrogram(audio: np.ndarray, sr: int = 24000) -> torch.Tensor:
"""Compute 128-bin mel spectrogram matching Qwen3-TTS's extraction pipeline."""
from vllm.multimodal.audio import resample_audio_resampy
from vllm.multimodal.audio import AudioResampler

# Resample to 24kHz if needed
if sr != 24000:
audio = resample_audio_resampy(audio.astype(np.float32), orig_sr=sr, target_sr=24000)
resampler = AudioResampler(target_sr=24000)
audio = resampler.resample(audio.astype(np.float32), orig_sr=sr)

y = torch.from_numpy(audio).unsqueeze(0).float()

Expand Down
1 change: 0 additions & 1 deletion requirements/common.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Common dependencies for all platforms
av>=14.0.0
omegaconf>=2.3.0
resampy>=0.4.3
diffusers>=0.36.0
accelerate==1.12.0
soundfile>=0.13.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.models.qwen3 import Qwen3Model
from vllm.model_executor.models.utils import AutoWeightsLoader, PPMissingLayer, WeightsMapper, maybe_prefix
from vllm.multimodal.audio import AudioResampler
from vllm.sequence import IntermediateTensors

from vllm_omni.model_executor.models.output_templates import OmniOutput
Expand Down Expand Up @@ -1094,9 +1095,8 @@ def _extract_speaker_embedding(self, wav: np.ndarray, sr: int) -> torch.Tensor:
# Resample to 24kHz for speaker encoder.
target_sr = int(getattr(self.config.speaker_encoder_config, "sample_rate", 24000))
if sr != target_sr:
from vllm.multimodal.audio import resample_audio_resampy

wav = resample_audio_resampy(wav.astype(np.float32), orig_sr=int(sr), target_sr=target_sr)
resampler = AudioResampler(target_sr=target_sr)
wav = resampler.resample(wav.astype(np.float32), orig_sr=int(sr))
sr = target_sr

# Follow official implementation: mel_spectrogram expects 24kHz.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoConfig, AutoFeatureExtractor, AutoModel
from vllm.multimodal.audio import resample_audio_resampy
from vllm.multimodal.audio import AudioResampler
from vllm.multimodal.media.audio import load_audio as _load_audio_file

from .tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2 import Qwen3TTSTokenizerV2Config
Expand Down Expand Up @@ -161,7 +161,8 @@ def load_audio(
audio = np.mean(audio, axis=-1)

if sr != target_sr:
audio = resample_audio_resampy(audio, orig_sr=sr, target_sr=target_sr)
resampler = AudioResampler(target_sr=target_sr)
audio = resampler.resample(audio, orig_sr=sr)

return audio.astype(np.float32)

Expand Down Expand Up @@ -209,7 +210,8 @@ def _normalize_audio_inputs(
if a.ndim > 1:
a = np.mean(a, axis=-1)
if int(sr) != target_sr:
a = resample_audio_resampy(a.astype(np.float32), orig_sr=int(sr), target_sr=target_sr)
resampler = AudioResampler(target_sr=target_sr)
a = resampler.resample(a.astype(np.float32), orig_sr=int(sr))
out.append(a.astype(np.float32))
return out

Expand Down
Loading