diff --git a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py index 38a2bdea929..7790fa51276 100644 --- a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py +++ b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py @@ -5,7 +5,7 @@ using SLERP and sends the result to the /v1/audio/speech API. Requirements: - pip install torch resampy soundfile numpy httpx + pip install torch soundfile numpy httpx Examples: # Extract and save an embedding @@ -143,11 +143,12 @@ def _load_speaker_encoder_weights(encoder: torch.nn.Module, model_path: str) -> def compute_mel_spectrogram(audio: np.ndarray, sr: int = 24000) -> torch.Tensor: """Compute 128-bin mel spectrogram matching Qwen3-TTS's extraction pipeline.""" - from vllm.multimodal.audio import resample_audio_resampy + from vllm.multimodal.audio import AudioResampler # Resample to 24kHz if needed if sr != 24000: - audio = resample_audio_resampy(audio.astype(np.float32), orig_sr=sr, target_sr=24000) + resampler = AudioResampler(target_sr=24000) + audio = resampler.resample(audio.astype(np.float32), orig_sr=sr) y = torch.from_numpy(audio).unsqueeze(0).float() diff --git a/requirements/common.txt b/requirements/common.txt index 1f44d343c62..63e16d580ff 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,7 +1,6 @@ # Common dependencies for all platforms av>=14.0.0 omegaconf>=2.3.0 -resampy>=0.4.3 diffusers>=0.36.0 accelerate==1.12.0 soundfile>=0.13.1 diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py index 6b7b688f15a..d9cbcf7d4ef 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.qwen3 import Qwen3Model from vllm.model_executor.models.utils import AutoWeightsLoader, PPMissingLayer, WeightsMapper, maybe_prefix +from vllm.multimodal.audio import AudioResampler from vllm.sequence import IntermediateTensors from vllm_omni.model_executor.models.output_templates import OmniOutput @@ -1094,9 +1095,8 @@ def _extract_speaker_embedding(self, wav: np.ndarray, sr: int) -> torch.Tensor: # Resample to 24kHz for speaker encoder. target_sr = int(getattr(self.config.speaker_encoder_config, "sample_rate", 24000)) if sr != target_sr: - from vllm.multimodal.audio import resample_audio_resampy - - wav = resample_audio_resampy(wav.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) + resampler = AudioResampler(target_sr=target_sr) + wav = resampler.resample(wav.astype(np.float32), orig_sr=int(sr)) sr = target_sr # Follow official implementation: mel_spectrogram expects 24kHz. diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py index 3db5cfd1b82..14bfbc5eedf 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py @@ -22,7 +22,7 @@ import torch from torch.nn.utils.rnn import pad_sequence from transformers import AutoConfig, AutoFeatureExtractor, AutoModel -from vllm.multimodal.audio import resample_audio_resampy +from vllm.multimodal.audio import AudioResampler from vllm.multimodal.media.audio import load_audio as _load_audio_file from .tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2 import Qwen3TTSTokenizerV2Config @@ -161,7 +161,8 @@ def load_audio( audio = np.mean(audio, axis=-1) if sr != target_sr: - audio = resample_audio_resampy(audio, orig_sr=sr, target_sr=target_sr) + resampler = AudioResampler(target_sr=target_sr) + audio = resampler.resample(audio, orig_sr=sr) return audio.astype(np.float32) @@ -209,7 +210,8 @@ def _normalize_audio_inputs( if a.ndim > 1: a = np.mean(a, axis=-1) if int(sr) != target_sr: - a = resample_audio_resampy(a.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) + resampler = AudioResampler(target_sr=target_sr) + a = resampler.resample(a.astype(np.float32), orig_sr=int(sr)) out.append(a.astype(np.float32)) return out