diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 24ce39bafd7..2a98de1b812 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -7,7 +7,7 @@ COPY . . # Install system dependencies RUN apt-get update && \ - apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \ + apt-get install -y espeak-ng git sox libsox-fmt-all jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda index 754d491d861..6ed5b7d2773 100644 --- a/docker/Dockerfile.cuda +++ b/docker/Dockerfile.cuda @@ -7,7 +7,7 @@ WORKDIR ${COMMON_WORKDIR} # Step 1: Setup - Install system dependencies RUN apt-get update && \ - apt-get install -y ffmpeg git sox libsox-fmt-all jq && \ + apt-get install -y git sox libsox-fmt-all jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index bfbb060bcb5..8b22bee38b4 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -19,7 +19,7 @@ WORKDIR ${COMMON_WORKDIR} # Step 1: Setup - Install system dependencies RUN apt-get update && \ - apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \ + apt-get install -y espeak-ng git sox libsox-fmt-all jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 17f1aebf0d0..25d5d0c800e 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -15,9 +15,7 @@ RUN apt clean && apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ curl \ espeak-ng \ - ffmpeg \ git \ - libsndfile1 \ libsm6 \ libxext6 \ libgl1 \ diff --git a/docs/usage/faq.md b/docs/usage/faq.md index c080eae4023..0539e158b01 100644 --- a/docs/usage/faq.md +++ b/docs/usage/faq.md @@ -4,14 +4,6 @@ A: Now, we support natively disaggregated deployment for different model stages within a model. There is a restriction that one chip can only have one AutoRegressive model stage. This is because the unified KV cache management of vLLM. Stages of other types can coexist within a chip. The restriction will be resolved in later version. -> Q: When trying to run examples, I encounter error about backend of librosa or soundfile. How to solve it? - -A: If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - > Q: I see GPU OOM or "free memory is less than desired GPU memory utilization" errors. How can I fix it? A: Refer to [GPU memory calculation and configuration](../configuration/gpu_memory_utilization.md) for guidance on tuning `gpu_memory_utilization` and related settings. diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md index 5f458750b44..e6266868722 100644 --- a/docs/user_guide/examples/offline_inference/bagel.md +++ b/docs/user_guide/examples/offline_inference/bagel.md @@ -250,13 +250,6 @@ For more details on the Mooncake connector and multi-node setup, see the [Moonca ## FAQ -- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. - -```bash -sudo apt update -sudo apt install ffmpeg -``` - - If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. | Stage | VRAM | diff --git a/docs/user_guide/examples/offline_inference/cosyvoice3.md b/docs/user_guide/examples/offline_inference/cosyvoice3.md index d912f1c62eb..ebb7c02efc2 100644 --- a/docs/user_guide/examples/offline_inference/cosyvoice3.md +++ b/docs/user_guide/examples/offline_inference/cosyvoice3.md @@ -10,7 +10,7 @@ Install dependencies: uv pip install -e . ``` -> **Note:** This includes required libraries such as `librosa`, `soundfile`, +> **Note:** This includes required libraries such as `soundfile`, > `onnxruntime`, `x-transformers`, and `einops` via > `requirements/common.txt` and platform-specific requirements files. diff --git a/docs/user_guide/examples/offline_inference/mimo_audio.md b/docs/user_guide/examples/offline_inference/mimo_audio.md index 1a3be15d69a..4e80526971e 100644 --- a/docs/user_guide/examples/offline_inference/mimo_audio.md +++ b/docs/user_guide/examples/offline_inference/mimo_audio.md @@ -189,29 +189,6 @@ Note: This task uses hardcoded message lists in the script. ## Troubleshooting -### Audio dependencies (soundfile, librosa) - -This example depends on **soundfile** (read/write WAV) and **librosa** (load audio including MP3). Install the project requirements first: - -```bash -pip install -r requirements/common.txt -# or at least: pip install soundfile>=0.13.1 librosa>=0.11.0 -``` - -- **`soundfile` / libsndfile not found** - `soundfile` uses the C library **libsndfile**. On Linux, install the system package before pip: - - Debian/Ubuntu: `sudo apt-get install libsndfile1` - - For development builds: `sudo apt-get install libsndfile1-dev` - - Then: `pip install soundfile` - -- **`librosa` fails to load MP3 or reports "No backend available"** - Loading MP3 (e.g. in `spoken_dialogue_sft_multiturn` with `.mp3` files) uses **ffmpeg** as the backend. Install ffmpeg: - - Debian/Ubuntu: `sudo apt-get install ffmpeg` - - macOS: `brew install ffmpeg` - -- **`ImportError: No module named 'soundfile'` or `ModuleNotFoundError: ... librosa`** - Ensure you are in the same Python environment where vLLM Omni and the example dependencies are installed, and that `requirements/common.txt` (or the packages above) are installed. - ### Tokenizer path - **`MIMO_AUDIO_TOKENIZER_PATH` not set or model fails to find tokenizer** diff --git a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md index 07a56cf9a06..c54976b540d 100644 --- a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md +++ b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md @@ -64,14 +64,6 @@ If media file paths are not provided, the script will use default assets. Suppor - `use_audio_in_video`: Extract audio from video - `text`: Text-only query -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Example materials ??? abstract "end2end.py" diff --git a/docs/user_guide/examples/offline_inference/qwen3_omni.md b/docs/user_guide/examples/offline_inference/qwen3_omni.md index 6577092bbfe..2d856f7380a 100644 --- a/docs/user_guide/examples/offline_inference/qwen3_omni.md +++ b/docs/user_guide/examples/offline_inference/qwen3_omni.md @@ -112,14 +112,6 @@ python end2end_async_chunk.py \ > async_chunk example when you need the stage-level concurrency semantics > described in PR #962 / #1151. -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Example materials ??? abstract "end2end.py" diff --git a/docs/user_guide/examples/online_serving/bagel.md b/docs/user_guide/examples/online_serving/bagel.md index 4a6094c0894..9de31926aa1 100644 --- a/docs/user_guide/examples/online_serving/bagel.md +++ b/docs/user_guide/examples/online_serving/bagel.md @@ -357,13 +357,6 @@ curl http://localhost:8091/v1/chat/completions \ ## FAQ -- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. - -```bash -sudo apt update -sudo apt install ffmpeg -``` - - If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. | Stage | VRAM | diff --git a/docs/user_guide/examples/online_serving/qwen2_5_omni.md b/docs/user_guide/examples/online_serving/qwen2_5_omni.md index 43576469242..b3a2c9f2ac9 100644 --- a/docs/user_guide/examples/online_serving/qwen2_5_omni.md +++ b/docs/user_guide/examples/online_serving/qwen2_5_omni.md @@ -218,14 +218,6 @@ The gradio script supports the following arguments: - `--port`: Port for Gradio server (default: 7861) - `--share`: Share the Gradio demo publicly (creates a public link) -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Example materials ??? abstract "gradio_demo.py" diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md index 69de24852f6..6f6d9ae4a9d 100644 --- a/docs/user_guide/examples/online_serving/qwen3_omni.md +++ b/docs/user_guide/examples/online_serving/qwen3_omni.md @@ -64,15 +64,6 @@ python openai_chat_completion_client_for_multimodal_generation.py \ bash run_curl_multimodal_generation.sh use_image ``` - -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Modality control You can control output modalities to specify which types of output the model should generate. This is useful when you only need text output and want to skip audio generation stages for better performance. diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md index 156c4942cd9..4e632d4c288 100644 --- a/docs/user_guide/examples/online_serving/qwen3_tts.md +++ b/docs/user_guide/examples/online_serving/qwen3_tts.md @@ -211,14 +211,6 @@ with open("output.wav", "wb") as f: f.write(response.content) ``` -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## API Reference ### Voices Endpoint diff --git a/examples/offline_inference/bagel/README.md b/examples/offline_inference/bagel/README.md index 226c009f792..48517b1cda0 100644 --- a/examples/offline_inference/bagel/README.md +++ b/examples/offline_inference/bagel/README.md @@ -247,13 +247,6 @@ For more details on the Mooncake connector and multi-node setup, see the [Moonca ## FAQ -- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. - -```bash -sudo apt update -sudo apt install ffmpeg -``` - - If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. | Stage | VRAM | diff --git a/examples/offline_inference/cosyvoice3/README.md b/examples/offline_inference/cosyvoice3/README.md index 895d3f660f0..e16134e6ef2 100644 --- a/examples/offline_inference/cosyvoice3/README.md +++ b/examples/offline_inference/cosyvoice3/README.md @@ -7,7 +7,7 @@ Install dependencies: uv pip install -e . ``` -> **Note:** This includes required libraries such as `librosa`, `soundfile`, +> **Note:** This includes required libraries such as `soundfile`, > `onnxruntime`, `x-transformers`, and `einops` via > `requirements/common.txt` and platform-specific requirements files. diff --git a/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py b/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py index 68ab72b3870..6311bbc901a 100644 --- a/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py +++ b/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py @@ -2,13 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os -from pathlib import Path -import librosa import numpy as np import soundfile as sf from vllm import SamplingParams from vllm.assets.audio import AudioAsset +from vllm.multimodal.media.audio import load_audio from vllm_omni.entrypoints.omni import Omni from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config @@ -16,22 +15,6 @@ from vllm_omni.model_executor.models.cosyvoice3.utils import extract_text_token -def _ensure_mel_filters_asset() -> None: - repo_root = Path(__file__).resolve().parents[3] - filters_path = repo_root / "vllm_omni" / "model_executor" / "models" / "cosyvoice3" / "assets" / "mel_filters.npz" - if filters_path.exists(): - return - - source_url = "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz" - raise FileNotFoundError( - "Missing CosyVoice3 mel filter asset:\n" - f" {filters_path}\n" - "Download it with:\n" - f" mkdir -p {filters_path.parent} && " - f"curl -L {source_url} -o {filters_path}" - ) - - def run_e2e(): parser = argparse.ArgumentParser() # ""FunAudioLLM/Fun-CosyVoice3-0.5B-2512 @@ -56,7 +39,6 @@ def run_e2e(): help="Path to tokenizer directory (e.g., /CosyVoice-BlankEN).", ) args = parser.parse_args() - _ensure_mel_filters_asset() # Ensure tokenizer directory exists if not os.path.exists(args.tokenizer): raise FileNotFoundError(f"{args.tokenizer} does not exist!") @@ -85,7 +67,7 @@ def run_e2e(): if not os.path.exists(args.audio_path): raise FileNotFoundError(f"Audio file not found: {args.audio_path}") # Load at native sample rate - audio_signal, sr = librosa.load(args.audio_path, sr=None) + audio_signal, sr = load_audio(args.audio_path, sr=None) # Validate sample rate before processing (similar to original CosyVoice) min_sr = 16000 diff --git a/examples/offline_inference/mimo_audio/README.md b/examples/offline_inference/mimo_audio/README.md index 747e734cc24..596afabeef9 100644 --- a/examples/offline_inference/mimo_audio/README.md +++ b/examples/offline_inference/mimo_audio/README.md @@ -190,29 +190,6 @@ Note: This task uses hardcoded message lists in the script. ## Troubleshooting -### Audio dependencies (soundfile, librosa) - -This example depends on **soundfile** (read/write WAV) and **librosa** (load audio including MP3). Install the project requirements first: - -```bash -pip install -r requirements/common.txt -# or at least: pip install soundfile>=0.13.1 librosa>=0.11.0 -``` - -- **`soundfile` / libsndfile not found** - `soundfile` uses the C library **libsndfile**. On Linux, install the system package before pip: - - Debian/Ubuntu: `sudo apt-get install libsndfile1` - - For development builds: `sudo apt-get install libsndfile1-dev` - - Then: `pip install soundfile` - -- **`librosa` fails to load MP3 or reports "No backend available"** - Loading MP3 (e.g. in `spoken_dialogue_sft_multiturn` with `.mp3` files) uses **ffmpeg** as the backend. Install ffmpeg: - - Debian/Ubuntu: `sudo apt-get install ffmpeg` - - macOS: `brew install ffmpeg` - -- **`ImportError: No module named 'soundfile'` or `ModuleNotFoundError: ... librosa`** - Ensure you are in the same Python environment where vLLM Omni and the example dependencies are installed, and that `requirements/common.txt` (or the packages above) are installed. - ### Tokenizer path - **`MIMO_AUDIO_TOKENIZER_PATH` not set or model fails to find tokenizer** diff --git a/examples/offline_inference/mimo_audio/message_convert.py b/examples/offline_inference/mimo_audio/message_convert.py index ebcc59c6b43..416f21ccfaf 100644 --- a/examples/offline_inference/mimo_audio/message_convert.py +++ b/examples/offline_inference/mimo_audio/message_convert.py @@ -5,12 +5,12 @@ import re from collections.abc import Callable -import librosa import numpy as np import torch import torchaudio from process_speechdata import InputSegment, StreamingInputSegment from torchaudio.transforms import MelSpectrogram +from vllm.multimodal.media.audio import load_audio speech_zeroemb_idx = 151667 empty_token = "<|empty|>" @@ -685,7 +685,7 @@ def get_audio_data(audio_url): # File path audio_file = audio_url - audio_signal, sr = librosa.load(audio_file, sr=24000) + audio_signal, sr = load_audio(audio_file, sr=24000) audio_data = (audio_signal.astype(np.float32), sr) return audio_data diff --git a/examples/offline_inference/omnivoice/end2end.py b/examples/offline_inference/omnivoice/end2end.py index b41379b011a..9371c95142b 100644 --- a/examples/offline_inference/omnivoice/end2end.py +++ b/examples/offline_inference/omnivoice/end2end.py @@ -103,9 +103,9 @@ def run_e2e(): if not os.path.exists(args.ref_audio): raise FileNotFoundError(f"Reference audio not found: {args.ref_audio}") - import librosa + from vllm.multimodal.media.audio import load_audio - audio_signal, sr = librosa.load(args.ref_audio, sr=None) + audio_signal, sr = load_audio(args.ref_audio, sr=None) multi_modal_data["audio"] = (audio_signal.astype(np.float32), sr) mm_processor_kwargs["ref_text"] = args.ref_text or "" mm_processor_kwargs["sample_rate"] = sr diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index 20740a0da02..e2eae8a96b5 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -60,11 +60,3 @@ If media file paths are not provided, the script will use default assets. Suppor - `mixed_modalities`: Audio + image + video - `use_audio_in_video`: Extract audio from video - `text`: Text-only query - -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index 7bba5998308..d8f1898ec91 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -9,7 +9,6 @@ import time from typing import NamedTuple -import librosa import numpy as np import soundfile as sf from PIL import Image @@ -17,6 +16,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode +from vllm.multimodal.media.audio import load_audio from vllm.sampling_params import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -96,7 +96,7 @@ def get_mixed_modalities_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -130,7 +130,7 @@ def get_use_audio_in_video_query( raise FileNotFoundError(f"Video file not found: {video_path}") video_frames = video_to_ndarrays(video_path, num_frames=num_frames) # Extract audio from video file - audio_signal, sr = librosa.load(video_path, sr=sampling_rate) + audio_signal, sr = load_audio(video_path, sr=sampling_rate) audio = (audio_signal.astype(np.float32), sr) else: asset = VideoAsset(name="baby_reading", num_frames=num_frames) @@ -165,7 +165,7 @@ def get_multi_audios_query(audio_path: str | None = None, sampling_rate: int = 1 if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) # Use the provided audio as the first audio, default as second audio_list = [ @@ -261,7 +261,7 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate diff --git a/examples/offline_inference/qwen3_omni/README.md b/examples/offline_inference/qwen3_omni/README.md index b3e8592532e..d69ad6abfc9 100644 --- a/examples/offline_inference/qwen3_omni/README.md +++ b/examples/offline_inference/qwen3_omni/README.md @@ -108,11 +108,3 @@ python end2end_async_chunk.py \ > recommended entry point for non-async-chunk workflows. Only use the > async_chunk example when you need the stage-level concurrency semantics > described in PR #962 / #1151. - -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 155eca4ed9f..056f820ff07 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -9,7 +9,6 @@ import time from typing import NamedTuple -import librosa import numpy as np import soundfile as sf import vllm @@ -19,6 +18,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode +from vllm.multimodal.media.audio import load_audio from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni.entrypoints.omni import Omni @@ -129,7 +129,7 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -183,7 +183,7 @@ def get_mixed_modalities_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate diff --git a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py index 8adbae9eb66..07442631302 100644 --- a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py +++ b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py @@ -32,13 +32,13 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -import librosa from PIL import Image from vllm import SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode +from vllm.multimodal.media.audio import load_audio from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni.entrypoints.async_omni import AsyncOmni @@ -89,7 +89,7 @@ def get_audio_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py index e0424add69b..fb77b214835 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py @@ -5,8 +5,8 @@ import re import time -import librosa from PIL import Image +from vllm.multimodal.media.audio import load_audio from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.entrypoints.omni import Omni @@ -69,7 +69,7 @@ def load_image_and_audio(image_paths, audio_paths): image.append(img) for path in audio_paths: - audio_array, sr = librosa.load(path, sr=16000) + audio_array, sr = load_audio(path, sr=16000) audio_array = audio_array[int(sr * 1) : int(sr * 3)] audio.append(audio_array) return image, audio diff --git a/examples/online_serving/bagel/README.md b/examples/online_serving/bagel/README.md index 9b74acae10e..0939bc5f387 100644 --- a/examples/online_serving/bagel/README.md +++ b/examples/online_serving/bagel/README.md @@ -354,13 +354,6 @@ curl http://localhost:8091/v1/chat/completions \ ## FAQ -- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. - -```bash -sudo apt update -sudo apt install ffmpeg -``` - - If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. | Stage | VRAM | diff --git a/examples/online_serving/qwen2_5_omni/README.md b/examples/online_serving/qwen2_5_omni/README.md index 91aab3b6518..c528732064a 100644 --- a/examples/online_serving/qwen2_5_omni/README.md +++ b/examples/online_serving/qwen2_5_omni/README.md @@ -208,11 +208,3 @@ The gradio script supports the following arguments: - `--ip`: Host/IP for Gradio server (default: 127.0.0.1) - `--port`: Port for Gradio server (default: 7861) - `--share`: Share the Gradio demo publicly (creates a public link) - -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` diff --git a/examples/online_serving/qwen3_omni/README.md b/examples/online_serving/qwen3_omni/README.md index c3171e43667..ff026422479 100644 --- a/examples/online_serving/qwen3_omni/README.md +++ b/examples/online_serving/qwen3_omni/README.md @@ -43,11 +43,9 @@ python examples/online_serving/openai_chat_completion_client_for_multimodal_gene **Dependencies:** ```bash -pip install websockets librosa numpy +pip install websockets numpy ``` -(ffmpeg may be required by `librosa` for some formats; see the FAQ below.) - **From this directory** (`examples/online_serving/qwen3_omni`): ```bash @@ -105,12 +103,6 @@ bash run_curl_multimodal_generation.sh use_image ### FAQ -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Modality control You can control output modalities to specify which types of output the model should generate. This is useful when you only need text output and want to skip audio generation stages for better performance. diff --git a/examples/online_serving/qwen3_omni/openai_realtime_client.py b/examples/online_serving/qwen3_omni/openai_realtime_client.py index 4fa043c481d..660e4ac336a 100644 --- a/examples/online_serving/qwen3_omni/openai_realtime_client.py +++ b/examples/online_serving/qwen3_omni/openai_realtime_client.py @@ -10,7 +10,7 @@ Requirements: - vllm with audio support - websockets -- librosa +- soundfile - numpy The script: @@ -25,10 +25,10 @@ import base64 import json -import librosa import numpy as np import websockets from vllm.assets.audio import AudioAsset +from vllm.multimodal.media.audio import load_audio def audio_to_pcm16_base64(audio_path: str) -> str: @@ -36,7 +36,7 @@ def audio_to_pcm16_base64(audio_path: str) -> str: Load an audio file and convert it to base64-encoded PCM16 @ 16kHz. """ # Load audio and resample to 16kHz mono - audio, _ = librosa.load(audio_path, sr=16000, mono=True) + audio, _ = load_audio(audio_path, sr=16000, mono=True) # Convert to PCM16 pcm16 = (audio * 32767).astype(np.int16) # Encode as base64 diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index 5504b5737a8..e53fa7392bc 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -192,14 +192,6 @@ with open("output.wav", "wb") as f: f.write(response.content) ``` -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## API Reference ### Voices Endpoint diff --git a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py index e6786f8869f..38a2bdea929 100644 --- a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py +++ b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py @@ -5,7 +5,7 @@ using SLERP and sends the result to the /v1/audio/speech API. Requirements: - pip install torch librosa soundfile numpy httpx + pip install torch resampy soundfile numpy httpx Examples: # Extract and save an embedding @@ -143,17 +143,17 @@ def _load_speaker_encoder_weights(encoder: torch.nn.Module, model_path: str) -> def compute_mel_spectrogram(audio: np.ndarray, sr: int = 24000) -> torch.Tensor: """Compute 128-bin mel spectrogram matching Qwen3-TTS's extraction pipeline.""" - import librosa + from vllm.multimodal.audio import resample_audio_resampy # Resample to 24kHz if needed if sr != 24000: - audio = librosa.resample(audio.astype(np.float32), orig_sr=sr, target_sr=24000) + audio = resample_audio_resampy(audio.astype(np.float32), orig_sr=sr, target_sr=24000) y = torch.from_numpy(audio).unsqueeze(0).float() - from librosa.filters import mel as librosa_mel_fn + from vllm_omni.utils.audio import mel_filter_bank - mel_basis = torch.from_numpy(librosa_mel_fn(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000)).float() + mel_basis = mel_filter_bank(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000) n_fft = 1024 hop_size = 256 @@ -180,9 +180,9 @@ def compute_mel_spectrogram(audio: np.ndarray, sr: int = 24000) -> torch.Tensor: @torch.inference_mode() def extract_embedding(encoder: torch.nn.Module, audio_path: str, device: str = "cpu") -> np.ndarray: """Extract a 1024-dim speaker embedding from an audio file.""" - import librosa + from vllm.multimodal.media.audio import load_audio - audio, sr = librosa.load(audio_path, sr=None, mono=True) + audio, sr = load_audio(audio_path, sr=None, mono=True) mel = compute_mel_spectrogram(audio, sr).to(device) embedding = encoder(mel.to(next(encoder.parameters()).dtype))[0] return embedding.float().cpu().numpy() diff --git a/requirements/common.txt b/requirements/common.txt index 89eaac32bcc..1fff584448d 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,7 +1,6 @@ # Common dependencies for all platforms av>=14.0.0 omegaconf>=2.3.0 -librosa>=0.11.0 resampy>=0.4.3 diffusers>=0.36.0 accelerate==1.12.0 diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 57aeef8f9de..554164a59c2 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -63,14 +63,11 @@ def test_stereo_to_mono_conversion(self, audio_mixin, mocker: MockerFixture): adjusted_tensor = mock_speed.call_args[0][0] assert len(adjusted_tensor) == 24000 - def test_speed_adjustment(self, audio_mixin, mocker: MockerFixture): - mock_time_stretch = mocker.patch("librosa.effects.time_stretch") - mock_time_stretch.return_value = np.zeros(12000) + def test_speed_adjustment(self, audio_mixin): audio_tensor = np.random.rand(24000).astype(np.float32) adjusted_audio, _ = audio_mixin._apply_speed_adjustment(audio_tensor, speed=2.0, sample_rate=24000) - mock_time_stretch.assert_called_with(y=audio_tensor, rate=2.0) assert adjusted_audio.shape == (12000,) def test_unsupported_format_fallback(self, audio_mixin, caplog, mocker: MockerFixture): @@ -117,30 +114,22 @@ def test_stereo_audio_preservation(self, audio_mixin, mocker: MockerFixture): assert np.array_equal(output_tensor, stereo_tensor) def test_speed_adjustment_bypass(self, audio_mixin, mocker: MockerFixture): - """Test that speed=1.0 bypasses the expensive librosa time stretching.""" + """Test that speed=1.0 bypasses the expensive torchaudio time stretching.""" audio_tensor = np.random.rand(24000).astype(np.float32) - mock_time_stretch = mocker.patch("librosa.effects.time_stretch") - # speed=1.0 should return immediately without calling librosa + mock_time_stretch = mocker.patch("torchaudio.transforms.TimeStretch") + # speed=1.0 should return immediately without calling torchaudio result, _ = audio_mixin._apply_speed_adjustment(audio_tensor, speed=1.0, sample_rate=24000) mock_time_stretch.assert_not_called() assert np.array_equal(result, audio_tensor) - def test_speed_adjustment_stereo_handling(self, audio_mixin, mocker: MockerFixture): - """Test that speed adjustment is attempted on stereo inputs.""" - mock_time_stretch = mocker.patch("librosa.effects.time_stretch") + def test_speed_adjustment_stereo_handling(self, audio_mixin): + """Test that speed adjustment handles stereo (channels-last) input.""" stereo_tensor = np.random.rand(24000, 2).astype(np.float32) - # Mock return value representing a sped-up version (half length) - mock_time_stretch.return_value = np.zeros((12000, 2), dtype=np.float32) result, _ = audio_mixin._apply_speed_adjustment(stereo_tensor, speed=2.0, sample_rate=24000) - mock_time_stretch.assert_called_once() - # Ensure the stereo tensor was passed to librosa - call_args = mock_time_stretch.call_args - assert np.array_equal(call_args.kwargs["y"], stereo_tensor) - assert call_args.kwargs["rate"] == 2.0 assert result.shape == (12000, 2) diff --git a/tests/utils/test_audio.py b/tests/utils/test_audio.py new file mode 100644 index 00000000000..cfbd2501b25 --- /dev/null +++ b/tests/utils/test_audio.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Unit tests for vllm_omni.utils.audio.mel_filter_bank.""" + +import pytest +import torch + +from vllm_omni.utils.audio import mel_filter_bank + +# Parameter combinations used across the codebase. +_PARAM_SETS = [ + # Qwen3-TTS talker / speaker encoder (sr=24000) + dict(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000), + # CosyVoice3 whisper encoder, Qwen3-TTS 25Hz tokenizer (sr=16000, 80 mels) + dict(sr=16000, n_fft=400, n_mels=80), + # CosyVoice3 whisper encoder (sr=16000, 128 mels) + dict(sr=16000, n_fft=400, n_mels=128), +] + +_parametrize_params = pytest.mark.parametrize( + "params", _PARAM_SETS, ids=lambda p: f"{p['sr']}_{p['n_fft']}_{p['n_mels']}" +) + + +class TestMelFilterBank: + @_parametrize_params + def test_output_shape(self, params): + fb = mel_filter_bank(**params) + n_freqs = params["n_fft"] // 2 + 1 + assert fb.shape == (params["n_mels"], n_freqs) + + @_parametrize_params + def test_non_negative(self, params): + fb = mel_filter_bank(**params) + assert (fb >= 0).all() + + def test_dtype_is_float(self): + fb = mel_filter_bank(sr=16000, n_fft=400, n_mels=80) + assert fb.dtype == torch.float32 + + def test_fmax_defaults_to_nyquist(self): + """When fmax is omitted it should equal sr / 2.""" + fb_default = mel_filter_bank(sr=16000, n_fft=400, n_mels=80) + fb_explicit = mel_filter_bank(sr=16000, n_fft=400, n_mels=80, fmax=8000.0) + torch.testing.assert_close(fb_default, fb_explicit) + + def test_each_mel_band_has_nonzero_energy(self): + """Every mel band should have at least one nonzero frequency bin.""" + fb = mel_filter_bank(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000) + for i in range(fb.shape[0]): + assert fb[i].sum() > 0, f"mel band {i} is all zeros" + + def test_higher_fmax_extends_coverage(self): + """A higher fmax should produce nonzero weights at higher frequency bins.""" + fb_low = mel_filter_bank(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=6000) + fb_high = mel_filter_bank(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000) + # The highest nonzero column should be larger for fb_high. + last_nonzero_low = (fb_low.sum(dim=0) > 0).nonzero()[-1].item() + last_nonzero_high = (fb_high.sum(dim=0) > 0).nonzero()[-1].item() + assert last_nonzero_high > last_nonzero_low diff --git a/vllm_omni/assets/video.py b/vllm_omni/assets/video.py index 98b1f7e4e29..6a5f3204a91 100644 --- a/vllm_omni/assets/video.py +++ b/vllm_omni/assets/video.py @@ -1,6 +1,6 @@ -import librosa import numpy as np from vllm.assets.video import VideoAsset +from vllm.multimodal.media.audio import load_audio def extract_video_audio(path: str = None, sampling_rate: int = 16000) -> np.ndarray: @@ -12,5 +12,5 @@ def extract_video_audio(path: str = None, sampling_rate: int = 16000) -> np.ndar """ if not path: path = VideoAsset(name="baby_reading").video_path - audio_signal, sr = librosa.load(path, sr=sampling_rate) + audio_signal, sr = load_audio(path, sr=sampling_rate) return audio_signal diff --git a/vllm_omni/entrypoints/chat_utils.py b/vllm_omni/entrypoints/chat_utils.py index 8970e589844..4c3d311ec50 100644 --- a/vllm_omni/entrypoints/chat_utils.py +++ b/vllm_omni/entrypoints/chat_utils.py @@ -2,7 +2,7 @@ async def extract_audio_from_video_async(video_url: str) -> tuple[np.ndarray, int | float]: - """Extract audio from a video URL using librosa. + """Extract audio from a video URL using vllm's load_audio. Returns a (audio_array, sample_rate) tuple compatible with audio format. All blocking I/O operations are run in a thread pool. @@ -26,9 +26,9 @@ def _write_temp_file_sync(data: bytes, suffix: str) -> str: return temp_file.name def _load_audio_sync(file_path: str) -> tuple[np.ndarray, int | float]: - import librosa + from vllm.multimodal.media.audio import load_audio - return librosa.load(file_path, sr=16000) + return load_audio(file_path, sr=16000) def _cleanup_file_sync(file_path: str) -> None: try: diff --git a/vllm_omni/entrypoints/openai/audio_utils_mixin.py b/vllm_omni/entrypoints/openai/audio_utils_mixin.py index 13df32ebe00..b626f7eeb20 100644 --- a/vllm_omni/entrypoints/openai/audio_utils_mixin.py +++ b/vllm_omni/entrypoints/openai/audio_utils_mixin.py @@ -1,6 +1,8 @@ from io import BytesIO import numpy as np +import torch +import torchaudio from vllm.logger import init_logger from vllm_omni.entrypoints.openai.protocol.audio import AudioResponse, CreateAudio @@ -10,11 +12,6 @@ except ImportError: soundfile = None -try: - import librosa -except ImportError: - librosa = None - logger = init_logger(__name__) @@ -74,20 +71,53 @@ def create_audio(self, audio_obj: CreateAudio) -> AudioResponse: return AudioResponse(audio_data=audio_data, media_type=media_type) def _apply_speed_adjustment(self, audio_tensor: np.ndarray, speed: float, sample_rate: int): - """Apply speed adjustment to the audio tensor while preserving pitch.""" + """Apply speed adjustment to the audio tensor while preserving pitch. + + Uses torchaudio's phase vocoder (Spectrogram → TimeStretch → + InverseSpectrogram) to stretch/compress audio in time without + changing pitch. + """ if speed == 1.0: return audio_tensor, sample_rate - if librosa is None: - raise ImportError("librosa is required for speed adjustment. Please install it with: pip install librosa") - try: - # librosa.effects.time_stretch requires a float audio tensor. if not np.issubdtype(audio_tensor.dtype, np.floating): audio_tensor = audio_tensor.astype(np.float32) - stretched_audio = librosa.effects.time_stretch(y=audio_tensor, rate=speed) - return stretched_audio, sample_rate + # Stereo numpy arrays use channels-last (T, C); + # torch expects channels-first (C, T). + channels_last = audio_tensor.ndim == 2 + if channels_last: + waveform = torch.from_numpy(audio_tensor.T) + else: + waveform = torch.from_numpy(audio_tensor).unsqueeze(0) + + # Match librosa.stft defaults: n_fft=2048, hop_length=n_fft//4 + n_fft = 2048 + hop_length = n_fft // 4 + to_spec = torchaudio.transforms.Spectrogram( + n_fft=n_fft, + hop_length=hop_length, + power=None, + ) + stretch = torchaudio.transforms.TimeStretch( + n_freq=n_fft // 2 + 1, + hop_length=hop_length, + ) + to_wave = torchaudio.transforms.InverseSpectrogram( + n_fft=n_fft, + hop_length=hop_length, + ) + + spec = to_spec(waveform) + stretched = stretch(spec, speed) + expected_length = int(audio_tensor.shape[0] / speed) + result = to_wave(stretched, length=expected_length) + + result = result.squeeze(0).numpy() + if channels_last: + result = result.T + return result, sample_rate except Exception as e: logger.error(f"An error occurred during speed adjustment: {e}") raise ValueError("Failed to apply speed adjustment.") from e diff --git a/vllm_omni/model_executor/models/cosyvoice3/assets/mel_filters.npz b/vllm_omni/model_executor/models/cosyvoice3/assets/mel_filters.npz deleted file mode 100644 index 28ea26909db..00000000000 Binary files a/vllm_omni/model_executor/models/cosyvoice3/assets/mel_filters.npz and /dev/null differ diff --git a/vllm_omni/model_executor/models/cosyvoice3/utils.py b/vllm_omni/model_executor/models/cosyvoice3/utils.py index 52c52655e8d..0bf0cccb163 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/utils.py +++ b/vllm_omni/model_executor/models/cosyvoice3/utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging -import os from functools import cache, lru_cache import numpy as np @@ -9,7 +8,8 @@ import torch.nn.functional as F import torchaudio import torchaudio.compliance.kaldi as kaldi -from librosa.filters import mel as librosa_mel_fn + +from vllm_omni.utils.audio import mel_filter_bank logger = logging.getLogger(__name__) @@ -34,8 +34,13 @@ def _get_mel_basis( fmax: float | None, device_str: str, ) -> torch.Tensor: - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - return torch.from_numpy(mel).float().to(torch.device(device_str)) + return mel_filter_bank( + sr=sampling_rate, + n_fft=n_fft, + n_mels=num_mels, + fmin=fmin, + fmax=fmax, + ).to(torch.device(device_str)) @lru_cache @@ -122,42 +127,8 @@ def exact_div(x, y): @cache def mel_filters(device, n_mels: int) -> torch.Tensor: - """ - load the mel filterbank matrix for projecting STFT into a Mel spectrogram. - Allows decoupling librosa dependency; saved using: - - np.savez_compressed( - "mel_filters.npz", - mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), - mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128), - ) - """ - assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}" - - filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") - if not os.path.exists(filters_path): - source_url = "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz" - os.makedirs(os.path.dirname(filters_path), exist_ok=True) - try: - import urllib.request - - with urllib.request.urlopen(source_url, timeout=30) as resp: - with open(filters_path, "wb") as f_out: - f_out.write(resp.read()) - logger.info("Downloaded mel_filters.npz from %s", source_url) - except Exception as e: - raise FileNotFoundError( - "Missing CosyVoice3 mel filter asset:\n" - f" {filters_path}\n" - "Auto-download failed. Download it manually from:\n" - f" {source_url}\n" - "Example:\n" - f" mkdir -p {os.path.dirname(filters_path)} && " - f"curl -L {source_url} -o {filters_path}" - ) from e - - with np.load(filters_path, allow_pickle=False) as f: - return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) + """Compute mel filterbank matrix for projecting STFT into a Mel spectrogram.""" + return mel_filter_bank(sr=16000, n_fft=400, n_mels=n_mels).to(device) def log_mel_spectrogram( diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py index 9f8aff6aff2..f89012ec45d 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py @@ -13,7 +13,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -from librosa.filters import mel as librosa_mel_fn from transformers import AutoTokenizer from transformers.activations import ACT2FN from transformers.utils.hub import cached_file @@ -27,6 +26,7 @@ from vllm.sequence import IntermediateTensors from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.utils.audio import mel_filter_bank from vllm_omni.utils.voice_cache import VoiceEmbeddingCache from .configuration_qwen3_tts import Qwen3TTSConfig, Qwen3TTSSpeakerEncoderConfig, Qwen3TTSTalkerConfig @@ -258,14 +258,19 @@ def mel_spectrogram( fmax: int | None = None, center: bool = False, ) -> torch.Tensor: - """Calculate mel spectrogram of an input signal using librosa mel filterbank and torch STFT.""" + """Calculate mel spectrogram of an input signal using torchaudio mel filterbank and torch STFT.""" if torch.min(y) < -1.0: logger.warning("Min value of input waveform signal is %s", torch.min(y)) if torch.max(y) > 1.0: logger.warning("Max value of input waveform signal is %s", torch.max(y)) device = y.device - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis = torch.from_numpy(mel).float().to(device) + mel_basis = mel_filter_bank( + sr=sampling_rate, + n_fft=n_fft, + n_mels=num_mels, + fmin=fmin, + fmax=fmax, + ).to(device) hann_window = torch.hann_window(win_size).to(device) padding = (n_fft - hop_size) // 2 y = torch.nn.functional.pad(y.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1) @@ -871,7 +876,7 @@ def _load_audio_to_np(self, x: str) -> tuple[np.ndarray, int]: Uses upstream vLLM's MediaConnector for http(s) URLs and ``file:`` URIs, with unrestricted local access (offline inference is trusted). """ - import librosa + from vllm.multimodal.media.audio import load_audio if self._is_url(x): from vllm.multimodal.media import MediaConnector @@ -883,7 +888,7 @@ def _load_audio_to_np(self, x: str) -> tuple[np.ndarray, int]: with io.BytesIO(wav_bytes) as f: audio, sr = sf.read(f, dtype="float32", always_2d=False) else: - audio, sr = librosa.load(x, sr=None, mono=True) + audio, sr = load_audio(x, sr=None, mono=True) if isinstance(audio, np.ndarray) and audio.ndim > 1: audio = np.mean(audio, axis=-1) @@ -1089,9 +1094,9 @@ def _extract_speaker_embedding(self, wav: np.ndarray, sr: int) -> torch.Tensor: # Resample to 24kHz for speaker encoder. target_sr = int(getattr(self.config.speaker_encoder_config, "sample_rate", 24000)) if sr != target_sr: - import librosa + from vllm.multimodal.audio import resample_audio_resampy - wav = librosa.resample(y=wav.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) + wav = resample_audio_resampy(wav.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) sr = target_sr # Follow official implementation: mel_spectrogram expects 24kHz. diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py index 503e6bbc83b..3db5cfd1b82 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py @@ -17,12 +17,13 @@ import urllib.request from urllib.parse import urlparse -import librosa import numpy as np import soundfile as sf import torch from torch.nn.utils.rnn import pad_sequence from transformers import AutoConfig, AutoFeatureExtractor, AutoModel +from vllm.multimodal.audio import resample_audio_resampy +from vllm.multimodal.media.audio import load_audio as _load_audio_file from .tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2 import Qwen3TTSTokenizerV2Config from .tokenizer_12hz.modeling_qwen3_tts_tokenizer_v2 import ( @@ -154,13 +155,13 @@ def load_audio( with io.BytesIO(wav_bytes) as f: audio, sr = sf.read(f, dtype="float32", always_2d=False) else: - audio, sr = librosa.load(x, sr=None, mono=True) + audio, sr = _load_audio_file(x, sr=None, mono=True) if audio.ndim > 1: audio = np.mean(audio, axis=-1) if sr != target_sr: - audio = librosa.resample(y=audio, orig_sr=sr, target_sr=target_sr) + audio = resample_audio_resampy(audio, orig_sr=sr, target_sr=target_sr) return audio.astype(np.float32) @@ -208,7 +209,7 @@ def _normalize_audio_inputs( if a.ndim > 1: a = np.mean(a, axis=-1) if int(sr) != target_sr: - a = librosa.resample(y=a.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) + a = resample_audio_resampy(a.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) out.append(a.astype(np.float32)) return out diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/assets/mel_filters.npz b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/assets/mel_filters.npz deleted file mode 100644 index 28ea26909db..00000000000 Binary files a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/assets/mel_filters.npz and /dev/null differ diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py index de2c69702c5..9bb2f78c5c0 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py +++ b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py @@ -22,9 +22,10 @@ import torch.nn as nn import torch.nn.functional as F import torchaudio.compliance.kaldi as kaldi -from librosa.filters import mel as librosa_mel_fn from torch import Tensor +from vllm_omni.utils.audio import mel_filter_bank + from .core_vq import DistributedGroupResidualVectorQuantization from .whisper_encoder import Conv1d, ConvTranspose1d, WhisperEncoder @@ -103,14 +104,14 @@ def extract(self, audio, **kwargs): y = audio if len(list(self.mel_basis.keys())) == 0: - mel = librosa_mel_fn( + mel = mel_filter_bank( sr=self.sampling_rate, n_fft=self.filter_length, n_mels=self.n_mel_channels, fmin=self.mel_fmin, fmax=self.mel_fmax, ) - self.mel_basis[str(self.mel_fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device) + self.mel_basis[str(self.mel_fmax) + "_" + str(y.device)] = mel.to(y.device) self.hann_window[str(y.device)] = torch.hann_window(self.win_length).to(y.device) y = torch.nn.functional.pad( diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py index e3bd6e1c3a3..8464f53c9df 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py +++ b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py @@ -14,7 +14,6 @@ # limitations under the License. import math import operator -import os from functools import cache from itertools import accumulate @@ -24,6 +23,7 @@ from torch import Tensor, nn from vllm_omni.diffusion.attention.backends.utils.fa import HAS_FLASH_ATTN, flash_attn_varlen_func +from vllm_omni.utils.audio import mel_filter_bank N_FFT = 400 HOP_LENGTH = 160 @@ -31,21 +31,8 @@ @cache def mel_filters(device, n_mels: int) -> torch.Tensor: - """ - load the mel filterbank matrix for projecting STFT into a Mel spectrogram. - Allows decoupling librosa dependency; saved using: - - np.savez_compressed( - "mel_filters.npz", - mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), - mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128), - ) - """ - assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}" - - filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") - with np.load(filters_path, allow_pickle=False) as f: - return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) + """Compute mel filterbank matrix for projecting STFT into a Mel spectrogram.""" + return mel_filter_bank(sr=16000, n_fft=N_FFT, n_mels=n_mels).to(device) def log_mel_spectrogram( diff --git a/vllm_omni/utils/audio.py b/vllm_omni/utils/audio.py new file mode 100644 index 00000000000..490737bd530 --- /dev/null +++ b/vllm_omni/utils/audio.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Audio utility functions shared across models and entrypoints.""" + +import torch +from torchaudio.functional import melscale_fbanks + + +def mel_filter_bank( + sr: int, + n_fft: int, + n_mels: int, + fmin: float = 0.0, + fmax: float | None = None, +) -> torch.Tensor: + """Compute a mel filterbank matrix. + + Drop-in replacement for ``librosa.filters.mel`` using + ``torchaudio.functional.melscale_fbanks``. + + Args: + sr: Sample rate of the audio. + n_fft: FFT window size. + n_mels: Number of mel bands. + fmin: Minimum frequency (Hz). + fmax: Maximum frequency (Hz). Defaults to ``sr / 2``. + + Returns: + Tensor of shape ``(n_mels, n_fft // 2 + 1)``. + """ + if fmax is None: + fmax = float(sr) / 2.0 + # Use mel_scale='slaney' and norm='slaney' to match librosa's + # default behaviour (Slaney 1998 frequency mapping with area + # normalization). + return melscale_fbanks( + n_freqs=n_fft // 2 + 1, + f_min=float(fmin), + f_max=float(fmax), + n_mels=n_mels, + sample_rate=sr, + mel_scale="slaney", + norm="slaney", + ).T