diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index a23de100da39..db868686e2a7 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -193,7 +193,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: The API server takes care of basic audio I/O and optional chunking before building prompts: -- Resampling: Input audio is resampled to `SpeechToTextConfig.sample_rate` using `librosa`. +- Resampling: Input audio is resampled to `SpeechToTextConfig.sample_rate` using `AudioResampler`. - Chunking: If `SpeechToTextConfig.allow_audio_chunking` is True and the duration exceeds `max_audio_clip_s`, the server splits the audio into overlapping chunks and generates a prompt per chunk. Overlap is controlled by `overlap_chunk_second`. - Energy-aware splitting: When `min_energy_split_window_size` is set, the server finds low-energy regions to minimize cutting within words. @@ -206,8 +206,8 @@ Relevant server logic: async def _preprocess_speech_to_text(...): language = self.model_cls.validate_language(request.language) ... - y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate) - duration = librosa.get_duration(y=y, sr=sr) + y, sr = load_audio(bytes_, sr=self.asr_config.sample_rate) + duration = get_audio_duration(y=y, sr=sr) do_split_audio = (self.asr_config.allow_audio_chunking and duration > self.asr_config.max_audio_clip_s) chunks = [y] if not do_split_audio else self._split_audio(y, int(sr)) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index ee82c34fa0eb..d9b49f7cb7f9 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -300,12 +300,12 @@ Full example: [examples/offline_inference/audio_language.py](../../examples/offl Speech-to-text models like Whisper have a maximum audio length they can process (typically 30 seconds). For longer audio files, vLLM provides a utility to intelligently split audio into chunks at quiet points to minimize cutting through speech. ```python -import librosa from vllm import LLM, SamplingParams from vllm.multimodal.audio import split_audio +from vllm.multimodal.media.audio import load_audio # Load long audio file -audio, sr = librosa.load("long_audio.wav", sr=16000) +audio, sr = load_audio("long_audio.wav", sr=16000) # Split into chunks at low-energy (quiet) regions chunks = split_audio( @@ -832,7 +832,7 @@ Then, you can use the OpenAI client as follows: base_url=openai_api_base, ) - # Any format supported by librosa is supported + # Any format supported by soundfile/PyAV is supported audio_url = AudioAsset("winning_call").url audio_base64 = encode_base64_content_from_url(audio_url) diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index c4407923ed2d..ba3adf55c90e 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -267,7 +267,7 @@ def run_audio(model: str, max_completion_tokens: int) -> None: { "type": "input_audio", "input_audio": { - # Any format supported by librosa is supported + # Any format supported by soundfile/PyAV is supported "data": audio_base64, "format": "wav", }, @@ -292,7 +292,7 @@ def run_audio(model: str, max_completion_tokens: int) -> None: { "type": "audio_url", "audio_url": { - # Any format supported by librosa is supported + # Any format supported by soundfile/PyAV is supported "url": audio_url }, }, @@ -316,7 +316,7 @@ def run_audio(model: str, max_completion_tokens: int) -> None: { "type": "audio_url", "audio_url": { - # Any format supported by librosa is supported + # Any format supported by soundfile/PyAV is supported "url": f"data:audio/ogg;base64,{audio_base64}" }, }, diff --git a/examples/online_serving/openai_realtime_client.py b/examples/online_serving/openai_realtime_client.py index 2bd3c7e60d55..fda3d7cb4564 100644 --- a/examples/online_serving/openai_realtime_client.py +++ b/examples/online_serving/openai_realtime_client.py @@ -12,7 +12,6 @@ Requirements: - vllm with audio support - websockets -- librosa - numpy The script: @@ -26,12 +25,12 @@ import asyncio import json -import librosa import numpy as np import pybase64 as base64 import websockets from vllm.assets.audio import AudioAsset +from vllm.multimodal.media.audio import load_audio def audio_to_pcm16_base64(audio_path: str) -> str: @@ -39,7 +38,7 @@ def audio_to_pcm16_base64(audio_path: str) -> str: Load an audio file and convert it to base64-encoded PCM16 @ 16kHz. """ # Load audio and resample to 16kHz mono - audio, _ = librosa.load(audio_path, sr=16000, mono=True) + audio, _ = load_audio(audio_path, sr=16000, mono=True) # Convert to PCM16 pcm16 = (audio * 32767).astype(np.int16) # Encode as base64 diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index 194c52eae35e..f847b7b0d88d 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -13,7 +13,6 @@ import time from statistics import mean, median -import librosa import pytest import soundfile import torch @@ -21,6 +20,7 @@ from evaluate import load from transformers.models.whisper.english_normalizer import EnglishTextNormalizer +from vllm.multimodal.audio import get_audio_duration from vllm.tokenizers import get_tokenizer from ....models.registry import HF_EXAMPLE_MODELS @@ -84,7 +84,7 @@ async def process_dataset(model, client, data, concurrent_request): trust_remote_code=model_info.trust_remote_code, ) - # Warmup call as the first `librosa.load` server-side is quite slow. + # Warmup call as the first `load_audio` server-side is quite slow. audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"] _ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "") @@ -118,7 +118,7 @@ def print_performance_metrics(results, total_time): def add_duration(sample): y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"] - sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000 + sample["duration_ms"] = get_audio_duration(y=y, sr=sr) * 1000 return sample diff --git a/tests/entrypoints/openai/realtime/test_realtime_validation.py b/tests/entrypoints/openai/realtime/test_realtime_validation.py index bb6b02f5c99e..e317090fa543 100644 --- a/tests/entrypoints/openai/realtime/test_realtime_validation.py +++ b/tests/entrypoints/openai/realtime/test_realtime_validation.py @@ -5,7 +5,6 @@ import json import warnings -import librosa import numpy as np import pybase64 as base64 import pytest @@ -14,6 +13,7 @@ from tests.entrypoints.openai.conftest import add_attention_backend from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer from vllm.assets.audio import AudioAsset +from vllm.multimodal.media.audio import load_audio # Increase engine iteration timeout for ROCm where first-use JIT compilation # can exceed the default 60s, causing a silent deadlock in feed_tokens. @@ -56,7 +56,7 @@ async def send_event(ws, event: dict) -> None: def mary_had_lamb_audio_chunks() -> list[str]: """Audio split into ~1 second chunks for streaming.""" path = AudioAsset("mary_had_lamb").get_local_path() - audio, _ = librosa.load(str(path), sr=16000, mono=True) + audio, _ = load_audio(str(path), sr=16000, mono=True) # Split into ~0.1 second chunks (1600 samples at 16kHz) chunk_size = 1600 diff --git a/tests/entrypoints/openai/speech_to_text/test_transcription_validation_whisper.py b/tests/entrypoints/openai/speech_to_text/test_transcription_validation_whisper.py index 8dba1b59742b..511179f7fcb1 100644 --- a/tests/entrypoints/openai/speech_to_text/test_transcription_validation_whisper.py +++ b/tests/entrypoints/openai/speech_to_text/test_transcription_validation_whisper.py @@ -6,7 +6,6 @@ import io import json -import librosa import numpy as np import openai import pytest @@ -14,6 +13,7 @@ import soundfile as sf from tests.utils import RemoteOpenAIServer +from vllm.multimodal.media.audio import load_audio from vllm.platforms import current_platform MODEL_NAME = "openai/whisper-large-v3-turbo" @@ -134,7 +134,7 @@ async def test_bad_requests(mary_had_lamb, whisper_client): @pytest.mark.asyncio async def test_long_audio_request(mary_had_lamb, whisper_client): mary_had_lamb.seek(0) - audio, sr = librosa.load(mary_had_lamb) + audio, sr = load_audio(mary_had_lamb) # Add small silence after each audio for repeatability in the split process audio = np.pad(audio, (0, 1600)) repeated_audio = np.tile(audio, 10) diff --git a/tests/entrypoints/openai/speech_to_text/test_translation_validation.py b/tests/entrypoints/openai/speech_to_text/test_translation_validation.py index 16b9614d957e..a8b17bf34324 100644 --- a/tests/entrypoints/openai/speech_to_text/test_translation_validation.py +++ b/tests/entrypoints/openai/speech_to_text/test_translation_validation.py @@ -7,7 +7,6 @@ import json import httpx -import librosa import numpy as np import openai import pytest @@ -17,6 +16,7 @@ from tests.entrypoints.openai.conftest import add_attention_backend from tests.utils import RemoteOpenAIServer from vllm.logger import init_logger +from vllm.multimodal.media.audio import load_audio logger = init_logger(__name__) @@ -264,7 +264,7 @@ async def test_long_audio_request(foscolo, client_and_model): if model_name == "google/gemma-3n-E2B-it": pytest.skip("Gemma3n does not support long audio requests") foscolo.seek(0) - audio, sr = librosa.load(foscolo) + audio, sr = load_audio(foscolo) repeated_audio = np.tile(audio, 2) # Repeated audio to buffer buffer = io.BytesIO() diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index 7f1a12f04474..1a4fb35a28aa 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -4,7 +4,6 @@ import os from collections.abc import Sequence -import librosa import pytest import regex as re from huggingface_hub import snapshot_download @@ -14,6 +13,7 @@ from vllm.logprobs import SampleLogprobs from vllm.lora.request import LoRARequest from vllm.multimodal.image import convert_image_mode, rescale_image_size +from vllm.multimodal.media.audio import load_audio from ....conftest import ( IMAGE_ASSETS, @@ -290,7 +290,7 @@ def test_vision_speech_models( num_logprobs: int, ) -> None: # use the example speech question so that the model outputs are reasonable - audio = librosa.load(speech_question, sr=None) + audio = load_audio(speech_question, sr=None) image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") inputs_vision_speech = [ diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index babf7e7a4978..186e7e054ce1 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -4,11 +4,11 @@ from collections.abc import Sequence from typing import Any -import librosa import pytest from transformers import AutoModelForSpeechSeq2Seq from vllm.assets.audio import AudioAsset +from vllm.multimodal.audio import AudioResampler from vllm.platforms import current_platform from ....conftest import HfRunner, PromptAudioInput, VllmRunner @@ -93,13 +93,12 @@ def run_test( def resampled_assets() -> list[tuple[Any, int]]: audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] sampled_assets = [] + resampler = AudioResampler(target_sr=WHISPER_SAMPLE_RATE) for asset in audio_assets: audio, orig_sr = asset.audio_and_sample_rate # Resample to Whisper's expected sample rate (16kHz) if orig_sr != WHISPER_SAMPLE_RATE: - audio = librosa.resample( - audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE - ) + audio = resampler.resample(audio, orig_sr=orig_sr) sampled_assets.append( (audio, WHISPER_SAMPLE_RATE), ) diff --git a/tests/multimodal/media/test_audio.py b/tests/multimodal/media/test_audio.py index 4361066ab885..3729e71f24e7 100644 --- a/tests/multimodal/media/test_audio.py +++ b/tests/multimodal/media/test_audio.py @@ -3,12 +3,12 @@ from pathlib import Path from unittest.mock import patch -import librosa import numpy as np import pybase64 as base64 import pytest from vllm.multimodal.media import AudioMediaIO +from vllm.multimodal.media.audio import load_audio from ...conftest import AudioTestAssets @@ -73,6 +73,6 @@ def test_audio_media_io_from_video(video_assets): video_path = video_assets[0].video_path with open(video_path, "rb") as f: audio, sr = audio_io.load_bytes(f.read()) - audio_ref, sr_ref = librosa.load(video_path, sr=None) + audio_ref, sr_ref = load_audio(video_path, sr=None) assert sr == sr_ref np.testing.assert_allclose(audio_ref, audio, atol=1e-4) diff --git a/vllm/multimodal/media/audio.py b/vllm/multimodal/media/audio.py index 26d58d13a731..37b8662a76b6 100644 --- a/vllm/multimodal/media/audio.py +++ b/vllm/multimodal/media/audio.py @@ -29,9 +29,9 @@ soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] -# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile -# being librosa's main backend. Used to validate if an audio loading error is due to a -# server error vs a client error (invalid audio file). +# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, +# soundfile being the main audio loading backend. Used to validate if an audio +# loading error is due to a server error vs a client error (invalid audio file). # 0 = sf_error(NULL) race condition: when multiple threads fail sf_open_virtual # concurrently, one thread may clear the global error before another reads it, # producing code=0 ("Garbled error message from libsndfile" in soundfile). diff --git a/vllm/transformers_utils/processors/cohere_asr.py b/vllm/transformers_utils/processors/cohere_asr.py index f742074a4e3d..e1257de4e735 100644 --- a/vllm/transformers_utils/processors/cohere_asr.py +++ b/vllm/transformers_utils/processors/cohere_asr.py @@ -4,11 +4,11 @@ import math import random -import librosa import numpy as np import torch import torch.nn.functional as F from torch import nn +from torchaudio.functional import melscale_fbanks from transformers import AutoFeatureExtractor, AutoProcessor, BatchFeature from transformers.feature_extraction_sequence_utils import ( SequenceFeatureExtractor, @@ -129,17 +129,15 @@ def __init__( self.pad_min_duration = 0.0 self.pad_direction = "both" - filterbanks = torch.tensor( - librosa.filters.mel( - sr=sample_rate, - n_fft=self.n_fft, - n_mels=nfilt, - fmin=lowfreq, - fmax=highfreq, - norm=mel_norm, - ), - dtype=torch.float, - ).unsqueeze(0) + filterbanks = melscale_fbanks( + n_freqs=self.n_fft // 2 + 1, + f_min=lowfreq, + f_max=highfreq, + n_mels=nfilt, + sample_rate=sample_rate, + norm=mel_norm, + mel_scale="slaney", + ).T.unsqueeze(0) self.register_buffer("fb", filterbanks) # Calculate maximum sequence length