Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test
pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests
sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
Expand Down
4 changes: 4 additions & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,7 @@ numba==0.61.2
# via
# -r requirements/test.in
# librosa
# resampy
numpy==2.2.6
# via
# -r requirements/test.in
Expand Down Expand Up @@ -584,6 +585,7 @@ numpy==2.2.6
# pyogrio
# pywavelets
# rasterio
# resampy
# rioxarray
# rouge-score
# runai-model-streamer
Expand Down Expand Up @@ -995,6 +997,8 @@ requests==2.32.3
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test.in
responses==0.25.3
# via genai-perf
rfc3339-validator==0.1.4
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -987,11 +987,11 @@ def _read_requirements(filename: str) -> list[str]:
"instanttensor": ["instanttensor >= 0.1.5"],
"runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
"audio": [
"librosa",
"av",
"resampy",
"scipy",
"soundfile",
"mistral_common[audio]",
"av",
], # Required for audio processing
"video": [], # Kept for backwards compatibility
"flashinfer": [], # Kept for backwards compatibility
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name)
model_name,
foscolo,
language="it",
expected_text="ove il mio corpo fanciulletto giacque",
expected_text="ove il mio corpo fanciulletto",
)
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_run_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@
]
)

MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/"
INPUT_TRANSCRIPTION_BATCH = (
json.dumps(
{
Expand Down
5 changes: 1 addition & 4 deletions tests/models/multimodal/generation/vlm_utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
resampler = AudioResampler(
target_sr=16000,
method="librosa",
)
resampler = AudioResampler(target_sr=16000)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [
(
Expand Down
38 changes: 16 additions & 22 deletions tests/multimodal/media/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

from vllm.multimodal.media import AudioMediaIO

from ...conftest import AudioTestAssets

pytestmark = pytest.mark.cpu_test

ASSETS_DIR = Path(__file__).parent.parent / "assets"
Expand All @@ -22,40 +24,32 @@ def dummy_audio():


@pytest.fixture
def dummy_audio_bytes():
return b"FAKEAUDIOBYTES"
def dummy_audio_bytes(audio_assets: AudioTestAssets):
with open(audio_assets[0].get_local_path(), "rb") as f:
return f.read()


def test_audio_media_io_load_bytes(dummy_audio_bytes):
audio_io = AudioMediaIO()
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_bytes(dummy_audio_bytes)
mock_load.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_bytes(dummy_audio_bytes)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000


def test_audio_media_io_load_base64(dummy_audio_bytes):
audio_io = AudioMediaIO()
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_base64("audio/wav", encoded)
mock_load_bytes.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_base64("audio/wav", encoded)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000


def test_audio_media_io_load_file():
def test_audio_media_io_load_file(audio_assets: AudioTestAssets):
audio_io = AudioMediaIO()
path = Path("/fake/path.wav")
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_file(path)
mock_load.assert_called_once_with(path, sr=None)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
path = audio_assets[0].get_local_path()
out = audio_io.load_file(path)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000


def test_audio_media_io_encode_base64(dummy_audio):
Expand Down
38 changes: 19 additions & 19 deletions tests/multimodal/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
AudioSpec,
ChannelReduction,
normalize_audio,
resample_audio_librosa,
resample_audio_pyav,
resample_audio_scipy,
split_audio,
)
Expand All @@ -25,14 +25,14 @@ def dummy_audio():
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)


def test_resample_audio_librosa(dummy_audio):
with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
mock_resample.return_value = dummy_audio * 2
out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050)
mock_resample.assert_called_once_with(
dummy_audio, orig_sr=44100, target_sr=22050
)
assert np.all(out == dummy_audio * 2)
def test_resample_audio_pyav(dummy_audio):
out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2)
out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4)
out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4)

assert len(out_down) == 3
assert len(out_up) == 10
assert np.all(out_same == dummy_audio)
Comment thread
Isotr0py marked this conversation as resolved.


def test_resample_audio_scipy(dummy_audio):
Expand All @@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
assert np.isfinite(out).all()


def test_audio_resampler_librosa_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="librosa")
with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
def test_audio_resampler_pyav_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="pyav")
with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample:
mock_resample.return_value = dummy_audio
out = resampler.resample(dummy_audio, orig_sr=44100)
mock_resample.assert_called_once_with(
Expand Down Expand Up @@ -423,13 +423,13 @@ def test_soundfile_format_normalized_to_mono_e2e(self):
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)

def test_librosa_mono_passthrough_e2e(self):
"""Full pipeline: librosa mono format → preserved as mono."""
def test_pyav_mono_passthrough_e2e(self):
"""Full pipeline: pyav mono format → preserved as mono."""
from vllm.multimodal.parse import MultiModalDataParser

# Simulate librosa output: already mono (time,) format
mono_librosa = np.random.randn(16000).astype(np.float32)
assert mono_librosa.shape == (16000,)
# Simulate pyav output: already mono (time,) format
mono_pyav = np.random.randn(16000).astype(np.float32)
assert mono_pyav.shape == (16000,)

# Create parser with mono normalization
parser = MultiModalDataParser(
Expand All @@ -438,15 +438,15 @@ def test_librosa_mono_passthrough_e2e(self):
)

# Process audio through the parser
result = parser._parse_audio_data((mono_librosa, 16000))
result = parser._parse_audio_data((mono_pyav, 16000))
audio_output = result.get(0)

# Verify output is still mono 1D
assert audio_output.ndim == 1
assert audio_output.shape == (16000,)

# Verify audio content is preserved
np.testing.assert_array_almost_equal(audio_output, mono_librosa)
np.testing.assert_array_almost_equal(audio_output, mono_pyav)

def test_multichannel_5_1_surround_to_mono_e2e(self):
"""Full pipeline: 5.1 surround (6 channels) → mono output."""
Expand Down
9 changes: 2 additions & 7 deletions vllm/assets/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,10 @@

import numpy.typing as npt

from vllm.utils.import_utils import PlaceholderModule
from vllm.multimodal.media.audio import load_audio

from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets

try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]

ASSET_DIR = "multimodal_asset"

AudioAssetName = Literal["winning_call", "mary_had_lamb"]
Expand All @@ -33,7 +28,7 @@ def filename(self) -> str:
@property
def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
return librosa.load(audio_path, sr=None)
return load_audio(audio_path, sr=None)

def get_local_path(self) -> Path:
return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
Expand Down
9 changes: 2 additions & 7 deletions vllm/assets/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,10 @@
from huggingface_hub import hf_hub_download
from PIL import Image

from vllm.utils.import_utils import PlaceholderModule
from vllm.multimodal.media.audio import load_audio_pyav

from .base import get_cache_dir

try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]


@lru_cache
def download_video_asset(filename: str) -> str:
Expand Down Expand Up @@ -146,4 +141,4 @@ def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray:

See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
"""
return librosa.load(self.video_path, sr=sampling_rate)[0]
return load_audio_pyav(self.video_path, sr=sampling_rate)[0]
7 changes: 2 additions & 5 deletions vllm/benchmarks/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.audio import get_audio_duration
from vllm.multimodal.image import convert_image_mode
from vllm.tokenizers import TokenizerLike
from vllm.utils.argparse_utils import FlexibleArgumentParser
Expand All @@ -54,10 +55,6 @@
except ImportError:
pd = PlaceholderModule("pandas")

try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa")

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -3253,7 +3250,7 @@ def sample(
break
audio = item["audio"]
y, sr = audio["array"], audio["sampling_rate"]
duration_s = librosa.get_duration(y=y, sr=sr)
duration_s = get_audio_duration(y=y, sr=sr)
if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec:
skipped += 1
continue
Expand Down
54 changes: 8 additions & 46 deletions vllm/entrypoints/openai/speech_to_text/speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,32 +42,13 @@
from vllm.logger import init_logger
from vllm.logprobs import FlatLogprobs, Logprob
from vllm.model_executor.models import SupportsTranscription
from vllm.multimodal.audio import split_audio
from vllm.multimodal.media.audio import extract_audio_from_video_bytes
from vllm.multimodal.audio import get_audio_duration, split_audio
from vllm.multimodal.media.audio import load_audio
from vllm.outputs import RequestOutput
from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt
from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tokenizers import get_tokenizer
from vllm.utils.import_utils import PlaceholderModule

try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]

try:
import soundfile as sf
except ImportError:
sf = PlaceholderModule("soundfile") # type: ignore[assignment]

# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}

SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
SpeechToTextResponseVerbose: TypeAlias = (
Expand Down Expand Up @@ -214,32 +195,13 @@ async def _preprocess_speech_to_text(
# pre-requisite for chunking, as it assumes Whisper SR.
try:
with io.BytesIO(audio_data) as buf:
y, sr = librosa.load(buf, sr=self.asr_config.sample_rate) # type: ignore[return-value]
except sf.LibsndfileError as exc:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if exc.code not in _BAD_SF_CODES:
raise
logger.debug(
"librosa/soundfile could not decode audio from BytesIO "
"(code=%s: %s); falling back to pyav in-process decode",
exc.code,
exc,
)
try:
native_y, native_sr = extract_audio_from_video_bytes(audio_data)
sr = self.asr_config.sample_rate
y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr)
except Exception as pyav_exc:
logger.debug(
"pyAV fallback also failed: %s",
pyav_exc,
)
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
y, sr = load_audio(buf, sr=self.asr_config.sample_rate)
except Exception as exc:
raise ValueError("Invalid or unsupported audio file.") from exc

duration = librosa.get_duration(y=y, sr=sr)
do_split_audio = (
self.asr_config.allow_audio_chunking
duration = get_audio_duration(y=y, sr=sr)
do_split_audio = self.asr_config.allow_audio_chunking and (
self.asr_config.max_audio_clip_s is not None
and duration > self.asr_config.max_audio_clip_s
)

Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/models/nano_nemotron_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import warnings
from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property
from io import BytesIO
from typing import Annotated, Literal, TypeAlias

import torch
Expand Down Expand Up @@ -53,7 +54,7 @@
MultiModalKwargsItems,
VideoItem,
)
from vllm.multimodal.media.audio import extract_audio_from_video_bytes
from vllm.multimodal.media.audio import load_audio_pyav
from vllm.multimodal.parse import (
AudioProcessorItems,
ImageEmbeddingItems,
Expand Down Expand Up @@ -553,7 +554,7 @@ def _extract_audio_from_videos(
"video must be loaded with keep_video_bytes=True (e.g. via "
"the chat API with a model that sets use_audio_in_video)."
)
audio_items.append(extract_audio_from_video_bytes(video_bytes))
audio_items.append(load_audio_pyav(BytesIO(video_bytes)))

# Create a new VideoProcessorItems with metadata that does not contain
# the large video bytes, to avoid modifying the input `mm_items`.
Expand Down
Loading
Loading