diff --git a/setup.py b/setup.py index 83b8b008ab45..5218b6eff4fc 100644 --- a/setup.py +++ b/setup.py @@ -976,6 +976,7 @@ def _read_requirements(filename: str) -> list[str]: "soundfile", "mistral_common[audio]", "av", + "torchcodec", ], # Required for audio processing "video": [], # Kept for backwards compatibility "flashinfer": [], # Kept for backwards compatibility diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py index ac621270d660..31902bfa7f0f 100644 --- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio -import io import math import time import zlib @@ -11,7 +10,6 @@ import numpy as np from fastapi import Request -from soundfile import LibsndfileError from transformers import PreTrainedTokenizerBase import vllm.envs as envs @@ -37,6 +35,7 @@ TranslationSegment, TranslationStreamResponse, ) +from vllm.entrypoints.openai.speech_to_text.utils import load_audio_bytes from vllm.entrypoints.utils import get_max_tokens from vllm.exceptions import VLLMValidationError from vllm.inputs import EncoderDecoderInputs, ProcessorInputs @@ -56,14 +55,6 @@ except ImportError: librosa = PlaceholderModule("librosa") # type: ignore[assignment] -# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile -# being librosa's main backend. Used to validate if an audio loading error is due to a -# server error vs a client error (invalid audio file). -# 1 = unrecognised format (file is not a supported audio container) -# 3 = malformed file (corrupt or structurally invalid audio) -# 4 = unsupported encoding (codec not supported by this libsndfile build) -_BAD_SF_CODES = {1, 3, 4} - SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse SpeechToTextResponseVerbose: TypeAlias = ( TranscriptionResponseVerbose | TranslationResponseVerbose @@ -202,16 +193,12 @@ async def _preprocess_speech_to_text( value=len(audio_data) / 1024**2, ) - with io.BytesIO(audio_data) as bytes_: - try: - # NOTE resample to model SR here for efficiency. This is also a - # pre-requisite for chunking, as it assumes Whisper SR. - y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate) - except LibsndfileError as exc: - # Distinguish client errors (invalid audio) from server errors - if exc.code in _BAD_SF_CODES: - raise ValueError("Invalid or unsupported audio file.") from exc - raise + # Decode audio bytes. For container formats (MP4, M4A, WebM) that + # soundfile cannot detect from a BytesIO stream, _load_audio_bytes + # transparently falls back to ffmpeg via an in-memory fd. + # NOTE resample to model SR here for efficiency. This is also a + # pre-requisite for chunking, as it assumes Whisper SR. + y, sr = load_audio_bytes(audio_data, sr=self.asr_config.sample_rate) duration = librosa.get_duration(y=y, sr=sr) do_split_audio = ( diff --git a/vllm/entrypoints/openai/speech_to_text/utils.py b/vllm/entrypoints/openai/speech_to_text/utils.py new file mode 100644 index 000000000000..ec82cdc3c2d5 --- /dev/null +++ b/vllm/entrypoints/openai/speech_to_text/utils.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Audio decoding utilities for the speech-to-text endpoints.""" + +import io + +import numpy as np +import torchaudio + +from vllm.logger import init_logger +from vllm.utils.import_utils import PlaceholderModule + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") # type: ignore[assignment] + +try: + import soundfile as sf +except ImportError: + sf = PlaceholderModule("soundfile") # type: ignore[assignment] + +logger = init_logger(__name__) + +# Public libsndfile error codes exposed via ``soundfile.LibsndfileError.code``. +# soundfile is librosa's primary backend. These codes indicate that the audio +# data itself is problematic (unrecognised container, corrupt file, or +# unsupported encoding) rather than a transient server error. +# 1 = unrecognised format, 3 = malformed file, 4 = unsupported encoding +_BAD_SF_CODES = {1, 3, 4} + + +def _decode_audio_bytes_torchaudio( + audio_data: bytes, + sr: int, +) -> tuple[np.ndarray, int]: + """Decode audio bytes to mono float32 PCM via torchaudio, in-process. + + ``torchaudio.load`` (backed by TorchCodec / FFmpeg) can decode + container formats (MP4, M4A, WebM) directly from a ``BytesIO`` + buffer without spawning a subprocess. The decoded waveform is + down-mixed to mono and resampled to *sr* Hz, matching the return + convention of ``librosa.load``. + """ + buf = io.BytesIO(audio_data) + waveform, orig_sr = torchaudio.load(buf) + + # Down-mix to mono (average across channels). + if waveform.shape[0] > 1: + waveform = waveform.mean(dim=0, keepdim=True) + + # Resample to the target sample rate when necessary. + if orig_sr != sr: + waveform = torchaudio.functional.resample( + waveform, orig_freq=orig_sr, new_freq=sr + ) + + # Squeeze channel dim → 1-D float32 numpy array (same as librosa.load). + y = waveform.squeeze(0).numpy() + if y.size == 0: + raise RuntimeError( + "torchaudio produced no audio samples (file may be empty or corrupt)" + ) + return y, sr + + +def load_audio_bytes( + audio_data: bytes, + sr: int | float, +) -> tuple[np.ndarray, int]: + """Load audio from raw bytes, with an in-process torchaudio fallback. + + First tries ``librosa.load(BytesIO(...))`` which works for formats + that *soundfile* can auto-detect (WAV, FLAC, MP3, OGG, ...). If + that fails with a ``LibsndfileError`` indicating an unrecognised or + unsupported format (typically container formats like MP4/M4A/WebM), + the bytes are decoded in-process via ``torchaudio`` (backed by + TorchCodec / FFmpeg) which handles these containers natively. + """ + sr = int(sr) + + # Fast path: librosa + soundfile (works for most formats). + try: + with io.BytesIO(audio_data) as buf: + return librosa.load(buf, sr=sr) # type: ignore[return-value] + except sf.LibsndfileError as exc: + # Only fall back for known format-detection failures. + # Re-raise anything else (e.g. corrupt but recognised format). + if exc.code not in _BAD_SF_CODES: + raise + logger.debug( + "librosa/soundfile could not decode audio from BytesIO " + "(code=%s: %s); falling back to torchaudio in-process decode", + exc.code, + exc, + ) + + # Fallback: torchaudio in-process decode (no subprocess overhead). + try: + return _decode_audio_bytes_torchaudio(audio_data, sr) + except Exception as ta_exc: + logger.debug( + "torchaudio fallback also failed: %s", + ta_exc, + ) + raise ValueError("Invalid or unsupported audio file.") from ta_exc