diff --git a/examples/offline_inference/ming_flash_omni/end2end.py b/examples/offline_inference/ming_flash_omni/end2end.py index 49cdbcc018..8f87301316 100644 --- a/examples/offline_inference/ming_flash_omni/end2end.py +++ b/examples/offline_inference/ming_flash_omni/end2end.py @@ -6,7 +6,6 @@ import time from typing import NamedTuple -import librosa import numpy as np import vllm from PIL import Image @@ -16,6 +15,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode +from vllm.multimodal.media.audio import load_audio from vllm.utils.argparse_utils import FlexibleArgumentParser import vllm_omni @@ -91,7 +91,7 @@ def get_audio_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -172,7 +172,7 @@ def get_mixed_modalities_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - sig, sr = librosa.load(audio_path, sr=sampling_rate) + sig, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (sig.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate diff --git a/pyproject.toml b/pyproject.toml index 012bcd47c4..b6f4092fd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,12 +127,13 @@ exclude = [ [tool.ruff.lint] select = [ - "E", # pycodestyle errors - "W", # pycodestyle warnings - "F", # pyflakes - "I", # isort (handled separately, but included for compatibility) - "N", # pep8-naming - "UP", # pyupgrade + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort (handled separately, but included for compatibility) + "N", # pep8-naming + "UP", # pyupgrade + "TID251", # flake8-tidy-imports.banned-api ] ignore = [ "E203", # whitespace before ':' (conflicts with black) @@ -147,6 +148,9 @@ ignore = [ "examples/**" = ["E501"] # Allow long lines in examples "tests/**" = ["E501"] # Allow long lines in tests +[tool.ruff.lint.flake8-tidy-imports.banned-api] +"librosa".msg = "The librosa module is banned, use vllm.multimodal helpers instead" + [tool.mypy] python_version = "3.12, 3.13" warn_return_any = true diff --git a/tests/model_executor/models/voxcpm2/test_talker_state_eviction.py b/tests/model_executor/models/voxcpm2/test_talker_state_eviction.py index 5d8a35636b..929e8a36ad 100644 --- a/tests/model_executor/models/voxcpm2/test_talker_state_eviction.py +++ b/tests/model_executor/models/voxcpm2/test_talker_state_eviction.py @@ -7,7 +7,6 @@ import pytest torch = pytest.importorskip("torch") -pytest.importorskip("librosa") from vllm_omni.model_executor.models.voxcpm2.voxcpm2_talker import ( # noqa: E402 VoxCPM2TalkerForConditionalGeneration, diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index 3724528898..0a9246251b 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -19,7 +19,6 @@ from collections.abc import Iterable from typing import Any -import librosa import torch import torch.nn as nn from vllm.config import VllmConfig @@ -30,6 +29,7 @@ WeightsMapper, maybe_prefix, ) +from vllm.multimodal.audio import AudioResampler from vllm.sequence import IntermediateTensors from vllm_omni.model_executor.models.output_templates import OmniOutput @@ -145,7 +145,8 @@ def _encode_raw_audio( encode_sr = tts._encode_sample_rate if sr != encode_sr: audio_np = audio.squeeze(0).numpy() - audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=encode_sr) + resampler = AudioResampler(target_sr=encode_sr) + audio_np = resampler.resample(audio_np, orig_sr=sr) audio = torch.from_numpy(audio_np).unsqueeze(0) patch_len = tts.patch_size * tts.chunk_size