Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions examples/offline_inference/ming_flash_omni/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import time
from typing import NamedTuple

import librosa
import numpy as np
import vllm
from PIL import Image
Expand All @@ -16,6 +15,7 @@
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset, video_to_ndarrays
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.media.audio import load_audio
from vllm.utils.argparse_utils import FlexibleArgumentParser

import vllm_omni
Expand Down Expand Up @@ -91,7 +91,7 @@ def get_audio_query(
if audio_path:
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
audio_signal, sr = librosa.load(audio_path, sr=sampling_rate)
audio_signal, sr = load_audio(audio_path, sr=sampling_rate)
audio_data = (audio_signal.astype(np.float32), sr)
else:
audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
Expand Down Expand Up @@ -172,7 +172,7 @@ def get_mixed_modalities_query(
if audio_path:
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
sig, sr = librosa.load(audio_path, sr=sampling_rate)
sig, sr = load_audio(audio_path, sr=sampling_rate)
audio_data = (sig.astype(np.float32), sr)
else:
audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
Expand Down
16 changes: 10 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,13 @@ exclude = [

[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort (handled separately, but included for compatibility)
"N", # pep8-naming
"UP", # pyupgrade
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort (handled separately, but included for compatibility)
"N", # pep8-naming
"UP", # pyupgrade
"TID251", # flake8-tidy-imports.banned-api
]
ignore = [
"E203", # whitespace before ':' (conflicts with black)
Expand All @@ -147,6 +148,9 @@ ignore = [
"examples/**" = ["E501"] # Allow long lines in examples
"tests/**" = ["E501"] # Allow long lines in tests

[tool.ruff.lint.flake8-tidy-imports.banned-api]
"librosa".msg = "The librosa module is banned, use vllm.multimodal helpers instead"

[tool.mypy]
python_version = "3.12, 3.13"
warn_return_any = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pytest

torch = pytest.importorskip("torch")
pytest.importorskip("librosa")

from vllm_omni.model_executor.models.voxcpm2.voxcpm2_talker import ( # noqa: E402
VoxCPM2TalkerForConditionalGeneration,
Expand Down
5 changes: 3 additions & 2 deletions vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from collections.abc import Iterable
from typing import Any

import librosa
import torch
import torch.nn as nn
from vllm.config import VllmConfig
Expand All @@ -30,6 +29,7 @@
WeightsMapper,
maybe_prefix,
)
from vllm.multimodal.audio import AudioResampler
from vllm.sequence import IntermediateTensors

from vllm_omni.model_executor.models.output_templates import OmniOutput
Expand Down Expand Up @@ -145,7 +145,8 @@ def _encode_raw_audio(
encode_sr = tts._encode_sample_rate
if sr != encode_sr:
audio_np = audio.squeeze(0).numpy()
audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=encode_sr)
resampler = AudioResampler(target_sr=encode_sr)
audio_np = resampler.resample(audio_np, orig_sr=sr)
audio = torch.from_numpy(audio_np).unsqueeze(0)

patch_len = tts.patch_size * tts.chunk_size
Expand Down
Loading