diff --git a/examples/offline_inference/ming_flash_omni/end2end.py b/examples/offline_inference/ming_flash_omni/end2end.py
index 49cdbcc018..8f87301316 100644
--- a/examples/offline_inference/ming_flash_omni/end2end.py
+++ b/examples/offline_inference/ming_flash_omni/end2end.py
@@ -6,7 +6,6 @@
 import time
 from typing import NamedTuple
 
-import librosa
 import numpy as np
 import vllm
 from PIL import Image
@@ -16,6 +15,7 @@
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset, video_to_ndarrays
 from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.media.audio import load_audio
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 import vllm_omni
@@ -91,7 +91,7 @@ def get_audio_query(
     if audio_path:
         if not os.path.exists(audio_path):
             raise FileNotFoundError(f"Audio file not found: {audio_path}")
-        audio_signal, sr = librosa.load(audio_path, sr=sampling_rate)
+        audio_signal, sr = load_audio(audio_path, sr=sampling_rate)
         audio_data = (audio_signal.astype(np.float32), sr)
     else:
         audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
@@ -172,7 +172,7 @@ def get_mixed_modalities_query(
     if audio_path:
         if not os.path.exists(audio_path):
             raise FileNotFoundError(f"Audio file not found: {audio_path}")
-        sig, sr = librosa.load(audio_path, sr=sampling_rate)
+        sig, sr = load_audio(audio_path, sr=sampling_rate)
         audio_data = (sig.astype(np.float32), sr)
     else:
         audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
diff --git a/pyproject.toml b/pyproject.toml
index 012bcd47c4..b6f4092fd1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -127,12 +127,13 @@ exclude = [
 
 [tool.ruff.lint]
 select = [
-    "E",  # pycodestyle errors
-    "W",  # pycodestyle warnings
-    "F",  # pyflakes
-    "I",  # isort (handled separately, but included for compatibility)
-    "N",  # pep8-naming
-    "UP", # pyupgrade
+    "E",      # pycodestyle errors
+    "W",      # pycodestyle warnings
+    "F",      # pyflakes
+    "I",      # isort (handled separately, but included for compatibility)
+    "N",      # pep8-naming
+    "UP",     # pyupgrade
+    "TID251", # flake8-tidy-imports.banned-api
 ]
 ignore = [
     "E203",  # whitespace before ':' (conflicts with black)
@@ -147,6 +148,9 @@ ignore = [
 "examples/**" = ["E501"]  # Allow long lines in examples
 "tests/**" = ["E501"]  # Allow long lines in tests
 
+[tool.ruff.lint.flake8-tidy-imports.banned-api]
+"librosa".msg = "The librosa module is banned, use vllm.multimodal helpers instead"
+
 [tool.mypy]
 python_version = "3.12, 3.13"
 warn_return_any = true
diff --git a/tests/model_executor/models/voxcpm2/test_talker_state_eviction.py b/tests/model_executor/models/voxcpm2/test_talker_state_eviction.py
index 5d8a35636b..929e8a36ad 100644
--- a/tests/model_executor/models/voxcpm2/test_talker_state_eviction.py
+++ b/tests/model_executor/models/voxcpm2/test_talker_state_eviction.py
@@ -7,7 +7,6 @@
 import pytest
 
 torch = pytest.importorskip("torch")
-pytest.importorskip("librosa")
 
 from vllm_omni.model_executor.models.voxcpm2.voxcpm2_talker import (  # noqa: E402
     VoxCPM2TalkerForConditionalGeneration,
diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py
index 3724528898..0a9246251b 100644
--- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py
+++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py
@@ -19,7 +19,6 @@
 from collections.abc import Iterable
 from typing import Any
 
-import librosa
 import torch
 import torch.nn as nn
 from vllm.config import VllmConfig
@@ -30,6 +29,7 @@
     WeightsMapper,
     maybe_prefix,
 )
+from vllm.multimodal.audio import AudioResampler
 from vllm.sequence import IntermediateTensors
 
 from vllm_omni.model_executor.models.output_templates import OmniOutput
@@ -145,7 +145,8 @@ def _encode_raw_audio(
     encode_sr = tts._encode_sample_rate
     if sr != encode_sr:
         audio_np = audio.squeeze(0).numpy()
-        audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=encode_sr)
+        resampler = AudioResampler(target_sr=encode_sr)
+        audio_np = resampler.resample(audio_np, orig_sr=sr)
         audio = torch.from_numpy(audio_np).unsqueeze(0)
 
     patch_len = tts.patch_size * tts.chunk_size