Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ pytest-cov

# testing utils
albumentations # required for Nemotron Parse in test_common.py
av # required for audio_in_video tests
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl
Expand Down
2 changes: 2 additions & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ attrs==24.2.0
# referencing
audioread==3.0.1
# via librosa
av==16.1.0
# via -r requirements/test.in
backoff==2.2.1
# via
# -r requirements/test.in
Expand Down
80 changes: 80 additions & 0 deletions tests/entrypoints/openai/test_audio_in_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import base64
import json

import openai
import pytest
import pytest_asyncio

from ...conftest import VideoTestAssets
from ...utils import RemoteOpenAIServer

MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"


@pytest.fixture
def server():
args = [
"--max-model-len",
"8192",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": 1, "video": 1}),
]

with RemoteOpenAIServer(
MODEL_NAME,
args,
) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client


@pytest.mark.core_model
@pytest.mark.asyncio
async def test_online_audio_in_video(
client: openai.AsyncOpenAI, video_assets: VideoTestAssets
):
"""Test video input with `audio_in_video=True`"""

# we don't use video_urls above because they missed audio stream.
video_path = video_assets[0].video_path
with open(video_path, "rb") as f:
video_base64 = base64.b64encode(f.read()).decode("utf-8")

messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this video?"},
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
},
],
}
]

# multi-turn to test mm processor cache as well
for _ in range(2):
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=16,
extra_body={
"mm_processor_kwargs": {
"use_audio_in_video": True,
}
},
)

assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
11 changes: 11 additions & 0 deletions tests/multimodal/media/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from unittest.mock import patch

import librosa
import numpy as np
import pytest

Expand Down Expand Up @@ -71,3 +72,13 @@ def write_to_buffer(buffer, *_args, **_kwargs):
decoded = base64.b64decode(out)
assert decoded == b"dummy_wav_data"
mock_write.assert_called_once()


def test_audio_media_io_from_video(video_assets):
audio_io = AudioMediaIO()
video_path = video_assets[0].video_path
with open(video_path, "rb") as f:
audio, sr = audio_io.load_bytes(f.read())
audio_ref, sr_ref = librosa.load(video_path, sr=None)
assert sr == sr_ref
np.testing.assert_allclose(audio_ref, audio, atol=1e-4)
7 changes: 6 additions & 1 deletion vllm/entrypoints/serve/render/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ async def _preprocess_chat(
(ResponsesRequest not supported here); TODO comment dropped accordingly.
"""
renderer = self.renderer
mm_config = self.model_config.multimodal_config

default_template_kwargs = merge_kwargs(
default_template_kwargs,
Expand All @@ -378,7 +379,11 @@ async def _preprocess_chat(
tok_params = request.build_tok_params(self.model_config)
chat_params = request.build_chat_params(
default_template, default_template_content_format
).with_defaults(default_template_kwargs)
).with_defaults(
default_template_kwargs,
default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None),
default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None),
)

(conversation,), (engine_prompt,) = await renderer.render_chat_async(
[messages],
Expand Down
Loading