diff --git a/requirements/test.in b/requirements/test.in index 5e6e3256a725..5be449b62b5f 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -10,6 +10,7 @@ pytest-cov # testing utils albumentations # required for Nemotron Parse in test_common.py +av # required for audio_in_video tests backoff # required for phi4mm test blobfile # required for kimi-vl test einops # required for MPT, qwen-vl diff --git a/requirements/test.txt b/requirements/test.txt index ac5fb9c2edff..0e7e5252a1e8 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -62,6 +62,8 @@ attrs==24.2.0 # referencing audioread==3.0.1 # via librosa +av==16.1.0 + # via -r requirements/test.in backoff==2.2.1 # via # -r requirements/test.in diff --git a/tests/entrypoints/openai/test_audio_in_video.py b/tests/entrypoints/openai/test_audio_in_video.py new file mode 100644 index 000000000000..cf715b83aa19 --- /dev/null +++ b/tests/entrypoints/openai/test_audio_in_video.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import base64 +import json + +import openai +import pytest +import pytest_asyncio + +from ...conftest import VideoTestAssets +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen2.5-Omni-3B" + + +@pytest.fixture +def server(): + args = [ + "--max-model-len", + "8192", + "--enforce-eager", + "--limit-mm-per-prompt", + json.dumps({"audio": 1, "video": 1}), + ] + + with RemoteOpenAIServer( + MODEL_NAME, + args, + ) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.core_model +@pytest.mark.asyncio +async def test_online_audio_in_video( + client: openai.AsyncOpenAI, video_assets: VideoTestAssets +): + """Test video input with `audio_in_video=True`""" + + # we don't use video_urls above because they missed audio stream. + video_path = video_assets[0].video_path + with open(video_path, "rb") as f: + video_base64 = base64.b64encode(f.read()).decode("utf-8") + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this video?"}, + { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, + }, + ], + } + ] + + # multi-turn to test mm processor cache as well + for _ in range(2): + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=16, + extra_body={ + "mm_processor_kwargs": { + "use_audio_in_video": True, + } + }, + ) + + assert len(chat_completion.choices) == 1 + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" diff --git a/tests/multimodal/media/test_audio.py b/tests/multimodal/media/test_audio.py index a6eb313f1bcc..d7fe891dd6d8 100644 --- a/tests/multimodal/media/test_audio.py +++ b/tests/multimodal/media/test_audio.py @@ -4,6 +4,7 @@ from pathlib import Path from unittest.mock import patch +import librosa import numpy as np import pytest @@ -71,3 +72,13 @@ def write_to_buffer(buffer, *_args, **_kwargs): decoded = base64.b64decode(out) assert decoded == b"dummy_wav_data" mock_write.assert_called_once() + + +def test_audio_media_io_from_video(video_assets): + audio_io = AudioMediaIO() + video_path = video_assets[0].video_path + with open(video_path, "rb") as f: + audio, sr = audio_io.load_bytes(f.read()) + audio_ref, sr_ref = librosa.load(video_path, sr=None) + assert sr == sr_ref + np.testing.assert_allclose(audio_ref, audio, atol=1e-4) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index 0ff737824596..92d66c2535be 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -366,6 +366,7 @@ async def _preprocess_chat( (ResponsesRequest not supported here); TODO comment dropped accordingly. """ renderer = self.renderer + mm_config = self.model_config.multimodal_config default_template_kwargs = merge_kwargs( default_template_kwargs, @@ -378,7 +379,11 @@ async def _preprocess_chat( tok_params = request.build_tok_params(self.model_config) chat_params = request.build_chat_params( default_template, default_template_content_format - ).with_defaults(default_template_kwargs) + ).with_defaults( + default_template_kwargs, + default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None), + default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None), + ) (conversation,), (engine_prompt,) = await renderer.render_chat_async( [messages],