diff --git a/requirements/test.in b/requirements/test.in
index 5e6e3256a725..5be449b62b5f 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -10,6 +10,7 @@ pytest-cov
 
 # testing utils
 albumentations # required for Nemotron Parse in test_common.py
+av  # required for audio_in_video tests
 backoff # required for phi4mm test
 blobfile # required for kimi-vl test
 einops # required for MPT, qwen-vl
diff --git a/requirements/test.txt b/requirements/test.txt
index ac5fb9c2edff..0e7e5252a1e8 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -62,6 +62,8 @@ attrs==24.2.0
     #   referencing
 audioread==3.0.1
     # via librosa
+av==16.1.0
+    # via -r requirements/test.in
 backoff==2.2.1
     # via
     #   -r requirements/test.in
diff --git a/tests/entrypoints/openai/test_audio_in_video.py b/tests/entrypoints/openai/test_audio_in_video.py
new file mode 100644
index 000000000000..cf715b83aa19
--- /dev/null
+++ b/tests/entrypoints/openai/test_audio_in_video.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+
+from ...conftest import VideoTestAssets
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
+
+
+@pytest.fixture
+def server():
+    args = [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": 1, "video": 1}),
+    ]
+
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        args,
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.core_model
+@pytest.mark.asyncio
+async def test_online_audio_in_video(
+    client: openai.AsyncOpenAI, video_assets: VideoTestAssets
+):
+    """Test video input with `audio_in_video=True`"""
+
+    # we don't use video_urls above because they missed audio stream.
+    video_path = video_assets[0].video_path
+    with open(video_path, "rb") as f:
+        video_base64 = base64.b64encode(f.read()).decode("utf-8")
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this video?"},
+                {
+                    "type": "video_url",
+                    "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+                },
+            ],
+        }
+    ]
+
+    # multi-turn to test mm processor cache as well
+    for _ in range(2):
+        chat_completion = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=16,
+            extra_body={
+                "mm_processor_kwargs": {
+                    "use_audio_in_video": True,
+                }
+            },
+        )
+
+        assert len(chat_completion.choices) == 1
+        choice = chat_completion.choices[0]
+        assert choice.finish_reason == "length"
diff --git a/tests/multimodal/media/test_audio.py b/tests/multimodal/media/test_audio.py
index a6eb313f1bcc..d7fe891dd6d8 100644
--- a/tests/multimodal/media/test_audio.py
+++ b/tests/multimodal/media/test_audio.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from unittest.mock import patch
 
+import librosa
 import numpy as np
 import pytest
 
@@ -71,3 +72,13 @@ def write_to_buffer(buffer, *_args, **_kwargs):
         decoded = base64.b64decode(out)
         assert decoded == b"dummy_wav_data"
         mock_write.assert_called_once()
+
+
+def test_audio_media_io_from_video(video_assets):
+    audio_io = AudioMediaIO()
+    video_path = video_assets[0].video_path
+    with open(video_path, "rb") as f:
+        audio, sr = audio_io.load_bytes(f.read())
+    audio_ref, sr_ref = librosa.load(video_path, sr=None)
+    assert sr == sr_ref
+    np.testing.assert_allclose(audio_ref, audio, atol=1e-4)
diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py
index 0ff737824596..92d66c2535be 100644
--- a/vllm/entrypoints/serve/render/serving.py
+++ b/vllm/entrypoints/serve/render/serving.py
@@ -366,6 +366,7 @@ async def _preprocess_chat(
         (ResponsesRequest not supported here); TODO comment dropped accordingly.
         """
         renderer = self.renderer
+        mm_config = self.model_config.multimodal_config
 
         default_template_kwargs = merge_kwargs(
             default_template_kwargs,
@@ -378,7 +379,11 @@ async def _preprocess_chat(
         tok_params = request.build_tok_params(self.model_config)
         chat_params = request.build_chat_params(
             default_template, default_template_content_format
-        ).with_defaults(default_template_kwargs)
+        ).with_defaults(
+            default_template_kwargs,
+            default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None),
+            default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None),
+        )
 
         (conversation,), (engine_prompt,) = await renderer.render_chat_async(
             [messages],