From 15bae255543002ad17bac5781f807b084c38f693 Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Sat, 7 Mar 2026 16:40:14 +0800 Subject: [PATCH 01/14] Support online use_audio_in_video Signed-off-by: Tianyu Guo --- vllm/entrypoints/chat_utils.py | 100 ++++++++++++++++++++++++++++++++- vllm/renderers/base.py | 14 +++-- vllm/renderers/hf.py | 6 +- 3 files changed, 114 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 6677350f41bf..f821c2ba1a2f 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -4,6 +4,7 @@ import asyncio import json import warnings +import numpy as np from abc import ABC, abstractmethod from collections import Counter, defaultdict from collections.abc import Awaitable, Callable, Iterable @@ -11,7 +12,7 @@ from functools import cached_property, lru_cache, partial from itertools import accumulate from pathlib import Path -from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast +from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast, Union, Optional from openai.types.chat import ( ChatCompletionAssistantMessageParam, @@ -898,10 +899,15 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) + self._mm_processor_kwargs: Optional[dict[str, Any]] = None @property def model_config(self) -> ModelConfig: return self._tracker.model_config + + def set_mm_processor_kwargs(self, mm_processor_kwargs: Optional[dict[str, Any]]) -> None: + """Set mm_processor_kwargs for use in parsing.""" + self._mm_processor_kwargs = mm_processor_kwargs async def _image_with_uuid_async(self, image_url: str | None, uuid: str | None): image = ( @@ -1032,6 +1038,87 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: placeholder = self._tracker.add("video", coro) self._add_placeholder("video", placeholder) + + # Extract audio from video if use_audio_in_video is True + if video_url and self._mm_processor_kwargs and self._mm_processor_kwargs.get("use_audio_in_video", False): + audio_coro = self._extract_audio_from_video_async(video_url, uuid) + audio_placeholder = self._tracker.add("audio", audio_coro) + self._add_placeholder("audio", audio_placeholder) + + async def _extract_audio_from_video_async(self, video_url: str, uuid: str | None = None): + """ + Extract audio from video URL using librosa. + Returns tuple of (audio_array, sample_rate) compatible with audio format. + + All blocking I/O operations are run in a thread pool to avoid blocking the event loop. + """ + import asyncio + import os + import tempfile + from urllib.parse import urlparse + + # Parse URL to determine type + parsed_url = urlparse(video_url) + temp_video_file_path = None + + def _download_video_sync(url: str) -> bytes: + """Synchronous video download - runs in thread pool.""" + from urllib.request import urlopen + + return urlopen(url).read() + + def _write_temp_file_sync(data: bytes, suffix: str) -> str: + """Synchronous temp file write - runs in thread pool.""" + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file: + temp_file.write(data) + return temp_file.name + + def _load_audio_sync(file_path: str) -> tuple[np.ndarray, Union[int, float]]: + """Synchronous audio loading with librosa - runs in thread pool.""" + import librosa + + return librosa.load(file_path, sr=16000) + + def _cleanup_file_sync(file_path: str) -> None: + """Synchronous file deletion - runs in thread pool.""" + try: + if os.path.exists(file_path): + os.unlink(file_path) + except OSError: + pass + + try: + if parsed_url.scheme in ("http", "https"): + # Download video from HTTP/HTTPS URL asynchronously + video_data = await asyncio.to_thread(_download_video_sync, video_url) + # Write temp file asynchronously + temp_video_file_path = await asyncio.to_thread(_write_temp_file_sync, video_data, ".mp4") + elif parsed_url.scheme == "file": + # Use file path directly (handle Windows paths) + from urllib.request import url2pathname + + temp_video_file_path = url2pathname(parsed_url.path) + elif parsed_url.scheme == "data": + # Handle data URL (base64 encoded video) + import base64 + + header, data = video_url.split(",", 1) + video_data = base64.b64decode(data) + # Write temp file asynchronously + temp_video_file_path = await asyncio.to_thread(_write_temp_file_sync, video_data, ".mp4") + else: + # Assume it's a local file path + temp_video_file_path = video_url + + # Extract audio using librosa asynchronously (CPU-intensive, runs in thread pool) + audio_array, sample_rate = await asyncio.to_thread(_load_audio_sync, temp_video_file_path) + + return (audio_array, sample_rate), uuid + finally: + # Clean up temporary file if we created one (asynchronously) + if temp_video_file_path and parsed_url.scheme in ("http", "https", "data"): + await asyncio.to_thread(_cleanup_file_sync, temp_video_file_path) + @dataclass @@ -1343,10 +1430,15 @@ def _parse_chat_message_content_parts( *, wrap_dicts: bool, interleave_strings: bool, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> list[ConversationMessage]: content = list[_ContentPart]() mm_parser = mm_tracker.create_parser() + + # Set mm_processor_kwargs if parser supports it + if hasattr(mm_parser, "set_mm_processor_kwargs"): + mm_parser.set_mm_processor_kwargs(mm_processor_kwargs) for part in parts: parse_res = _parse_chat_message_content_part( @@ -1464,6 +1556,7 @@ def _parse_chat_message_content( mm_tracker: BaseMultiModalItemTracker, content_format: ChatTemplateContentFormat, interleave_strings: bool, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> list[ConversationMessage]: role = message["role"] content = message.get("content") @@ -1479,6 +1572,7 @@ def _parse_chat_message_content( mm_tracker, wrap_dicts=(content_format == "openai"), interleave_strings=interleave_strings, + mm_processor_kwargs=mm_processor_kwargs, ) for result_msg in result: @@ -1540,6 +1634,7 @@ def parse_chat_messages( model_config: ModelConfig, content_format: ChatTemplateContentFormat, media_io_kwargs: dict[str, dict[str, Any]] | None = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[ list[ConversationMessage], MultiModalDataDict | None, @@ -1558,6 +1653,7 @@ def parse_chat_messages( and model_config.multimodal_config is not None and model_config.multimodal_config.interleave_mm_strings ), + mm_processor_kwargs=mm_processor_kwargs, ) conversation.extend(sub_messages) @@ -1574,6 +1670,7 @@ async def parse_chat_messages_async( model_config: ModelConfig, content_format: ChatTemplateContentFormat, media_io_kwargs: dict[str, dict[str, Any]] | None = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[ list[ConversationMessage], MultiModalDataDict | None, @@ -1594,6 +1691,7 @@ async def parse_chat_messages_async( and model_config.multimodal_config is not None and model_config.multimodal_config.interleave_mm_strings ), + mm_processor_kwargs=mm_processor_kwargs, ) conversation.extend(sub_messages) diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index b19753e48423..a2a6b10b0575 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from collections.abc import Mapping, Sequence from functools import cached_property -from typing import TYPE_CHECKING, Any, Generic, overload +from typing import TYPE_CHECKING, Any, Generic, overload, Optional from typing_extensions import TypeVar @@ -263,6 +263,7 @@ def render_messages( self, messages: list["ChatCompletionMessageParam"], params: ChatParams, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[list["ConversationMessage"], DictPrompt]: raise NotImplementedError @@ -270,8 +271,9 @@ async def render_messages_async( self, messages: list["ChatCompletionMessageParam"], params: ChatParams, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[list["ConversationMessage"], DictPrompt]: - return self.render_messages(messages, params) + return self.render_messages(messages, params, mm_processor_kwargs) # Step 2: Tokenize prompts if necessary def _tokenize_prompt( @@ -711,9 +713,11 @@ def render_chat( if tok_params is None: tok_params = self.default_chat_tok_params + + mm_processor_kwargs = prompt_extras.get("mm_processor_kwargs", None) rendered = [ - self.render_messages(conversation, chat_params) + self.render_messages(conversation, chat_params, mm_processor_kwargs) for conversation in conversations ] @@ -745,9 +749,11 @@ async def render_chat_async( if tok_params is None: tok_params = self.default_chat_tok_params + + mm_processor_kwargs = prompt_extras.get("mm_processor_kwargs", None) rendered = [ - self.render_messages_async(conversation, chat_params) + self.render_messages_async(conversation, chat_params, mm_processor_kwargs) for conversation in conversations ] diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index f919677a0454..683a9be8ddae 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -5,7 +5,7 @@ from collections import defaultdict, deque from collections.abc import Set from functools import lru_cache -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, Any, cast, Optional import jinja2 import jinja2.ext @@ -621,6 +621,7 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config tokenizer = self.get_tokenizer() @@ -636,6 +637,7 @@ def render_messages( model_config=model_config, ), media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt_raw = safe_apply_chat_template( @@ -676,6 +678,7 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config tokenizer = self.get_tokenizer() @@ -691,6 +694,7 @@ async def render_messages_async( model_config=model_config, ), media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt_raw = safe_apply_chat_template( From 5205c215f789f1611ce4760906a3d03d4ed8859e Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 8 Mar 2026 17:40:12 +0800 Subject: [PATCH 02/14] reuse audio io and disable audio_in_video cache Signed-off-by: Isotr0py Signed-off-by: Tianyu Guo --- vllm/entrypoints/chat_utils.py | 2 +- .../models/qwen2_5_omni_thinker.py | 12 ++++++++- vllm/multimodal/media/audio.py | 25 +++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index f821c2ba1a2f..f134eb7bcd20 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1041,7 +1041,7 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: # Extract audio from video if use_audio_in_video is True if video_url and self._mm_processor_kwargs and self._mm_processor_kwargs.get("use_audio_in_video", False): - audio_coro = self._extract_audio_from_video_async(video_url, uuid) + audio_coro = self._audio_with_uuid_async(video_url, uuid) audio_placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder("audio", audio_placeholder) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index f53a0e9bc629..a8b6f481f734 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -78,7 +78,7 @@ ModalityDataItems, MultiModalDataItems, ) -from vllm.multimodal.processing import BaseDummyInputsBuilder +from vllm.multimodal.processing import BaseDummyInputsBuilder, ProcessorInputs, TimingContext from vllm.multimodal.processing.processor import ( BaseMultiModalProcessor, MultiModalPromptUpdates, @@ -811,6 +811,16 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int): ), ] + def _cached_apply_hf_processor( + self, + inputs: ProcessorInputs, + timing_ctx: TimingContext, + ): + mm_processor_kwargs = inputs.hf_processor_mm_kwargs + if mm_processor_kwargs.get("use_audio_in_video", False): + return self._apply_hf_processor(inputs, timing_ctx) + return super()._cached_apply_hf_processor(inputs, timing_ctx) + def _apply_hf_processor_main( self, prompt: str | list[int], diff --git a/vllm/multimodal/media/audio.py b/vllm/multimodal/media/audio.py index 1c906c06c8d9..3e038b598a50 100644 --- a/vllm/multimodal/media/audio.py +++ b/vllm/multimodal/media/audio.py @@ -82,6 +82,29 @@ def extract_audio_from_video_bytes( return audio, float(native_sr) +def is_video(data: bytes) -> bool: + """Check if the fetched bytes are video""" + if len(data) < 12: + return False + + box_type = data[4:8] + major_brand = data[8:12] + + MP4_BRANDS = { + b"mp41", b"mp42", # MP4 + b"isom", # ISO Base Media + b"iso2", b"iso4", b"iso5", b"iso6", + b"M4V ", b"M4A ", # Apple + b"avc1", # H.264 + b"dash", # DASH + b"mmp4", b"MSNV", + } + + is_avi = (data[:4] == b"RIFF" and major_brand == b"RIFF") + is_mp4 = (box_type == b"ftyp" and major_brand in MP4_BRANDS) + return is_mp4 or is_avi + + class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): """Configuration values can be user-provided either by --media-io-kwargs or by the runtime API field "media_io_kwargs". Ensure proper validation and @@ -100,6 +123,8 @@ def __init__(self, **kwargs) -> None: self.kwargs = kwargs def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: + if is_video(data): + return extract_audio_from_video_bytes(data) return librosa.load(BytesIO(data), sr=None) def load_base64( From 5a66a9da293365b1b876c87a83cdb3492b2000ed Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 8 Mar 2026 17:44:30 +0800 Subject: [PATCH 03/14] clean and fix avi Signed-off-by: Isotr0py Signed-off-by: Tianyu Guo --- vllm/entrypoints/chat_utils.py | 112 ++++++++------------------------- vllm/multimodal/media/audio.py | 24 ++++--- 2 files changed, 40 insertions(+), 96 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index f134eb7bcd20..a479c76ca2cb 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -4,7 +4,6 @@ import asyncio import json import warnings -import numpy as np from abc import ABC, abstractmethod from collections import Counter, defaultdict from collections.abc import Awaitable, Callable, Iterable @@ -12,7 +11,15 @@ from functools import cached_property, lru_cache, partial from itertools import accumulate from pathlib import Path -from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast, Union, Optional +from typing import ( + TYPE_CHECKING, + Any, + Generic, + Literal, + TypeAlias, + TypeVar, + cast, +) from openai.types.chat import ( ChatCompletionAssistantMessageParam, @@ -899,13 +906,15 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) - self._mm_processor_kwargs: Optional[dict[str, Any]] = None + self._mm_processor_kwargs: dict[str, Any] | None = None @property def model_config(self) -> ModelConfig: return self._tracker.model_config - - def set_mm_processor_kwargs(self, mm_processor_kwargs: Optional[dict[str, Any]]) -> None: + + def set_mm_processor_kwargs( + self, mm_processor_kwargs: dict[str, Any] | None + ) -> None: """Set mm_processor_kwargs for use in parsing.""" self._mm_processor_kwargs = mm_processor_kwargs @@ -1038,87 +1047,16 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: placeholder = self._tracker.add("video", coro) self._add_placeholder("video", placeholder) - + # Extract audio from video if use_audio_in_video is True - if video_url and self._mm_processor_kwargs and self._mm_processor_kwargs.get("use_audio_in_video", False): + if ( + video_url + and self._mm_processor_kwargs + and self._mm_processor_kwargs.get("use_audio_in_video", False) + ): audio_coro = self._audio_with_uuid_async(video_url, uuid) audio_placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder("audio", audio_placeholder) - - async def _extract_audio_from_video_async(self, video_url: str, uuid: str | None = None): - """ - Extract audio from video URL using librosa. - Returns tuple of (audio_array, sample_rate) compatible with audio format. - - All blocking I/O operations are run in a thread pool to avoid blocking the event loop. - """ - import asyncio - import os - import tempfile - from urllib.parse import urlparse - - # Parse URL to determine type - parsed_url = urlparse(video_url) - temp_video_file_path = None - - def _download_video_sync(url: str) -> bytes: - """Synchronous video download - runs in thread pool.""" - from urllib.request import urlopen - - return urlopen(url).read() - - def _write_temp_file_sync(data: bytes, suffix: str) -> str: - """Synchronous temp file write - runs in thread pool.""" - with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file: - temp_file.write(data) - return temp_file.name - - def _load_audio_sync(file_path: str) -> tuple[np.ndarray, Union[int, float]]: - """Synchronous audio loading with librosa - runs in thread pool.""" - import librosa - - return librosa.load(file_path, sr=16000) - - def _cleanup_file_sync(file_path: str) -> None: - """Synchronous file deletion - runs in thread pool.""" - try: - if os.path.exists(file_path): - os.unlink(file_path) - except OSError: - pass - - try: - if parsed_url.scheme in ("http", "https"): - # Download video from HTTP/HTTPS URL asynchronously - video_data = await asyncio.to_thread(_download_video_sync, video_url) - # Write temp file asynchronously - temp_video_file_path = await asyncio.to_thread(_write_temp_file_sync, video_data, ".mp4") - elif parsed_url.scheme == "file": - # Use file path directly (handle Windows paths) - from urllib.request import url2pathname - - temp_video_file_path = url2pathname(parsed_url.path) - elif parsed_url.scheme == "data": - # Handle data URL (base64 encoded video) - import base64 - - header, data = video_url.split(",", 1) - video_data = base64.b64decode(data) - # Write temp file asynchronously - temp_video_file_path = await asyncio.to_thread(_write_temp_file_sync, video_data, ".mp4") - else: - # Assume it's a local file path - temp_video_file_path = video_url - - # Extract audio using librosa asynchronously (CPU-intensive, runs in thread pool) - audio_array, sample_rate = await asyncio.to_thread(_load_audio_sync, temp_video_file_path) - - return (audio_array, sample_rate), uuid - finally: - # Clean up temporary file if we created one (asynchronously) - if temp_video_file_path and parsed_url.scheme in ("http", "https", "data"): - await asyncio.to_thread(_cleanup_file_sync, temp_video_file_path) - @dataclass @@ -1430,12 +1368,12 @@ def _parse_chat_message_content_parts( *, wrap_dicts: bool, interleave_strings: bool, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> list[ConversationMessage]: content = list[_ContentPart]() mm_parser = mm_tracker.create_parser() - + # Set mm_processor_kwargs if parser supports it if hasattr(mm_parser, "set_mm_processor_kwargs"): mm_parser.set_mm_processor_kwargs(mm_processor_kwargs) @@ -1556,7 +1494,7 @@ def _parse_chat_message_content( mm_tracker: BaseMultiModalItemTracker, content_format: ChatTemplateContentFormat, interleave_strings: bool, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> list[ConversationMessage]: role = message["role"] content = message.get("content") @@ -1634,7 +1572,7 @@ def parse_chat_messages( model_config: ModelConfig, content_format: ChatTemplateContentFormat, media_io_kwargs: dict[str, dict[str, Any]] | None = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[ list[ConversationMessage], MultiModalDataDict | None, @@ -1670,7 +1608,7 @@ async def parse_chat_messages_async( model_config: ModelConfig, content_format: ChatTemplateContentFormat, media_io_kwargs: dict[str, dict[str, Any]] | None = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[ list[ConversationMessage], MultiModalDataDict | None, diff --git a/vllm/multimodal/media/audio.py b/vllm/multimodal/media/audio.py index 3e038b598a50..4f101bced1b1 100644 --- a/vllm/multimodal/media/audio.py +++ b/vllm/multimodal/media/audio.py @@ -91,17 +91,23 @@ def is_video(data: bytes) -> bool: major_brand = data[8:12] MP4_BRANDS = { - b"mp41", b"mp42", # MP4 - b"isom", # ISO Base Media - b"iso2", b"iso4", b"iso5", b"iso6", - b"M4V ", b"M4A ", # Apple - b"avc1", # H.264 - b"dash", # DASH - b"mmp4", b"MSNV", + b"mp41", + b"mp42", # MP4 + b"isom", # ISO Base Media + b"iso2", + b"iso4", + b"iso5", + b"iso6", + b"M4V ", + b"M4A ", # Apple + b"avc1", # H.264 + b"dash", # DASH + b"mmp4", + b"MSNV", } - is_avi = (data[:4] == b"RIFF" and major_brand == b"RIFF") - is_mp4 = (box_type == b"ftyp" and major_brand in MP4_BRANDS) + is_avi = data[:4] == b"RIFF" and major_brand == b"AVI " + is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS return is_mp4 or is_avi From 1ca1407b9cee748ae6a6829699043c1f9b5ec96b Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 02:20:45 +0000 Subject: [PATCH 04/14] Lint Signed-off-by: Tianyu Guo --- vllm/model_executor/models/qwen2_5_omni_thinker.py | 6 +++++- vllm/renderers/base.py | 10 +++++----- vllm/renderers/hf.py | 6 +++--- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index a8b6f481f734..16d3f469338f 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -78,7 +78,11 @@ ModalityDataItems, MultiModalDataItems, ) -from vllm.multimodal.processing import BaseDummyInputsBuilder, ProcessorInputs, TimingContext +from vllm.multimodal.processing import ( + BaseDummyInputsBuilder, + ProcessorInputs, + TimingContext, +) from vllm.multimodal.processing.processor import ( BaseMultiModalProcessor, MultiModalPromptUpdates, diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index a2a6b10b0575..bc205fc7a3bf 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from collections.abc import Mapping, Sequence from functools import cached_property -from typing import TYPE_CHECKING, Any, Generic, overload, Optional +from typing import TYPE_CHECKING, Any, Generic, overload from typing_extensions import TypeVar @@ -263,7 +263,7 @@ def render_messages( self, messages: list["ChatCompletionMessageParam"], params: ChatParams, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list["ConversationMessage"], DictPrompt]: raise NotImplementedError @@ -271,7 +271,7 @@ async def render_messages_async( self, messages: list["ChatCompletionMessageParam"], params: ChatParams, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list["ConversationMessage"], DictPrompt]: return self.render_messages(messages, params, mm_processor_kwargs) @@ -713,7 +713,7 @@ def render_chat( if tok_params is None: tok_params = self.default_chat_tok_params - + mm_processor_kwargs = prompt_extras.get("mm_processor_kwargs", None) rendered = [ @@ -749,7 +749,7 @@ async def render_chat_async( if tok_params is None: tok_params = self.default_chat_tok_params - + mm_processor_kwargs = prompt_extras.get("mm_processor_kwargs", None) rendered = [ diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index 683a9be8ddae..ab5d075cb4de 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -5,7 +5,7 @@ from collections import defaultdict, deque from collections.abc import Set from functools import lru_cache -from typing import TYPE_CHECKING, Any, cast, Optional +from typing import TYPE_CHECKING, Any, cast import jinja2 import jinja2.ext @@ -621,7 +621,7 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config tokenizer = self.get_tokenizer() @@ -678,7 +678,7 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config tokenizer = self.get_tokenizer() From 1b50736a529d1a77b14cf9711eac7262b3e53ded Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 11:46:09 +0800 Subject: [PATCH 05/14] Lint Signed-off-by: Tianyu Guo --- vllm/renderers/deepseek_v32.py | 4 ++++ vllm/renderers/grok2.py | 4 ++++ vllm/renderers/mistral.py | 4 ++++ vllm/renderers/terratorch.py | 4 ++++ 4 files changed, 16 insertions(+) diff --git a/vllm/renderers/deepseek_v32.py b/vllm/renderers/deepseek_v32.py index df510cf26a39..d64b931b8ff7 100644 --- a/vllm/renderers/deepseek_v32.py +++ b/vllm/renderers/deepseek_v32.py @@ -43,6 +43,7 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( @@ -50,6 +51,7 @@ def render_messages( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt_raw = tokenizer.apply_chat_template( @@ -70,6 +72,7 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( @@ -77,6 +80,7 @@ async def render_messages_async( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt_raw = tokenizer.apply_chat_template( diff --git a/vllm/renderers/grok2.py b/vllm/renderers/grok2.py index 1662079f9a5b..50e5d9b4e31b 100644 --- a/vllm/renderers/grok2.py +++ b/vllm/renderers/grok2.py @@ -43,6 +43,7 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( @@ -50,6 +51,7 @@ def render_messages( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt_raw = tokenizer.apply_chat_template( @@ -70,6 +72,7 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( @@ -77,6 +80,7 @@ async def render_messages_async( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt_raw = tokenizer.apply_chat_template( diff --git a/vllm/renderers/mistral.py b/vllm/renderers/mistral.py index 5191e324fe36..d56dc21fa288 100644 --- a/vllm/renderers/mistral.py +++ b/vllm/renderers/mistral.py @@ -84,6 +84,7 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( @@ -91,6 +92,7 @@ def render_messages( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt_raw = safe_apply_chat_template( @@ -111,6 +113,7 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( @@ -118,6 +121,7 @@ async def render_messages_async( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt_raw = await self._apply_chat_template_async( diff --git a/vllm/renderers/terratorch.py b/vllm/renderers/terratorch.py index 6eaaff825bfe..928b357bc6b5 100644 --- a/vllm/renderers/terratorch.py +++ b/vllm/renderers/terratorch.py @@ -36,6 +36,7 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config @@ -44,6 +45,7 @@ def render_messages( model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt = parse_dec_only_prompt([1]) # Dummy token IDs @@ -58,6 +60,7 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, + mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config @@ -66,6 +69,7 @@ async def render_messages_async( model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) prompt = parse_dec_only_prompt([1]) # Dummy token IDs From cdbd64230bdeb3635bc707b01d6e7317ebdc1576 Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 12:13:44 +0800 Subject: [PATCH 06/14] Move mm_processor_kwargs to ChatParams Signed-off-by: Tianyu Guo --- vllm/entrypoints/openai/engine/serving.py | 1 + vllm/renderers/base.py | 12 +++--------- vllm/renderers/deepseek_v32.py | 6 ++---- vllm/renderers/grok2.py | 6 ++---- vllm/renderers/hf.py | 6 ++---- vllm/renderers/mistral.py | 6 ++---- vllm/renderers/params.py | 10 +++++++++- vllm/renderers/terratorch.py | 6 ++---- 8 files changed, 23 insertions(+), 30 deletions(-) diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 0c074116d3ce..0bf6868c851d 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -908,6 +908,7 @@ async def _preprocess_chat( ).with_defaults( default_template_kwargs, default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None), + default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None), ) (conversation,), (engine_prompt,) = await renderer.render_chat_async( diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index bc205fc7a3bf..b19753e48423 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -263,7 +263,6 @@ def render_messages( self, messages: list["ChatCompletionMessageParam"], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list["ConversationMessage"], DictPrompt]: raise NotImplementedError @@ -271,9 +270,8 @@ async def render_messages_async( self, messages: list["ChatCompletionMessageParam"], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list["ConversationMessage"], DictPrompt]: - return self.render_messages(messages, params, mm_processor_kwargs) + return self.render_messages(messages, params) # Step 2: Tokenize prompts if necessary def _tokenize_prompt( @@ -714,10 +712,8 @@ def render_chat( if tok_params is None: tok_params = self.default_chat_tok_params - mm_processor_kwargs = prompt_extras.get("mm_processor_kwargs", None) - rendered = [ - self.render_messages(conversation, chat_params, mm_processor_kwargs) + self.render_messages(conversation, chat_params) for conversation in conversations ] @@ -750,10 +746,8 @@ async def render_chat_async( if tok_params is None: tok_params = self.default_chat_tok_params - mm_processor_kwargs = prompt_extras.get("mm_processor_kwargs", None) - rendered = [ - self.render_messages_async(conversation, chat_params, mm_processor_kwargs) + self.render_messages_async(conversation, chat_params) for conversation in conversations ] diff --git a/vllm/renderers/deepseek_v32.py b/vllm/renderers/deepseek_v32.py index d64b931b8ff7..5146f5a4580b 100644 --- a/vllm/renderers/deepseek_v32.py +++ b/vllm/renderers/deepseek_v32.py @@ -43,7 +43,6 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( @@ -51,7 +50,7 @@ def render_messages( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt_raw = tokenizer.apply_chat_template( @@ -72,7 +71,6 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( @@ -80,7 +78,7 @@ async def render_messages_async( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt_raw = tokenizer.apply_chat_template( diff --git a/vllm/renderers/grok2.py b/vllm/renderers/grok2.py index 50e5d9b4e31b..cdb500ca1e23 100644 --- a/vllm/renderers/grok2.py +++ b/vllm/renderers/grok2.py @@ -43,7 +43,6 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( @@ -51,7 +50,7 @@ def render_messages( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt_raw = tokenizer.apply_chat_template( @@ -72,7 +71,6 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( @@ -80,7 +78,7 @@ async def render_messages_async( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt_raw = tokenizer.apply_chat_template( diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index ab5d075cb4de..c862f70aa0e4 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -621,7 +621,6 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config tokenizer = self.get_tokenizer() @@ -637,7 +636,7 @@ def render_messages( model_config=model_config, ), media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt_raw = safe_apply_chat_template( @@ -678,7 +677,6 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config tokenizer = self.get_tokenizer() @@ -694,7 +692,7 @@ async def render_messages_async( model_config=model_config, ), media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt_raw = safe_apply_chat_template( diff --git a/vllm/renderers/mistral.py b/vllm/renderers/mistral.py index d56dc21fa288..8f08a1b04133 100644 --- a/vllm/renderers/mistral.py +++ b/vllm/renderers/mistral.py @@ -84,7 +84,6 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( @@ -92,7 +91,7 @@ def render_messages( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt_raw = safe_apply_chat_template( @@ -113,7 +112,6 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( @@ -121,7 +119,7 @@ async def render_messages_async( self.model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt_raw = await self._apply_chat_template_async( diff --git a/vllm/renderers/params.py b/vllm/renderers/params.py index e5a04301402e..46a2d21994ae 100644 --- a/vllm/renderers/params.py +++ b/vllm/renderers/params.py @@ -55,13 +55,17 @@ class ChatParams: media_io_kwargs: dict[str, dict[str, Any]] | None = None """Per-modality kwargs for media I/O (loading/decoding images, videos, etc.).""" + + mm_processor_kwargs: dict[str, Any] | None = None + """The kwargs to pass to the multi-modal processor.""" def with_defaults( self, default_chat_template_kwargs: dict[str, Any] | None = None, default_media_io_kwargs: dict[str, dict[str, Any]] | None = None, + default_mm_processor_kwargs: dict[str, Any] | None = None, ): - if not default_chat_template_kwargs and not default_media_io_kwargs: + if not default_chat_template_kwargs and not default_media_io_kwargs and not default_mm_processor_kwargs: return self return ChatParams( @@ -75,6 +79,10 @@ def with_defaults( default_media_io_kwargs, self.media_io_kwargs, ), + mm_processor_kwargs=merge_kwargs( + default_mm_processor_kwargs, + self.mm_processor_kwargs, + ), ) def get_apply_chat_template_kwargs(self) -> dict[str, Any]: diff --git a/vllm/renderers/terratorch.py b/vllm/renderers/terratorch.py index 928b357bc6b5..ff10c5423973 100644 --- a/vllm/renderers/terratorch.py +++ b/vllm/renderers/terratorch.py @@ -36,7 +36,6 @@ def render_messages( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config @@ -45,7 +44,7 @@ def render_messages( model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt = parse_dec_only_prompt([1]) # Dummy token IDs @@ -60,7 +59,6 @@ async def render_messages_async( self, messages: list[ChatCompletionMessageParam], params: ChatParams, - mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[list[ConversationMessage], DictPrompt]: model_config = self.model_config @@ -69,7 +67,7 @@ async def render_messages_async( model_config, content_format="string", media_io_kwargs=params.media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, + mm_processor_kwargs=params.mm_processor_kwargs, ) prompt = parse_dec_only_prompt([1]) # Dummy token IDs From ab0eb4a534a8ee47f5ce38b54f62a22383a63bda Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 04:17:13 +0000 Subject: [PATCH 07/14] Lint Signed-off-by: Tianyu Guo --- vllm/renderers/params.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/renderers/params.py b/vllm/renderers/params.py index 46a2d21994ae..3ff92fd831a8 100644 --- a/vllm/renderers/params.py +++ b/vllm/renderers/params.py @@ -55,7 +55,7 @@ class ChatParams: media_io_kwargs: dict[str, dict[str, Any]] | None = None """Per-modality kwargs for media I/O (loading/decoding images, videos, etc.).""" - + mm_processor_kwargs: dict[str, Any] | None = None """The kwargs to pass to the multi-modal processor.""" @@ -65,7 +65,11 @@ def with_defaults( default_media_io_kwargs: dict[str, dict[str, Any]] | None = None, default_mm_processor_kwargs: dict[str, Any] | None = None, ): - if not default_chat_template_kwargs and not default_media_io_kwargs and not default_mm_processor_kwargs: + if ( + not default_chat_template_kwargs + and not default_media_io_kwargs + and not default_mm_processor_kwargs + ): return self return ChatParams( From 1f6f7000da04f81bebd8f7379682030b1345d7e1 Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 16:11:53 +0800 Subject: [PATCH 08/14] Pass mm_processor_kwargs at method create_parser Signed-off-by: Tianyu Guo --- vllm/entrypoints/chat_utils.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a479c76ca2cb..8810e1741eeb 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -572,7 +572,7 @@ def add(self, modality: ModalityStr, item: _T) -> str | None: return self.model_cls.get_placeholder_str(modality, num_items) @abstractmethod - def create_parser(self) -> "BaseMultiModalContentParser": + def create_parser(self, mm_processor_kwargs: dict[str, Any] | None = None) -> "BaseMultiModalContentParser": raise NotImplementedError @@ -698,8 +698,8 @@ def resolve_items( dict(self._items_by_modality), self.mm_processor, self._modality_order ) - def create_parser(self) -> "BaseMultiModalContentParser": - return MultiModalContentParser(self) + def create_parser(self, mm_processor_kwargs: dict[str, Any] | None = None) -> "BaseMultiModalContentParser": + return MultiModalContentParser(self, mm_processor_kwargs=mm_processor_kwargs) class AsyncMultiModalItemTracker( @@ -720,8 +720,8 @@ async def resolve_items( resolved_items_by_modality, self.mm_processor, self._modality_order ) - def create_parser(self) -> "BaseMultiModalContentParser": - return AsyncMultiModalContentParser(self) + def create_parser(self, mm_processor_kwargs: dict[str, Any] | None = None) -> "BaseMultiModalContentParser": + return AsyncMultiModalContentParser(self, mm_processor_kwargs=mm_processor_kwargs) class BaseMultiModalContentParser(ABC): @@ -786,7 +786,7 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: class MultiModalContentParser(BaseMultiModalContentParser): - def __init__(self, tracker: MultiModalItemTracker) -> None: + def __init__(self, tracker: MultiModalItemTracker, mm_processor_kwargs: dict[str, Any] | None = None) -> None: super().__init__() self._tracker = tracker @@ -797,6 +797,8 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) + + self._mm_processor_kwargs = mm_processor_kwargs @property def model_config(self) -> ModelConfig: @@ -896,7 +898,7 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: class AsyncMultiModalContentParser(BaseMultiModalContentParser): - def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: + def __init__(self, tracker: AsyncMultiModalItemTracker, mm_processor_kwargs: dict[str, Any] | None = None,) -> None: super().__init__() self._tracker = tracker @@ -906,18 +908,12 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) - self._mm_processor_kwargs: dict[str, Any] | None = None + self._mm_processor_kwargs: dict[str, Any] | None = mm_processor_kwargs @property def model_config(self) -> ModelConfig: return self._tracker.model_config - def set_mm_processor_kwargs( - self, mm_processor_kwargs: dict[str, Any] | None - ) -> None: - """Set mm_processor_kwargs for use in parsing.""" - self._mm_processor_kwargs = mm_processor_kwargs - async def _image_with_uuid_async(self, image_url: str | None, uuid: str | None): image = ( await self._connector.fetch_image_async(image_url) if image_url else None @@ -1372,11 +1368,7 @@ def _parse_chat_message_content_parts( ) -> list[ConversationMessage]: content = list[_ContentPart]() - mm_parser = mm_tracker.create_parser() - - # Set mm_processor_kwargs if parser supports it - if hasattr(mm_parser, "set_mm_processor_kwargs"): - mm_parser.set_mm_processor_kwargs(mm_processor_kwargs) + mm_parser = mm_tracker.create_parser(mm_processor_kwargs=mm_processor_kwargs) for part in parts: parse_res = _parse_chat_message_content_part( From 33f0041bf259e3958f4575c58154ca99e45f037d Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 16:16:44 +0800 Subject: [PATCH 09/14] Cancel format Signed-off-by: Tianyu Guo --- vllm/entrypoints/chat_utils.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 8810e1741eeb..ee71abc52ab0 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -11,15 +11,7 @@ from functools import cached_property, lru_cache, partial from itertools import accumulate from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Generic, - Literal, - TypeAlias, - TypeVar, - cast, -) +from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast from openai.types.chat import ( ChatCompletionAssistantMessageParam, From 104095259a9c8c0b851bf79b37db6b219658a066 Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 16:30:45 +0800 Subject: [PATCH 10/14] Recursively merge kwargs Signed-off-by: Tianyu Guo --- vllm/renderers/params.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/vllm/renderers/params.py b/vllm/renderers/params.py index 3ff92fd831a8..023ac2729c5a 100644 --- a/vllm/renderers/params.py +++ b/vllm/renderers/params.py @@ -39,6 +39,37 @@ def merge_kwargs( return defaults | {k: v for k, v in overrides.items() if v not in unset_values} +def recursively_merge_kwargs( + defaults: dict[str, Any] | None, + overrides: dict[str, Any] | None, + /, + *, + unset_values: tuple[object, ...] = (None, "auto"), +) -> dict[str, Any]: + if defaults is None: + defaults = {} + if overrides is None: + overrides = {} + + merged = dict(defaults) + + for k, v in overrides.items(): + if v in unset_values: + continue + + if ( + k in merged + and isinstance(merged[k], dict) + and isinstance(v, dict) + ): + merged[k] = recursively_merge_kwargs( + merged[k], v, unset_values=unset_values + ) + else: + merged[k] = v + + return merged + @dataclass(frozen=True) class ChatParams: @@ -83,7 +114,7 @@ def with_defaults( default_media_io_kwargs, self.media_io_kwargs, ), - mm_processor_kwargs=merge_kwargs( + mm_processor_kwargs=recursively_merge_kwargs( default_mm_processor_kwargs, self.mm_processor_kwargs, ), From 2629325b44c549a8788db8971e132d6eefa40084 Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 08:38:01 +0000 Subject: [PATCH 11/14] Lint Signed-off-by: Tianyu Guo --- vllm/entrypoints/chat_utils.py | 30 +++++++++++++++++++++++------- vllm/renderers/params.py | 7 ++----- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index ee71abc52ab0..81d72bc530df 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -564,7 +564,9 @@ def add(self, modality: ModalityStr, item: _T) -> str | None: return self.model_cls.get_placeholder_str(modality, num_items) @abstractmethod - def create_parser(self, mm_processor_kwargs: dict[str, Any] | None = None) -> "BaseMultiModalContentParser": + def create_parser( + self, mm_processor_kwargs: dict[str, Any] | None = None + ) -> "BaseMultiModalContentParser": raise NotImplementedError @@ -690,7 +692,9 @@ def resolve_items( dict(self._items_by_modality), self.mm_processor, self._modality_order ) - def create_parser(self, mm_processor_kwargs: dict[str, Any] | None = None) -> "BaseMultiModalContentParser": + def create_parser( + self, mm_processor_kwargs: dict[str, Any] | None = None + ) -> "BaseMultiModalContentParser": return MultiModalContentParser(self, mm_processor_kwargs=mm_processor_kwargs) @@ -712,8 +716,12 @@ async def resolve_items( resolved_items_by_modality, self.mm_processor, self._modality_order ) - def create_parser(self, mm_processor_kwargs: dict[str, Any] | None = None) -> "BaseMultiModalContentParser": - return AsyncMultiModalContentParser(self, mm_processor_kwargs=mm_processor_kwargs) + def create_parser( + self, mm_processor_kwargs: dict[str, Any] | None = None + ) -> "BaseMultiModalContentParser": + return AsyncMultiModalContentParser( + self, mm_processor_kwargs=mm_processor_kwargs + ) class BaseMultiModalContentParser(ABC): @@ -778,7 +786,11 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: class MultiModalContentParser(BaseMultiModalContentParser): - def __init__(self, tracker: MultiModalItemTracker, mm_processor_kwargs: dict[str, Any] | None = None) -> None: + def __init__( + self, + tracker: MultiModalItemTracker, + mm_processor_kwargs: dict[str, Any] | None = None, + ) -> None: super().__init__() self._tracker = tracker @@ -789,7 +801,7 @@ def __init__(self, tracker: MultiModalItemTracker, mm_processor_kwargs: dict[str allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) - + self._mm_processor_kwargs = mm_processor_kwargs @property @@ -890,7 +902,11 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: class AsyncMultiModalContentParser(BaseMultiModalContentParser): - def __init__(self, tracker: AsyncMultiModalItemTracker, mm_processor_kwargs: dict[str, Any] | None = None,) -> None: + def __init__( + self, + tracker: AsyncMultiModalItemTracker, + mm_processor_kwargs: dict[str, Any] | None = None, + ) -> None: super().__init__() self._tracker = tracker diff --git a/vllm/renderers/params.py b/vllm/renderers/params.py index 023ac2729c5a..54da0f3b519d 100644 --- a/vllm/renderers/params.py +++ b/vllm/renderers/params.py @@ -39,6 +39,7 @@ def merge_kwargs( return defaults | {k: v for k, v in overrides.items() if v not in unset_values} + def recursively_merge_kwargs( defaults: dict[str, Any] | None, overrides: dict[str, Any] | None, @@ -57,11 +58,7 @@ def recursively_merge_kwargs( if v in unset_values: continue - if ( - k in merged - and isinstance(merged[k], dict) - and isinstance(v, dict) - ): + if k in merged and isinstance(merged[k], dict) and isinstance(v, dict): merged[k] = recursively_merge_kwargs( merged[k], v, unset_values=unset_values ) From 2ea6a02796acdd8509d08c356f09547569c9a1c8 Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 17:36:16 +0800 Subject: [PATCH 12/14] Extract audio in MMcontentParser Signed-off-by: Tianyu Guo --- vllm/entrypoints/chat_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 81d72bc530df..77d3804c6a41 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -899,6 +899,16 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: placeholder = self._tracker.add("video", (video, uuid)) self._add_placeholder("video", placeholder) + + # Extract audio from video if use_audio_in_video is True + if ( + video_url + and self._mm_processor_kwargs + and self._mm_processor_kwargs.get("use_audio_in_video", False) + ): + audio_coro = self._audio_with_uuid_async(video_url, uuid) + audio_placeholder = self._tracker.add("audio", audio_coro) + self._add_placeholder("audio", audio_placeholder) class AsyncMultiModalContentParser(BaseMultiModalContentParser): From 175bd776d1970ea58569444c82c0a104f35bd43c Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 09:58:06 +0000 Subject: [PATCH 13/14] Lint Signed-off-by: Tianyu Guo --- vllm/entrypoints/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 77d3804c6a41..9994339e00e8 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -899,7 +899,7 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: placeholder = self._tracker.add("video", (video, uuid)) self._add_placeholder("video", placeholder) - + # Extract audio from video if use_audio_in_video is True if ( video_url From 83c15425b1a40eafb7e0a4b3bed198d01442aef5 Mon Sep 17 00:00:00 2001 From: Tianyu Guo Date: Mon, 9 Mar 2026 18:29:33 +0800 Subject: [PATCH 14/14] Fix Signed-off-by: Tianyu Guo --- vllm/entrypoints/chat_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 9994339e00e8..5ffb60719901 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -906,8 +906,8 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None: and self._mm_processor_kwargs and self._mm_processor_kwargs.get("use_audio_in_video", False) ): - audio_coro = self._audio_with_uuid_async(video_url, uuid) - audio_placeholder = self._tracker.add("audio", audio_coro) + audio = self._connector.fetch_audio(video_url) if video_url else None + audio_placeholder = self._tracker.add("audio", (audio, uuid)) self._add_placeholder("audio", audio_placeholder)