Skip to content
62 changes: 54 additions & 8 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,9 @@ def add(self, modality: ModalityStr, item: _T) -> str | None:
return self.model_cls.get_placeholder_str(modality, num_items)

@abstractmethod
def create_parser(self) -> "BaseMultiModalContentParser":
def create_parser(
self, mm_processor_kwargs: dict[str, Any] | None = None
) -> "BaseMultiModalContentParser":
raise NotImplementedError


Expand Down Expand Up @@ -690,8 +692,10 @@ def resolve_items(
dict(self._items_by_modality), self.mm_processor, self._modality_order
)

def create_parser(self) -> "BaseMultiModalContentParser":
return MultiModalContentParser(self)
def create_parser(
self, mm_processor_kwargs: dict[str, Any] | None = None
) -> "BaseMultiModalContentParser":
return MultiModalContentParser(self, mm_processor_kwargs=mm_processor_kwargs)


class AsyncMultiModalItemTracker(
Expand All @@ -712,8 +716,12 @@ async def resolve_items(
resolved_items_by_modality, self.mm_processor, self._modality_order
)

def create_parser(self) -> "BaseMultiModalContentParser":
return AsyncMultiModalContentParser(self)
def create_parser(
self, mm_processor_kwargs: dict[str, Any] | None = None
) -> "BaseMultiModalContentParser":
return AsyncMultiModalContentParser(
self, mm_processor_kwargs=mm_processor_kwargs
)


class BaseMultiModalContentParser(ABC):
Expand Down Expand Up @@ -778,7 +786,11 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:


class MultiModalContentParser(BaseMultiModalContentParser):
def __init__(self, tracker: MultiModalItemTracker) -> None:
def __init__(
self,
tracker: MultiModalItemTracker,
mm_processor_kwargs: dict[str, Any] | None = None,
) -> None:
super().__init__()

self._tracker = tracker
Expand All @@ -790,6 +802,8 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
allowed_media_domains=tracker.allowed_media_domains,
)

self._mm_processor_kwargs = mm_processor_kwargs

@property
def model_config(self) -> ModelConfig:
return self._tracker.model_config
Expand Down Expand Up @@ -886,9 +900,23 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
placeholder = self._tracker.add("video", (video, uuid))
self._add_placeholder("video", placeholder)

# Extract audio from video if use_audio_in_video is True
if (
video_url
and self._mm_processor_kwargs
and self._mm_processor_kwargs.get("use_audio_in_video", False)
):
audio = self._connector.fetch_audio(video_url) if video_url else None
audio_placeholder = self._tracker.add("audio", (audio, uuid))
self._add_placeholder("audio", audio_placeholder)


class AsyncMultiModalContentParser(BaseMultiModalContentParser):
def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
def __init__(
self,
tracker: AsyncMultiModalItemTracker,
mm_processor_kwargs: dict[str, Any] | None = None,
) -> None:
super().__init__()

self._tracker = tracker
Expand All @@ -898,6 +926,7 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
allowed_local_media_path=tracker.allowed_local_media_path,
allowed_media_domains=tracker.allowed_media_domains,
)
self._mm_processor_kwargs: dict[str, Any] | None = mm_processor_kwargs

@property
def model_config(self) -> ModelConfig:
Expand Down Expand Up @@ -1033,6 +1062,16 @@ def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
placeholder = self._tracker.add("video", coro)
self._add_placeholder("video", placeholder)

# Extract audio from video if use_audio_in_video is True
if (
video_url
and self._mm_processor_kwargs
and self._mm_processor_kwargs.get("use_audio_in_video", False)
):
audio_coro = self._audio_with_uuid_async(video_url, uuid)
audio_placeholder = self._tracker.add("audio", audio_coro)
self._add_placeholder("audio", audio_placeholder)


@dataclass
class ChatTemplateConfig:
Expand Down Expand Up @@ -1343,10 +1382,11 @@ def _parse_chat_message_content_parts(
*,
wrap_dicts: bool,
interleave_strings: bool,
mm_processor_kwargs: dict[str, Any] | None = None,
) -> list[ConversationMessage]:
content = list[_ContentPart]()

mm_parser = mm_tracker.create_parser()
mm_parser = mm_tracker.create_parser(mm_processor_kwargs=mm_processor_kwargs)

for part in parts:
parse_res = _parse_chat_message_content_part(
Expand Down Expand Up @@ -1464,6 +1504,7 @@ def _parse_chat_message_content(
mm_tracker: BaseMultiModalItemTracker,
content_format: ChatTemplateContentFormat,
interleave_strings: bool,
mm_processor_kwargs: dict[str, Any] | None = None,
) -> list[ConversationMessage]:
role = message["role"]
content = message.get("content")
Expand All @@ -1479,6 +1520,7 @@ def _parse_chat_message_content(
mm_tracker,
wrap_dicts=(content_format == "openai"),
interleave_strings=interleave_strings,
mm_processor_kwargs=mm_processor_kwargs,
)

for result_msg in result:
Expand Down Expand Up @@ -1540,6 +1582,7 @@ def parse_chat_messages(
model_config: ModelConfig,
content_format: ChatTemplateContentFormat,
media_io_kwargs: dict[str, dict[str, Any]] | None = None,
mm_processor_kwargs: dict[str, Any] | None = None,
) -> tuple[
list[ConversationMessage],
MultiModalDataDict | None,
Expand All @@ -1558,6 +1601,7 @@ def parse_chat_messages(
and model_config.multimodal_config is not None
and model_config.multimodal_config.interleave_mm_strings
),
mm_processor_kwargs=mm_processor_kwargs,
)

conversation.extend(sub_messages)
Expand All @@ -1574,6 +1618,7 @@ async def parse_chat_messages_async(
model_config: ModelConfig,
content_format: ChatTemplateContentFormat,
media_io_kwargs: dict[str, dict[str, Any]] | None = None,
mm_processor_kwargs: dict[str, Any] | None = None,
) -> tuple[
list[ConversationMessage],
MultiModalDataDict | None,
Expand All @@ -1594,6 +1639,7 @@ async def parse_chat_messages_async(
and model_config.multimodal_config is not None
and model_config.multimodal_config.interleave_mm_strings
),
mm_processor_kwargs=mm_processor_kwargs,
)

conversation.extend(sub_messages)
Expand Down
1 change: 1 addition & 0 deletions vllm/entrypoints/openai/engine/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,6 +908,7 @@ async def _preprocess_chat(
).with_defaults(
default_template_kwargs,
default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None),
default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None),
)

(conversation,), (engine_prompt,) = await renderer.render_chat_async(
Expand Down
16 changes: 15 additions & 1 deletion vllm/model_executor/models/qwen2_5_omni_thinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,11 @@
ModalityDataItems,
MultiModalDataItems,
)
from vllm.multimodal.processing import BaseDummyInputsBuilder
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
ProcessorInputs,
TimingContext,
)
from vllm.multimodal.processing.processor import (
BaseMultiModalProcessor,
MultiModalPromptUpdates,
Expand Down Expand Up @@ -811,6 +815,16 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int):
),
]

def _cached_apply_hf_processor(
self,
inputs: ProcessorInputs,
timing_ctx: TimingContext,
):
mm_processor_kwargs = inputs.hf_processor_mm_kwargs
if mm_processor_kwargs.get("use_audio_in_video", False):
return self._apply_hf_processor(inputs, timing_ctx)
return super()._cached_apply_hf_processor(inputs, timing_ctx)

def _apply_hf_processor_main(
self,
prompt: str | list[int],
Expand Down
31 changes: 31 additions & 0 deletions vllm/multimodal/media/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,35 @@ def extract_audio_from_video_bytes(
return audio, float(native_sr)


def is_video(data: bytes) -> bool:
"""Check if the fetched bytes are video"""
if len(data) < 12:
return False

box_type = data[4:8]
major_brand = data[8:12]

MP4_BRANDS = {
b"mp41",
b"mp42", # MP4
b"isom", # ISO Base Media
b"iso2",
b"iso4",
b"iso5",
b"iso6",
b"M4V ",
b"M4A ", # Apple
b"avc1", # H.264
b"dash", # DASH
b"mmp4",
b"MSNV",
}

is_avi = data[:4] == b"RIFF" and major_brand == b"AVI "
is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS
return is_mp4 or is_avi


class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
"""Configuration values can be user-provided either by --media-io-kwargs or
by the runtime API field "media_io_kwargs". Ensure proper validation and
Expand All @@ -100,6 +129,8 @@ def __init__(self, **kwargs) -> None:
self.kwargs = kwargs

def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
if is_video(data):
return extract_audio_from_video_bytes(data)
return librosa.load(BytesIO(data), sr=None)

def load_base64(
Expand Down
2 changes: 2 additions & 0 deletions vllm/renderers/deepseek_v32.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def render_messages(
self.model_config,
content_format="string",
media_io_kwargs=params.media_io_kwargs,
mm_processor_kwargs=params.mm_processor_kwargs,
)

prompt_raw = tokenizer.apply_chat_template(
Expand Down Expand Up @@ -77,6 +78,7 @@ async def render_messages_async(
self.model_config,
content_format="string",
media_io_kwargs=params.media_io_kwargs,
mm_processor_kwargs=params.mm_processor_kwargs,
)

prompt_raw = tokenizer.apply_chat_template(
Expand Down
2 changes: 2 additions & 0 deletions vllm/renderers/grok2.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def render_messages(
self.model_config,
content_format="string",
media_io_kwargs=params.media_io_kwargs,
mm_processor_kwargs=params.mm_processor_kwargs,
)

prompt_raw = tokenizer.apply_chat_template(
Expand Down Expand Up @@ -77,6 +78,7 @@ async def render_messages_async(
self.model_config,
content_format="string",
media_io_kwargs=params.media_io_kwargs,
mm_processor_kwargs=params.mm_processor_kwargs,
)

prompt_raw = tokenizer.apply_chat_template(
Expand Down
2 changes: 2 additions & 0 deletions vllm/renderers/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,7 @@ def render_messages(
model_config=model_config,
),
media_io_kwargs=params.media_io_kwargs,
mm_processor_kwargs=params.mm_processor_kwargs,
)

prompt_raw = safe_apply_chat_template(
Expand Down Expand Up @@ -691,6 +692,7 @@ async def render_messages_async(
model_config=model_config,
),
media_io_kwargs=params.media_io_kwargs,
mm_processor_kwargs=params.mm_processor_kwargs,
)

prompt_raw = safe_apply_chat_template(
Expand Down
2 changes: 2 additions & 0 deletions vllm/renderers/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def render_messages(
self.model_config,
content_format="string",
media_io_kwargs=params.media_io_kwargs,
mm_processor_kwargs=params.mm_processor_kwargs,
)

prompt_raw = safe_apply_chat_template(
Expand Down Expand Up @@ -118,6 +119,7 @@ async def render_messages_async(
self.model_config,
content_format="string",
media_io_kwargs=params.media_io_kwargs,
mm_processor_kwargs=params.mm_processor_kwargs,
)

prompt_raw = await self._apply_chat_template_async(
Expand Down
42 changes: 41 additions & 1 deletion vllm/renderers/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,34 @@ def merge_kwargs(
return defaults | {k: v for k, v in overrides.items() if v not in unset_values}


def recursively_merge_kwargs(
defaults: dict[str, Any] | None,
overrides: dict[str, Any] | None,
/,
*,
unset_values: tuple[object, ...] = (None, "auto"),
) -> dict[str, Any]:
if defaults is None:
defaults = {}
if overrides is None:
overrides = {}

merged = dict(defaults)

for k, v in overrides.items():
if v in unset_values:
continue

if k in merged and isinstance(merged[k], dict) and isinstance(v, dict):
merged[k] = recursively_merge_kwargs(
merged[k], v, unset_values=unset_values
)
else:
merged[k] = v

return merged


@dataclass(frozen=True)
class ChatParams:
"""Configuration to control how to parse chat messages."""
Expand All @@ -56,12 +84,20 @@ class ChatParams:
media_io_kwargs: dict[str, dict[str, Any]] | None = None
"""Per-modality kwargs for media I/O (loading/decoding images, videos, etc.)."""

mm_processor_kwargs: dict[str, Any] | None = None
"""The kwargs to pass to the multi-modal processor."""

def with_defaults(
self,
default_chat_template_kwargs: dict[str, Any] | None = None,
default_media_io_kwargs: dict[str, dict[str, Any]] | None = None,
default_mm_processor_kwargs: dict[str, Any] | None = None,
):
if not default_chat_template_kwargs and not default_media_io_kwargs:
if (
not default_chat_template_kwargs
and not default_media_io_kwargs
and not default_mm_processor_kwargs
):
return self

return ChatParams(
Expand All @@ -75,6 +111,10 @@ def with_defaults(
default_media_io_kwargs,
self.media_io_kwargs,
),
mm_processor_kwargs=recursively_merge_kwargs(
default_mm_processor_kwargs,
self.mm_processor_kwargs,
),
)

def get_apply_chat_template_kwargs(self) -> dict[str, Any]:
Expand Down
Loading