From 118aae6c3d9d634d93a31b2f078743ef2ed4a15a Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 22 Aug 2025 02:17:58 +0000 Subject: [PATCH 01/35] add Signed-off-by: Roger Wang --- vllm/entrypoints/chat_utils.py | 256 +++++++++++++++++++++++++-------- vllm/entrypoints/llm.py | 5 +- vllm/inputs/data.py | 17 ++- vllm/inputs/preprocess.py | 60 ++++++-- vllm/multimodal/processing.py | 57 ++++++-- 5 files changed, 315 insertions(+), 80 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 87772a499f42..4ca3e2cebe09 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -410,7 +410,11 @@ def resolve_hf_chat_template( processor.chat_template is not None: return processor.chat_template except Exception: - logger.debug("Failed to load AutoProcessor chat template for %s", tokenizer.name_or_path, exc_info=True) # noqa: E501 + logger.debug( + "Failed to load AutoProcessor chat template for %s", + tokenizer.name_or_path, + exc_info=True, + ) # 3rd priority: AutoTokenizer chat template try: @@ -531,6 +535,7 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): self._tokenizer = tokenizer self._items_by_modality = defaultdict[str, list[_T]](list) + self._ids_by_modality = defaultdict[str, list[str]](list) @property def model_config(self) -> ModelConfig: @@ -554,7 +559,12 @@ def mm_registry(self): def mm_processor(self): return self.mm_registry.create_processor(self.model_config) - def add(self, modality: ModalityStr, item: _T) -> Optional[str]: + def add( + self, + modality: ModalityStr, + item: _T, + item_id: Optional[str] = None, + ) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. @@ -565,6 +575,8 @@ def add(self, modality: ModalityStr, item: _T) -> Optional[str]: self.mm_processor.validate_num_items(input_modality, num_items) self._items_by_modality[modality].append(item) + if item_id is not None: + self._ids_by_modality[modality].append(item_id) return self.model_cls.get_placeholder_str(modality, num_items) @@ -572,6 +584,38 @@ def add(self, modality: ModalityStr, item: _T) -> Optional[str]: def create_parser(self) -> "BaseMultiModalContentParser": raise NotImplementedError + def all_mm_ids(self) -> Optional[dict[str, list[str]]]: + if not self._ids_by_modality: + return None + result: dict[str, list[str]] = {} + for modality, ids in self._ids_by_modality.items(): + if len(ids) == 0: + continue + items_len = len(self._items_by_modality.get(modality, [])) + if len(ids) == items_len: + result[modality] = ids + return result or None + + def validate_all_ids_complete(self) -> None: + """ + If any UUIDs were provided by the user for any multimodal item, + enforce that UUIDs are provided for all items of all modalities. + """ + any_ids = any(len(ids) > 0 for ids in self._ids_by_modality.values()) + if not any_ids: + return + + # For each modality with items, ensure IDs exist and match count + for modality, items in self._items_by_modality.items(): + if len(items) == 0: + continue + ids = self._ids_by_modality.get(modality, []) + if len(ids) != len(items): + raise ValueError( + "When specifying 'uuid' on any multimodal content, you " + "must specify 'uuid' for every multimodal item in all " + "modalities present in the request.") + class MultiModalItemTracker(BaseMultiModalItemTracker[object]): @@ -658,28 +702,55 @@ def mm_placeholder_storage(self) -> dict[str, list]: return dict(self._placeholder_storage) @abstractmethod - def parse_image(self, image_url: str) -> None: + def parse_image( + self, + image_url: str, + *, + item_id: Optional[str] = None, + ) -> None: raise NotImplementedError @abstractmethod def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]]) -> None: + image_embeds: Union[str, dict[str, str]], + *, + item_id: Optional[str] = None) -> None: raise NotImplementedError @abstractmethod - def parse_image_pil(self, image_pil: Image.Image) -> None: + def parse_image_pil( + self, + image_pil: Image.Image, + *, + item_id: Optional[str] = None, + ) -> None: raise NotImplementedError @abstractmethod - def parse_audio(self, audio_url: str) -> None: + def parse_audio( + self, + audio_url: str, + *, + item_id: Optional[str] = None, + ) -> None: raise NotImplementedError @abstractmethod - def parse_input_audio(self, input_audio: InputAudio) -> None: + def parse_input_audio( + self, + input_audio: InputAudio, + *, + item_id: Optional[str] = None, + ) -> None: raise NotImplementedError @abstractmethod - def parse_video(self, video_url: str) -> None: + def parse_video( + self, + video_url: str, + *, + item_id: Optional[str] = None, + ) -> None: raise NotImplementedError @@ -695,48 +766,71 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str) -> None: + def parse_image( + self, + image_url: str, + *, + item_id: Optional[str] = None, + ) -> None: image = self._connector.fetch_image(image_url) - - placeholder = self._tracker.add("image", image) + placeholder = self._tracker.add("image", image, item_id) self._add_placeholder("image", placeholder) def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]]) -> None: + image_embeds: Union[str, dict[str, str]], + *, + item_id: Optional[str] = None) -> None: if isinstance(image_embeds, dict): embeds = { k: self._connector.fetch_image_embedding(v) for k, v in image_embeds.items() } - placeholder = self._tracker.add("image_embeds", embeds) + placeholder = self._tracker.add("image_embeds", embeds, item_id) if isinstance(image_embeds, str): embedding = self._connector.fetch_image_embedding(image_embeds) - placeholder = self._tracker.add("image_embeds", embedding) + placeholder = self._tracker.add("image_embeds", embedding, item_id) self._add_placeholder("image", placeholder) - def parse_image_pil(self, image_pil: Image.Image) -> None: - placeholder = self._tracker.add("image", image_pil) + def parse_image_pil( + self, + image_pil: Image.Image, + *, + item_id: Optional[str] = None, + ) -> None: + placeholder = self._tracker.add("image", image_pil, item_id) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: + def parse_audio( + self, + audio_url: str, + *, + item_id: Optional[str] = None, + ) -> None: audio = self._connector.fetch_audio(audio_url) - - placeholder = self._tracker.add("audio", audio) + placeholder = self._tracker.add("audio", audio, item_id) self._add_placeholder("audio", placeholder) - def parse_input_audio(self, input_audio: InputAudio) -> None: + def parse_input_audio( + self, + input_audio: InputAudio, + *, + item_id: Optional[str] = None, + ) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" - - return self.parse_audio(audio_url) - - def parse_video(self, video_url: str) -> None: + return self.parse_audio(audio_url, item_id=item_id) + + def parse_video( + self, + video_url: str, + *, + item_id: Optional[str] = None, + ) -> None: video = self._connector.fetch_video(video_url=video_url) - - placeholder = self._tracker.add("video", video) + placeholder = self._tracker.add("video", video, item_id) self._add_placeholder("video", placeholder) @@ -751,14 +845,20 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path ) - def parse_image(self, image_url: str) -> None: + def parse_image( + self, + image_url: str, + *, + item_id: Optional[str] = None, + ) -> None: image_coro = self._connector.fetch_image_async(image_url) - - placeholder = self._tracker.add("image", image_coro) + placeholder = self._tracker.add("image", image_coro, item_id) self._add_placeholder("image", placeholder) def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]]) -> None: + image_embeds: Union[str, dict[str, str]], + *, + item_id: Optional[str] = None) -> None: future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() if isinstance(image_embeds, dict): @@ -773,33 +873,49 @@ def parse_image_embeds(self, fetch_image_embedding(image_embeds) future.set_result(embedding) - placeholder = self._tracker.add("image_embeds", future) + placeholder = self._tracker.add("image_embeds", future, item_id) self._add_placeholder("image", placeholder) - def parse_image_pil(self, image_pil: Image.Image) -> None: + def parse_image_pil( + self, + image_pil: Image.Image, + *, + item_id: Optional[str] = None, + ) -> None: future: asyncio.Future[Image.Image] = asyncio.Future() future.set_result(image_pil) - - placeholder = self._tracker.add("image", future) + placeholder = self._tracker.add("image", future, item_id) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: + def parse_audio( + self, + audio_url: str, + *, + item_id: Optional[str] = None, + ) -> None: audio_coro = self._connector.fetch_audio_async(audio_url) - - placeholder = self._tracker.add("audio", audio_coro) + placeholder = self._tracker.add("audio", audio_coro, item_id) self._add_placeholder("audio", placeholder) - def parse_input_audio(self, input_audio: InputAudio) -> None: + def parse_input_audio( + self, + input_audio: InputAudio, + *, + item_id: Optional[str] = None, + ) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" - - return self.parse_audio(audio_url) - - def parse_video(self, video_url: str) -> None: + return self.parse_audio(audio_url, item_id=item_id) + + def parse_video( + self, + video_url: str, + *, + item_id: Optional[str] = None, + ) -> None: video = self._connector.fetch_video_async(video_url=video_url) - - placeholder = self._tracker.add("video", video) + placeholder = self._tracker.add("video", video, item_id) self._add_placeholder("video", placeholder) @@ -961,18 +1077,34 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None), "image_url": - lambda part: _ImageParser(part).get("image_url", {}).get("url", None), + lambda part: _ImageParser({ + k: v + for k, v in cast(dict, part).items() + if k != "uuid" + }).get("image_url", {}).get("url", None), "image_embeds": - lambda part: _ImageEmbedsParser(part).get("image_embeds", None), + lambda part: _ImageEmbedsParser({ + k: v + for k, v in cast(dict, part).items() + if k != "uuid" + }).get("image_embeds", None), "image_pil": lambda part: _PILImageParser(part).get("image_pil", None), "audio_url": - lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), + lambda part: _AudioParser({ + k: v + for k, v in cast(dict, part).items() + if k != "uuid" + }).get("audio_url", {}).get("url", None), "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None), "refusal": lambda part: _RefusalParser(part).get("refusal", None), "video_url": - lambda part: _VideoParser(part).get("video_url", {}).get("url", None), + lambda part: _VideoParser({ + k: v + for k, v in cast(dict, part).items() + if k != "uuid" + }).get("video_url", {}).get("url", None), } @@ -1110,29 +1242,35 @@ def _parse_chat_message_content_part( return str_content modality = None + # Best-effort extraction of a user-provided UUID from the part dict + part_uuid: Optional[str] = None + if isinstance(part, dict): + uuid_val = part.get("uuid") + if isinstance(uuid_val, str) and uuid_val: + part_uuid = uuid_val if part_type == "image_pil": image_content = cast(Image.Image, content) - mm_parser.parse_image_pil(image_content) + mm_parser.parse_image_pil(image_content, item_id=part_uuid) modality = "image" elif part_type in ("image_url", "input_image"): str_content = cast(str, content) - mm_parser.parse_image(str_content) + mm_parser.parse_image(str_content, item_id=part_uuid) modality = "image" elif part_type == "image_embeds": content = cast(Union[str, dict[str, str]], content) - mm_parser.parse_image_embeds(content) + mm_parser.parse_image_embeds(content, item_id=part_uuid) modality = "image" elif part_type == "audio_url": str_content = cast(str, content) - mm_parser.parse_audio(str_content) + mm_parser.parse_audio(str_content, item_id=part_uuid) modality = "audio" elif part_type == "input_audio": dict_content = cast(InputAudio, content) - mm_parser.parse_input_audio(dict_content) + mm_parser.parse_input_audio(dict_content, item_id=part_uuid) modality = "audio" elif part_type == "video_url": str_content = cast(str, content) - mm_parser.parse_video(str_content) + mm_parser.parse_video(str_content, item_id=part_uuid) modality = "video" else: raise NotImplementedError(f"Unknown part type: {part_type}") @@ -1211,7 +1349,11 @@ def parse_chat_messages( model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]: +) -> tuple[ + list[ConversationMessage], + Optional[MultiModalDataDict], + Optional[dict[str, list[str]]], +]: conversation: list[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) @@ -1230,8 +1372,10 @@ def parse_chat_messages( conversation.extend(sub_messages) _postprocess_messages(conversation) + # Validate completeness: if any UUIDs were supplied, require all + mm_tracker.validate_all_ids_complete() - return conversation, mm_tracker.all_mm_data() + return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_ids() def parse_chat_messages_futures( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index b002f234c043..a053e339b8d4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -891,7 +891,7 @@ def chat( # NOTE: _parse_chat_message_content_parts() currently doesn't # handle mm_processor_kwargs, since there is no implementation in # the chat message parsing for it. - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_ids = parse_chat_messages( msgs, model_config, tokenizer, @@ -920,6 +920,9 @@ def chat( if mm_data is not None: prompt["multi_modal_data"] = mm_data + if mm_ids is not None: + prompt[ + "multi_modal_ids"] = mm_ids # type: ignore[typeddict-item] if mm_processor_kwargs is not None: prompt["mm_processor_kwargs"] = mm_processor_kwargs diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 23cb5e5022f1..7185d3255808 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -7,7 +7,8 @@ from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar if TYPE_CHECKING: - from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs + from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalHashDict, + MultiModalInputs) class TextPrompt(TypedDict): @@ -30,6 +31,13 @@ class TextPrompt(TypedDict): to pass the mm_processor_kwargs to each of them. """ + multi_modal_ids: NotRequired["MultiModalHashDict"] + """ + Optional user-specified UUIDs for multimodal items, mapped by modality. + When provided, these IDs override the default hashes generated by the + multimodal processor for caching and deduplication. + """ + cache_salt: NotRequired[str] """ Optional cache salt to be used for prefix caching. @@ -59,6 +67,13 @@ class TokensPrompt(TypedDict): to pass the mm_processor_kwargs to each of them. """ + multi_modal_ids: NotRequired["MultiModalHashDict"] + """ + Optional user-specified UUIDs for multimodal items, mapped by modality. + When provided, these IDs override the default hashes generated by the + multimodal processor for caching and deduplication. + """ + cache_salt: NotRequired[str] """ Optional cache salt to be used for prefix caching. diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 3f521012e82a..e24ff4a951d9 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -252,6 +252,8 @@ def _process_multimodal( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], + *, + mm_ids_override: Optional[dict[str, list[str]]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: @@ -267,16 +269,34 @@ def _process_multimodal( if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply(prompt, - mm_data, - hf_processor_mm_kwargs=mm_processor_kwargs, - tokenization_kwargs=tokenization_kwargs) + # If any override IDs are provided, enforce that all modalities/items + # present in mm_data are covered. + if mm_ids_override and any( + len(v) > 0 for v in mm_ids_override.values()): + for modality, items in mm_data.items(): + expected = len(items) if isinstance(items, list) else 1 + ids = mm_ids_override.get(modality) + if ids is None or len(ids) != expected: + raise ValueError( + "When providing 'multi_modal_ids', you must provide " + "IDs for every item of all modalities present in the " + "request.") + + return mm_processor.apply( + prompt, + mm_data, + hf_processor_mm_kwargs=mm_processor_kwargs, + tokenization_kwargs=tokenization_kwargs, + mm_ids_override=mm_ids_override, + ) async def _process_multimodal_async( self, prompt: Union[str, list[int]], mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], + *, + mm_ids_override: Optional[dict[str, list[str]]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: @@ -291,10 +311,26 @@ async def _process_multimodal_async( if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply(prompt, - mm_data, - hf_processor_mm_kwargs=mm_processor_kwargs, - tokenization_kwargs=tokenization_kwargs) + # If any override IDs are provided, enforce that all modalities/items + # present in mm_data are covered. + if mm_ids_override and any( + len(v) > 0 for v in mm_ids_override.values()): + for modality, items in mm_data.items(): + expected = len(items) if isinstance(items, list) else 1 + ids = mm_ids_override.get(modality) + if ids is None or len(ids) != expected: + raise ValueError( + "When providing 'multi_modal_ids', you must provide " + "IDs for every item of all modalities present in the " + "request.") + + return mm_processor.apply( + prompt, + mm_data, + hf_processor_mm_kwargs=mm_processor_kwargs, + tokenization_kwargs=tokenization_kwargs, + mm_ids_override=mm_ids_override, + ) def _process_embeds( self, @@ -341,6 +377,8 @@ def _process_tokens( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), + mm_ids_override=cast(Optional[dict[str, list[str]]], + parsed_content.get("multi_modal_id")), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -370,6 +408,8 @@ async def _process_tokens_async( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), + mm_ids_override=cast(Optional[dict[str, list[str]]], + parsed_content.get("multi_modal_ids")), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -398,6 +438,8 @@ def _process_text( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), + mm_ids_override=cast(Optional[dict[str, list[str]]], + parsed_content.get("multi_modal_ids")), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -431,6 +473,8 @@ async def _process_text_async( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), + mm_ids_override=cast(Optional[dict[str, list[str]]], + parsed_content.get("multi_modal_ids")), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 55fd1479d2de..1e48f0f46890 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1355,20 +1355,34 @@ def _hash_mm_items( mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], + *, + override_ids: Optional[dict[str, list[str]]] = None, ) -> MultiModalHashes: """Create MM hashes to be returned (only used in V1).""" model_id = self.info.model_id - return { - modality: [ - MultiModalHasher.hash_kwargs(model_id=model_id, - **{modality: item}, - **hf_processor_mm_kwargs, - **tokenization_kwargs) - for item in items - ] - for modality, items in mm_items.items() - } + hashes: MultiModalHashes = {} + override_ids = override_ids or {} + + for modality, items in mm_items.items(): + if modality in override_ids and override_ids[modality]: + ids = override_ids[modality] + if len(ids) != len(items): + raise ValueError( + f"multi_modal_ids for modality '{modality}' must " + f"have same length as data: got {len(ids)} ids vs " + f"{len(items)} items.") + hashes[modality] = list(ids) + else: + hashes[modality] = [ + MultiModalHasher.hash_kwargs(model_id=model_id, + **{modality: item}, + **hf_processor_mm_kwargs, + **tokenization_kwargs) + for item in items + ] + + return hashes def _merge_mm_kwargs( self, @@ -1399,6 +1413,8 @@ def _apply_hf_processor( mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], + *, + override_ids: Optional[dict[str, list[str]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1418,8 +1434,10 @@ def _apply_hf_processor( hf_processor_mm_kwargs), ) - mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, - tokenization_kwargs) + mm_hashes = self._hash_mm_items(mm_data_items, + hf_processor_mm_kwargs, + tokenization_kwargs, + override_ids=override_ids) unbound_prompt_updates = self._get_prompt_updates( mm_data_items, @@ -1443,6 +1461,8 @@ def _cached_apply_hf_processor( mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], + *, + override_ids: Optional[dict[str, list[str]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1457,10 +1477,13 @@ def _cached_apply_hf_processor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + override_ids=override_ids, ) - mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, - tokenization_kwargs) + mm_hashes = self._hash_mm_items(mm_data_items, + hf_processor_mm_kwargs, + tokenization_kwargs, + override_ids=override_ids) ( mm_cache_items_or_hashes, mm_missing_data_items, @@ -1689,6 +1712,8 @@ def apply( mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, + *, + mm_ids_override: Optional[dict[str, list[str]]] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1717,6 +1742,7 @@ def apply( mm_items, hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + override_ids=mm_ids_override, ) # NOTE: tokenization_kwargs are not required to init processor @@ -1801,6 +1827,8 @@ def apply( mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, + *, + mm_ids_override: Optional[dict[str, list[str]]] = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1815,6 +1843,7 @@ def apply( mm_data, hf_processor_mm_kwargs, tokenization_kwargs, + mm_ids_override=mm_ids_override, ) return self._get_enc_dec_inputs( From 3b92bcf79f8bf79ac78a17e521d70a86734fbc4d Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 22 Aug 2025 02:29:20 +0000 Subject: [PATCH 02/35] typo Signed-off-by: Roger Wang --- vllm/inputs/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index e24ff4a951d9..488bb660444e 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -378,7 +378,7 @@ def _process_tokens( multi_modal_data, parsed_content.get("mm_processor_kwargs"), mm_ids_override=cast(Optional[dict[str, list[str]]], - parsed_content.get("multi_modal_id")), + parsed_content.get("multi_modal_ids")), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) From 53dd7c7d9d79ef49a9b517675c525b3a140351ec Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 22 Aug 2025 02:53:38 +0000 Subject: [PATCH 03/35] fix import Signed-off-by: Roger Wang --- vllm/inputs/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 7185d3255808..de8b03f2e7da 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -7,8 +7,8 @@ from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar if TYPE_CHECKING: - from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalHashDict, - MultiModalInputs) + from vllm.multimodal.hasher import MultiModalHashDict + from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs class TextPrompt(TypedDict): From cea5c09146e5408fb7f1f441eb77bc995f753449 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Aug 2025 12:01:58 +0000 Subject: [PATCH 04/35] revert Signed-off-by: Roger Wang --- vllm/entrypoints/chat_utils.py | 152 ++++++++------------------------- vllm/entrypoints/llm.py | 6 +- 2 files changed, 37 insertions(+), 121 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 097fd1f90ca4..8cf43d9ca38b 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -535,7 +535,6 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): self._tokenizer = tokenizer self._items_by_modality = defaultdict[str, list[_T]](list) - self._ids_by_modality = defaultdict[str, list[str]](list) @property def model_config(self) -> ModelConfig: @@ -563,7 +562,6 @@ def add( self, modality: ModalityStr, item: _T, - item_id: Optional[str] = None, ) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the @@ -575,8 +573,6 @@ def add( self.mm_processor.validate_num_items(input_modality, num_items) self._items_by_modality[modality].append(item) - if item_id is not None: - self._ids_by_modality[modality].append(item_id) return self.model_cls.get_placeholder_str(modality, num_items) @@ -585,36 +581,12 @@ def create_parser(self) -> "BaseMultiModalContentParser": raise NotImplementedError def all_mm_ids(self) -> Optional[dict[str, list[str]]]: - if not self._ids_by_modality: - return None - result: dict[str, list[str]] = {} - for modality, ids in self._ids_by_modality.items(): - if len(ids) == 0: - continue - items_len = len(self._items_by_modality.get(modality, [])) - if len(ids) == items_len: - result[modality] = ids - return result or None + # UUID tracking removed; keep API for compatibility but return None + return None def validate_all_ids_complete(self) -> None: - """ - If any UUIDs were provided by the user for any multimodal item, - enforce that UUIDs are provided for all items of all modalities. - """ - any_ids = any(len(ids) > 0 for ids in self._ids_by_modality.values()) - if not any_ids: - return - - # For each modality with items, ensure IDs exist and match count - for modality, items in self._items_by_modality.items(): - if len(items) == 0: - continue - ids = self._ids_by_modality.get(modality, []) - if len(ids) != len(items): - raise ValueError( - "When specifying 'uuid' on any multimodal content, you " - "must specify 'uuid' for every multimodal item in all " - "modalities present in the request.") + # UUID validation removed; no-op + return None class MultiModalItemTracker(BaseMultiModalItemTracker[object]): @@ -705,24 +677,19 @@ def mm_placeholder_storage(self) -> dict[str, list]: def parse_image( self, image_url: str, - *, - item_id: Optional[str] = None, ) -> None: raise NotImplementedError @abstractmethod def parse_image_embeds(self, image_embeds: Union[str, dict[str, str]], - *, - item_id: Optional[str] = None) -> None: + ) -> None: raise NotImplementedError @abstractmethod def parse_image_pil( self, image_pil: Image.Image, - *, - item_id: Optional[str] = None, ) -> None: raise NotImplementedError @@ -730,8 +697,6 @@ def parse_image_pil( def parse_audio( self, audio_url: str, - *, - item_id: Optional[str] = None, ) -> None: raise NotImplementedError @@ -739,8 +704,6 @@ def parse_audio( def parse_input_audio( self, input_audio: InputAudio, - *, - item_id: Optional[str] = None, ) -> None: raise NotImplementedError @@ -748,8 +711,6 @@ def parse_input_audio( def parse_video( self, video_url: str, - *, - item_id: Optional[str] = None, ) -> None: raise NotImplementedError @@ -769,68 +730,57 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: def parse_image( self, image_url: str, - *, - item_id: Optional[str] = None, ) -> None: image = self._connector.fetch_image(image_url) - placeholder = self._tracker.add("image", image, item_id) + placeholder = self._tracker.add("image", image) self._add_placeholder("image", placeholder) def parse_image_embeds(self, image_embeds: Union[str, dict[str, str]], - *, - item_id: Optional[str] = None) -> None: + ) -> None: if isinstance(image_embeds, dict): embeds = { k: self._connector.fetch_image_embedding(v) for k, v in image_embeds.items() } - placeholder = self._tracker.add("image_embeds", embeds, item_id) + placeholder = self._tracker.add("image_embeds", embeds) if isinstance(image_embeds, str): embedding = self._connector.fetch_image_embedding(image_embeds) - placeholder = self._tracker.add("image_embeds", embedding, item_id) + placeholder = self._tracker.add("image_embeds", embedding) self._add_placeholder("image", placeholder) def parse_image_pil( self, image_pil: Image.Image, - *, - item_id: Optional[str] = None, ) -> None: - placeholder = self._tracker.add("image", image_pil, item_id) + placeholder = self._tracker.add("image", image_pil) self._add_placeholder("image", placeholder) def parse_audio( self, audio_url: str, - *, - item_id: Optional[str] = None, ) -> None: audio = self._connector.fetch_audio(audio_url) - placeholder = self._tracker.add("audio", audio, item_id) + placeholder = self._tracker.add("audio", audio) self._add_placeholder("audio", placeholder) def parse_input_audio( self, input_audio: InputAudio, - *, - item_id: Optional[str] = None, ) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" - return self.parse_audio(audio_url, item_id=item_id) + return self.parse_audio(audio_url) def parse_video( self, video_url: str, - *, - item_id: Optional[str] = None, ) -> None: video = self._connector.fetch_video(video_url=video_url) - placeholder = self._tracker.add("video", video, item_id) + placeholder = self._tracker.add("video", video) self._add_placeholder("video", placeholder) @@ -848,17 +798,14 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: def parse_image( self, image_url: str, - *, - item_id: Optional[str] = None, ) -> None: image_coro = self._connector.fetch_image_async(image_url) - placeholder = self._tracker.add("image", image_coro, item_id) + placeholder = self._tracker.add("image", image_coro) self._add_placeholder("image", placeholder) def parse_image_embeds(self, image_embeds: Union[str, dict[str, str]], - *, - item_id: Optional[str] = None) -> None: + ) -> None: future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() if isinstance(image_embeds, dict): @@ -873,49 +820,41 @@ def parse_image_embeds(self, fetch_image_embedding(image_embeds) future.set_result(embedding) - placeholder = self._tracker.add("image_embeds", future, item_id) + placeholder = self._tracker.add("image_embeds", future) self._add_placeholder("image", placeholder) def parse_image_pil( self, image_pil: Image.Image, - *, - item_id: Optional[str] = None, ) -> None: future: asyncio.Future[Image.Image] = asyncio.Future() future.set_result(image_pil) - placeholder = self._tracker.add("image", future, item_id) + placeholder = self._tracker.add("image", future) self._add_placeholder("image", placeholder) def parse_audio( self, audio_url: str, - *, - item_id: Optional[str] = None, ) -> None: audio_coro = self._connector.fetch_audio_async(audio_url) - placeholder = self._tracker.add("audio", audio_coro, item_id) + placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder("audio", placeholder) def parse_input_audio( self, input_audio: InputAudio, - *, - item_id: Optional[str] = None, ) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" - return self.parse_audio(audio_url, item_id=item_id) + return self.parse_audio(audio_url) def parse_video( self, video_url: str, - *, - item_id: Optional[str] = None, ) -> None: video = self._connector.fetch_video_async(video_url=video_url) - placeholder = self._tracker.add("video", video, item_id) + placeholder = self._tracker.add("video", video) self._add_placeholder("video", placeholder) @@ -1077,34 +1016,22 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None), "image_url": - lambda part: _ImageParser({ - k: v - for k, v in cast(dict, part).items() - if k != "uuid" - }).get("image_url", {}).get("url", None), + lambda part: _ImageParser(cast(dict, part)).get("image_url", {}).get( + "url", None), "image_embeds": - lambda part: _ImageEmbedsParser({ - k: v - for k, v in cast(dict, part).items() - if k != "uuid" - }).get("image_embeds", None), + lambda part: _ImageEmbedsParser(cast(dict, part)).get("image_embeds", + None), "image_pil": lambda part: _PILImageParser(part).get("image_pil", None), "audio_url": - lambda part: _AudioParser({ - k: v - for k, v in cast(dict, part).items() - if k != "uuid" - }).get("audio_url", {}).get("url", None), + lambda part: _AudioParser(cast(dict, part)).get("audio_url", {}).get( + "url", None), "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None), "refusal": lambda part: _RefusalParser(part).get("refusal", None), "video_url": - lambda part: _VideoParser({ - k: v - for k, v in cast(dict, part).items() - if k != "uuid" - }).get("video_url", {}).get("url", None), + lambda part: _VideoParser(cast(dict, part)).get("video_url", {}).get( + "url", None), } @@ -1242,35 +1169,29 @@ def _parse_chat_message_content_part( return str_content modality = None - # Best-effort extraction of a user-provided UUID from the part dict - part_uuid: Optional[str] = None - if isinstance(part, dict): - uuid_val = part.get("uuid") - if isinstance(uuid_val, str) and uuid_val: - part_uuid = uuid_val if part_type == "image_pil": image_content = cast(Image.Image, content) - mm_parser.parse_image_pil(image_content, item_id=part_uuid) + mm_parser.parse_image_pil(image_content) modality = "image" elif part_type in ("image_url", "input_image"): str_content = cast(str, content) - mm_parser.parse_image(str_content, item_id=part_uuid) + mm_parser.parse_image(str_content) modality = "image" elif part_type == "image_embeds": content = cast(Union[str, dict[str, str]], content) - mm_parser.parse_image_embeds(content, item_id=part_uuid) + mm_parser.parse_image_embeds(content) modality = "image" elif part_type == "audio_url": str_content = cast(str, content) - mm_parser.parse_audio(str_content, item_id=part_uuid) + mm_parser.parse_audio(str_content) modality = "audio" elif part_type == "input_audio": dict_content = cast(InputAudio, content) - mm_parser.parse_input_audio(dict_content, item_id=part_uuid) + mm_parser.parse_input_audio(dict_content) modality = "audio" elif part_type == "video_url": str_content = cast(str, content) - mm_parser.parse_video(str_content, item_id=part_uuid) + mm_parser.parse_video(str_content) modality = "video" else: raise NotImplementedError(f"Unknown part type: {part_type}") @@ -1352,7 +1273,6 @@ def parse_chat_messages( ) -> tuple[ list[ConversationMessage], Optional[MultiModalDataDict], - Optional[dict[str, list[str]]], ]: conversation: list[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) @@ -1372,10 +1292,8 @@ def parse_chat_messages( conversation.extend(sub_messages) _postprocess_messages(conversation) - # Validate completeness: if any UUIDs were supplied, require all - mm_tracker.validate_all_ids_complete() - return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_ids() + return conversation, mm_tracker.all_mm_data() def parse_chat_messages_futures( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0e45ca6afa5a..8b241da7e620 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -780,7 +780,7 @@ def chat( # NOTE: _parse_chat_message_content_parts() currently doesn't # handle mm_processor_kwargs, since there is no implementation in # the chat message parsing for it. - conversation, mm_data, mm_ids = parse_chat_messages( + conversation, mm_data = parse_chat_messages( msgs, model_config, tokenizer, @@ -809,9 +809,7 @@ def chat( if mm_data is not None: prompt["multi_modal_data"] = mm_data - if mm_ids is not None: - prompt[ - "multi_modal_ids"] = mm_ids # type: ignore[typeddict-item] + # UUID-based multi_modal_ids support removed if mm_processor_kwargs is not None: prompt["mm_processor_kwargs"] = mm_processor_kwargs From 611827f09dde6f5545e0eafc9f2d8590c6ae3324 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Aug 2025 12:04:26 +0000 Subject: [PATCH 05/35] revert Signed-off-by: Roger Wang --- vllm/entrypoints/chat_utils.py | 130 +++++++++------------------------ vllm/entrypoints/llm.py | 1 - 2 files changed, 34 insertions(+), 97 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 8cf43d9ca38b..7b11a50642de 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -410,11 +410,7 @@ def resolve_hf_chat_template( processor.chat_template is not None: return processor.chat_template except Exception: - logger.debug( - "Failed to load AutoProcessor chat template for %s", - tokenizer.name_or_path, - exc_info=True, - ) + logger.debug("Failed to load AutoProcessor chat template for %s", tokenizer.name_or_path, exc_info=True) # noqa: E501 # 3rd priority: AutoTokenizer chat template try: @@ -558,11 +554,7 @@ def mm_registry(self): def mm_processor(self): return self.mm_registry.create_processor(self.model_config) - def add( - self, - modality: ModalityStr, - item: _T, - ) -> Optional[str]: + def add(self, modality: ModalityStr, item: _T) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. @@ -580,14 +572,6 @@ def add( def create_parser(self) -> "BaseMultiModalContentParser": raise NotImplementedError - def all_mm_ids(self) -> Optional[dict[str, list[str]]]: - # UUID tracking removed; keep API for compatibility but return None - return None - - def validate_all_ids_complete(self) -> None: - # UUID validation removed; no-op - return None - class MultiModalItemTracker(BaseMultiModalItemTracker[object]): @@ -674,44 +658,28 @@ def mm_placeholder_storage(self) -> dict[str, list]: return dict(self._placeholder_storage) @abstractmethod - def parse_image( - self, - image_url: str, - ) -> None: + def parse_image(self, image_url: str) -> None: raise NotImplementedError @abstractmethod def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]], - ) -> None: + image_embeds: Union[str, dict[str, str]]) -> None: raise NotImplementedError @abstractmethod - def parse_image_pil( - self, - image_pil: Image.Image, - ) -> None: + def parse_image_pil(self, image_pil: Image.Image) -> None: raise NotImplementedError @abstractmethod - def parse_audio( - self, - audio_url: str, - ) -> None: + def parse_audio(self, audio_url: str) -> None: raise NotImplementedError @abstractmethod - def parse_input_audio( - self, - input_audio: InputAudio, - ) -> None: + def parse_input_audio(self, input_audio: InputAudio) -> None: raise NotImplementedError @abstractmethod - def parse_video( - self, - video_url: str, - ) -> None: + def parse_video(self, video_url: str) -> None: raise NotImplementedError @@ -727,17 +695,14 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image( - self, - image_url: str, - ) -> None: + def parse_image(self, image_url: str) -> None: image = self._connector.fetch_image(image_url) + placeholder = self._tracker.add("image", image) self._add_placeholder("image", placeholder) def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]], - ) -> None: + image_embeds: Union[str, dict[str, str]]) -> None: if isinstance(image_embeds, dict): embeds = { k: self._connector.fetch_image_embedding(v) @@ -751,35 +716,26 @@ def parse_image_embeds(self, self._add_placeholder("image", placeholder) - def parse_image_pil( - self, - image_pil: Image.Image, - ) -> None: + def parse_image_pil(self, image_pil: Image.Image) -> None: placeholder = self._tracker.add("image", image_pil) self._add_placeholder("image", placeholder) - def parse_audio( - self, - audio_url: str, - ) -> None: + def parse_audio(self, audio_url: str) -> None: audio = self._connector.fetch_audio(audio_url) + placeholder = self._tracker.add("audio", audio) self._add_placeholder("audio", placeholder) - def parse_input_audio( - self, - input_audio: InputAudio, - ) -> None: + def parse_input_audio(self, input_audio: InputAudio) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" + return self.parse_audio(audio_url) - def parse_video( - self, - video_url: str, - ) -> None: + def parse_video(self, video_url: str) -> None: video = self._connector.fetch_video(video_url=video_url) + placeholder = self._tracker.add("video", video) self._add_placeholder("video", placeholder) @@ -795,17 +751,14 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path ) - def parse_image( - self, - image_url: str, - ) -> None: + def parse_image(self, image_url: str) -> None: image_coro = self._connector.fetch_image_async(image_url) + placeholder = self._tracker.add("image", image_coro) self._add_placeholder("image", placeholder) def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]], - ) -> None: + image_embeds: Union[str, dict[str, str]]) -> None: future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() if isinstance(image_embeds, dict): @@ -823,37 +776,29 @@ def parse_image_embeds(self, placeholder = self._tracker.add("image_embeds", future) self._add_placeholder("image", placeholder) - def parse_image_pil( - self, - image_pil: Image.Image, - ) -> None: + def parse_image_pil(self, image_pil: Image.Image) -> None: future: asyncio.Future[Image.Image] = asyncio.Future() future.set_result(image_pil) + placeholder = self._tracker.add("image", future) self._add_placeholder("image", placeholder) - def parse_audio( - self, - audio_url: str, - ) -> None: + def parse_audio(self, audio_url: str) -> None: audio_coro = self._connector.fetch_audio_async(audio_url) + placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder("audio", placeholder) - def parse_input_audio( - self, - input_audio: InputAudio, - ) -> None: + def parse_input_audio(self, input_audio: InputAudio) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" + return self.parse_audio(audio_url) - def parse_video( - self, - video_url: str, - ) -> None: + def parse_video(self, video_url: str) -> None: video = self._connector.fetch_video_async(video_url=video_url) + placeholder = self._tracker.add("video", video) self._add_placeholder("video", placeholder) @@ -1016,22 +961,18 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None), "image_url": - lambda part: _ImageParser(cast(dict, part)).get("image_url", {}).get( - "url", None), + lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": - lambda part: _ImageEmbedsParser(cast(dict, part)).get("image_embeds", - None), + lambda part: _ImageEmbedsParser(part).get("image_embeds", None), "image_pil": lambda part: _PILImageParser(part).get("image_pil", None), "audio_url": - lambda part: _AudioParser(cast(dict, part)).get("audio_url", {}).get( - "url", None), + lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None), "refusal": lambda part: _RefusalParser(part).get("refusal", None), "video_url": - lambda part: _VideoParser(cast(dict, part)).get("video_url", {}).get( - "url", None), + lambda part: _VideoParser(part).get("video_url", {}).get("url", None), } @@ -1270,10 +1211,7 @@ def parse_chat_messages( model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> tuple[ - list[ConversationMessage], - Optional[MultiModalDataDict], -]: +) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]: conversation: list[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 8b241da7e620..728ed8328d36 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -809,7 +809,6 @@ def chat( if mm_data is not None: prompt["multi_modal_data"] = mm_data - # UUID-based multi_modal_ids support removed if mm_processor_kwargs is not None: prompt["mm_processor_kwargs"] = mm_processor_kwargs From 1f31339365e6f0d1a972fe3004782f1548034bed Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Aug 2025 13:56:42 +0000 Subject: [PATCH 06/35] allow missing entry Signed-off-by: Roger Wang --- vllm/inputs/data.py | 14 +++++++----- vllm/inputs/preprocess.py | 42 +++++------------------------------ vllm/multimodal/hasher.py | 4 ++-- vllm/multimodal/processing.py | 34 +++++++++++++++++----------- vllm/v1/engine/processor.py | 40 +++++++++++++++++++++++++++++++++ 5 files changed, 77 insertions(+), 57 deletions(-) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index de8b03f2e7da..a0fe65d3c809 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -33,9 +33,10 @@ class TextPrompt(TypedDict): multi_modal_ids: NotRequired["MultiModalHashDict"] """ - Optional user-specified UUIDs for multimodal items, mapped by modality. - When provided, these IDs override the default hashes generated by the - multimodal processor for caching and deduplication. + Optional user-specified IDs for multimodal items, mapped by modality. + Lists must match the number of items per modality and may contain `None`. + For `None` entries, the hasher will compute IDs automatically; non-None + entries override the default hashes for caching. """ cache_salt: NotRequired[str] @@ -69,9 +70,10 @@ class TokensPrompt(TypedDict): multi_modal_ids: NotRequired["MultiModalHashDict"] """ - Optional user-specified UUIDs for multimodal items, mapped by modality. - When provided, these IDs override the default hashes generated by the - multimodal processor for caching and deduplication. + Optional user-specified IDs for multimodal items, mapped by modality. + Lists must match the number of items per modality and may contain `None`. + For `None` entries, the hasher will compute IDs automatically; non-None + entries override the default hashes for caching. """ cache_salt: NotRequired[str] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 488bb660444e..4a78aaa55d53 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -253,7 +253,7 @@ def _process_multimodal( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], *, - mm_ids_override: Optional[dict[str, list[str]]] = None, + mm_ids_override: Optional[dict[str, list[Optional[str]]]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: @@ -269,19 +269,6 @@ def _process_multimodal( if mm_processor_kwargs is None: mm_processor_kwargs = {} - # If any override IDs are provided, enforce that all modalities/items - # present in mm_data are covered. - if mm_ids_override and any( - len(v) > 0 for v in mm_ids_override.values()): - for modality, items in mm_data.items(): - expected = len(items) if isinstance(items, list) else 1 - ids = mm_ids_override.get(modality) - if ids is None or len(ids) != expected: - raise ValueError( - "When providing 'multi_modal_ids', you must provide " - "IDs for every item of all modalities present in the " - "request.") - return mm_processor.apply( prompt, mm_data, @@ -296,7 +283,7 @@ async def _process_multimodal_async( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], *, - mm_ids_override: Optional[dict[str, list[str]]] = None, + mm_ids_override: Optional[dict[str, list[Optional[str]]]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: @@ -311,19 +298,6 @@ async def _process_multimodal_async( if mm_processor_kwargs is None: mm_processor_kwargs = {} - # If any override IDs are provided, enforce that all modalities/items - # present in mm_data are covered. - if mm_ids_override and any( - len(v) > 0 for v in mm_ids_override.values()): - for modality, items in mm_data.items(): - expected = len(items) if isinstance(items, list) else 1 - ids = mm_ids_override.get(modality) - if ids is None or len(ids) != expected: - raise ValueError( - "When providing 'multi_modal_ids', you must provide " - "IDs for every item of all modalities present in the " - "request.") - return mm_processor.apply( prompt, mm_data, @@ -377,8 +351,7 @@ def _process_tokens( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=cast(Optional[dict[str, list[str]]], - parsed_content.get("multi_modal_ids")), + mm_ids_override=parsed_content.get("multi_modal_ids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -408,8 +381,7 @@ async def _process_tokens_async( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=cast(Optional[dict[str, list[str]]], - parsed_content.get("multi_modal_ids")), + mm_ids_override=parsed_content.get("multi_modal_ids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -438,8 +410,7 @@ def _process_text( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=cast(Optional[dict[str, list[str]]], - parsed_content.get("multi_modal_ids")), + mm_ids_override=parsed_content.get("multi_modal_ids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -473,8 +444,7 @@ async def _process_text_async( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=cast(Optional[dict[str, list[str]]], - parsed_content.get("multi_modal_ids")), + mm_ids_override=parsed_content.get("multi_modal_ids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 210a4ec76287..607cde302607 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -4,7 +4,7 @@ import pickle import uuid from collections.abc import Iterable, Mapping -from typing import Union +from typing import Optional, Union import numpy as np import torch @@ -16,7 +16,7 @@ logger = init_logger(__name__) -MultiModalHashDict = Mapping[str, list[str]] +MultiModalHashDict = Mapping[str, list[Optional[str]]] """ A dictionary containing hashes for items in each modality. """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 1e48f0f46890..ff10c5f501c7 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1356,7 +1356,7 @@ def _hash_mm_items( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - override_ids: Optional[dict[str, list[str]]] = None, + override_ids: Optional[dict[str, list[Optional[str]]]] = None, ) -> MultiModalHashes: """Create MM hashes to be returned (only used in V1).""" model_id = self.info.model_id @@ -1365,14 +1365,22 @@ def _hash_mm_items( override_ids = override_ids or {} for modality, items in mm_items.items(): - if modality in override_ids and override_ids[modality]: - ids = override_ids[modality] - if len(ids) != len(items): - raise ValueError( - f"multi_modal_ids for modality '{modality}' must " - f"have same length as data: got {len(ids)} ids vs " - f"{len(items)} items.") - hashes[modality] = list(ids) + if modality in override_ids: + mm_ids = override_ids[modality] + # For None entries, compute a hash; otherwise, use provided ID. + computed: list[str] = [] + for i, item in enumerate(items): + mm_id = mm_ids[i] + if mm_id is None: + computed.append( + MultiModalHasher.hash_kwargs( + model_id=model_id, + **{modality: item}, + **hf_processor_mm_kwargs, + **tokenization_kwargs)) + else: + computed.append(mm_id) + hashes[modality] = computed else: hashes[modality] = [ MultiModalHasher.hash_kwargs(model_id=model_id, @@ -1414,7 +1422,7 @@ def _apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - override_ids: Optional[dict[str, list[str]]] = None, + override_ids: Optional[dict[str, list[Optional[str]]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1462,7 +1470,7 @@ def _cached_apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - override_ids: Optional[dict[str, list[str]]] = None, + override_ids: Optional[dict[str, list[Optional[str]]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1713,7 +1721,7 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_ids_override: Optional[dict[str, list[str]]] = None, + mm_ids_override: Optional[dict[str, list[Optional[str]]]] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1828,7 +1836,7 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_ids_override: Optional[dict[str, list[str]]] = None, + mm_ids_override: Optional[dict[str, list[Optional[str]]]] = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 69f8e531e01b..3598393e72dd 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -147,6 +147,42 @@ def _validate_params( self._validate_sampling_params(params, lora_request) self._validate_supported_sampling_params(params) + def _validate_multi_modal_ids(self, prompt: PromptType) -> None: + """ + Validate that user-provided multi_modal_ids align with multi_modal_data + in the incoming request prompt(s). Only checks lengths; `None` entries + are allowed and will be auto-hashed downstream. + """ + + def _validate_single(single_prompt: Union[dict, str]) -> None: + if not isinstance(single_prompt, dict): + return + mm_data = single_prompt.get("multi_modal_data") + mm_ids = single_prompt.get("multi_modal_ids") + if not mm_data or not mm_ids: + return + + for modality, items in mm_data.items(): + if modality in mm_ids: + expected = len(items) if isinstance(items, list) else 1 + if len(mm_ids[modality]) != expected: + raise ValueError( + f"multi_modal_ids for modality '{modality}' must " + "have same length as data: got " + f"{len(mm_ids[modality])} ids vs {expected} items." + ) + + # Handle explicit encoder/decoder prompts or singleton prompt + if isinstance(prompt, dict) and "encoder_prompt" in prompt: + enc = prompt.get("encoder_prompt") + dec = prompt.get("decoder_prompt") + if enc is not None: + _validate_single(enc) + if dec is not None: + _validate_single(dec) + else: + _validate_single(prompt) # type: ignore[arg-type] + def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None: if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " @@ -248,6 +284,9 @@ def process_inputs( if arrival_time is None: arrival_time = time.time() + # Validate multimodal ids alignment with mm_data at request layer + self._validate_multi_modal_ids(prompt) + # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess @@ -263,6 +302,7 @@ def process_inputs( params=params, processed_inputs=processed_inputs, ) + eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) self._validate_model_inputs(processed_inputs, lora_request) From 17566062be51e2637653cbeb400719eab8c552d1 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Aug 2025 14:03:16 +0000 Subject: [PATCH 07/35] update Signed-off-by: Roger Wang --- vllm/multimodal/hasher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 607cde302607..9aab8b3c680d 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -3,7 +3,7 @@ import pickle import uuid -from collections.abc import Iterable, Mapping +from collections.abc import Iterable from typing import Optional, Union import numpy as np @@ -16,7 +16,7 @@ logger = init_logger(__name__) -MultiModalHashDict = Mapping[str, list[Optional[str]]] +MultiModalHashDict = dict[str, list[Optional[str]]] """ A dictionary containing hashes for items in each modality. """ From a82a865bc54be7491e832a6b5a1030b4a9dc41b2 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Aug 2025 14:25:28 +0000 Subject: [PATCH 08/35] update typing Signed-off-by: Roger Wang --- vllm/multimodal/processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index ff10c5f501c7..b32dfee11945 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -983,7 +983,7 @@ def get_mm_max_tokens_per_item( _I = TypeVar("_I", bound=BaseProcessingInfo) -MultiModalHashes = dict[str, list[str]] +MultiModalHashes = dict[str, list[Optional[str]]] """ A collection of hashes with a similar structure as [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems]. From c6a1e6a94b89e9cf396555bc378b948fdc599a36 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Aug 2025 23:56:08 -0700 Subject: [PATCH 09/35] rename Signed-off-by: Roger Wang --- vllm/inputs/data.py | 8 ++++---- vllm/inputs/preprocess.py | 8 ++++---- vllm/v1/engine/processor.py | 31 ++++++++++++++++++------------- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index a0fe65d3c809..49e52ea47c0f 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -31,9 +31,9 @@ class TextPrompt(TypedDict): to pass the mm_processor_kwargs to each of them. """ - multi_modal_ids: NotRequired["MultiModalHashDict"] + multi_modal_uuids: NotRequired["MultiModalHashDict"] """ - Optional user-specified IDs for multimodal items, mapped by modality. + Optional user-specified UUIDs for multimodal items, mapped by modality. Lists must match the number of items per modality and may contain `None`. For `None` entries, the hasher will compute IDs automatically; non-None entries override the default hashes for caching. @@ -68,9 +68,9 @@ class TokensPrompt(TypedDict): to pass the mm_processor_kwargs to each of them. """ - multi_modal_ids: NotRequired["MultiModalHashDict"] + multi_modal_uuids: NotRequired["MultiModalHashDict"] """ - Optional user-specified IDs for multimodal items, mapped by modality. + Optional user-specified UUIDs for multimodal items, mapped by modality. Lists must match the number of items per modality and may contain `None`. For `None` entries, the hasher will compute IDs automatically; non-None entries override the default hashes for caching. diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 4a78aaa55d53..708df51a1ef2 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -351,7 +351,7 @@ def _process_tokens( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=parsed_content.get("multi_modal_ids"), + mm_ids_override=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -381,7 +381,7 @@ async def _process_tokens_async( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=parsed_content.get("multi_modal_ids"), + mm_ids_override=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -410,7 +410,7 @@ def _process_text( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=parsed_content.get("multi_modal_ids"), + mm_ids_override=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -444,7 +444,7 @@ async def _process_text_async( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=parsed_content.get("multi_modal_ids"), + mm_ids_override=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 3598393e72dd..eaf2c2731c2f 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -147,30 +147,35 @@ def _validate_params( self._validate_sampling_params(params, lora_request) self._validate_supported_sampling_params(params) - def _validate_multi_modal_ids(self, prompt: PromptType) -> None: + def _validate_multi_modal_uuids(self, prompt: PromptType) -> None: """ - Validate that user-provided multi_modal_ids align with multi_modal_data - in the incoming request prompt(s). Only checks lengths; `None` entries - are allowed and will be auto-hashed downstream. + Validate that user-provided multi_modal_uuids align with + multi_modal_data in the incoming request prompt(s). + Only checks lengths; `None` entries are allowed and will be + auto-hashed downstream. """ def _validate_single(single_prompt: Union[dict, str]) -> None: if not isinstance(single_prompt, dict): return mm_data = single_prompt.get("multi_modal_data") - mm_ids = single_prompt.get("multi_modal_ids") - if not mm_data or not mm_ids: + mm_uuids = single_prompt.get("multi_modal_uuids") + if not mm_data or not mm_uuids: return for modality, items in mm_data.items(): - if modality in mm_ids: + if modality in mm_uuids: expected = len(items) if isinstance(items, list) else 1 - if len(mm_ids[modality]) != expected: + if len(mm_uuids[modality]) != expected: raise ValueError( - f"multi_modal_ids for modality '{modality}' must " - "have same length as data: got " - f"{len(mm_ids[modality])} ids vs {expected} items." - ) + f"multi_modal_uuids for modality '{modality}' " + "must have same length as data: got " + f"{len(mm_uuids[modality])} uuids vs " + f"{expected} items.") + else: + raise ValueError( + f"multi_modal_uuids for modality '{modality}' must " + "be provided if multi_modal_data is provided.") # Handle explicit encoder/decoder prompts or singleton prompt if isinstance(prompt, dict) and "encoder_prompt" in prompt: @@ -285,7 +290,7 @@ def process_inputs( arrival_time = time.time() # Validate multimodal ids alignment with mm_data at request layer - self._validate_multi_modal_ids(prompt) + self._validate_multi_modal_uuids(prompt) # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. From 0af3999b77d54b1a58aaaa22cf1cae7f9ed5c705 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Aug 2025 23:57:50 -0700 Subject: [PATCH 10/35] comment Signed-off-by: Roger Wang --- vllm/inputs/data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 49e52ea47c0f..3ed6bed294aa 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -36,7 +36,8 @@ class TextPrompt(TypedDict): Optional user-specified UUIDs for multimodal items, mapped by modality. Lists must match the number of items per modality and may contain `None`. For `None` entries, the hasher will compute IDs automatically; non-None - entries override the default hashes for caching. + entries override the default hashes for caching, and MUST be unique per + multimodal item. """ cache_salt: NotRequired[str] From 6defa1d331a3712736243ba1b6498fbe4b34e8b3 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 25 Aug 2025 00:08:58 -0700 Subject: [PATCH 11/35] comments Signed-off-by: Roger Wang --- vllm/inputs/preprocess.py | 16 ++++++------- vllm/multimodal/processing.py | 45 ++++++++++++++++++++++------------- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 708df51a1ef2..4e031204f05e 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -253,7 +253,7 @@ def _process_multimodal( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], *, - mm_ids_override: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: @@ -274,7 +274,7 @@ def _process_multimodal( mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_ids_override=mm_ids_override, + mm_uuids=mm_uuids, ) async def _process_multimodal_async( @@ -283,7 +283,7 @@ async def _process_multimodal_async( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], *, - mm_ids_override: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: @@ -303,7 +303,7 @@ async def _process_multimodal_async( mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_ids_override=mm_ids_override, + mm_uuids=mm_uuids, ) def _process_embeds( @@ -351,7 +351,7 @@ def _process_tokens( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=parsed_content.get("multi_modal_uuids"), + mm_uuids=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -381,7 +381,7 @@ async def _process_tokens_async( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=parsed_content.get("multi_modal_uuids"), + mm_uuids=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -410,7 +410,7 @@ def _process_text( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=parsed_content.get("multi_modal_uuids"), + mm_uuids=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) @@ -444,7 +444,7 @@ async def _process_text_async( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_ids_override=parsed_content.get("multi_modal_uuids"), + mm_uuids=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index b32dfee11945..257d6175393e 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1356,22 +1356,33 @@ def _hash_mm_items( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - override_ids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, ) -> MultiModalHashes: - """Create MM hashes to be returned (only used in V1).""" + """Create MM hashes to be returned (only used in V1). + + If `mm_uuids` is provided, it will override the default hashes for + caching. + """ model_id = self.info.model_id hashes: MultiModalHashes = {} - override_ids = override_ids or {} + mm_uuids = mm_uuids or {} for modality, items in mm_items.items(): - if modality in override_ids: - mm_ids = override_ids[modality] + if modality in mm_uuids: + mm_uuid_list = mm_uuids[modality] # For None entries, compute a hash; otherwise, use provided ID. computed: list[str] = [] for i, item in enumerate(items): - mm_id = mm_ids[i] - if mm_id is None: + mm_uuid = mm_uuid_list[i] + + # NOTE: Even if `mm_uuid` is provided, we still compute a + # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs` + # are provided. This is because the processed multimodal + # inputs can be different depending on the processor kwargs. + if mm_uuid is None or \ + hf_processor_mm_kwargs or \ + tokenization_kwargs: computed.append( MultiModalHasher.hash_kwargs( model_id=model_id, @@ -1379,7 +1390,7 @@ def _hash_mm_items( **hf_processor_mm_kwargs, **tokenization_kwargs)) else: - computed.append(mm_id) + computed.append(mm_uuid) hashes[modality] = computed else: hashes[modality] = [ @@ -1422,7 +1433,7 @@ def _apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - override_ids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1445,7 +1456,7 @@ def _apply_hf_processor( mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, tokenization_kwargs, - override_ids=override_ids) + mm_uuids=mm_uuids) unbound_prompt_updates = self._get_prompt_updates( mm_data_items, @@ -1470,7 +1481,7 @@ def _cached_apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - override_ids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1485,13 +1496,13 @@ def _cached_apply_hf_processor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - override_ids=override_ids, + mm_uuids=mm_uuids, ) mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, tokenization_kwargs, - override_ids=override_ids) + mm_uuids=mm_uuids) ( mm_cache_items_or_hashes, mm_missing_data_items, @@ -1721,7 +1732,7 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_ids_override: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1750,7 +1761,7 @@ def apply( mm_items, hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - override_ids=mm_ids_override, + mm_uuids=mm_uuids, ) # NOTE: tokenization_kwargs are not required to init processor @@ -1836,7 +1847,7 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_ids_override: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1851,7 +1862,7 @@ def apply( mm_data, hf_processor_mm_kwargs, tokenization_kwargs, - mm_ids_override=mm_ids_override, + mm_uuids=mm_uuids, ) return self._get_enc_dec_inputs( From cb406f3ccead12e1f35e4eccf639997edde8f5b2 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 25 Aug 2025 00:30:20 -0700 Subject: [PATCH 12/35] tweak Signed-off-by: Roger Wang --- vllm/multimodal/processing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 257d6175393e..d97379a2f3a3 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1383,6 +1383,10 @@ def _hash_mm_items( if mm_uuid is None or \ hf_processor_mm_kwargs or \ tokenization_kwargs: + + # NOTE: use uuid to hash if available for better + # performance. + item = mm_uuid if mm_uuid is not None else item computed.append( MultiModalHasher.hash_kwargs( model_id=model_id, From 977811bfaf46da1ad1d5a4342c8bbab3ec20cfaa Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 25 Aug 2025 02:43:45 -0700 Subject: [PATCH 13/35] typing Signed-off-by: Roger Wang --- vllm/inputs/data.py | 8 ++++---- vllm/inputs/preprocess.py | 6 +++--- vllm/multimodal/hasher.py | 4 ++-- vllm/multimodal/inputs.py | 8 ++++++++ vllm/multimodal/processing.py | 12 ++++++------ 5 files changed, 23 insertions(+), 15 deletions(-) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 3ed6bed294aa..e4e0c3581afb 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -7,8 +7,8 @@ from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar if TYPE_CHECKING: - from vllm.multimodal.hasher import MultiModalHashDict - from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs + from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalInputs, + MultiModalUUIDDict) class TextPrompt(TypedDict): @@ -31,7 +31,7 @@ class TextPrompt(TypedDict): to pass the mm_processor_kwargs to each of them. """ - multi_modal_uuids: NotRequired["MultiModalHashDict"] + multi_modal_uuids: NotRequired["MultiModalUUIDDict"] """ Optional user-specified UUIDs for multimodal items, mapped by modality. Lists must match the number of items per modality and may contain `None`. @@ -69,7 +69,7 @@ class TokensPrompt(TypedDict): to pass the mm_processor_kwargs to each of them. """ - multi_modal_uuids: NotRequired["MultiModalHashDict"] + multi_modal_uuids: NotRequired["MultiModalUUIDDict"] """ Optional user-specified UUIDs for multimodal items, mapped by modality. Lists must match the number of items per modality and may contain `None`. diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 4e031204f05e..933e6a432b40 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -12,7 +12,7 @@ from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, - MultiModalInputs) + MultiModalInputs, MultiModalUUIDDict) from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -253,7 +253,7 @@ def _process_multimodal( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], *, - mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: @@ -283,7 +283,7 @@ async def _process_multimodal_async( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], *, - mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 9aab8b3c680d..7a561a2ee74f 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -4,7 +4,7 @@ import pickle import uuid from collections.abc import Iterable -from typing import Optional, Union +from typing import Union import numpy as np import torch @@ -16,7 +16,7 @@ logger = init_logger(__name__) -MultiModalHashDict = dict[str, list[Optional[str]]] +MultiModalHashDict = dict[str, list[str]] """ A dictionary containing hashes for items in each modality. """ diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 581f9a109cce..765a2c11bc22 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -115,6 +115,14 @@ class MultiModalDataBuiltins(TypedDict, total=False): [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins]. """ +MultiModalUUIDDict = dict[str, list[Optional[str]]] +""" +A user-input dictionary containing user-provided UUIDs for items in each +modality. +If a UUID for an item is not provided, its entry will be `None` and +MultiModalHasher will compute a hash for the item. +""" + @dataclass(frozen=True) class PlaceholderRange: diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index d97379a2f3a3..4b422fc4c4e1 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -25,7 +25,7 @@ from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItem, MultiModalKwargsItems, - PlaceholderRange) + MultiModalUUIDDict, PlaceholderRange) from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems, MultiModalDataParser) @@ -1356,7 +1356,7 @@ def _hash_mm_items( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalHashes: """Create MM hashes to be returned (only used in V1). @@ -1437,7 +1437,7 @@ def _apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1485,7 +1485,7 @@ def _cached_apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1736,7 +1736,7 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1851,7 +1851,7 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_uuids: Optional[dict[str, list[Optional[str]]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. From f26890b65ad7d663063403791cd712a5fcb8c8c5 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 25 Aug 2025 02:59:51 -0700 Subject: [PATCH 14/35] clarify Signed-off-by: Roger Wang --- vllm/multimodal/inputs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 765a2c11bc22..df7229c76d8d 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -117,10 +117,12 @@ class MultiModalDataBuiltins(TypedDict, total=False): MultiModalUUIDDict = dict[str, list[Optional[str]]] """ -A user-input dictionary containing user-provided UUIDs for items in each -modality. +A dictionary containing user-provided UUIDs for items in each modality. If a UUID for an item is not provided, its entry will be `None` and MultiModalHasher will compute a hash for the item. + +The UUID will be used to identify the item for all caching purposes +(input processing caching, embedding caching, prefix caching, etc). """ From 06a69b02a030704c7413f059ffac64afd1126ef4 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Aug 2025 07:21:11 +0000 Subject: [PATCH 15/35] relax on user-input data structure Signed-off-by: Roger Wang --- vllm/multimodal/inputs.py | 2 +- vllm/v1/engine/processor.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index df7229c76d8d..4a604c28edfb 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -115,7 +115,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins]. """ -MultiModalUUIDDict = dict[str, list[Optional[str]]] +MultiModalUUIDDict = dict[str, Union[list[Optional[str]], str]] """ A dictionary containing user-provided UUIDs for items in each modality. If a UUID for an item is not provided, its entry will be `None` and diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index eaf2c2731c2f..93029bdfad5b 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -166,6 +166,8 @@ def _validate_single(single_prompt: Union[dict, str]) -> None: for modality, items in mm_data.items(): if modality in mm_uuids: expected = len(items) if isinstance(items, list) else 1 + if isinstance(mm_uuids[modality], str): + mm_uuids[modality] = [mm_uuids[modality]] if len(mm_uuids[modality]) != expected: raise ValueError( f"multi_modal_uuids for modality '{modality}' " From d9c97fc149abdbcb9dc90e53ed8dd0c3a5cb2d70 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Aug 2025 07:56:59 +0000 Subject: [PATCH 16/35] fix import Signed-off-by: Roger Wang --- vllm/multimodal/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 69eed2274144..7efd9dd1ae22 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -5,7 +5,7 @@ from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, MultiModalDataDict, MultiModalKwargs, MultiModalKwargsItems, MultiModalPlaceholderDict, - NestedTensors) + MultiModalUUIDDict, NestedTensors) from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -29,6 +29,7 @@ "MultiModalKwargsItems", "MultiModalPlaceholderDict", "MultiModalPlaceholderMap", + "MultiModalUUIDDict", "NestedTensors", "MULTIMODAL_REGISTRY", "MultiModalRegistry", From 2663d0eeb5dd5d8a961371d443388c4bfa02df6e Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Aug 2025 08:52:50 +0000 Subject: [PATCH 17/35] typing Signed-off-by: Roger Wang --- vllm/multimodal/processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 037390dfa925..9ea8c5781928 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -960,7 +960,7 @@ def get_mm_max_tokens_per_item( _I = TypeVar("_I", bound=BaseProcessingInfo) -MultiModalHashes = dict[str, list[Optional[str]]] +MultiModalHashes = dict[str, list[str]] """ A collection of hashes with a similar structure as [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems]. From 86d7e469f0c750f33aa3613391f268ddddf9c3b1 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Aug 2025 09:05:02 +0000 Subject: [PATCH 18/35] typing cleanup Signed-off-by: Roger Wang --- vllm/inputs/data.py | 6 +++--- vllm/inputs/preprocess.py | 6 +++--- vllm/multimodal/__init__.py | 7 +++---- vllm/multimodal/hasher.py | 5 ----- vllm/multimodal/inputs.py | 7 ++++--- vllm/multimodal/processing.py | 12 ++++++------ 6 files changed, 19 insertions(+), 24 deletions(-) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index e4e0c3581afb..12469d220da5 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalInputs, - MultiModalUUIDDict) + MultiModalUUIDs) class TextPrompt(TypedDict): @@ -31,7 +31,7 @@ class TextPrompt(TypedDict): to pass the mm_processor_kwargs to each of them. """ - multi_modal_uuids: NotRequired["MultiModalUUIDDict"] + multi_modal_uuids: NotRequired["MultiModalUUIDs"] """ Optional user-specified UUIDs for multimodal items, mapped by modality. Lists must match the number of items per modality and may contain `None`. @@ -69,7 +69,7 @@ class TokensPrompt(TypedDict): to pass the mm_processor_kwargs to each of them. """ - multi_modal_uuids: NotRequired["MultiModalUUIDDict"] + multi_modal_uuids: NotRequired["MultiModalUUIDs"] """ Optional user-specified UUIDs for multimodal items, mapped by modality. Lists must match the number of items per modality and may contain `None`. diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 933e6a432b40..914dfc03286d 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -12,7 +12,7 @@ from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, - MultiModalInputs, MultiModalUUIDDict) + MultiModalInputs, MultiModalUUIDs) from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -253,7 +253,7 @@ def _process_multimodal( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], *, - mm_uuids: Optional[MultiModalUUIDDict] = None, + mm_uuids: Optional[MultiModalUUIDs] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: @@ -283,7 +283,7 @@ async def _process_multimodal_async( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], *, - mm_uuids: Optional[MultiModalUUIDDict] = None, + mm_uuids: Optional[MultiModalUUIDs] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, ) -> MultiModalInputs: diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 7efd9dd1ae22..6bf08b6d8e75 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .base import MultiModalPlaceholderMap -from .hasher import MultiModalHashDict, MultiModalHasher +from .hasher import MultiModalHasher from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, MultiModalDataDict, MultiModalKwargs, MultiModalKwargsItems, MultiModalPlaceholderDict, - MultiModalUUIDDict, NestedTensors) + MultiModalUUIDs, NestedTensors) from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -23,13 +23,12 @@ "ModalityData", "MultiModalDataBuiltins", "MultiModalDataDict", - "MultiModalHashDict", "MultiModalHasher", "MultiModalKwargs", "MultiModalKwargsItems", "MultiModalPlaceholderDict", "MultiModalPlaceholderMap", - "MultiModalUUIDDict", + "MultiModalUUIDs", "NestedTensors", "MULTIMODAL_REGISTRY", "MultiModalRegistry", diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 7a561a2ee74f..96a12b0a111b 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -16,11 +16,6 @@ logger = init_logger(__name__) -MultiModalHashDict = dict[str, list[str]] -""" -A dictionary containing hashes for items in each modality. -""" - class MultiModalHasher: diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 4a604c28edfb..315e363dc706 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -22,7 +22,8 @@ from PIL.Image import Image from transformers.feature_extraction_utils import BatchFeature - from .hasher import MultiModalHashDict + from .processing import MultiModalHashes + else: torch = LazyLoader("torch", globals(), "torch") @@ -115,7 +116,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins]. """ -MultiModalUUIDDict = dict[str, Union[list[Optional[str]], str]] +MultiModalUUIDs = dict[str, Union[list[Optional[str]], str]] """ A dictionary containing user-provided UUIDs for items in each modality. If a UUID for an item is not provided, its entry will be `None` and @@ -911,7 +912,7 @@ class MultiModalInputs(TypedDict): mm_kwargs: MultiModalKwargsItems """Keyword arguments to be directly passed to the model after batching.""" - mm_hashes: "MultiModalHashDict" + mm_hashes: "MultiModalHashes" """The hashes of the multi-modal data.""" mm_placeholders: "MultiModalPlaceholderDict" diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 9ea8c5781928..4f355b03e952 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -25,7 +25,7 @@ from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItem, MultiModalKwargsItems, - MultiModalUUIDDict, PlaceholderRange) + MultiModalUUIDs, PlaceholderRange) from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems, MultiModalDataParser) @@ -1387,7 +1387,7 @@ def _hash_mm_items( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_uuids: Optional[MultiModalUUIDDict] = None, + mm_uuids: Optional[MultiModalUUIDs] = None, ) -> MultiModalHashes: """Create MM hashes to be returned (only used in V1). @@ -1468,7 +1468,7 @@ def _apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_uuids: Optional[MultiModalUUIDDict] = None, + mm_uuids: Optional[MultiModalUUIDs] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1514,7 +1514,7 @@ def _cached_apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_uuids: Optional[MultiModalUUIDDict] = None, + mm_uuids: Optional[MultiModalUUIDs] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1737,7 +1737,7 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_uuids: Optional[MultiModalUUIDDict] = None, + mm_uuids: Optional[MultiModalUUIDs] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1852,7 +1852,7 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_uuids: Optional[MultiModalUUIDDict] = None, + mm_uuids: Optional[MultiModalUUIDs] = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. From 175bfe476c1a8fd8650790e210898778fe697ea1 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 27 Aug 2025 21:23:46 +0000 Subject: [PATCH 19/35] update Signed-off-by: Roger Wang --- vllm/inputs/preprocess.py | 26 ++++++++++++++++---------- vllm/multimodal/processing.py | 35 ++++++++++++++++++++--------------- vllm/v1/engine/processor.py | 14 +++++++++----- 3 files changed, 45 insertions(+), 30 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index b2729450470d..c364e9ad6ba6 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -258,7 +258,8 @@ def _process_multimodal( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, @@ -385,7 +386,8 @@ async def _process_tokens_async( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = parsed_content["prompt_token_ids"] token_type_ids = parsed_content.get("token_type_ids") @@ -418,7 +420,8 @@ def _process_text( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_text = parsed_content["prompt"] @@ -428,7 +431,6 @@ def _process_text( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_uuids=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, mm_hash_overrides=mm_hash_overrides, @@ -465,7 +467,6 @@ async def _process_text_async( prompt_text, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_uuids=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, mm_hash_overrides=mm_hash_overrides, @@ -492,7 +493,8 @@ def _prompt_to_llm_inputs( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> SingletonInputs: """ Extract the singleton inputs from a prompt. @@ -539,7 +541,8 @@ async def _prompt_to_llm_inputs_async( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> SingletonInputs: """ Async version of @@ -678,7 +681,8 @@ def _process_encoder_decoder_prompt( prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> EncoderDecoderInputs: """ For encoder/decoder models only: @@ -873,7 +877,8 @@ def preprocess( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" if self.model_config.is_encoder_decoder: @@ -903,7 +908,8 @@ async def preprocess_async( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> ProcessorInputs: """ Async version of diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index b3f00ba92bf0..7ac221b97ed0 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -24,7 +24,8 @@ from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItem, MultiModalKwargsItems, - MultiModalUUIDs, MultiModalKwargsOptionalItems, PlaceholderRange) + MultiModalKwargsOptionalItems, MultiModalUUIDs, + PlaceholderRange) from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems, MultiModalDataParser) @@ -1362,7 +1363,8 @@ def _hash_mm_items( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_uuids: Optional[MultiModalUUIDs] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> MultiModalHashes: """Create MM hashes to be returned (only used in V1). @@ -1373,27 +1375,27 @@ def _hash_mm_items( model_id = self.info.model_id hashes: MultiModalHashes = {} - mm_uuids = mm_uuids or {} + mm_hash_overrides = mm_hash_overrides or {} for modality, items in mm_items.items(): - if modality in mm_uuids: - mm_uuid_list = mm_uuids[modality] + if modality in mm_hash_overrides: + mm_hash_list = mm_hash_overrides[modality] # For None entries, compute a hash; otherwise, use provided ID. computed: list[str] = [] for i, item in enumerate(items): - mm_uuid = mm_uuid_list[i] + mm_hash = mm_hash_list[i] - # NOTE: Even if `mm_uuid` is provided, we still compute a + # NOTE: Even if a mm_hash is provided, we still compute a # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs` # are provided. This is because the processed multimodal # inputs can be different depending on the processor kwargs. - if mm_uuid is None or \ + if mm_hash is None or \ hf_processor_mm_kwargs or \ tokenization_kwargs: - # NOTE: use uuid to hash if available for better - # performance. - item = mm_uuid if mm_uuid is not None else item + # NOTE: use provided hash string to hash with kwargs + # if available for better performance. + item = mm_hash if mm_hash is not None else item computed.append( MultiModalHasher.hash_kwargs( model_id=model_id, @@ -1401,7 +1403,7 @@ def _hash_mm_items( **hf_processor_mm_kwargs, **tokenization_kwargs)) else: - computed.append(mm_uuid) + computed.append(mm_hash) hashes[modality] = computed else: hashes[modality] = [ @@ -1508,7 +1510,8 @@ def _apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[MultiModalHashes] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1555,7 +1558,8 @@ def _cached_apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[MultiModalHashes] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1777,7 +1781,8 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 595e6bf89169..1e70e7ca3cf5 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -332,20 +332,24 @@ def process_inputs( if arrival_time is None: arrival_time = time.time() - # Validate multimodal ids alignment with mm_data at request layer - self._validate_multi_modal_uuids(prompt) + # Optionally generate multimodal hash overrides to avoid hashing + # multimodal data items by their content as their identifiers. - # Optionally generate multimodal hash overrides based on request id. # NOTE: when users explicitly turn off BOTH prefix caching and input # processing caching, no multimodal features or embeddings will be - # reused across requests, therefore hashing is no longer necessary. + # reused across requests, therefore identifying multimodal data items + # by their content is no longer necessary, and we create uuids with + # request id-modality-index as multimodal hash overrides. if (self.model_config.multimodal_config and self.model_config.multimodal_config.mm_processor_cache_gb == 0 and not self.cache_config.enable_prefix_caching): mm_hash_overrides = self._maybe_build_mm_hash_overrides( request_id, prompt) else: - mm_hash_overrides = None + # Otherwise, use user-provided uuids as multimodal hash overrides + # if provided. + self._validate_multi_modal_uuids(prompt) + mm_hash_overrides = prompt.get("multi_modal_uuids") # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. From fb8aa346f9b610d8008a29dc1573b98cf30166de Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 27 Aug 2025 21:28:08 +0000 Subject: [PATCH 20/35] fix Signed-off-by: Roger Wang --- vllm/inputs/preprocess.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index c364e9ad6ba6..3bc4dfa1a998 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -398,7 +398,6 @@ async def _process_tokens_async( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_uuids=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, mm_hash_overrides=mm_hash_overrides, From c87488955918f7a4f05d09f37b4366cd1c2bff35 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 27 Aug 2025 21:29:18 +0000 Subject: [PATCH 21/35] typing Signed-off-by: Roger Wang --- vllm/multimodal/processing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 7ac221b97ed0..3728ad91700f 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1022,7 +1022,8 @@ def __call__( mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[MultiModalHashes] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> MultiModalInputs: return self.apply(prompt, mm_data, From 4811dc60e17549ec1d454fffcb60bd1a600f01a7 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 28 Aug 2025 07:44:37 +0000 Subject: [PATCH 22/35] update Signed-off-by: Roger Wang --- vllm/multimodal/processing.py | 4 +++- vllm/v1/engine/processor.py | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 3728ad91700f..edc239002f7b 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1380,7 +1380,9 @@ def _hash_mm_items( for modality, items in mm_items.items(): if modality in mm_hash_overrides: - mm_hash_list = mm_hash_overrides[modality] + if isinstance(mm_hash_list := mm_hash_overrides[modality], + str): + mm_hash_list = [mm_hash_list] # For None entries, compute a hash; otherwise, use provided ID. computed: list[str] = [] for i, item in enumerate(items): diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 1e70e7ca3cf5..e37b83661617 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -168,15 +168,15 @@ def _validate_single(single_prompt: Union[dict, str]) -> None: for modality, items in mm_data.items(): if modality in mm_uuids: - expected = len(items) if isinstance(items, list) else 1 - if isinstance(mm_uuids[modality], str): - mm_uuids[modality] = [mm_uuids[modality]] - if len(mm_uuids[modality]) != expected: + data_len = len(items) if isinstance(items, list) else 1 + uuid_len = len(mm_uuids[modality]) if isinstance( + mm_uuids[modality], list) else 1 + if uuid_len != data_len: raise ValueError( f"multi_modal_uuids for modality '{modality}' " "must have same length as data: got " - f"{len(mm_uuids[modality])} uuids vs " - f"{expected} items.") + f"{uuid_len} uuids vs " + f"{data_len} items.") else: raise ValueError( f"multi_modal_uuids for modality '{modality}' must " From b6be45a23642507360c80c6f1fe13743c043e6de Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 28 Aug 2025 07:57:32 +0000 Subject: [PATCH 23/35] update Signed-off-by: Roger Wang --- vllm/multimodal/processing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index edc239002f7b..061e4f10862c 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1380,13 +1380,14 @@ def _hash_mm_items( for modality, items in mm_items.items(): if modality in mm_hash_overrides: - if isinstance(mm_hash_list := mm_hash_overrides[modality], - str): - mm_hash_list = [mm_hash_list] + mm_hashes = mm_hash_overrides[modality] + if isinstance(mm_hashes, str): + mm_hashes = [mm_hashes] + # For None entries, compute a hash; otherwise, use provided ID. computed: list[str] = [] for i, item in enumerate(items): - mm_hash = mm_hash_list[i] + mm_hash = mm_hashes[i] # NOTE: Even if a mm_hash is provided, we still compute a # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs` From 779d2b23d8d954cd8a4ca578c8414a0fef12fa5a Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 28 Aug 2025 08:09:48 +0000 Subject: [PATCH 24/35] use typealias Signed-off-by: Roger Wang --- vllm/multimodal/inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index c38ac5d94c35..678e7fbc8350 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -116,7 +116,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins]. """ -MultiModalUUIDs = dict[str, Union[list[Optional[str]], str]] +MultiModalUUIDs: TypeAlias = dict[str, Union[list[Optional[str]], str]] """ A dictionary containing user-provided UUIDs for items in each modality. If a UUID for an item is not provided, its entry will be `None` and From e5ca7364c56e8a62c24c6fc4fa0c1169100fc01e Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 28 Aug 2025 08:10:25 +0000 Subject: [PATCH 25/35] mapping Signed-off-by: Roger Wang --- vllm/multimodal/inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 678e7fbc8350..a24ae683f10a 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -116,7 +116,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins]. """ -MultiModalUUIDs: TypeAlias = dict[str, Union[list[Optional[str]], str]] +MultiModalUUIDs: TypeAlias = Mapping[str, Union[list[Optional[str]], str]] """ A dictionary containing user-provided UUIDs for items in each modality. If a UUID for an item is not provided, its entry will be `None` and From d3b227e78bf45f04356985a79af84ecdf7d52a5b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 28 Aug 2025 18:03:47 +0000 Subject: [PATCH 26/35] fix Signed-off-by: Roger Wang --- vllm/inputs/preprocess.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 3bc4dfa1a998..8d3186a35d87 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -364,7 +364,6 @@ def _process_tokens( prompt_token_ids, multi_modal_data, parsed_content.get("mm_processor_kwargs"), - mm_uuids=parsed_content.get("multi_modal_uuids"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, mm_hash_overrides=mm_hash_overrides, From 0848d8624fdb680316288843d178920c5931ffbf Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 28 Aug 2025 18:23:48 +0000 Subject: [PATCH 27/35] typing Signed-off-by: Roger Wang --- vllm/inputs/preprocess.py | 15 ++++++++++----- vllm/multimodal/processing.py | 3 ++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 8d3186a35d87..20cb47b67d71 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -292,7 +292,8 @@ async def _process_multimodal_async( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> MultiModalInputs: """ Async version of @@ -353,7 +354,8 @@ def _process_tokens( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = parsed_content["prompt_token_ids"] token_type_ids = parsed_content.get("token_type_ids") @@ -455,7 +457,8 @@ async def _process_text_async( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_text = parsed_content["prompt"] @@ -755,7 +758,8 @@ async def _process_encoder_decoder_prompt_async( prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> EncoderDecoderInputs: """ Async version of @@ -854,7 +858,8 @@ async def _process_decoder_only_prompt_async( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> DecoderOnlyInputs: """ Async version of diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 061e4f10862c..c5e06add9ff8 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1901,7 +1901,8 @@ def apply( hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_hash_overrides: Optional[MultiModalHashes] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. From ce7c9b07b393056d80062a8f10f1940c4eab22d2 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 28 Aug 2025 18:32:02 +0000 Subject: [PATCH 28/35] typing Signed-off-by: Roger Wang --- vllm/inputs/preprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 20cb47b67d71..6a5d3ae8c940 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -826,7 +826,8 @@ def _process_decoder_only_prompt( tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_hash_overrides: Optional[Union[dict[str, list[str]], + MultiModalUUIDs]] = None, ) -> DecoderOnlyInputs: """ For decoder-only models: From 2f0a02c7df4ef8c119e777fb7099a21a440536e0 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 28 Aug 2025 21:52:55 +0000 Subject: [PATCH 29/35] add tests Signed-off-by: Roger Wang --- .../test_processor_multi_modal_uuids.py | 297 ++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 tests/v1/engine/test_processor_multi_modal_uuids.py diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py new file mode 100644 index 000000000000..4e77e34fefd8 --- /dev/null +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -0,0 +1,297 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.platforms.interface import UnspecifiedPlatform +from vllm.sampling_params import SamplingParams +from vllm.v1.engine import processor as processor_mod +from vllm.v1.engine.processor import Processor + +cherry_pil_image = ImageAsset("cherry_blossom").pil_image +stop_pil_image = ImageAsset("stop_sign").pil_image +baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays + + +# Mock processor for testing +def _mk_processor(monkeypatch, + *, + mm_cache_gb: float = 4.0, + enable_prefix_caching: bool = True) -> Processor: + """ + Create a Processor instance with minimal configuration suitable for unit + tests without accessing external resources. + """ + monkeypatch.setattr(ModelConfig, + "try_get_generation_config", + lambda self: {}, + raising=True) + monkeypatch.setattr(ModelConfig, + "__post_init__", + lambda self: None, + raising=True) + monkeypatch.setattr(UnspecifiedPlatform, + "is_async_output_supported", + classmethod(lambda cls, enforce_eager: True), + raising=True) + monkeypatch.setattr( + ModelConfig, + "verify_async_output_proc", + lambda self, parallel_config, speculative_config, device_config: None, + raising=True) + monkeypatch.setattr(ModelConfig, + "verify_with_parallel_config", + lambda self, parallel_config: None, + raising=True) + monkeypatch.setattr(processor_mod, + "processor_cache_from_config", + lambda vllm_config, mm_registry: None, + raising=True) + + monkeypatch.setattr(VllmConfig, + "__post_init__", + lambda self: None, + raising=True) + + model_config = ModelConfig( + skip_tokenizer_init=True, + max_model_len=128, + mm_processor_cache_gb=mm_cache_gb, + generation_config="vllm", + tokenizer="dummy", + ) + + # Minimal multimodal_config to satisfy references in + # Processor.process_inputs. + class _MockMMConfig: + + def __init__(self, gb: float): + self.mm_processor_cache_gb = gb + + model_config.multimodal_config = _MockMMConfig( + mm_cache_gb) # type: ignore[attr-defined] + vllm_config = VllmConfig( + model_config=model_config, + cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), + device_config=DeviceConfig(device="cpu"), + ) + + # Pass tokenizer=None; InputPreprocessor handles None when + # skip_tokenizer_init is True. + return Processor(vllm_config, tokenizer=None) # type: ignore[arg-type] + + +def test_multi_modal_uuids_length_mismatch_raises(monkeypatch): + processor = _mk_processor(monkeypatch) + + prompt = { + "prompt": "USER: \nDescribe\nASSISTANT:", + "multi_modal_data": { + "image": [cherry_pil_image, stop_pil_image] + }, + # Mismatch: 2 items but only 1 uuid provided + "multi_modal_uuids": { + "image": ["hash_cherry"] + }, + } + + with pytest.raises(ValueError, match="must have same length as data"): + processor.process_inputs( + request_id="req-1", + prompt=prompt, # type: ignore[arg-type] + params=SamplingParams(), + ) + + +def test_multi_modal_uuids_missing_modality_raises(monkeypatch): + processor = _mk_processor(monkeypatch) + + prompt = { + "prompt": "USER: