diff --git a/tests/renderers/test_process_multi_modal_uuids.py b/tests/renderers/test_process_multi_modal_uuids.py index 8d9fea28b73a..91e4377d5612 100644 --- a/tests/renderers/test_process_multi_modal_uuids.py +++ b/tests/renderers/test_process_multi_modal_uuids.py @@ -6,6 +6,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.config import CacheConfig, ModelConfig, VllmConfig +from vllm.multimodal.parse import parse_mm_uuids from vllm.renderers.hf import HfRenderer from vllm.tokenizers.registry import tokenizer_args_from_config @@ -45,10 +46,11 @@ def test_multi_modal_uuids_length_mismatch_raises(): mm_uuids = {"image": ["hash_cherry"]} mm_processor = renderer.get_mm_processor() - mm_items = mm_processor.info.parse_mm_data(mm_data) + mm_data_items = mm_processor.info.parse_mm_data(mm_data) + mm_uuid_items = parse_mm_uuids(mm_uuids) with pytest.raises(ValueError, match="must have same length as"): - renderer._process_mm_uuids(mm_data, mm_items, mm_uuids, "req-1") + renderer._process_mm_uuids(mm_data, mm_data_items, mm_uuid_items, "req-1") def test_multi_modal_uuids_missing_modality_raises(): @@ -63,10 +65,11 @@ def test_multi_modal_uuids_missing_modality_raises(): mm_uuids = {"image": ["hash_cherry"]} mm_processor = renderer.get_mm_processor() - mm_items = mm_processor.info.parse_mm_data(mm_data) + mm_data_items = mm_processor.info.parse_mm_data(mm_data) + mm_uuid_items = parse_mm_uuids(mm_uuids) with pytest.raises(ValueError, match="is empty but .* is missing"): - renderer._process_mm_uuids(mm_data, mm_items, mm_uuids, "req-2") + renderer._process_mm_uuids(mm_data, mm_data_items, mm_uuid_items, "req-2") @pytest.mark.parametrize( @@ -78,7 +81,7 @@ def test_multi_modal_uuids_missing_modality_raises(): ], ) def test_multi_modal_uuids_accepts_none_and_passes_through( - monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool + mm_cache_gb: float, enable_prefix_caching: bool ): renderer = _build_renderer( mm_cache_gb=mm_cache_gb, @@ -94,9 +97,11 @@ def test_multi_modal_uuids_accepts_none_and_passes_through( mm_uuids = {"image": [None, "hash_stop"], "video": None} mm_processor = renderer.get_mm_processor() - mm_items = mm_processor.info.parse_mm_data(mm_data) + mm_data_items = mm_processor.info.parse_mm_data(mm_data) + mm_uuid_items = parse_mm_uuids(mm_uuids) + processed_mm_uuids = renderer._process_mm_uuids( - mm_data, mm_items, mm_uuids, "req-3" + mm_data, mm_data_items, mm_uuid_items, "req-3" ) assert processed_mm_uuids == mm_uuids @@ -111,7 +116,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through( ], ) def test_multi_modal_uuids_accepts_empty( - monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool + mm_cache_gb: float, enable_prefix_caching: bool ): renderer = _build_renderer( mm_cache_gb=mm_cache_gb, @@ -124,15 +129,17 @@ def test_multi_modal_uuids_accepts_empty( mm_uuids = {"image": [], "video": None} # type: ignore[var-annotated] mm_processor = renderer.get_mm_processor() - mm_items = mm_processor.info.parse_mm_data(mm_data) + mm_data_items = mm_processor.info.parse_mm_data(mm_data) + mm_uuid_items = parse_mm_uuids(mm_uuids) + processed_mm_uuids = renderer._process_mm_uuids( - mm_data, mm_items, mm_uuids, "req-4" + mm_data, mm_data_items, mm_uuid_items, "req-4" ) assert processed_mm_uuids == mm_uuids -def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): +def test_multi_modal_uuids_ignored_when_caching_disabled(): # When both processor cache is 0 and prefix caching disabled, the # processor builds overrides from request id instead of using user UUIDs. renderer = _build_renderer(mm_cache_gb=0.0, enable_prefix_caching=False) @@ -145,9 +152,11 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": ["hash_video"]} mm_processor = renderer.get_mm_processor() - mm_items = mm_processor.info.parse_mm_data(mm_data) + mm_data_items = mm_processor.info.parse_mm_data(mm_data) + mm_uuid_items = parse_mm_uuids(mm_uuids) + processed_mm_uuids = renderer._process_mm_uuids( - mm_data, mm_items, mm_uuids, request_id + mm_data, mm_data_items, mm_uuid_items, request_id ) # Expect request-id-based overrides are passed through diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 29e877a05e8c..b67493932639 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -91,7 +91,7 @@ def _process_multimodal( self, prompt: str | list[int], mm_data: MultiModalDataDict, - mm_processor_kwargs: Mapping[str, object] | None, + mm_processor_kwargs: Mapping[str, object] | None = None, tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, @@ -103,9 +103,9 @@ def _process_multimodal( return self.renderer._process_multimodal( prompt, mm_data, + mm_uuids=mm_uuids, mm_processor_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) def _process_embeds( @@ -144,7 +144,7 @@ def _process_tokens( inputs = self._process_multimodal( prompt_token_ids, multi_modal_data, - parsed_content.get("mm_processor_kwargs") or {}, + parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, mm_uuids=parsed_content.get("multi_modal_uuids"), ) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 37888086b683..556c68fc17f5 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -36,9 +36,13 @@ MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, - MultiModalUUIDDict, ) -from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems +from vllm.multimodal.parse import ( + ImageProcessorItems, + ImageSize, + MultiModalDataItems, + MultiModalUUIDItems, +) from vllm.multimodal.processing import ( BaseDummyInputsBuilder, BaseMultiModalProcessor, @@ -203,10 +207,9 @@ def apply( self, prompt: str | list[int], mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + mm_uuid_items: MultiModalUUIDItems | None = None, + hf_processor_mm_kwargs: Mapping[str, object] | None = None, tokenization_kwargs: Mapping[str, object] | None = None, - *, - mm_uuids: MultiModalUUIDDict | None = None, ) -> MultiModalInputs: if mm_items: if isinstance(prompt, str): @@ -235,9 +238,9 @@ def apply( return super().apply( prompt=prompt, mm_items=mm_items, + mm_uuid_items=mm_uuid_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) def _hf_processor_applies_updates( diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 83ab54f604a1..e0de49fb6eae 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -24,13 +24,13 @@ MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, - MultiModalUUIDDict, ) from vllm.multimodal.parse import ( ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems, + MultiModalUUIDItems, ) from vllm.multimodal.processing import BaseDummyInputsBuilder from vllm.multimodal.processing.processor import ( @@ -313,9 +313,9 @@ def _cached_apply_hf_processor( self, prompt: str | list[int], mm_data_items: MultiModalDataItems, + mm_uuid_items: MultiModalUUIDItems | None, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_uuids: MultiModalUUIDDict | None = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: # The processor logic is different for len(images) <= 2 vs > 2 # Since the processing cache assumes that the processor output is @@ -325,17 +325,17 @@ def _cached_apply_hf_processor( return self._apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, + mm_uuid_items=mm_uuid_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) return super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, + mm_uuid_items=mm_uuid_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index ea25f884fc17..a4b87631fe2f 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -16,11 +16,12 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargsItems, MultiModalUUIDDict +from vllm.multimodal.inputs import MultiModalKwargsItems from vllm.multimodal.parse import ( ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems, + MultiModalUUIDItems, ) from vllm.multimodal.processing.processor import ( MultiModalProcessingInfo, @@ -491,9 +492,9 @@ def _cached_apply_hf_processor( self, prompt: str | list[int], mm_data_items: MultiModalDataItems, + mm_uuid_items: MultiModalUUIDItems | None, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_uuids: MultiModalUUIDDict | None = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: # The processor logic is different for len(images) <= 1 vs > 1 # Since the processing cache assumes that the processor output is @@ -503,17 +504,17 @@ def _cached_apply_hf_processor( return self._apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, + mm_uuid_items=mm_uuid_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) return super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, + mm_uuid_items=mm_uuid_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 07e8dac85475..c8ca1815d7b1 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -30,7 +30,6 @@ MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, - MultiModalUUIDDict, mm_inputs, ) from vllm.multimodal.parse import ( @@ -38,6 +37,7 @@ ImageProcessorItems, ImageSize, MultiModalDataItems, + MultiModalUUIDItems, ) from vllm.multimodal.processing import ( BaseDummyInputsBuilder, @@ -773,9 +773,9 @@ def apply( self, prompt: str | list[int], mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + mm_uuid_items: MultiModalUUIDItems | None = None, + hf_processor_mm_kwargs: Mapping[str, object] | None = None, tokenization_kwargs: Mapping[str, object] | None = None, - mm_uuids: MultiModalUUIDDict | None = None, ) -> MultiModalInputs: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -789,9 +789,9 @@ def apply( result = super().apply( prompt, mm_items, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids, + mm_uuid_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, ) mm_item_counts = mm_items.get_all_counts() diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 0453f6852853..37beaffef624 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -16,12 +16,12 @@ MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, - MultiModalUUIDDict, ) from vllm.multimodal.parse import ( ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems, + MultiModalUUIDItems, ) from vllm.multimodal.processing import ( BaseDummyInputsBuilder, @@ -231,16 +231,16 @@ def apply( self, prompt: str | list[int], mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + mm_uuid_items: MultiModalUUIDItems | None = None, + hf_processor_mm_kwargs: Mapping[str, object] | None = None, tokenization_kwargs: Mapping[str, object] | None = None, - mm_uuids: MultiModalUUIDDict | None = None, ) -> MultiModalInputs: mm_inputs = super().apply( prompt, mm_items, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids, + mm_uuid_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, ) prompt_token_ids = mm_inputs["prompt_token_ids"] diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 407cf3ff5550..0cfa8b6a3a84 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -44,10 +44,14 @@ from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFieldConfig, - MultiModalUUIDDict, NestedTensors, ) -from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems +from vllm.multimodal.parse import ( + ImageProcessorItems, + ImageSize, + MultiModalDataItems, + MultiModalUUIDItems, +) from vllm.multimodal.processing import BaseDummyInputsBuilder, ProcessorInputs from vllm.multimodal.processing.processor import ( BaseMultiModalProcessor, @@ -344,16 +348,16 @@ def _cached_apply_hf_processor( self, prompt: str | list[int], mm_data_items: MultiModalDataItems, + mm_uuid_items: MultiModalUUIDItems | None, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_uuids: MultiModalUUIDDict | None = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, + mm_uuid_items=mm_uuid_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) # NOTE: The tokens are already inserted by the chat template diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index a447d376b220..8e07a90e893e 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -42,9 +42,13 @@ MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, - MultiModalUUIDDict, ) -from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems +from vllm.multimodal.parse import ( + ImageProcessorItems, + ImageSize, + MultiModalDataItems, + MultiModalUUIDItems, +) from vllm.multimodal.processing import ( BaseDummyInputsBuilder, BaseMultiModalProcessor, @@ -189,10 +193,9 @@ def apply( self, prompt: str | list[int], mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + mm_uuid_items: MultiModalUUIDItems | None = None, + hf_processor_mm_kwargs: Mapping[str, object] | None = None, tokenization_kwargs: Mapping[str, object] | None = None, - *, - mm_uuids: MultiModalUUIDDict | None = None, ) -> MultiModalInputs: if mm_items: if isinstance(prompt, str): @@ -221,9 +224,9 @@ def apply( return super().apply( prompt=prompt, mm_items=mm_items, + mm_uuid_items=mm_uuid_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) def _hf_processor_applies_updates( diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 0dc778a097b8..1cf65abd649e 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -46,7 +46,6 @@ MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, - MultiModalUUIDDict, PlaceholderRange, mm_inputs, ) @@ -55,6 +54,7 @@ ModalityDataItems, MultiModalDataItems, MultiModalDataParser, + MultiModalUUIDItems, ) from vllm.multimodal.processing import ( BaseDummyInputsBuilder, @@ -196,15 +196,19 @@ def apply( self, prompt: str | list[int], mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + mm_uuid_items: MultiModalUUIDItems | None = None, + hf_processor_mm_kwargs: Mapping[str, object] | None = None, tokenization_kwargs: Mapping[str, object] | None = None, - mm_uuids: MultiModalUUIDDict | None = None, ) -> MultiModalInputs: + if hf_processor_mm_kwargs is None: + hf_processor_mm_kwargs = {} if tokenization_kwargs is None: tokenization_kwargs = {} mm_hashes = self._hash_mm_items( - mm_items, hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids + mm_items, + mm_uuid_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) _, passthrough_data = self._get_hf_mm_data(mm_items) diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index 6fb5827a873f..3b1eb7db8cca 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -31,11 +31,14 @@ MultiModalFeatureSpec, MultiModalFieldConfig, MultiModalInputs, - MultiModalUUIDDict, PlaceholderRange, mm_inputs, ) -from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems +from vllm.multimodal.parse import ( + ImageProcessorItems, + MultiModalDataItems, + MultiModalUUIDItems, +) from vllm.multimodal.processing import ( BaseDummyInputsBuilder, BaseMultiModalProcessor, @@ -177,9 +180,9 @@ def apply( self, prompt: str | list[int], mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + mm_uuid_items: MultiModalUUIDItems | None = None, + hf_processor_mm_kwargs: Mapping[str, object] | None = None, tokenization_kwargs: Mapping[str, object] | None = None, - mm_uuids: MultiModalUUIDDict | None = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -187,6 +190,8 @@ def apply( Apply HF Processor on prompt text and multi-modal data together, outputting token IDs and processed tensors. """ + if hf_processor_mm_kwargs is None: + hf_processor_mm_kwargs = {} if tokenization_kwargs is None: tokenization_kwargs = {} @@ -258,7 +263,9 @@ def apply( # Use overrides if provided; fallback to data-dependent hashing. mm_hashes = self._hash_mm_items( - mm_items, hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids + mm_items, + mm_uuid_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) return mm_inputs( diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 6c1055b19dd5..a4dcc1b413c1 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -41,13 +41,13 @@ MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, - MultiModalUUIDDict, NestedTensors, ) from vllm.multimodal.parse import ( AudioProcessorItems, MultiModalDataItems, MultiModalDataParser, + MultiModalUUIDItems, ) from vllm.multimodal.processing import BaseDummyInputsBuilder, ProcessorInputs from vllm.multimodal.processing.processor import ( @@ -363,16 +363,16 @@ def _cached_apply_hf_processor( self, prompt: str | list[int], mm_data_items: MultiModalDataItems, + mm_uuid_items: MultiModalUUIDItems | None, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_uuids: MultiModalUUIDDict | None = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, + mm_uuid_items=mm_uuid_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) # NOTE: The tokens are already inserted by the chat template diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index be9f7e652282..1e25142f3c2c 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -155,7 +155,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins]. """ -MultiModalUUIDDict: TypeAlias = Mapping[str, list[str | None] | str] +MultiModalUUIDDict: TypeAlias = Mapping[str, Sequence[str | None] | str] """ A dictionary containing user-provided UUIDs for items in each modality. If a UUID for an item is not provided, its entry will be `None` and diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 0462ab5dea93..6a588dad0207 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from collections import UserDict -from collections.abc import Callable, Iterator, Mapping, Sequence +from collections.abc import Callable, Iterator, Mapping, Sequence, Set from typing import ( TYPE_CHECKING, Any, @@ -33,6 +33,7 @@ MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, + MultiModalUUIDDict, VideoItem, ) from .media import MediaWithBytes @@ -297,14 +298,15 @@ def get_passthrough_data(self) -> Mapping[str, object]: return self.data -class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): - def __init__(self, data: Sequence[HfAudioItem] | None) -> None: - if data is None: - data = [None] +class AudioProcessorItems(ProcessorBatchItems[HfAudioItem | None]): + def __init__(self, data: Sequence[HfAudioItem | None]) -> None: super().__init__(data, "audio") def get_audio_length(self, item_idx: int) -> int: audio = self.get(item_idx) + if audio is None: + raise ValueError(f"Cannot get length of cached audio at {item_idx}") + return len(audio) @@ -322,14 +324,14 @@ class ImageSize(NamedTuple): height: int -class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): - def __init__(self, data: Sequence[HfImageItem] | None) -> None: - if data is None: - data = [None] +class ImageProcessorItems(ProcessorBatchItems[HfImageItem | None]): + def __init__(self, data: Sequence[HfImageItem | None]) -> None: super().__init__(data, "image") def get_image_size(self, item_idx: int) -> ImageSize: image = self.get(item_idx) + if image is None: + raise ValueError(f"Cannot get size of cached image at {item_idx}") if isinstance(image, PILImage.Image): return ImageSize(*image.size) @@ -349,22 +351,31 @@ def __init__( super().__init__(data, "image", expected_hidden_size) -class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): +class VideoProcessorItems(ProcessorBatchItems[HfVideoItem | None]): def __init__( self, - data: Sequence[HfVideoItem] | None, + data: Sequence[HfVideoItem | None], metadata: dict[str, Any] | list[dict[str, Any] | None] | None = None, ) -> None: - if data is None: - data = [None] super().__init__(data, "video") + self.metadata = metadata def get_num_frames(self, item_idx: int) -> int: - return len(self.get(item_idx)) + video = self.get(item_idx) + if video is None: + raise ValueError(f"Cannot get length of cached video at {item_idx}") + + return len(video) def get_frame_size(self, item_idx: int) -> ImageSize: - image = self.get(item_idx)[0] # Assume that the video isn't empty + video = self.get(item_idx) + if video is None: + raise ValueError(f"Cannot get size of cached video at {item_idx}") + if len(video) == 0: + raise ValueError(f"Cannot get size of empty video at {item_idx}") + + image = video[0] if isinstance(image, PILImage.Image): return ImageSize(*image.size) @@ -400,6 +411,15 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): normalized such that each entry corresponds to a list. """ + def select(self, modalities: Set[str]): + """ + Construct a new `MultiModalDataItems` instance containing only the + selected modalities. + """ + return MultiModalDataItems( + {modality: self[modality] for modality in modalities} + ) + def get_count(self, modality: str, *, strict: bool = True) -> int: """ Get the number of data items belonging to a modality. @@ -497,19 +517,11 @@ def is_embeddings( ) -> TypeGuard[torch.Tensor | list[torch.Tensor]]: if isinstance(data, torch.Tensor): return data.ndim == 3 - if is_list_of(data, torch.Tensor): + if is_list_of(data, torch.Tensor) and len(data) > 0: return data[0].ndim == 2 # type: ignore[index] return False - def _is_empty(self, data: object) -> TypeGuard[None]: - if isinstance(data, list): - return len(data) == 0 - if isinstance(data, (np.ndarray, torch.Tensor)): - return data.size == 0 - - return False - def _get_audio_with_sr( self, audio: AudioItem, @@ -545,12 +557,6 @@ def _parse_audio_data( data: ModalityData[AudioItem], ) -> ModalityDataItems[Any, Any] | None: if data is None: - return AudioProcessorItems(None) - - # also check single audio item with sampling rate - if self._is_empty(data) or ( - isinstance(data, tuple) and self._is_empty(data[0]) - ): return None if self.is_embeddings(data): @@ -558,9 +564,8 @@ def _parse_audio_data( data_items: list[AudioItem] if ( - is_list_of(data, float) - or isinstance(data, (np.ndarray, torch.Tensor)) - and data.ndim == 1 + (is_list_of(data, float) and len(data) > 0) + or (isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 1) or isinstance(data, tuple) ): data_items = [data] @@ -591,18 +596,13 @@ def _parse_image_data( data: ModalityData[ImageItem], ) -> ModalityDataItems[Any, Any] | None: if data is None: - return ImageProcessorItems(None) - - if self._is_empty(data): return None if self.is_embeddings(data): return ImageEmbeddingItems(data, self.expected_hidden_size) - if ( - isinstance(data, (PILImage.Image, MediaWithBytes)) - or isinstance(data, (np.ndarray, torch.Tensor)) - and data.ndim == 3 + if isinstance(data, (PILImage.Image, MediaWithBytes)) or ( + isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 3 ): data_items = [data] elif isinstance(data, (np.ndarray, torch.Tensor)): @@ -617,19 +617,14 @@ def _parse_video_data( data: ModalityData[VideoItem], ) -> ModalityDataItems[Any, Any] | None: if data is None: - return VideoProcessorItems(None) - - if self._is_empty(data): return None if self.is_embeddings(data): return VideoEmbeddingItems(data, self.expected_hidden_size) data_items: list[VideoItem] - if ( - is_list_of(data, PILImage.Image) - or isinstance(data, (np.ndarray, torch.Tensor)) - and data.ndim == 4 + if (is_list_of(data, PILImage.Image) and len(data) > 0) or ( + isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 4 ): data_items = [data] elif isinstance(data, (np.ndarray, torch.Tensor)): @@ -664,12 +659,15 @@ def _parse_vision_chunk_data( data: ModalityData[Any], ) -> ModalityDataItems[Any, Any] | None: """Parse vision chunk data (unified image and video chunks).""" - if data is None or self._is_empty(data): + if data is None: return None + if self.is_embeddings(data): raise ValueError("Do not support embedding data for vision_chunk right now") + if isinstance(data, dict): data = [data] + return VisionChunkProcessorItems(data) def _get_subparsers(self) -> Mapping[str, ModalityDataParser]: @@ -693,3 +691,20 @@ def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: mm_items[k] = parsed_data return mm_items + + +MultiModalUUIDItems: TypeAlias = dict[str, Sequence[str | None]] +""" +As [`MultiModalUUIDDict`][vllm.multimodal.inputs.MultiModalUUIDDict], but +normalized such that each entry corresponds to a list. +""" + + +def parse_mm_uuids(mm_uuids: MultiModalUUIDDict | None) -> MultiModalUUIDItems: + if mm_uuids is None: + return {} + + return { + modality: [uuids] if isinstance(uuids, str) else uuids + for modality, uuids in mm_uuids.items() + } diff --git a/vllm/multimodal/processing/processor.py b/vllm/multimodal/processing/processor.py index 713717881a16..d1b1df627383 100644 --- a/vllm/multimodal/processing/processor.py +++ b/vllm/multimodal/processing/processor.py @@ -32,7 +32,6 @@ MultiModalKwargsItem, MultiModalKwargsItems, MultiModalKwargsOptionalItems, - MultiModalUUIDDict, PlaceholderRange, mm_enc_dec_inputs, mm_inputs, @@ -41,6 +40,7 @@ DictEmbeddingItems, EmbeddingItems, MultiModalDataItems, + MultiModalUUIDItems, ) from .context import ( BaseProcessingInfo, @@ -1014,11 +1014,15 @@ def __call__( self, prompt: str, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - *, - mm_uuids: MultiModalUUIDDict | None = None, + mm_uuid_items: MultiModalUUIDItems | None = None, + hf_processor_mm_kwargs: Mapping[str, object] | None = None, ) -> MultiModalInputs: - return self.apply(prompt, mm_items, hf_processor_mm_kwargs, mm_uuids=mm_uuids) + return self.apply( + prompt, + mm_items, + mm_uuid_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) @abstractmethod def _get_mm_fields_config( @@ -1174,7 +1178,10 @@ def _apply_hf_processor_text_mm( In addition, return whether prompt updates have been applied. """ - processor_data, passthrough_data = self._get_hf_mm_data(mm_items) + valid_mm_items = mm_items.select( + {k for k, c in mm_items.get_all_counts().items() if c > 0} + ) + processor_data, passthrough_data = self._get_hf_mm_data(valid_mm_items) processed_data = self._call_hf_processor( prompt=prompt_text, @@ -1301,69 +1308,57 @@ def _apply_hf_processor_main( def _hash_mm_items( self, - mm_items: MultiModalDataItems, + mm_data_items: MultiModalDataItems, + mm_uuid_items: MultiModalUUIDItems | None, hf_processor_mm_kwargs: Mapping[str, object], - tokenization_kwargs: Mapping[str, object], - *, - mm_uuids: MultiModalUUIDDict | None = None, ) -> MultiModalHashes: - """Create MM hashes to be returned. - - - Note: When overrides are provided via callers of `apply`, - `_hash_mm_items` will be bypassed and the overrides will be used. - """ model_id = self.info.model_id - hashes: MultiModalHashes = {} - mm_uuids = mm_uuids or {} + if mm_uuid_items is None: + mm_uuid_items = {} - for modality, items in mm_items.items(): - if modality in mm_uuids: - mm_uuids_per_modality = mm_uuids[modality] - if isinstance(mm_uuids_per_modality, str): - mm_uuids_per_modality = [mm_uuids_per_modality] + mm_hashes: MultiModalHashes = {} + hasher = MultiModalHasher + + for modality, data_items in mm_data_items.items(): + if modality in mm_uuid_items: + uuid_items = mm_uuid_items[modality] # For None entries, compute a hash; otherwise, use provided ID. - computed: list[str] = [] - for i, item in enumerate(items.get_all_items_for_hash()): - item_uuid = mm_uuids_per_modality[i] - - # NOTE: Even if a item_uuid is provided, we still compute a - # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs` - # are provided. This is because the processed multimodal - # inputs can be different depending on the processor kwargs. - if ( - item_uuid is None - or hf_processor_mm_kwargs - or tokenization_kwargs - ): + hashes: list[str] = [] + for i, item in enumerate(data_items.get_all_items_for_hash()): + uuid_item = uuid_items[i] + + # NOTE: Even if a uuid_item is provided, we still compute a hash + # if `hf_processor_mm_kwargs` is provided. + # This is because the processed multimodal inputs can be different + # depending on the processor kwargs. + if uuid_item is None or hf_processor_mm_kwargs: # NOTE: use provided hash string to hash with kwargs # if available for better performance. - item = item_uuid if item_uuid is not None else item - computed.append( - MultiModalHasher.hash_kwargs( + item = uuid_item if uuid_item is not None else item + hashes.append( + hasher.hash_kwargs( model_id=model_id, **{modality: item}, **hf_processor_mm_kwargs, - **tokenization_kwargs, ) ) else: - computed.append(item_uuid) - hashes[modality] = computed + hashes.append(uuid_item) + + mm_hashes[modality] = hashes else: - hashes[modality] = [ - MultiModalHasher.hash_kwargs( + mm_hashes[modality] = [ + hasher.hash_kwargs( model_id=model_id, **{modality: item}, **hf_processor_mm_kwargs, - **tokenization_kwargs, ) - for item in items + for item in data_items ] - return hashes + return mm_hashes def _get_cache_missing_items( self, @@ -1468,10 +1463,9 @@ def _apply_hf_processor( self, prompt: str | list[int], mm_data_items: MultiModalDataItems, + mm_uuid_items: MultiModalUUIDItems | None, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - *, - mm_uuids: MultiModalUUIDDict | None = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1494,9 +1488,8 @@ def _apply_hf_processor( with timed_preprocessor_operation(self.info.ctx, "hashing"): mm_hashes = self._hash_mm_items( mm_data_items, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids, + mm_uuid_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) mm_prompt_updates = self._get_mm_prompt_updates( @@ -1517,10 +1510,9 @@ def _cached_apply_hf_processor( self, prompt: str | list[int], mm_data_items: MultiModalDataItems, + mm_uuid_items: MultiModalUUIDItems | None, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - *, - mm_uuids: MultiModalUUIDDict | None = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1533,17 +1525,16 @@ def _cached_apply_hf_processor( return self._apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, + mm_uuid_items=mm_uuid_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) with timed_preprocessor_operation(self.info.ctx, "hashing"): mm_hashes = self._hash_mm_items( mm_data_items, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids, + mm_uuid_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) with timed_preprocessor_operation(self.info.ctx, "cache_lookup"): @@ -1753,10 +1744,9 @@ def apply( self, prompt: str | list[int], mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + mm_uuid_items: MultiModalUUIDItems | None = None, + hf_processor_mm_kwargs: Mapping[str, object] | None = None, tokenization_kwargs: Mapping[str, object] | None = None, - *, - mm_uuids: MultiModalUUIDDict | None = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1775,6 +1765,8 @@ def apply( if request_id is not None: self.info.ctx.create_timing_stats(request_id) + if hf_processor_mm_kwargs is None: + hf_processor_mm_kwargs = {} if tokenization_kwargs is None: tokenization_kwargs = {} @@ -1785,9 +1777,9 @@ def apply( ) = self._cached_apply_hf_processor( prompt, mm_items, + mm_uuid_items, hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) # NOTE: tokenization_kwargs are not required to init processor @@ -1861,10 +1853,9 @@ def apply( self, prompt: str | list[int], mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + mm_uuid_items: MultiModalUUIDItems | None = None, + hf_processor_mm_kwargs: Mapping[str, object] | None = None, tokenization_kwargs: Mapping[str, object] | None = None, - *, - mm_uuids: MultiModalUUIDDict | None = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1877,9 +1868,9 @@ def apply( encoder_inputs = super().apply( encoder_prompt, mm_items, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids, + mm_uuid_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, ) return self._get_enc_dec_inputs( diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index 0dccd307f179..790544294696 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -51,7 +51,7 @@ MultiModalInputs, MultiModalUUIDDict, ) - from vllm.multimodal.parse import MultiModalDataItems + from vllm.multimodal.parse import MultiModalDataItems, MultiModalUUIDItems from vllm.multimodal.processing import BaseMultiModalProcessor logger = init_logger(__name__) @@ -463,23 +463,25 @@ def _apply_prompt_extras( def _validate_mm_uuids( self, mm_data: "MultiModalDataDict", - mm_items: "MultiModalDataItems", - mm_uuids: "MultiModalUUIDDict | None", + mm_data_items: "MultiModalDataItems", + mm_uuid_items: "MultiModalUUIDItems", ) -> None: - if mm_uuids is None: - mm_uuids = {} - - # NOTE: Keys corresponding to `None` in `mm_data` don't appear in `mm_items` - modalities = mm_data.keys() | mm_uuids.keys() + # NOTE: Keys corresponding to `None` in `mm_data` don't appear in + # `mm_data_items` + modalities = mm_data.keys() | mm_uuid_items.keys() for modality in modalities: - data_items = mm_items.get(modality) or list[Any]() + data_items = mm_data_items.get(modality) + uuid_items = mm_uuid_items.get(modality) - uuid_items = mm_uuids.get(modality) or list[str | None]() - if isinstance(uuid_items, str): - uuid_items = [uuid_items] + if data_items is None: + if uuid_items is None: + raise ValueError( + f"multi_modal_data[{modality!r}] is empty but " + f"multi_modal_uuids[{modality!r}] is missing." + ) - if len(data_items) > 0: + elif uuid_items is not None: if len(uuid_items) > 0 and len(data_items) != len(uuid_items): raise ValueError( f"If given, multi_modal_uuids[{modality!r}] must have " @@ -488,24 +490,17 @@ def _validate_mm_uuids( ) for i, item in enumerate(data_items): - if item is None: - if not uuid_items: - raise ValueError( - f"multi_modal_data[{modality!r}][{i}] is empty but " - f"multi_modal_uuids[{modality!r}] is missing." - ) - - if uuid_items[i] is None: - raise ValueError( - f"multi_modal_data[{modality!r}][{i}] is empty but " - f"multi_modal_uuids[{modality!r}][{i}] is missing." - ) + if item is None and uuid_items[i] is None: + raise ValueError( + f"multi_modal_data[{modality!r}][{i}] is empty but " + f"multi_modal_uuids[{modality!r}][{i}] is missing." + ) def _process_mm_uuids( self, mm_data: "MultiModalDataDict", - mm_items: "MultiModalDataItems", - mm_uuids: "MultiModalUUIDDict | None", + mm_data_items: "MultiModalDataItems", + mm_uuid_items: "MultiModalUUIDItems", mm_req_id: str, ): model_config = self.model_config @@ -520,40 +515,45 @@ def _process_mm_uuids( and model_config.multimodal_config.mm_processor_cache_gb == 0 and not self.config.cache_config.enable_prefix_caching ): - mm_uuids = { + mm_uuid_items = { modality: [f"{mm_req_id}-{modality}-{i}" for i in range(data_count)] - for modality, data_count in mm_items.get_all_counts().items() + for modality, data_count in mm_data_items.get_all_counts().items() } - self._validate_mm_uuids(mm_data, mm_items, mm_uuids) + self._validate_mm_uuids(mm_data, mm_data_items, mm_uuid_items) - return mm_uuids + return mm_uuid_items # TODO: Remove str and tokenization_kwargs after deprecating InputPreprocessor def _process_multimodal( self, prompt: list[int] | str, mm_data: "MultiModalDataDict", + mm_uuids: "MultiModalUUIDDict | None", mm_processor_kwargs: Mapping[str, object] | None, tokenization_kwargs: dict[str, Any] | None, - mm_uuids: "MultiModalUUIDDict | None", ) -> "MultiModalInputs": + from vllm.multimodal.parse import parse_mm_uuids from vllm.multimodal.processing.context import set_request_id mm_req_id = f"renderer-mm-{self._mm_req_counter.inc(1)}" mm_processor = self.get_mm_processor() - mm_items = mm_processor.info.parse_mm_data(mm_data) - mm_uuids = self._process_mm_uuids(mm_data, mm_items, mm_uuids, mm_req_id) + mm_data_items = mm_processor.info.parse_mm_data(mm_data) + mm_uuid_items = parse_mm_uuids(mm_uuids) + + mm_uuids = self._process_mm_uuids( + mm_data, mm_data_items, mm_uuid_items, mm_req_id + ) with set_request_id(mm_req_id), set_default_torch_num_threads(): mm_inputs = mm_processor.apply( prompt, - mm_items, - hf_processor_mm_kwargs=mm_processor_kwargs or {}, + mm_data_items, + mm_uuid_items, + hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, ) self.update_mm_cache_stats()