Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions tests/renderers/test_process_multi_modal_uuids.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.multimodal.parse import parse_mm_uuids
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config

Expand Down Expand Up @@ -45,10 +46,11 @@ def test_multi_modal_uuids_length_mismatch_raises():
mm_uuids = {"image": ["hash_cherry"]}

mm_processor = renderer.get_mm_processor()
mm_items = mm_processor.info.parse_mm_data(mm_data)
mm_data_items = mm_processor.info.parse_mm_data(mm_data)
mm_uuid_items = parse_mm_uuids(mm_uuids)

with pytest.raises(ValueError, match="must have same length as"):
renderer._process_mm_uuids(mm_data, mm_items, mm_uuids, "req-1")
renderer._process_mm_uuids(mm_data, mm_data_items, mm_uuid_items, "req-1")


def test_multi_modal_uuids_missing_modality_raises():
Expand All @@ -63,10 +65,11 @@ def test_multi_modal_uuids_missing_modality_raises():
mm_uuids = {"image": ["hash_cherry"]}

mm_processor = renderer.get_mm_processor()
mm_items = mm_processor.info.parse_mm_data(mm_data)
mm_data_items = mm_processor.info.parse_mm_data(mm_data)
mm_uuid_items = parse_mm_uuids(mm_uuids)

with pytest.raises(ValueError, match="is empty but .* is missing"):
renderer._process_mm_uuids(mm_data, mm_items, mm_uuids, "req-2")
renderer._process_mm_uuids(mm_data, mm_data_items, mm_uuid_items, "req-2")


@pytest.mark.parametrize(
Expand All @@ -78,7 +81,7 @@ def test_multi_modal_uuids_missing_modality_raises():
],
)
def test_multi_modal_uuids_accepts_none_and_passes_through(
monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
mm_cache_gb: float, enable_prefix_caching: bool
):
renderer = _build_renderer(
mm_cache_gb=mm_cache_gb,
Expand All @@ -94,9 +97,11 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
mm_uuids = {"image": [None, "hash_stop"], "video": None}

mm_processor = renderer.get_mm_processor()
mm_items = mm_processor.info.parse_mm_data(mm_data)
mm_data_items = mm_processor.info.parse_mm_data(mm_data)
mm_uuid_items = parse_mm_uuids(mm_uuids)

processed_mm_uuids = renderer._process_mm_uuids(
mm_data, mm_items, mm_uuids, "req-3"
mm_data, mm_data_items, mm_uuid_items, "req-3"
)

assert processed_mm_uuids == mm_uuids
Expand All @@ -111,7 +116,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
],
)
def test_multi_modal_uuids_accepts_empty(
monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
mm_cache_gb: float, enable_prefix_caching: bool
):
renderer = _build_renderer(
mm_cache_gb=mm_cache_gb,
Expand All @@ -124,15 +129,17 @@ def test_multi_modal_uuids_accepts_empty(
mm_uuids = {"image": [], "video": None} # type: ignore[var-annotated]

mm_processor = renderer.get_mm_processor()
mm_items = mm_processor.info.parse_mm_data(mm_data)
mm_data_items = mm_processor.info.parse_mm_data(mm_data)
mm_uuid_items = parse_mm_uuids(mm_uuids)

processed_mm_uuids = renderer._process_mm_uuids(
mm_data, mm_items, mm_uuids, "req-4"
mm_data, mm_data_items, mm_uuid_items, "req-4"
)

assert processed_mm_uuids == mm_uuids


def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
def test_multi_modal_uuids_ignored_when_caching_disabled():
# When both processor cache is 0 and prefix caching disabled, the
# processor builds overrides from request id instead of using user UUIDs.
renderer = _build_renderer(mm_cache_gb=0.0, enable_prefix_caching=False)
Expand All @@ -145,9 +152,11 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": ["hash_video"]}

mm_processor = renderer.get_mm_processor()
mm_items = mm_processor.info.parse_mm_data(mm_data)
mm_data_items = mm_processor.info.parse_mm_data(mm_data)
mm_uuid_items = parse_mm_uuids(mm_uuids)

processed_mm_uuids = renderer._process_mm_uuids(
mm_data, mm_items, mm_uuids, request_id
mm_data, mm_data_items, mm_uuid_items, request_id
)

# Expect request-id-based overrides are passed through
Expand Down
6 changes: 3 additions & 3 deletions vllm/inputs/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def _process_multimodal(
self,
prompt: str | list[int],
mm_data: MultiModalDataDict,
mm_processor_kwargs: Mapping[str, object] | None,
mm_processor_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
*,
mm_uuids: MultiModalUUIDDict | None = None,
Expand All @@ -103,9 +103,9 @@ def _process_multimodal(
return self.renderer._process_multimodal(
prompt,
mm_data,
mm_uuids=mm_uuids,
mm_processor_kwargs=mm_processor_kwargs,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)

def _process_embeds(
Expand Down Expand Up @@ -144,7 +144,7 @@ def _process_tokens(
inputs = self._process_multimodal(
prompt_token_ids,
multi_modal_data,
parsed_content.get("mm_processor_kwargs") or {},
parsed_content.get("mm_processor_kwargs"),
tokenization_kwargs=tokenization_kwargs,
mm_uuids=parsed_content.get("multi_modal_uuids"),
)
Expand Down
15 changes: 9 additions & 6 deletions vllm/model_executor/models/clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,13 @@
MultiModalFieldConfig,
MultiModalInputs,
MultiModalKwargsItems,
MultiModalUUIDDict,
)
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
from vllm.multimodal.parse import (
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
Expand Down Expand Up @@ -203,10 +207,9 @@ def apply(
self,
prompt: str | list[int],
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
mm_uuid_items: MultiModalUUIDItems | None = None,
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: Mapping[str, object] | None = None,
*,
mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs:
if mm_items:
if isinstance(prompt, str):
Expand Down Expand Up @@ -235,9 +238,9 @@ def apply(
return super().apply(
prompt=prompt,
mm_items=mm_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)

def _hf_processor_applies_updates(
Expand Down
8 changes: 4 additions & 4 deletions vllm/model_executor/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
MultiModalUUIDDict,
)
from vllm.multimodal.parse import (
ImageEmbeddingItems,
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import BaseDummyInputsBuilder
from vllm.multimodal.processing.processor import (
Expand Down Expand Up @@ -313,9 +313,9 @@ def _cached_apply_hf_processor(
self,
prompt: str | list[int],
mm_data_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
mm_uuids: MultiModalUUIDDict | None = None,
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
# The processor logic is different for len(images) <= 2 vs > 2
# Since the processing cache assumes that the processor output is
Expand All @@ -325,17 +325,17 @@ def _cached_apply_hf_processor(
return self._apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)

return super()._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)


Expand Down
9 changes: 5 additions & 4 deletions vllm/model_executor/models/h2ovl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@

from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargsItems, MultiModalUUIDDict
from vllm.multimodal.inputs import MultiModalKwargsItems
from vllm.multimodal.parse import (
ImageEmbeddingItems,
ImageProcessorItems,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing.processor import (
MultiModalProcessingInfo,
Expand Down Expand Up @@ -491,9 +492,9 @@ def _cached_apply_hf_processor(
self,
prompt: str | list[int],
mm_data_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
mm_uuids: MultiModalUUIDDict | None = None,
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
# The processor logic is different for len(images) <= 1 vs > 1
# Since the processing cache assumes that the processor output is
Expand All @@ -503,17 +504,17 @@ def _cached_apply_hf_processor(
return self._apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)

return super()._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)


Expand Down
12 changes: 6 additions & 6 deletions vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@
MultiModalFieldConfig,
MultiModalInputs,
MultiModalKwargsItems,
MultiModalUUIDDict,
mm_inputs,
)
from vllm.multimodal.parse import (
ImageEmbeddingItems,
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
Expand Down Expand Up @@ -773,9 +773,9 @@ def apply(
self,
prompt: str | list[int],
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
mm_uuid_items: MultiModalUUIDItems | None = None,
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: Mapping[str, object] | None = None,
mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
Expand All @@ -789,9 +789,9 @@ def apply(
result = super().apply(
prompt,
mm_items,
hf_processor_mm_kwargs,
tokenization_kwargs,
mm_uuids=mm_uuids,
mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)

mm_item_counts = mm_items.get_all_counts()
Expand Down
12 changes: 6 additions & 6 deletions vllm/model_executor/models/paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
MultiModalFieldConfig,
MultiModalInputs,
MultiModalKwargsItems,
MultiModalUUIDDict,
)
from vllm.multimodal.parse import (
ImageEmbeddingItems,
ImageProcessorItems,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
Expand Down Expand Up @@ -231,16 +231,16 @@ def apply(
self,
prompt: str | list[int],
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
mm_uuid_items: MultiModalUUIDItems | None = None,
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: Mapping[str, object] | None = None,
mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs:
mm_inputs = super().apply(
prompt,
mm_items,
hf_processor_mm_kwargs,
tokenization_kwargs,
mm_uuids=mm_uuids,
mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
prompt_token_ids = mm_inputs["prompt_token_ids"]

Expand Down
12 changes: 8 additions & 4 deletions vllm/model_executor/models/pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,14 @@
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalUUIDDict,
NestedTensors,
)
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
from vllm.multimodal.parse import (
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import BaseDummyInputsBuilder, ProcessorInputs
from vllm.multimodal.processing.processor import (
BaseMultiModalProcessor,
Expand Down Expand Up @@ -344,16 +348,16 @@ def _cached_apply_hf_processor(
self,
prompt: str | list[int],
mm_data_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
mm_uuids: MultiModalUUIDDict | None = None,
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)

# NOTE: The tokens are already inserted by the chat template
Expand Down
Loading