Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
118aae6
add
ywang96 Aug 22, 2025
3b92bcf
typo
ywang96 Aug 22, 2025
53dd7c7
fix import
ywang96 Aug 22, 2025
57e114d
Merge branch 'main' into allow-passing-mm-hash
ywang96 Aug 24, 2025
cea5c09
revert
ywang96 Aug 24, 2025
611827f
revert
ywang96 Aug 24, 2025
1f31339
allow missing entry
ywang96 Aug 24, 2025
1756606
update
ywang96 Aug 24, 2025
a82a865
update typing
ywang96 Aug 24, 2025
c6a1e6a
rename
Aug 25, 2025
0af3999
comment
Aug 25, 2025
6defa1d
comments
Aug 25, 2025
cb406f3
tweak
Aug 25, 2025
977811b
typing
Aug 25, 2025
f26890b
clarify
Aug 25, 2025
06a69b0
relax on user-input data structure
ywang96 Aug 26, 2025
d9c97fc
fix import
ywang96 Aug 26, 2025
0927f7f
Merge branch 'main' into allow-passing-mm-hash
ywang96 Aug 26, 2025
2663d0e
typing
ywang96 Aug 26, 2025
86d7e46
typing cleanup
ywang96 Aug 26, 2025
46c874e
Merge branch 'main' into allow-passing-mm-hash
ywang96 Aug 27, 2025
175bfe4
update
ywang96 Aug 27, 2025
fb8aa34
fix
ywang96 Aug 27, 2025
c874889
typing
ywang96 Aug 27, 2025
4811dc6
update
ywang96 Aug 28, 2025
b6be45a
update
ywang96 Aug 28, 2025
779d2b2
use typealias
ywang96 Aug 28, 2025
e5ca736
mapping
ywang96 Aug 28, 2025
d3b227e
fix
ywang96 Aug 28, 2025
0848d86
typing
ywang96 Aug 28, 2025
ce7c9b0
typing
ywang96 Aug 28, 2025
2f0a02c
add tests
ywang96 Aug 28, 2025
06d31af
add tests
ywang96 Aug 28, 2025
6ea9ee4
Merge branch 'main' into allow-passing-mm-hash
ywang96 Aug 28, 2025
c074041
rename
ywang96 Aug 29, 2025
2388fd6
Merge branch 'main' into allow-passing-mm-hash
ywang96 Aug 29, 2025
518230c
add doc
ywang96 Aug 29, 2025
6e638a9
make warning
ywang96 Aug 29, 2025
87e7e89
add
ywang96 Aug 29, 2025
778557e
Merge branch 'main' into allow-passing-mm-hash
ywang96 Aug 29, 2025
93fd175
Merge branch 'main' into allow-passing-mm-hash
ywang96 Aug 29, 2025
34a2afc
Merge branch 'main' into allow-passing-mm-hash
ywang96 Aug 30, 2025
e53694e
fix type check
ywang96 Aug 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions docs/features/multimodal_inputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
- `prompt`: The prompt should follow the format that is documented on HuggingFace.
- `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].

### Stable UUIDs for Caching (multi_modal_uuids)

When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.

??? code

```python
from vllm import LLM
from PIL import Image

# Qwen2.5-VL example with two images
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")

prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
img_a = Image.open("/path/to/a.jpg")
img_b = Image.open("/path/to/b.jpg")

outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {"image": [img_a, img_b]},
# Provide stable IDs for caching.
# Requirements (matched by this example):
# - Include every modality present in multi_modal_data.
# - For lists, provide the same number of entries.
# - Use None to fall back to content hashing for that item.
"multi_modal_uuids": {"image": ["sku-1234-a", None]},
})

for o in outputs:
print(o.outputs[0].text)
```

!!! warning
If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.

### Image Inputs

You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
Expand Down
229 changes: 229 additions & 0 deletions tests/v1/engine/test_processor_multi_modal_uuids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest

from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.platforms.interface import UnspecifiedPlatform
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import processor as processor_mod
from vllm.v1.engine.processor import Processor

cherry_pil_image = ImageAsset("cherry_blossom").pil_image
stop_pil_image = ImageAsset("stop_sign").pil_image
baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays


# Mock processor for testing
def _mk_processor(monkeypatch,
*,
mm_cache_gb: float = 4.0,
enable_prefix_caching: bool = True) -> Processor:
"""
Create a Processor instance with minimal configuration suitable for unit
tests without accessing external resources.
"""
monkeypatch.setattr(ModelConfig,
"try_get_generation_config",
lambda self: {},
raising=True)
monkeypatch.setattr(ModelConfig,
"__post_init__",
lambda self: None,
raising=True)
monkeypatch.setattr(UnspecifiedPlatform,
"is_async_output_supported",
classmethod(lambda cls, enforce_eager: True),
raising=True)
monkeypatch.setattr(
ModelConfig,
"verify_async_output_proc",
lambda self, parallel_config, speculative_config, device_config: None,
raising=True)
monkeypatch.setattr(ModelConfig,
"verify_with_parallel_config",
lambda self, parallel_config: None,
raising=True)
monkeypatch.setattr(processor_mod,
"processor_cache_from_config",
lambda vllm_config, mm_registry: None,
raising=True)

monkeypatch.setattr(VllmConfig,
"__post_init__",
lambda self: None,
raising=True)

model_config = ModelConfig(
skip_tokenizer_init=True,
max_model_len=128,
mm_processor_cache_gb=mm_cache_gb,
generation_config="vllm",
tokenizer="dummy",
)

# Minimal multimodal_config to satisfy references in
# Processor.process_inputs.
class _MockMMConfig:

def __init__(self, gb: float):
self.mm_processor_cache_gb = gb

model_config.multimodal_config = _MockMMConfig(
mm_cache_gb) # type: ignore[attr-defined]
vllm_config = VllmConfig(
model_config=model_config,
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
device_config=DeviceConfig(device="cpu"),
)

# Pass tokenizer=None; InputPreprocessor handles None when
# skip_tokenizer_init is True.
return Processor(vllm_config, tokenizer=None) # type: ignore[arg-type]


def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
processor = _mk_processor(monkeypatch)

prompt = {
"prompt": "USER: <image>\nDescribe\nASSISTANT:",
"multi_modal_data": {
"image": [cherry_pil_image, stop_pil_image]
},
# Mismatch: 2 items but only 1 uuid provided
"multi_modal_uuids": {
"image": ["hash_cherry"]
},
}

with pytest.raises(ValueError, match="must have same length as data"):
processor.process_inputs(
request_id="req-1",
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
)


def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
processor = _mk_processor(monkeypatch)

prompt = {
"prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
# Two modalities provided in data
"multi_modal_data": {
"image": [cherry_pil_image],
"video": [baby_reading_np_ndarrays]
},
# Only image uuids provided; video missing should raise
"multi_modal_uuids": {
"image": ["hash_cherry"]
},
}

with pytest.raises(ValueError,
match="must be provided if multi_modal_data"):
processor.process_inputs(
request_id="req-2",
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
)


@pytest.mark.parametrize(
"mm_cache_gb, enable_prefix_caching",
[
(4.0, True), # default behavior
(4.0, False), # prefix caching disabled
(0.0, True), # processor cache disabled
],
)
def test_multi_modal_uuids_accepts_none_and_passes_through(
monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool):
processor = _mk_processor(monkeypatch,
mm_cache_gb=mm_cache_gb,
enable_prefix_caching=enable_prefix_caching)

# Capture the overrides passed to InputPreprocessor.preprocess
captured: dict[str, object] = {}

def fake_preprocess(prompt,
*,
tokenization_kwargs=None,
lora_request=None,
mm_hash_overrides=None):
captured["mm_hash_overrides"] = mm_hash_overrides
# Minimal processed inputs for decoder-only flow
return {"type": "token", "prompt_token_ids": [1]}

# Monkeypatch only the bound preprocess method on this instance
monkeypatch.setattr(processor.input_preprocessor,
"preprocess",
fake_preprocess,
raising=True)

# Use a consistent two-image scenario across all configurations
mm_uuids = {"image": [None, "hash_stop"], "video": None}
prompt = {
"prompt": "USER: <image><image>\nTwo images\nASSISTANT:",
"multi_modal_data": {
"image": [cherry_pil_image, stop_pil_image],
"video": baby_reading_np_ndarrays,
},
"multi_modal_uuids": mm_uuids,
}

processor.process_inputs(
request_id="req-3",
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
)

assert captured["mm_hash_overrides"] == mm_uuids


def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
# When both processor cache is 0 and prefix caching disabled, the
# processor builds overrides from request id instead of using user UUIDs.
processor = _mk_processor(monkeypatch,
mm_cache_gb=0.0,
enable_prefix_caching=False)

captured: dict[str, object] = {}

def fake_preprocess(prompt,
*,
tokenization_kwargs=None,
lora_request=None,
mm_hash_overrides=None):
captured["mm_hash_overrides"] = mm_hash_overrides
return {"type": "token", "prompt_token_ids": [1]}

monkeypatch.setattr(processor.input_preprocessor,
"preprocess",
fake_preprocess,
raising=True)

request_id = "req-42"
mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
prompt = {
"prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:",
"multi_modal_data": {
"image": [cherry_pil_image, stop_pil_image],
"video": baby_reading_np_ndarrays,
},
"multi_modal_uuids": mm_uuids,
}

processor.process_inputs(
request_id=request_id,
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
)

# Expect request-id-based overrides are passed through
assert captured["mm_hash_overrides"] == {
"image": [f"{request_id}-image-0", f"{request_id}-image-1"],
"video": [f"{request_id}-video-0"],
}
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import ( # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
MultiModalDataDict)
MultiModalDataDict, MultiModalUUIDDict)
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import BeamSearchParams, SamplingParams
Expand Down
20 changes: 19 additions & 1 deletion vllm/inputs/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar

if TYPE_CHECKING:
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalInputs,
MultiModalUUIDDict)


class TextPrompt(TypedDict):
Expand All @@ -30,6 +31,15 @@ class TextPrompt(TypedDict):
to pass the mm_processor_kwargs to each of them.
"""

multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
"""
Optional user-specified UUIDs for multimodal items, mapped by modality.
Lists must match the number of items per modality and may contain `None`.
For `None` entries, the hasher will compute IDs automatically; non-None
entries override the default hashes for caching, and MUST be unique per
multimodal item.
"""

cache_salt: NotRequired[str]
"""
Optional cache salt to be used for prefix caching.
Expand Down Expand Up @@ -59,6 +69,14 @@ class TokensPrompt(TypedDict):
to pass the mm_processor_kwargs to each of them.
"""

multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
"""
Optional user-specified UUIDs for multimodal items, mapped by modality.
Lists must match the number of items per modality and may contain `None`.
For `None` entries, the hasher will compute IDs automatically; non-None
entries override the default hashes for caching.
"""

cache_salt: NotRequired[str]
"""
Optional cache salt to be used for prefix caching.
Expand Down
Loading