Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions tests/models/multimodal/processing/test_gemma4.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,60 @@
GEMMA4_MODEL_ID = "google/gemma-4-E2B-it"


@pytest.mark.parametrize(
"image_width,image_height,max_soft_tokens",
[
# Production repro: a 3x900 image (extreme aspect ratio) made the
# prompt-side estimator return 289 while the HF Gemma 4 image
# processor's vision tower output capped at 280, producing the
# "Attempted to assign 280 multimodal tokens to 289 placeholders"
# mismatch that crashed EngineCore.
(900, 3, 280),
(3, 900, 280),
# Same pathology should hold for the video-frame budget (70 tokens).
(900, 3, 70),
# And for any other supported budget.
(4000, 2, 1120),
],
)
@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
def test_compute_num_soft_tokens_does_not_exceed_max_soft_tokens(
model_id: str,
image_width: int,
image_height: int,
max_soft_tokens: int,
):
Comment thread
hnt2601 marked this conversation as resolved.
"""Regression for the Gemma 3/4 multimodal crash.

`_compute_num_soft_tokens` must never return a value larger than
`max_soft_tokens`. The HF Gemma 4 image processor clamps its vision
tower output to that value; if the prompt-side estimator returns more,
the prompt has more `image` placeholder tokens than the encoder will
fill, and `_merge_multimodal_embeddings` raises `ValueError` deep in
the model forward.
"""
ctx = build_model_context(
model_id,
mm_processor_kwargs={"do_pan_and_scan": True},
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)

num_soft_tokens = processor.info._compute_num_soft_tokens(
image_width=image_width,
image_height=image_height,
max_soft_tokens=max_soft_tokens,
)

assert num_soft_tokens <= max_soft_tokens, (
f"_compute_num_soft_tokens returned {num_soft_tokens} for "
f"image_width={image_width}, image_height={image_height}, "
f"max_soft_tokens={max_soft_tokens} — exceeds the cap that the HF "
f"image processor enforces on its vision tower output. This is "
f"the placeholder/encoder count mismatch that crashes EngineCore."
)


@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
def test_limit_mm_per_prompt(
image_assets: ImageTestAssets,
Expand Down
9 changes: 8 additions & 1 deletion vllm/model_executor/models/gemma4_mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,14 @@ def _compute_num_soft_tokens(
target_h = max(unit, int(math.floor(image_height * scale / unit)) * unit)
target_w = max(unit, int(math.floor(image_width * scale / unit)) * unit)
num_patches = (target_h // patch_size) * (target_w // patch_size)
return num_patches // (pooling_kernel_size**2)
# Clamp to ``max_soft_tokens``: extreme aspect ratios (e.g. 3x900)
# cause the floor() above to round one dim up to ``unit`` while the
# other scales freely, which over-shoots ``max_patches``. The HF
# Gemma 4 image processor caps its vision-tower output at
# ``max_soft_tokens``, so without this clamp the prompt-side
# placeholder count exceeds the encoder output and
# ``_merge_multimodal_embeddings`` crashes.
return min(num_patches // (pooling_kernel_size**2), max_soft_tokens)

def get_image_repl(
self,
Expand Down
Loading