diff --git a/tests/models/multimodal/processing/test_gemma4.py b/tests/models/multimodal/processing/test_gemma4.py index 808fab6a030f..bd1f2bb86779 100644 --- a/tests/models/multimodal/processing/test_gemma4.py +++ b/tests/models/multimodal/processing/test_gemma4.py @@ -12,6 +12,60 @@ GEMMA4_MODEL_ID = "google/gemma-4-E2B-it" +@pytest.mark.parametrize( + "image_width,image_height,max_soft_tokens", + [ + # Production repro: a 3x900 image (extreme aspect ratio) made the + # prompt-side estimator return 289 while the HF Gemma 4 image + # processor's vision tower output capped at 280, producing the + # "Attempted to assign 280 multimodal tokens to 289 placeholders" + # mismatch that crashed EngineCore. + (900, 3, 280), + (3, 900, 280), + # Same pathology should hold for the video-frame budget (70 tokens). + (900, 3, 70), + # And for any other supported budget. + (4000, 2, 1120), + ], +) +@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID]) +def test_compute_num_soft_tokens_does_not_exceed_max_soft_tokens( + model_id: str, + image_width: int, + image_height: int, + max_soft_tokens: int, +): + """Regression for the Gemma 3/4 multimodal crash. + + `_compute_num_soft_tokens` must never return a value larger than + `max_soft_tokens`. The HF Gemma 4 image processor clamps its vision + tower output to that value; if the prompt-side estimator returns more, + the prompt has more `image` placeholder tokens than the encoder will + fill, and `_merge_multimodal_embeddings` raises `ValueError` deep in + the model forward. + """ + ctx = build_model_context( + model_id, + mm_processor_kwargs={"do_pan_and_scan": True}, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + + num_soft_tokens = processor.info._compute_num_soft_tokens( + image_width=image_width, + image_height=image_height, + max_soft_tokens=max_soft_tokens, + ) + + assert num_soft_tokens <= max_soft_tokens, ( + f"_compute_num_soft_tokens returned {num_soft_tokens} for " + f"image_width={image_width}, image_height={image_height}, " + f"max_soft_tokens={max_soft_tokens} — exceeds the cap that the HF " + f"image processor enforces on its vision tower output. This is " + f"the placeholder/encoder count mismatch that crashes EngineCore." + ) + + @pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID]) def test_limit_mm_per_prompt( image_assets: ImageTestAssets, diff --git a/vllm/model_executor/models/gemma4_mm.py b/vllm/model_executor/models/gemma4_mm.py index cdc54609a652..9b2c54e27354 100644 --- a/vllm/model_executor/models/gemma4_mm.py +++ b/vllm/model_executor/models/gemma4_mm.py @@ -265,7 +265,14 @@ def _compute_num_soft_tokens( target_h = max(unit, int(math.floor(image_height * scale / unit)) * unit) target_w = max(unit, int(math.floor(image_width * scale / unit)) * unit) num_patches = (target_h // patch_size) * (target_w // patch_size) - return num_patches // (pooling_kernel_size**2) + # Clamp to ``max_soft_tokens``: extreme aspect ratios (e.g. 3x900) + # cause the floor() above to round one dim up to ``unit`` while the + # other scales freely, which over-shoots ``max_patches``. The HF + # Gemma 4 image processor caps its vision-tower output at + # ``max_soft_tokens``, so without this clamp the prompt-side + # placeholder count exceeds the encoder output and + # ``_merge_multimodal_embeddings`` crashes. + return min(num_patches // (pooling_kernel_size**2), max_soft_tokens) def get_image_repl( self,