Skip to content

Commit 66a15bf

Browse files
2ez4bzlancelly
authored andcommitted
[fix] Switch placement of image placeholder for mistral 3.1 (NVIDIA#6435)
Signed-off-by: William Zhang <[email protected]> Signed-off-by: Lanyu Liao <[email protected]>
1 parent ae968a0 commit 66a15bf

File tree

2 files changed

+9
-4
lines changed

2 files changed

+9
-4
lines changed

tensorrt_llm/inputs/utils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,12 @@ class MultimodalPlaceholderPlacement(enum.Enum):
254254
"mllama": MultimodalPlaceholderPlacement.BEFORE_TEXT,
255255
"hyperclovax_vlm": MultimodalPlaceholderPlacement.AFTER_TEXT,
256256
"gemma3": MultimodalPlaceholderPlacement.BEFORE_TEXT,
257-
# NOTE: for mistral3 multimodal models, it does not strictly have to be after the text.
257+
# NOTE: for mistral3 multimodal models, it does not strictly have to be before the text.
258258
# Ref: https://github.com/mistralai/mistral-common/blob/039465db2bdc0486df36365c9bdb428188482a18/
259259
# src/mistral_common/tokens/tokenizers/base.py#L326
260-
"mistral3": MultimodalPlaceholderPlacement.AFTER_TEXT,
260+
# However, accuracy tests show that the model generates higher quality output when the image
261+
# precedes the text (the relative difference can be as much as ~30% for both vLLM and TRT-LLM).
262+
"mistral3": MultimodalPlaceholderPlacement.BEFORE_TEXT,
261263
"phi4mm": MultimodalPlaceholderPlacement.BEFORE_TEXT,
262264
}
263265
assert len(PLACEHOLDER_PLACEMENT_MAP) == len(ALL_SUPPORTED_MULTIMODAL_MODELS)

tests/integration/defs/test_e2e.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2049,8 +2049,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
20492049
"mistral-small-3.1-24b-instruct": {
20502050
"image": [
20512051
["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
2052-
["scenic", "rock", "landscape", "snow", "altitude"],
2053-
["highway", "traffic", "directions", "lanes", "Jurong"],
2052+
["scenic", "rock", "landscape", "monolith", "formation"],
2053+
[
2054+
"multi-lane", "highway", "moderate", "traffic", "flow",
2055+
"vehicles", "congestion"
2056+
],
20542057
],
20552058
"mixture_text_image":
20562059
[["invention", "person", "scientists", "Lick", "engineers"],

0 commit comments

Comments
 (0)