Skip to content

Commit dd07925

Browse files
committed
[fix] Switch placement of image placeholder for mistral 3.1
Signed-off-by: William Zhang <[email protected]>
1 parent 64ba483 commit dd07925

File tree

2 files changed

+8
-3
lines changed

2 files changed

+8
-3
lines changed

tensorrt_llm/inputs/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,9 @@ class MultimodalPlaceholderPlacement(enum.Enum):
257257
# NOTE: for mistral3 multimodal models, it does not strictly have to be after the text.
258258
# Ref: https://github.com/mistralai/mistral-common/blob/039465db2bdc0486df36365c9bdb428188482a18/
259259
# src/mistral_common/tokens/tokenizers/base.py#L326
260-
"mistral3": MultimodalPlaceholderPlacement.AFTER_TEXT,
260+
# However, accuracy tests show that the model generates higher quality output when the image
261+
# precedes the text (the relative difference can be as much as ~30% for both vLLM and TRT-LLM).
262+
"mistral3": MultimodalPlaceholderPlacement.BEFORE_TEXT,
261263
"phi4mm": MultimodalPlaceholderPlacement.BEFORE_TEXT,
262264
}
263265
assert len(PLACEHOLDER_PLACEMENT_MAP) == len(ALL_SUPPORTED_MULTIMODAL_MODELS)

tests/integration/defs/test_e2e.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2039,8 +2039,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
20392039
"mistral-small-3.1-24b-instruct": {
20402040
"image": [
20412041
["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
2042-
["scenic", "rock", "landscape", "snow", "altitude"],
2043-
["highway", "traffic", "directions", "lanes", "Jurong"],
2042+
["scenic", "rock", "landscape", "monolith", "formation"],
2043+
[
2044+
"multi-lane", "highway", "moderate", "traffic", "flow",
2045+
"vehicles", "congestion"
2046+
],
20442047
],
20452048
},
20462049
"gemma-3-27b-it": {

0 commit comments

Comments
 (0)