[fix] Switch placement of image placeholder for mistral 3.1 (NVIDIA#6435)

2ez4bz · lancelly · commit 66a15bf68c6e · 2025-08-06T03:01:45.000Z
Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
Signed-off-by: Lanyu Liao &lt;lancelly@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
@@ -254,10 +254,12 @@ class MultimodalPlaceholderPlacement(enum.Enum):
     "mllama": MultimodalPlaceholderPlacement.BEFORE_TEXT,
     "hyperclovax_vlm": MultimodalPlaceholderPlacement.AFTER_TEXT,
     "gemma3": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-    # NOTE: for mistral3 multimodal models, it does not strictly have to be after the text.
+    # NOTE: for mistral3 multimodal models, it does not strictly have to be before the text.
     # Ref: https://github.com/mistralai/mistral-common/blob/039465db2bdc0486df36365c9bdb428188482a18/
     #      src/mistral_common/tokens/tokenizers/base.py#L326
-    "mistral3": MultimodalPlaceholderPlacement.AFTER_TEXT,
+    # However, accuracy tests show that the model generates higher quality output when the image
+    # precedes the text (the relative difference can be as much as ~30% for both vLLM and TRT-LLM).
+    "mistral3": MultimodalPlaceholderPlacement.BEFORE_TEXT,
     "phi4mm": MultimodalPlaceholderPlacement.BEFORE_TEXT,
 }
 assert len(PLACEHOLDER_PLACEMENT_MAP) == len(ALL_SUPPORTED_MULTIMODAL_MODELS)
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -2049,8 +2049,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
         "mistral-small-3.1-24b-instruct": {
             "image": [
                 ["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
-                ["scenic", "rock", "landscape", "snow", "altitude"],
-                ["highway", "traffic", "directions", "lanes", "Jurong"],
+                ["scenic", "rock", "landscape", "monolith", "formation"],
+                [
+                    "multi-lane", "highway", "moderate", "traffic", "flow",
+                    "vehicles", "congestion"
+                ],
             ],
             "mixture_text_image":
             [["invention", "person", "scientists", "Lick", "engineers"],