[fix] Switch placement of image placeholder for mistral 3.1

2ez4bz · 2ez4bz · commit dd07925568ac · 2025-07-28T23:02:48.000-07:00
Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
@@ -257,7 +257,9 @@ class MultimodalPlaceholderPlacement(enum.Enum):
     # NOTE: for mistral3 multimodal models, it does not strictly have to be after the text.
     # Ref: https://github.com/mistralai/mistral-common/blob/039465db2bdc0486df36365c9bdb428188482a18/
     #      src/mistral_common/tokens/tokenizers/base.py#L326
-    "mistral3": MultimodalPlaceholderPlacement.AFTER_TEXT,
+    # However, accuracy tests show that the model generates higher quality output when the image
+    # precedes the text (the relative difference can be as much as ~30% for both vLLM and TRT-LLM).
+    "mistral3": MultimodalPlaceholderPlacement.BEFORE_TEXT,
     "phi4mm": MultimodalPlaceholderPlacement.BEFORE_TEXT,
 }
 assert len(PLACEHOLDER_PLACEMENT_MAP) == len(ALL_SUPPORTED_MULTIMODAL_MODELS)
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -2039,8 +2039,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
         "mistral-small-3.1-24b-instruct": {
             "image": [
                 ["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
-                ["scenic", "rock", "landscape", "snow", "altitude"],
-                ["highway", "traffic", "directions", "lanes", "Jurong"],
+                ["scenic", "rock", "landscape", "monolith", "formation"],
+                [
+                    "multi-lane", "highway", "moderate", "traffic", "flow",
+                    "vehicles", "congestion"
+                ],
             ],
         },
         "gemma-3-27b-it": {