[test] Add accuracy regression test for Mistral3.1 (#6322)

2ez4bz · web-flow · commit 60e4d3a9d487 · 2025-07-28T09:41:44.000-07:00
Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -188,6 +188,8 @@ mistralai/Mistral-7B-Instruct-v0.3:
     accuracy: 31.457
   - quant_algo: W4A8_AWQ
     accuracy: 31.201
+mistralai/Mistral-Small-3.1-24B-Instruct-2503:
+  - accuracy: 29.20
 mistralai/Mistral-Nemo-Base-2407:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -122,5 +122,7 @@ mistralai/Ministral-8B-Instruct-2410:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 78.35
+mistralai/Mistral-Small-3.1-24B-Instruct-2503:
+  - accuracy: 89.23
 microsoft/Phi-4-multimodal-instruct:
   - accuracy: 81.19
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -95,6 +95,8 @@ mistralai/Mixtral-8x7B-Instruct-v0.1:
 mistralai/Mixtral-8x22B-v0.1:
   - quant_algo: FP8
     accuracy: 77.63
+mistralai/Mistral-Small-3.1-24B-Instruct-2503:
+  - accuracy: 81.7
 google/gemma-2-9b-it:
   - accuracy: 73.05
 google/gemma-3-27b-it:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -521,6 +521,20 @@ def test_auto_dtype(self):
             task.evaluate(llm)
 
 
+class TestMistralSmall24B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+    MODEL_PATH = f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503"
+
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestMinistral8BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "mistralai/Ministral-8B-Instruct-2410"
     MODEL_PATH = f"{llm_models_root()}/Ministral-8B-Instruct-2410"
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -192,6 +192,7 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
+  - accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
   - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]