From 4a1cff2e93f99f8cff648f382598f7648d3363ae Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Thu, 14 Aug 2025 03:00:44 +0000
Subject: [PATCH 1/4] add Codestral 22B v01 torch

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../accuracy/references/cnn_dailymail.yaml    |  2 ++
 .../defs/accuracy/references/gsm8k.yaml       |  2 ++
 .../defs/accuracy/references/mmlu.yaml        |  2 ++
 .../defs/accuracy/test_llm_api_pytorch.py     | 24 +++++++++++++++++++
 4 files changed, 30 insertions(+)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index 67781cd8d15..be16c07581b 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -45,6 +45,8 @@ microsoft/Phi-3.5-mini-instruct:
   - accuracy: 31.354
 microsoft/Phi-4-mini-instruct:
   - accuracy: 32.921
+codestral/codestral-22b-v0.1:
+  - accuracy: 28.90
 state-spaces/mamba-130m-hf:
   - accuracy: 19.470
 lmsys/vicuna-7b-v1.3:
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 26de82cbc09..95986804ee5 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -159,6 +159,8 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
   - accuracy: 75.85
 microsoft/Phi-4-mini-instruct:
   - accuracy: 82.30
+codestral/codestral-22b-v0.1:
+  - accuracy: 28.90
 GPT-OSS/BF16:
   - accuracy: 90.3
 GPT-OSS/MXFP4:
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 7f2bb55e6f7..278cd5e540d 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -229,6 +229,8 @@ nvidia/Nemotron-H-56B-Base-8K:
     accuracy: 83.82
 microsoft/Phi-4-mini-instruct:
   - accuracy: 68.98
+codestral/codestral-22b-v0.1:
+  - accuracy: 28.90
 # Created a dummy accuracy to track tp_size=2 for phi4-mini model.
 # TODO: update once https://nvbugs/5393849 is fixed.
 microsoft/Phi-4-mini-instruct-tp2:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 89483fd2620..ae699c6f421 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2431,6 +2431,30 @@ def test_auto_dtype(self):
             task.evaluate(llm)
 
 
+class TestStarCoder2_7B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "bigcode/starcoder2-7b"
+    MODEL_PATH = f"{llm_models_root()}/starcoder2-7b"
+
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
+class TestCodestral_22B_V01(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "codestral/codestral-22b-v0.1"
+    MODEL_PATH = f"{llm_models_root()}/codestral-22b-v0.1"
+
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestKanana_Instruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "kanana-1.5-2.1b-instruct-2505"
     MODEL_PATH = f"{llm_models_root()}/kanana-1.5-2.1b-instruct-2505"

From f8a2db3e066aa038fce8c1e7fb9e27bcd28eae4e Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Thu, 14 Aug 2025 12:39:18 +0000
Subject: [PATCH 2/4] add codestral 2b code

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../accuracy/references/cnn_dailymail.yaml    |  6 +++--
 .../defs/accuracy/references/gsm8k.yaml       |  4 +--
 .../defs/accuracy/references/mmlu.yaml        |  6 +++--
 .../integration/defs/accuracy/test_llm_api.py | 26 +++++++++++++++++++
 .../defs/accuracy/test_llm_api_pytorch.py     |  9 ++++---
 5 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index be16c07581b..aff3d1c31e5 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -45,8 +45,10 @@ microsoft/Phi-3.5-mini-instruct:
   - accuracy: 31.354
 microsoft/Phi-4-mini-instruct:
   - accuracy: 32.921
-codestral/codestral-22b-v0.1:
-  - accuracy: 28.90
+mistralai/Codestral-22B-v0.1:
+  - accuracy: 30.316
+  - quant_algo: FP8
+    accuracy: 30.316
 state-spaces/mamba-130m-hf:
   - accuracy: 19.470
 lmsys/vicuna-7b-v1.3:
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 95986804ee5..36ab9ae5997 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -159,8 +159,8 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
   - accuracy: 75.85
 microsoft/Phi-4-mini-instruct:
   - accuracy: 82.30
-codestral/codestral-22b-v0.1:
-  - accuracy: 28.90
+mistralai/Codestral-22B-v0.1:
+  - accuracy: 67.10
 GPT-OSS/BF16:
   - accuracy: 90.3
 GPT-OSS/MXFP4:
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 278cd5e540d..34619f07816 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -229,8 +229,10 @@ nvidia/Nemotron-H-56B-Base-8K:
     accuracy: 83.82
 microsoft/Phi-4-mini-instruct:
   - accuracy: 68.98
-codestral/codestral-22b-v0.1:
-  - accuracy: 28.90
+mistralai/Codestral-22B-v0.1:
+  - accuracy: 61.72
+  - quant_algo: FP8
+    accuracy: 61.72
 # Created a dummy accuracy to track tp_size=2 for phi4-mini model.
 # TODO: update once https://nvbugs/5393849 is fixed.
 microsoft/Phi-4-mini-instruct-tp2:
diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
index f34bcdb5be4..c44a9e52fab 100644
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@@ -433,3 +433,29 @@ def test_auto_dtype(self):
                 speculative_config=self.speculative_config) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
+
+
+class TestCodestral_22B_V01(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "mistralai/Codestral-22B-v0.1"
+    MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1"
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+
+    @pytest.mark.skip_less_device_memory(80000)
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH, kv_cache_config=self.kv_cache_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_ada
+    @pytest.mark.skip_less_device_memory(80000)
+    def test_fp8(self):
+        quant_config = QuantConfig(QuantAlgo.FP8)
+        with LLM(self.MODEL_PATH,
+                 quant_config=quant_config,
+                 kv_cache_config=self.kv_cache_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index ae699c6f421..cf4de0438bc 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2442,11 +2442,14 @@ def test_auto_dtype(self):
 
 
 class TestCodestral_22B_V01(LlmapiAccuracyTestHarness):
-    MODEL_NAME = "codestral/codestral-22b-v0.1"
-    MODEL_PATH = f"{llm_models_root()}/codestral-22b-v0.1"
+    MODEL_NAME = "mistralai/Codestral-22B-v0.1"
+    MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1"
 
+    @pytest.mark.timeout(2400)
+    @pytest.mark.skip_less_device_memory(80000)
     def test_auto_dtype(self):
-        with LLM(self.MODEL_PATH) as llm:
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)

From 001b8e0a526ccbba3968c557bbac9150b1c615a8 Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Fri, 15 Aug 2025 07:30:03 +0000
Subject: [PATCH 3/4] Add starcoder 2-7b

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../accuracy/references/cnn_dailymail.yaml    |  4 +++
 .../defs/accuracy/references/mmlu.yaml        |  4 +++
 .../integration/defs/accuracy/test_llm_api.py | 26 +++++++++++++++++++
 .../defs/accuracy/test_llm_api_pytorch.py     | 10 -------
 .../test_lists/qa/llm_function_nim.txt        |  5 ++++
 5 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index aff3d1c31e5..5d5adcbf9b4 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -45,6 +45,10 @@ microsoft/Phi-3.5-mini-instruct:
   - accuracy: 31.354
 microsoft/Phi-4-mini-instruct:
   - accuracy: 32.921
+bigcode/starcoder2-7b:
+  - accuracy: 26.611
+  - quant_algo: FP8
+    accuracy: 26.611
 mistralai/Codestral-22B-v0.1:
   - accuracy: 30.316
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 34619f07816..c8bacfeaf46 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -229,6 +229,10 @@ nvidia/Nemotron-H-56B-Base-8K:
     accuracy: 83.82
 microsoft/Phi-4-mini-instruct:
   - accuracy: 68.98
+bigcode/starcoder2-7b:
+  - accuracy: 41.35
+  - quant_algo: FP8
+    accuracy: 41.35
 mistralai/Codestral-22B-v0.1:
   - accuracy: 61.72
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
index c44a9e52fab..321591228bd 100644
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@@ -435,6 +435,32 @@ def test_auto_dtype(self):
             task.evaluate(llm)
 
 
+class TestStarCoder2_7B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "bigcode/starcoder2-7b"
+    MODEL_PATH = f"{llm_models_root()}/starcoder2-7b"
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+
+    @pytest.mark.skip_less_device_memory(70000)
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH, kv_cache_config=self.kv_cache_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_ada
+    @pytest.mark.skip_less_device_memory(70000)
+    def test_fp8(self):
+        quant_config = QuantConfig(QuantAlgo.FP8)
+        with LLM(self.MODEL_PATH,
+                 quant_config=quant_config,
+                 kv_cache_config=self.kv_cache_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestCodestral_22B_V01(LlmapiAccuracyTestHarness):
     MODEL_NAME = "mistralai/Codestral-22B-v0.1"
     MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1"
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index cf4de0438bc..9dcbf34bc87 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2431,16 +2431,6 @@ def test_auto_dtype(self):
             task.evaluate(llm)
 
 
-class TestStarCoder2_7B(LlmapiAccuracyTestHarness):
-    MODEL_NAME = "bigcode/starcoder2-7b"
-    MODEL_PATH = f"{llm_models_root()}/starcoder2-7b"
-
-    def test_auto_dtype(self):
-        with LLM(self.MODEL_PATH) as llm:
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-
-
 class TestCodestral_22B_V01(LlmapiAccuracyTestHarness):
     MODEL_NAME = "mistralai/Codestral-22B-v0.1"
     MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1"
diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt
index 4fe1fd3ab00..90b6406806b 100644
--- a/tests/integration/test_lists/qa/llm_function_nim.txt
+++ b/tests/integration/test_lists/qa/llm_function_nim.txt
@@ -21,3 +21,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cu
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
+accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype
+accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8
+accuracy/test_llm_api.py::TestCodestral_22B_V01::test_auto_dtype
+accuracy/test_llm_api.py::TestCodestral_22B_V01::test_fp8
+accuracy/test_llm_api_pytorch.py::TestCodestral_22B_V01::test_auto_dtype

From 4ab7dd703506275b3721089a34c5f68fa27ceed2 Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Fri, 15 Aug 2025 08:12:55 +0000
Subject: [PATCH 4/4] remove unnecessary timeout issues

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 9dcbf34bc87..b3c1c50ca5d 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2435,7 +2435,6 @@ class TestCodestral_22B_V01(LlmapiAccuracyTestHarness):
     MODEL_NAME = "mistralai/Codestral-22B-v0.1"
     MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1"
 
-    @pytest.mark.timeout(2400)
     @pytest.mark.skip_less_device_memory(80000)
     def test_auto_dtype(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)