From 4a1cff2e93f99f8cff648f382598f7648d3363ae Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Thu, 14 Aug 2025 03:00:44 +0000 Subject: [PATCH 1/4] add Codestral 22B v01 torch Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../accuracy/references/cnn_dailymail.yaml | 2 ++ .../defs/accuracy/references/gsm8k.yaml | 2 ++ .../defs/accuracy/references/mmlu.yaml | 2 ++ .../defs/accuracy/test_llm_api_pytorch.py | 24 +++++++++++++++++++ 4 files changed, 30 insertions(+) diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index 67781cd8d15..be16c07581b 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -45,6 +45,8 @@ microsoft/Phi-3.5-mini-instruct: - accuracy: 31.354 microsoft/Phi-4-mini-instruct: - accuracy: 32.921 +codestral/codestral-22b-v0.1: + - accuracy: 28.90 state-spaces/mamba-130m-hf: - accuracy: 19.470 lmsys/vicuna-7b-v1.3: diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 26de82cbc09..95986804ee5 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -159,6 +159,8 @@ microsoft/Phi-4-multimodal-instruct-long-rope: - accuracy: 75.85 microsoft/Phi-4-mini-instruct: - accuracy: 82.30 +codestral/codestral-22b-v0.1: + - accuracy: 28.90 GPT-OSS/BF16: - accuracy: 90.3 GPT-OSS/MXFP4: diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 7f2bb55e6f7..278cd5e540d 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -229,6 +229,8 @@ nvidia/Nemotron-H-56B-Base-8K: accuracy: 83.82 microsoft/Phi-4-mini-instruct: - accuracy: 68.98 +codestral/codestral-22b-v0.1: + - accuracy: 28.90 # Created a dummy accuracy to track tp_size=2 for phi4-mini model. # TODO: update once https://nvbugs/5393849 is fixed. microsoft/Phi-4-mini-instruct-tp2: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 89483fd2620..ae699c6f421 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2431,6 +2431,30 @@ def test_auto_dtype(self): task.evaluate(llm) +class TestStarCoder2_7B(LlmapiAccuracyTestHarness): + MODEL_NAME = "bigcode/starcoder2-7b" + MODEL_PATH = f"{llm_models_root()}/starcoder2-7b" + + def test_auto_dtype(self): + with LLM(self.MODEL_PATH) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + +class TestCodestral_22B_V01(LlmapiAccuracyTestHarness): + MODEL_NAME = "codestral/codestral-22b-v0.1" + MODEL_PATH = f"{llm_models_root()}/codestral-22b-v0.1" + + def test_auto_dtype(self): + with LLM(self.MODEL_PATH) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + class TestKanana_Instruct(LlmapiAccuracyTestHarness): MODEL_NAME = "kanana-1.5-2.1b-instruct-2505" MODEL_PATH = f"{llm_models_root()}/kanana-1.5-2.1b-instruct-2505" From f8a2db3e066aa038fce8c1e7fb9e27bcd28eae4e Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Thu, 14 Aug 2025 12:39:18 +0000 Subject: [PATCH 2/4] add codestral 2b code Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../accuracy/references/cnn_dailymail.yaml | 6 +++-- .../defs/accuracy/references/gsm8k.yaml | 4 +-- .../defs/accuracy/references/mmlu.yaml | 6 +++-- .../integration/defs/accuracy/test_llm_api.py | 26 +++++++++++++++++++ .../defs/accuracy/test_llm_api_pytorch.py | 9 ++++--- 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index be16c07581b..aff3d1c31e5 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -45,8 +45,10 @@ microsoft/Phi-3.5-mini-instruct: - accuracy: 31.354 microsoft/Phi-4-mini-instruct: - accuracy: 32.921 -codestral/codestral-22b-v0.1: - - accuracy: 28.90 +mistralai/Codestral-22B-v0.1: + - accuracy: 30.316 + - quant_algo: FP8 + accuracy: 30.316 state-spaces/mamba-130m-hf: - accuracy: 19.470 lmsys/vicuna-7b-v1.3: diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 95986804ee5..36ab9ae5997 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -159,8 +159,8 @@ microsoft/Phi-4-multimodal-instruct-long-rope: - accuracy: 75.85 microsoft/Phi-4-mini-instruct: - accuracy: 82.30 -codestral/codestral-22b-v0.1: - - accuracy: 28.90 +mistralai/Codestral-22B-v0.1: + - accuracy: 67.10 GPT-OSS/BF16: - accuracy: 90.3 GPT-OSS/MXFP4: diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 278cd5e540d..34619f07816 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -229,8 +229,10 @@ nvidia/Nemotron-H-56B-Base-8K: accuracy: 83.82 microsoft/Phi-4-mini-instruct: - accuracy: 68.98 -codestral/codestral-22b-v0.1: - - accuracy: 28.90 +mistralai/Codestral-22B-v0.1: + - accuracy: 61.72 + - quant_algo: FP8 + accuracy: 61.72 # Created a dummy accuracy to track tp_size=2 for phi4-mini model. # TODO: update once https://nvbugs/5393849 is fixed. microsoft/Phi-4-mini-instruct-tp2: diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index f34bcdb5be4..c44a9e52fab 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -433,3 +433,29 @@ def test_auto_dtype(self): speculative_config=self.speculative_config) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) + + +class TestCodestral_22B_V01(LlmapiAccuracyTestHarness): + MODEL_NAME = "mistralai/Codestral-22B-v0.1" + MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1" + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + + @pytest.mark.skip_less_device_memory(80000) + def test_auto_dtype(self): + with LLM(self.MODEL_PATH, kv_cache_config=self.kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_ada + @pytest.mark.skip_less_device_memory(80000) + def test_fp8(self): + quant_config = QuantConfig(QuantAlgo.FP8) + with LLM(self.MODEL_PATH, + quant_config=quant_config, + kv_cache_config=self.kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ae699c6f421..cf4de0438bc 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2442,11 +2442,14 @@ def test_auto_dtype(self): class TestCodestral_22B_V01(LlmapiAccuracyTestHarness): - MODEL_NAME = "codestral/codestral-22b-v0.1" - MODEL_PATH = f"{llm_models_root()}/codestral-22b-v0.1" + MODEL_NAME = "mistralai/Codestral-22B-v0.1" + MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1" + @pytest.mark.timeout(2400) + @pytest.mark.skip_less_device_memory(80000) def test_auto_dtype(self): - with LLM(self.MODEL_PATH) as llm: + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) From 001b8e0a526ccbba3968c557bbac9150b1c615a8 Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Fri, 15 Aug 2025 07:30:03 +0000 Subject: [PATCH 3/4] Add starcoder 2-7b Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../accuracy/references/cnn_dailymail.yaml | 4 +++ .../defs/accuracy/references/mmlu.yaml | 4 +++ .../integration/defs/accuracy/test_llm_api.py | 26 +++++++++++++++++++ .../defs/accuracy/test_llm_api_pytorch.py | 10 ------- .../test_lists/qa/llm_function_nim.txt | 5 ++++ 5 files changed, 39 insertions(+), 10 deletions(-) diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index aff3d1c31e5..5d5adcbf9b4 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -45,6 +45,10 @@ microsoft/Phi-3.5-mini-instruct: - accuracy: 31.354 microsoft/Phi-4-mini-instruct: - accuracy: 32.921 +bigcode/starcoder2-7b: + - accuracy: 26.611 + - quant_algo: FP8 + accuracy: 26.611 mistralai/Codestral-22B-v0.1: - accuracy: 30.316 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 34619f07816..c8bacfeaf46 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -229,6 +229,10 @@ nvidia/Nemotron-H-56B-Base-8K: accuracy: 83.82 microsoft/Phi-4-mini-instruct: - accuracy: 68.98 +bigcode/starcoder2-7b: + - accuracy: 41.35 + - quant_algo: FP8 + accuracy: 41.35 mistralai/Codestral-22B-v0.1: - accuracy: 61.72 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index c44a9e52fab..321591228bd 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -435,6 +435,32 @@ def test_auto_dtype(self): task.evaluate(llm) +class TestStarCoder2_7B(LlmapiAccuracyTestHarness): + MODEL_NAME = "bigcode/starcoder2-7b" + MODEL_PATH = f"{llm_models_root()}/starcoder2-7b" + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + + @pytest.mark.skip_less_device_memory(70000) + def test_auto_dtype(self): + with LLM(self.MODEL_PATH, kv_cache_config=self.kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_ada + @pytest.mark.skip_less_device_memory(70000) + def test_fp8(self): + quant_config = QuantConfig(QuantAlgo.FP8) + with LLM(self.MODEL_PATH, + quant_config=quant_config, + kv_cache_config=self.kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + class TestCodestral_22B_V01(LlmapiAccuracyTestHarness): MODEL_NAME = "mistralai/Codestral-22B-v0.1" MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1" diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index cf4de0438bc..9dcbf34bc87 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2431,16 +2431,6 @@ def test_auto_dtype(self): task.evaluate(llm) -class TestStarCoder2_7B(LlmapiAccuracyTestHarness): - MODEL_NAME = "bigcode/starcoder2-7b" - MODEL_PATH = f"{llm_models_root()}/starcoder2-7b" - - def test_auto_dtype(self): - with LLM(self.MODEL_PATH) as llm: - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) - - class TestCodestral_22B_V01(LlmapiAccuracyTestHarness): MODEL_NAME = "mistralai/Codestral-22B-v0.1" MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1" diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 4fe1fd3ab00..90b6406806b 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -21,3 +21,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cu accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True] +accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype +accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8 +accuracy/test_llm_api.py::TestCodestral_22B_V01::test_auto_dtype +accuracy/test_llm_api.py::TestCodestral_22B_V01::test_fp8 +accuracy/test_llm_api_pytorch.py::TestCodestral_22B_V01::test_auto_dtype From 4ab7dd703506275b3721089a34c5f68fa27ceed2 Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Fri, 15 Aug 2025 08:12:55 +0000 Subject: [PATCH 4/4] remove unnecessary timeout issues Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 9dcbf34bc87..b3c1c50ca5d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2435,7 +2435,6 @@ class TestCodestral_22B_V01(LlmapiAccuracyTestHarness): MODEL_NAME = "mistralai/Codestral-22B-v0.1" MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1" - @pytest.mark.timeout(2400) @pytest.mark.skip_less_device_memory(80000) def test_auto_dtype(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)