[None][chore] Disable add special tokens for Llama3.3 70B (#6482)

chenfeiz0326 · web-flow · commit ba5bdbb138ba · 2025-08-01T17:03:27.000+08:00
Signed-off-by: Chenfei Zhang &lt;chenfeiz@nvidia.com&gt;
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -11,19 +11,20 @@ meta-llama/Llama-3.3-70B-Instruct:
   - accuracy: 83.78
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
-    accuracy: 75.61
+    accuracy: 88.70
   - quant_algo: FP8
-    accuracy: 83.30
+    kv_cache_quant_algo: FP8
+    accuracy: 84.08
 meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 92.20
 meta-llama/Llama-4-Scout-17B-16E-Instruct:
   - accuracy: 89.70
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
-    accuracy: 79.62
+    accuracy: 88.61
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
-    accuracy: 80.37
+    accuracy: 89.45
 deepseek-ai/DeepSeek-V3-Lite:
   - accuracy: 64.74
   - quant_algo: NVFP4
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -63,6 +63,7 @@ meta-llama/Llama-3.3-70B-Instruct:
     kv_cache_quant_algo: FP8
     accuracy: 79.31
   - quant_algo: FP8
+    kv_cache_quant_algo: FP8
     accuracy: 81.02
 meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 86.40
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -374,12 +374,17 @@ def test_fp8_tp4(self):
                  max_batch_size=32,
                  kv_cache_config=kv_cache_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            sampling_params = SamplingParams(
+                temperature=0.0,
+                add_special_tokens=False,
+            )
             task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
+            task.evaluate(llm, sampling_params=sampling_params)
             task = GSM8K(self.MODEL_NAME)
-            task.evaluate(llm)
+            task.evaluate(llm, sampling_params=sampling_params)
             task = GPQADiamond(self.MODEL_NAME)
             task.evaluate(llm,
+                          sampling_params=sampling_params,
                           extra_evaluator_kwargs=dict(apply_chat_template=True))
 
     @pytest.mark.skip_less_device(4)
@@ -388,12 +393,17 @@ def test_nvfp4_tp4(self):
         model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
         with LLM(model_path, tensor_parallel_size=4) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            sampling_params = SamplingParams(
+                temperature=0.0,
+                add_special_tokens=False,
+            )
             task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
+            task.evaluate(llm, sampling_params=sampling_params)
             task = GSM8K(self.MODEL_NAME)
-            task.evaluate(llm)
+            task.evaluate(llm, sampling_params=sampling_params)
             task = GPQADiamond(self.MODEL_NAME)
             task.evaluate(llm,
+                          sampling_params=sampling_params,
                           extra_evaluator_kwargs=dict(apply_chat_template=True))