Skip to content

Commit ba5bdbb

Browse files
authored
[None][chore] Disable add special tokens for Llama3.3 70B (#6482)
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent 147ad69 commit ba5bdbb

File tree

3 files changed

+20
-8
lines changed

3 files changed

+20
-8
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,20 @@ meta-llama/Llama-3.3-70B-Instruct:
1111
- accuracy: 83.78
1212
- quant_algo: NVFP4
1313
kv_cache_quant_algo: FP8
14-
accuracy: 75.61
14+
accuracy: 88.70
1515
- quant_algo: FP8
16-
accuracy: 83.30
16+
kv_cache_quant_algo: FP8
17+
accuracy: 84.08
1718
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
1819
- accuracy: 92.20
1920
meta-llama/Llama-4-Scout-17B-16E-Instruct:
2021
- accuracy: 89.70
2122
- quant_algo: NVFP4
2223
kv_cache_quant_algo: FP8
23-
accuracy: 79.62
24+
accuracy: 88.61
2425
- quant_algo: FP8
2526
kv_cache_quant_algo: FP8
26-
accuracy: 80.37
27+
accuracy: 89.45
2728
deepseek-ai/DeepSeek-V3-Lite:
2829
- accuracy: 64.74
2930
- quant_algo: NVFP4

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ meta-llama/Llama-3.3-70B-Instruct:
6363
kv_cache_quant_algo: FP8
6464
accuracy: 79.31
6565
- quant_algo: FP8
66+
kv_cache_quant_algo: FP8
6667
accuracy: 81.02
6768
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
6869
- accuracy: 86.40

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -374,12 +374,17 @@ def test_fp8_tp4(self):
374374
max_batch_size=32,
375375
kv_cache_config=kv_cache_config) as llm:
376376
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
377+
sampling_params = SamplingParams(
378+
temperature=0.0,
379+
add_special_tokens=False,
380+
)
377381
task = MMLU(self.MODEL_NAME)
378-
task.evaluate(llm)
382+
task.evaluate(llm, sampling_params=sampling_params)
379383
task = GSM8K(self.MODEL_NAME)
380-
task.evaluate(llm)
384+
task.evaluate(llm, sampling_params=sampling_params)
381385
task = GPQADiamond(self.MODEL_NAME)
382386
task.evaluate(llm,
387+
sampling_params=sampling_params,
383388
extra_evaluator_kwargs=dict(apply_chat_template=True))
384389

385390
@pytest.mark.skip_less_device(4)
@@ -388,12 +393,17 @@ def test_nvfp4_tp4(self):
388393
model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
389394
with LLM(model_path, tensor_parallel_size=4) as llm:
390395
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
396+
sampling_params = SamplingParams(
397+
temperature=0.0,
398+
add_special_tokens=False,
399+
)
391400
task = MMLU(self.MODEL_NAME)
392-
task.evaluate(llm)
401+
task.evaluate(llm, sampling_params=sampling_params)
393402
task = GSM8K(self.MODEL_NAME)
394-
task.evaluate(llm)
403+
task.evaluate(llm, sampling_params=sampling_params)
395404
task = GPQADiamond(self.MODEL_NAME)
396405
task.evaluate(llm,
406+
sampling_params=sampling_params,
397407
extra_evaluator_kwargs=dict(apply_chat_template=True))
398408

399409

0 commit comments

Comments
 (0)