diff --git a/python/sglang/test/accuracy_test_runner.py b/python/sglang/test/accuracy_test_runner.py index c35dba6497b4..abe8cc3d59c1 100644 --- a/python/sglang/test/accuracy_test_runner.py +++ b/python/sglang/test/accuracy_test_runner.py @@ -29,6 +29,7 @@ class AccuracyTestParams: top_p: Optional[float] = None top_k: Optional[int] = None repeat: Optional[int] = None + api: Optional[str] = None # "chat" or "completion"; defaults to "chat" in run_eval @dataclass @@ -86,6 +87,7 @@ def _run_simple_eval( top_p: Optional[float] = None, top_k: Optional[int] = None, repeat: Optional[int] = None, + api: Optional[str] = None, ) -> Tuple[bool, Optional[str], Optional[dict]]: """Run evaluation using simple_eval backend (run_eval.py). @@ -110,6 +112,9 @@ def _run_simple_eval( num_threads=num_threads or 1024, ) + if api is not None: + args.api = api + if max_tokens is not None: args.max_tokens = max_tokens @@ -482,6 +487,7 @@ def run_accuracy_test( top_p=params.top_p, top_k=params.top_k, repeat=params.repeat, + api=params.api, ) if not success: diff --git a/test/registered/backends/test_flashinfer_trtllm_gen_moe_backend.py b/test/registered/backends/test_flashinfer_trtllm_gen_moe_backend.py index b63447a60cd5..76b47ffd4dee 100644 --- a/test/registered/backends/test_flashinfer_trtllm_gen_moe_backend.py +++ b/test/registered/backends/test_flashinfer_trtllm_gen_moe_backend.py @@ -59,7 +59,7 @@ def test_gsm8k(self): ) metrics = run_eval(args) print(f"{metrics=}") - self.assertGreater(metrics["score"], 0.93) + self.assertGreater(metrics["score"], 0.89) class FlashinferTrtllmGenMoeBackendBF16Base: diff --git a/test/registered/perf/test_dpsk_r1_fp4_4gpu_perf.py b/test/registered/perf/test_dpsk_r1_fp4_4gpu_perf.py index b03c34337d26..23714a8ce1fc 100644 --- a/test/registered/perf/test_dpsk_r1_fp4_4gpu_perf.py +++ b/test/registered/perf/test_dpsk_r1_fp4_4gpu_perf.py @@ -62,7 +62,10 @@ def test_deepseek_r1_fp4_all_variants(self): models=variants, test_name="DeepSeek-R1-0528-NVFP4-v2 Unified", accuracy_params=AccuracyTestParams( - dataset="gsm8k", baseline_accuracy=0.935 + dataset="gsm8k", + baseline_accuracy=0.935, + num_examples=200, + api="completion", ), performance_params=PerformanceTestParams( profile_dir="performance_profiles_deepseek_r1_fp4",