Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions python/sglang/test/accuracy_test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class AccuracyTestParams:
top_p: Optional[float] = None
top_k: Optional[int] = None
repeat: Optional[int] = None
api: Optional[str] = None # "chat" or "completion"; defaults to "chat" in run_eval


@dataclass
Expand Down Expand Up @@ -86,6 +87,7 @@ def _run_simple_eval(
top_p: Optional[float] = None,
top_k: Optional[int] = None,
repeat: Optional[int] = None,
api: Optional[str] = None,
) -> Tuple[bool, Optional[str], Optional[dict]]:
"""Run evaluation using simple_eval backend (run_eval.py).

Expand All @@ -110,6 +112,9 @@ def _run_simple_eval(
num_threads=num_threads or 1024,
)

if api is not None:
args.api = api

if max_tokens is not None:
args.max_tokens = max_tokens

Expand Down Expand Up @@ -482,6 +487,7 @@ def run_accuracy_test(
top_p=params.top_p,
top_k=params.top_k,
repeat=params.repeat,
api=params.api,
)

if not success:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_gsm8k(self):
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["score"], 0.93)
self.assertGreater(metrics["score"], 0.89)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The PR description mentions lowering the FP8 MOE backend baseline. Note that FlashinferTrtllmGenMoeBackendMXFP8Base (line 157) also uses an FP8-based quantization (mxfp8) and currently maintains a GSM8K baseline of 0.93. If this model was also affected by the evaluation unification and data leakage fix, its baseline should likely be updated to avoid potential CI failures.



class FlashinferTrtllmGenMoeBackendBF16Base:
Expand Down
5 changes: 4 additions & 1 deletion test/registered/perf/test_dpsk_r1_fp4_4gpu_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ def test_deepseek_r1_fp4_all_variants(self):
models=variants,
test_name="DeepSeek-R1-0528-NVFP4-v2 Unified",
accuracy_params=AccuracyTestParams(
dataset="gsm8k", baseline_accuracy=0.935
dataset="gsm8k",
baseline_accuracy=0.935,
num_examples=200,
api="completion",
),
performance_params=PerformanceTestParams(
profile_dir="performance_profiles_deepseek_r1_fp4",
Expand Down
Loading