Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions python/sglang/test/accuracy_test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
ModelLaunchSettings,
dump_metric,
popen_launch_server,
write_github_step_summary,
)
Expand Down Expand Up @@ -421,6 +422,12 @@ def _run_nemo_skills_eval(
if score is None:
return False, "Could not parse accuracy from ns eval output", None

dump_metric(
f"{dataset}_score",
score,
labels={"model": model.model_path, "eval": dataset, "api": "nemo-skills"},
)

return True, None, {"score": score}

except subprocess.TimeoutExpired:
Expand Down
11 changes: 11 additions & 0 deletions python/sglang/test/kits/lm_eval_kit.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import requests
import yaml

from sglang.test.test_utils import dump_metric


@contextmanager
def scoped_env_vars(new_env: dict[str, str] | None):
Expand Down Expand Up @@ -69,6 +71,15 @@ def test_lm_eval(self):
f"ground_truth={ground_truth:.3f} | "
f"measured={measured_value:.3f} | rtol={rtol}"
)
dump_metric(
f"{task['name']}_{metric['name']}",
measured_value,
labels={
"model": eval_config.get("model_name", ""),
"eval": "lm-eval",
"task": task["name"],
},
Comment on lines +77 to +81
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The labeling schema here is inconsistent with the other evaluation paths modified in this PR. In mmmu_vlm_kit.py and accuracy_test_runner.py, the eval label represents the benchmark/dataset name (e.g., "mmmu" or "mmmu-pro") and the api label represents the framework/runner (e.g., "lmms-eval" or "nemo-skills").

In this file, eval is set to "lm-eval" and the benchmark is stored in a separate task label. To maintain consistency across the metrics collected from different kits, consider using the task name for the eval label and adding an api label set to "lm-eval".

Suggested change
labels={
"model": eval_config.get("model_name", ""),
"eval": "lm-eval",
"task": task["name"],
},
labels={
"model": eval_config.get("model_name", ""),
"eval": task["name"],
"api": "lm-eval",
},

)
success = success and np.isclose(
ground_truth, measured_value, rtol=rtol
)
Expand Down
13 changes: 13 additions & 0 deletions python/sglang/test/kits/mmmu_vlm_kit.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
dump_metric,
popen_launch_server,
)

Expand Down Expand Up @@ -216,6 +217,12 @@ def test_mmmu(self: CustomTestCase):
mmmu_accuracy = result["results"]["mmmu_val"]["mmmu_acc,none"]
print(f"Model {self.model} achieved accuracy: {mmmu_accuracy:.4f}")

dump_metric(
"mmmu_score",
mmmu_accuracy,
labels={"model": self.model, "eval": "mmmu", "api": "lmms-eval"},
)

# Assert performance meets expected threshold
self.assertGreaterEqual(
mmmu_accuracy,
Expand Down Expand Up @@ -403,6 +410,12 @@ def _run_vlm_mmmu_test(
f"Model {model.model} achieved accuracy{test_name}: {mmmu_accuracy:.4f}"
)

dump_metric(
"mmmu_score",
mmmu_accuracy,
labels={"model": model.model, "eval": "mmmu", "api": "lmms-eval"},
)

# Capture server output if requested
if capture_output and process:
server_output = self._read_output_from_files()
Expand Down
Loading