diff --git a/docs/evaluation/scientific-knowledge.md b/docs/evaluation/scientific-knowledge.md index 618d6a0076..b9f21f84fc 100644 --- a/docs/evaluation/scientific-knowledge.md +++ b/docs/evaluation/scientific-knowledge.md @@ -100,7 +100,83 @@ We also tested a variant where the full generation output was provided to the ju The reported number for `simpleqa-gpt-oss-120b-notool` is 13.1% according to this [kaggle page](https://www.kaggle.com/benchmarks/deepmind/simpleqa-verified). +### FrontierScience-Olympiad +- Benchmark is defined in [`nemo_skills/dataset/frontierscience-olympiad/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/frontierscience-olympiad/__init__.py) +- Original benchmark source is [here](https://huggingface.co/datasets/openai/frontierscience). +- Contains 100 short-answer questions crafted by international science olympiad medalists across physics, chemistry, and biology. +- Available splits: `physics`, `chemistry`, `biology`, and `all` (all subjects combined, default). + +#### Configuration: `gpt-oss-20b` with builtin tool (python) + +```python +from nemo_skills.pipeline.cli import wrap_arguments, eval + +eval( + ctx=wrap_arguments( + "++inference.temperature=1.0 ++inference.tokens_to_generate=65536 " + "++code_tags=gpt-oss ++server.code_execution.max_code_executions=100 " + "++inference.endpoint_type=text ++chat_template_kwargs.builtin_tools=[python] " + "++chat_template_kwargs.reasoning_effort=high ++code_execution=true" + ), + cluster="slurm", + expname="ghb-model_gpt_oss_20b", + model="openai/gpt-oss-20b", + server_type="vllm", + server_gpus=4, + server_args="--async-scheduling", + benchmarks="frontierscience-olympiad:20", + split="all", + num_chunks=1, + output_dir="/workspace/frontierscience-ghb-model_gpt_oss_20b", + with_sandbox=True, + wandb_project="frontier", + wandb_name="frontierscience-ghb-model_gpt_oss_20b", + judge_model="openai/gpt-oss-120b", + judge_server_type="vllm", + judge_server_gpus=8, + judge_server_args="--async-scheduling", +) +``` + + +#### Configuration: `gpt-oss-120b` without tool + +```python +from nemo_skills.pipeline.cli import wrap_arguments, eval + +eval( + ctx=wrap_arguments( + "++inference.temperature=1.0 ++inference.tokens_to_generate=65536 " + "++inference.extra_body.reasoning_effort=high" + ), + cluster="slurm", + expname="ghn-model_gpt_oss_120b", + model="openai/gpt-oss-120b", + server_type="vllm", + server_gpus=8, + server_args="--async-scheduling", + benchmarks="frontierscience-olympiad:20", + split="all", + num_chunks=1, + output_dir="/workspace/frontierscience-ghn-model_gpt_oss_120b", + wandb_project="frontier", + wandb_name="frontierscience-ghn-model_gpt_oss_120b", + judge_model="openai/gpt-oss-120b", + judge_server_type="vllm", + judge_server_gpus=8, + judge_server_args="--async-scheduling", +) +``` + +#### Result + +| Run Name | pass@1 | majority@8 | pass@8 | +|:------------------------------------------|---------:|-------------:|---------:| +| gpt-oss-20b (no tool) | 49.74 | 47.00 | 71.98 | +| gpt-oss-20b (with python tool) | 36.94 | 37.38 | 73.61 | +| gpt-oss-120b (no tool) | 60.53 | 61.13 | 79.25 | +| gpt-oss-120b (with python tool) | 54.05 | 53.00 | 80.07 | ### SuperGPQA diff --git a/nemo_skills/dataset/frontierscience-olympiad/__init__.py b/nemo_skills/dataset/frontierscience-olympiad/__init__.py new file mode 100644 index 0000000000..ec00e343be --- /dev/null +++ b/nemo_skills/dataset/frontierscience-olympiad/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# settings that define how evaluation should be done by default (all can be changed from cmdline) +DATASET_GROUP = "math" +METRICS_TYPE = "frontierscience-olympiad" +GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=math" +EVAL_SPLIT = "all" + + +JUDGE_PIPELINE_ARGS = { + "model": "o3-mini-2025-01-31", + "server_type": "openai", + "server_address": "https://api.openai.com/v1", +} +JUDGE_ARGS = "++prompt_config=judge/frontierscience-olympiad ++generation_key=judgement ++add_generation_stats=False" diff --git a/nemo_skills/dataset/frontierscience-olympiad/prepare.py b/nemo_skills/dataset/frontierscience-olympiad/prepare.py new file mode 100644 index 0000000000..33a8810c1c --- /dev/null +++ b/nemo_skills/dataset/frontierscience-olympiad/prepare.py @@ -0,0 +1,103 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import re +from pathlib import Path + +import requests +from tqdm import tqdm + +OLYMPIAD_URL = "https://huggingface.co/datasets/openai/frontierscience/resolve/main/olympiad/test.jsonl" + +# Map of available subjects +SUBJECTS = ["chemistry", "biology", "physics"] + + +def format_entry(entry, problem_index): + """Format entry for nemo-skills from FrontierScience Olympiad dataset.""" + answer = entry.get("answer", "") + # Remove surrounding backticks (handles `, ``, ```, etc.) + answer = re.sub(r"^`+|`+$", "", answer).strip() + + formatted = { + "id": f"olympiad-{problem_index}", + "question": entry.get("problem", ""), + "expected_answer": answer, + "subset_for_metrics": entry.get("subject", ""), + "task_group_id": entry.get("task_group_id", ""), + } + + return formatted + + +def write_data_to_file(output_file, data, subject_filter=None): + """Write formatted data to JSONL file.""" + count = 0 + with open(output_file, "wt", encoding="utf-8") as fout: + for idx, entry in enumerate(tqdm(data, desc=f"Writing {output_file.name}")): + # Filter by subject if specified + if subject_filter and entry.get("subject", "").lower() != subject_filter: + continue + formatted_entry = format_entry(entry, idx) + json.dump(formatted_entry, fout) + fout.write("\n") + count += 1 + return count + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--split", + default="all", + choices=["all"] + SUBJECTS, + help="Dataset split to process (all/chemistry/biology/physics).", + ) + args = parser.parse_args() + + # Load the FrontierScience olympiad dataset directly from HuggingFace + print(f"Downloading FrontierScience olympiad dataset from {OLYMPIAD_URL}...") + + try: + response = requests.get(OLYMPIAD_URL, timeout=30) + except Exception as e: + raise RuntimeError(f"Error downloading dataset from {OLYMPIAD_URL}: {e}") + + # Parse JSONL data + olympiad_data = [] + for line in response.text.strip().split("\n"): + if line: + olympiad_data.append(json.loads(line)) + + print(f"Loaded {len(olympiad_data)} olympiad problems") + + data_dir = Path(__file__).absolute().parent + data_dir.mkdir(exist_ok=True) + + if args.split == "all": + applied_subjects = SUBJECTS + else: + applied_subjects = [args.split] + # Process all subjects separately + for subject in applied_subjects: + output_file = data_dir / f"{subject}.jsonl" + count = write_data_to_file(output_file, olympiad_data, subject_filter=subject) + print(f"Saved {count} {subject} entries to {output_file}") + if args.split == "all": + # Also create a combined all.jsonl with all problems + output_file = data_dir / "all.jsonl" + count = write_data_to_file(output_file, olympiad_data) + print(f"Saved {count} total entries to {output_file}") diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py index 1cfb24dea2..f7dfafb85e 100644 --- a/nemo_skills/evaluation/metrics/map_metrics.py +++ b/nemo_skills/evaluation/metrics/map_metrics.py @@ -46,6 +46,9 @@ METRICS_MAP = { "math": MathMetrics, "hle": functools.partial(MathMetrics, compute_no_answer=False, answer_key="generation"), + "frontierscience-olympiad": functools.partial( + MathMetrics, compute_no_answer=False, question_key="question", answer_key="generation" + ), "simpleqa": SimpleQAMetrics, "lean4-proof": Lean4Metrics, "lean4-statement": Lean4Metrics, diff --git a/nemo_skills/evaluation/metrics/math_metrics.py b/nemo_skills/evaluation/metrics/math_metrics.py index 9c9f0cf136..8d5cf72c54 100644 --- a/nemo_skills/evaluation/metrics/math_metrics.py +++ b/nemo_skills/evaluation/metrics/math_metrics.py @@ -25,8 +25,11 @@ class MathMetrics(BaseMetrics): # TODO: how can we ensure that user-defined aggregations have all the same metrics as in base? - def __init__(self, compute_no_answer: bool = True, answer_key: str = "predicted_answer"): + def __init__( + self, compute_no_answer: bool = True, question_key: str = "problem", answer_key: str = "predicted_answer" + ): super().__init__(compute_no_answer=compute_no_answer) + self.question_key = question_key self.answer_key = answer_key def _compute_reward_at_k(self, predictions: list[dict]): @@ -113,7 +116,7 @@ def update(self, predictions): "Question: %s\nPredicted answer: %s\nExpected answer: %s\nLLM reasoning: %s\n", correctness_dict["symbolic_correct"], correctness_dict["judge_correct"], - prediction["problem"], + prediction[self.question_key], prediction[self.answer_key], prediction["expected_answer"], prediction["judgement"], diff --git a/nemo_skills/prompt/config/judge/frontierscience-olympiad.yaml b/nemo_skills/prompt/config/judge/frontierscience-olympiad.yaml new file mode 100644 index 0000000000..f4c6bfa43d --- /dev/null +++ b/nemo_skills/prompt/config/judge/frontierscience-olympiad.yaml @@ -0,0 +1,23 @@ +# Source: https://cdn.openai.com/pdf/2fcd284c-b468-4c21-8ee0-7a783933efcc/frontierscience-paper.pdf Page 13 +user: |- + You are grading an attempted answer to a science olympiad problem. You will be given the + problem, attempted answer, and reference answer. Evaluate the solution against the provided + reference solution, ensuring it is complete and matches the reference solution. Pay close + attention to detail and grade it strictly, but fairly. + The reference answer is either a single number or expression in latex formatting, a chemical + formula, a compound name, or a phrase referring to a specific name, entity, or method. + Mark the attempted answer as correct if it fully matches the reference answer or is otherwise + equivalent (e.g., an equivalent algebraic expression, a numerical number within 1 decimal + place rounding of the reference answer (e.g., 6.69 ≈ 6.7), an equivalent name for a + compound/formula, equivalent when accounting for units, etc.). Mark it as incorrect if it is + not equivalent to the reference answer. + *** + The problem: {question} + *** + The reference answer: {expected_answer} + *** + The attempted answer: {generation} + *** + First, think step-by-step about whether the attempted answer matches the reference answer. + If the attempted answer is correct, write "Judgement: YES" in the last line of your + response, with no other text or formatting. If it is incorrect, write "Judgement: NO".