NVIDIA-NeMo · Kipok · Jan 21, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/docs/evaluation/scientific-knowledge.md b/docs/evaluation/scientific-knowledge.md
@@ -100,7 +100,83 @@ We also tested a variant where the full generation output was provided to the ju
 
 The reported number for `simpleqa-gpt-oss-120b-notool` is 13.1% according to this [kaggle page](https://www.kaggle.com/benchmarks/deepmind/simpleqa-verified).
 
+### FrontierScience-Olympiad
 
+- Benchmark is defined in [`nemo_skills/dataset/frontierscience-olympiad/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/frontierscience-olympiad/__init__.py)
+- Original benchmark source is [here](https://huggingface.co/datasets/openai/frontierscience).
+- Contains 100 short-answer questions crafted by international science olympiad medalists across physics, chemistry, and biology.
+- Available splits: `physics`, `chemistry`, `biology`, and `all` (all subjects combined, default).
+
+#### Configuration: `gpt-oss-20b` with builtin tool (python)
+
+```python
+from nemo_skills.pipeline.cli import wrap_arguments, eval
+
+eval(
+    ctx=wrap_arguments(
+        "++inference.temperature=1.0 ++inference.tokens_to_generate=65536 "
+        "++code_tags=gpt-oss ++server.code_execution.max_code_executions=100 "
+        "++inference.endpoint_type=text ++chat_template_kwargs.builtin_tools=[python] "
+        "++chat_template_kwargs.reasoning_effort=high ++code_execution=true"
+    ),
+    cluster="slurm",
+    expname="ghb-model_gpt_oss_20b",
+    model="openai/gpt-oss-20b",
+    server_type="vllm",
+    server_gpus=4,
+    server_args="--async-scheduling",
+    benchmarks="frontierscience-olympiad:20",
+    split="all",
+    num_chunks=1,
+    output_dir="/workspace/frontierscience-ghb-model_gpt_oss_20b",
+    with_sandbox=True,
+    wandb_project="frontier",
+    wandb_name="frontierscience-ghb-model_gpt_oss_20b",
+    judge_model="openai/gpt-oss-120b",
+    judge_server_type="vllm",
+    judge_server_gpus=8,
+    judge_server_args="--async-scheduling",
+)
+```
+
+
+#### Configuration: `gpt-oss-120b` without tool
+
+```python
+from nemo_skills.pipeline.cli import wrap_arguments, eval
+
+eval(
+    ctx=wrap_arguments(
+        "++inference.temperature=1.0 ++inference.tokens_to_generate=65536 "
+        "++inference.extra_body.reasoning_effort=high"
+    ),
+    cluster="slurm",
+    expname="ghn-model_gpt_oss_120b",
+    model="openai/gpt-oss-120b",
+    server_type="vllm",
+    server_gpus=8,
+    server_args="--async-scheduling",
+    benchmarks="frontierscience-olympiad:20",
+    split="all",
+    num_chunks=1,
+    output_dir="/workspace/frontierscience-ghn-model_gpt_oss_120b",
+    wandb_project="frontier",
+    wandb_name="frontierscience-ghn-model_gpt_oss_120b",
+    judge_model="openai/gpt-oss-120b",
+    judge_server_type="vllm",
+    judge_server_gpus=8,
+    judge_server_args="--async-scheduling",
+)
+```
+
+#### Result
+
+| Run Name                                  |   pass@1 |   majority@8 |   pass@8 |
+|:------------------------------------------|---------:|-------------:|---------:|
+| gpt-oss-20b (no tool)                     |    49.74 |        47.00 |    71.98 |
+| gpt-oss-20b (with python tool)            |    36.94 |        37.38 |    73.61 |
+| gpt-oss-120b (no tool)                    |    60.53 |        61.13 |    79.25 |
+| gpt-oss-120b (with python tool)           |    54.05 |        53.00 |    80.07 |
 
 ### SuperGPQA
 

diff --git a/nemo_skills/dataset/frontierscience-olympiad/__init__.py b/nemo_skills/dataset/frontierscience-olympiad/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+DATASET_GROUP = "math"
+METRICS_TYPE = "frontierscience-olympiad"
+GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=math"
+EVAL_SPLIT = "all"
+
+
+JUDGE_PIPELINE_ARGS = {
+    "model": "o3-mini-2025-01-31",
+    "server_type": "openai",
+    "server_address": "https://api.openai.com/v1",
+}
+JUDGE_ARGS = "++prompt_config=judge/frontierscience-olympiad ++generation_key=judgement ++add_generation_stats=False"
diff --git a/nemo_skills/dataset/frontierscience-olympiad/prepare.py b/nemo_skills/dataset/frontierscience-olympiad/prepare.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import re
+from pathlib import Path
+
+import requests
+from tqdm import tqdm
+
+OLYMPIAD_URL = "https://huggingface.co/datasets/openai/frontierscience/resolve/main/olympiad/test.jsonl"
+
+# Map of available subjects
+SUBJECTS = ["chemistry", "biology", "physics"]
+
+
+def format_entry(entry, problem_index):
+    """Format entry for nemo-skills from FrontierScience Olympiad dataset."""
+    answer = entry.get("answer", "")
+    # Remove surrounding backticks (handles `, ``, ```, etc.)
+    answer = re.sub(r"^`+|`+$", "", answer).strip()
+
+    formatted = {
+        "id": f"olympiad-{problem_index}",
+        "question": entry.get("problem", ""),
+        "expected_answer": answer,
+        "subset_for_metrics": entry.get("subject", ""),
+        "task_group_id": entry.get("task_group_id", ""),
+    }
+
+    return formatted
+
+
+def write_data_to_file(output_file, data, subject_filter=None):
+    """Write formatted data to JSONL file."""
+    count = 0
+    with open(output_file, "wt", encoding="utf-8") as fout:
+        for idx, entry in enumerate(tqdm(data, desc=f"Writing {output_file.name}")):
+            # Filter by subject if specified
+            if subject_filter and entry.get("subject", "").lower() != subject_filter:
+                continue
+            formatted_entry = format_entry(entry, idx)
+            json.dump(formatted_entry, fout)
+            fout.write("\n")
+            count += 1
+    return count
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--split",
+        default="all",
+        choices=["all"] + SUBJECTS,
+        help="Dataset split to process (all/chemistry/biology/physics).",
+    )
+    args = parser.parse_args()
+
+    # Load the FrontierScience olympiad dataset directly from HuggingFace
+    print(f"Downloading FrontierScience olympiad dataset from {OLYMPIAD_URL}...")
+
+    try:
+        response = requests.get(OLYMPIAD_URL, timeout=30)
-        response = requests.get(OLYMPIAD_URL, timeout=30)
+        response = requests.get(OLYMPIAD_URL, timeout=30)
+        response.raise_for_status()
-        response = requests.get(OLYMPIAD_URL, timeout=30)
+        response = requests.get(OLYMPIAD_URL, timeout=30)
+        response.raise_for_status()
+    except Exception as e:
+        raise RuntimeError(f"Error downloading dataset from {OLYMPIAD_URL}: {e}")
-    try:
-        response = requests.get(OLYMPIAD_URL, timeout=30)
-    except Exception as e:
-        raise RuntimeError(f"Error downloading dataset from {OLYMPIAD_URL}: {e}")
+    try:
+        response = requests.get(OLYMPIAD_URL, timeout=30)
+        response.raise_for_status()
+    except Exception as e:
+        raise RuntimeError(f"Error downloading dataset from {OLYMPIAD_URL}: {e}")
-    try:
-        response = requests.get(OLYMPIAD_URL, timeout=30)
-    except Exception as e:
-        raise RuntimeError(f"Error downloading dataset from {OLYMPIAD_URL}: {e}")
+    try:
+        response = requests.get(OLYMPIAD_URL, timeout=30)
+        response.raise_for_status()
+    except Exception as e:
+        raise RuntimeError(f"Error downloading dataset from {OLYMPIAD_URL}: {e}")
+
+    # Parse JSONL data
+    olympiad_data = []
+    for line in response.text.strip().split("\n"):
+        if line:
+            olympiad_data.append(json.loads(line))
+
+    print(f"Loaded {len(olympiad_data)} olympiad problems")
+
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+
+    if args.split == "all":
+        applied_subjects = SUBJECTS
+    else:
+        applied_subjects = [args.split]
+    # Process all subjects separately
+    for subject in applied_subjects:
+        output_file = data_dir / f"{subject}.jsonl"
+        count = write_data_to_file(output_file, olympiad_data, subject_filter=subject)
+        print(f"Saved {count} {subject} entries to {output_file}")
+    if args.split == "all":
+        # Also create a combined all.jsonl with all problems
+        output_file = data_dir / "all.jsonl"
+        count = write_data_to_file(output_file, olympiad_data)
+        print(f"Saved {count} total entries to {output_file}")
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -46,6 +46,9 @@
 METRICS_MAP = {
     "math": MathMetrics,
     "hle": functools.partial(MathMetrics, compute_no_answer=False, answer_key="generation"),
+    "frontierscience-olympiad": functools.partial(
+        MathMetrics, compute_no_answer=False, question_key="question", answer_key="generation"
+    ),
     "simpleqa": SimpleQAMetrics,
     "lean4-proof": Lean4Metrics,
     "lean4-statement": Lean4Metrics,

diff --git a/nemo_skills/evaluation/metrics/math_metrics.py b/nemo_skills/evaluation/metrics/math_metrics.py
@@ -25,8 +25,11 @@
 class MathMetrics(BaseMetrics):
     # TODO: how can we ensure that user-defined aggregations have all the same metrics as in base?
 
-    def __init__(self, compute_no_answer: bool = True, answer_key: str = "predicted_answer"):
+    def __init__(
+        self, compute_no_answer: bool = True, question_key: str = "problem", answer_key: str = "predicted_answer"
+    ):
         super().__init__(compute_no_answer=compute_no_answer)
+        self.question_key = question_key
         self.answer_key = answer_key
 
     def _compute_reward_at_k(self, predictions: list[dict]):
@@ -113,7 +116,7 @@ def update(self, predictions):
                     "Question: %s\nPredicted answer: %s\nExpected answer: %s\nLLM reasoning: %s\n",
                     correctness_dict["symbolic_correct"],
                     correctness_dict["judge_correct"],
-                    prediction["problem"],
+                    prediction[self.question_key],
                     prediction[self.answer_key],
                     prediction["expected_answer"],
                     prediction["judgement"],

diff --git a/nemo_skills/prompt/config/judge/frontierscience-olympiad.yaml b/nemo_skills/prompt/config/judge/frontierscience-olympiad.yaml
@@ -0,0 +1,23 @@
+# Source: https://cdn.openai.com/pdf/2fcd284c-b468-4c21-8ee0-7a783933efcc/frontierscience-paper.pdf Page 13
+user: |-
+  You are grading an attempted answer to a science olympiad problem. You will be given the
+  problem, attempted answer, and reference answer. Evaluate the solution against the provided
+  reference solution, ensuring it is complete and matches the reference solution. Pay close
+  attention to detail and grade it strictly, but fairly.
+  The reference answer is either a single number or expression in latex formatting, a chemical
+  formula, a compound name, or a phrase referring to a specific name, entity, or method.
+  Mark the attempted answer as correct if it fully matches the reference answer or is otherwise
+  equivalent (e.g., an equivalent algebraic expression, a numerical number within 1 decimal
+  place rounding of the reference answer (e.g., 6.69 ≈ 6.7), an equivalent name for a
+  compound/formula, equivalent when accounting for units, etc.). Mark it as incorrect if it is
+  not equivalent to the reference answer.
+  ***
+  The problem: {question}
+  ***
+  The reference answer: {expected_answer}
+  ***
+  The attempted answer: {generation}
+  ***
+  First, think step-by-step about whether the attempted answer matches the reference answer.
+  If the attempted answer is correct, write "Judgement: YES" in the last line of your
+  response, with no other text or formatting. If it is incorrect, write "Judgement: NO".