NVIDIA-NeMo · Kipok · Jan 22, 2026 · Nov 19, 2025 · Nov 24, 2025 · Dec 1, 2025
diff --git a/docs/evaluation/natural-math.md b/docs/evaluation/natural-math.md
@@ -232,4 +232,22 @@ In either case you can always customize the judge prompt by setting a new `++pro
 ### beyond-aime
 
 - Benchmark is defined in [`nemo_skills/dataset/beyond-aime/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/beyond-aime/__init__.py)
-- Original benchmark source is [here](https://huggingface.co/datasets/ByteDance-Seed/BeyondAIME).
+- Original benchmark source is [here](https://huggingface.co/datasets/ByteDance-Seed/BeyondAIME).
+
+### imo-answerbench
+
+- Benchmark is defined in [`nemo_skills/dataset/imo-answerbench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/imo-answerbench/__init__.py)
+- Original benchmark source is [here](https://github.com/google-deepmind/superhuman/tree/main/imobench).
+- Part of IMO-Bench from [this paper](https://arxiv.org/abs/2511.01846). Tests mathematical answer equivalence using LLM-as-a-judge.
+
+### imo-proofbench
+
+- Benchmark is defined in [`nemo_skills/dataset/imo-proofbench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/imo-proofbench/__init__.py)
+- Original benchmark source is [here](https://github.com/google-deepmind/superhuman/tree/main/imobench).
+- Part of IMO-Bench from [this paper](https://arxiv.org/abs/2511.01846). Evaluates full mathematical proofs using LLM-as-a-judge with a detailed rubric.
+
+### imo-gradingbench
+
+- Benchmark is defined in [`nemo_skills/dataset/imo-gradingbench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/imo-gradingbench/__init__.py)
+- Original benchmark source is [here](https://github.com/google-deepmind/superhuman/tree/main/imobench).
+- Part of IMO-Bench from [this paper](https://arxiv.org/abs/2511.01846). Meta-benchmark for evaluating how well models can grade mathematical solutions.
diff --git a/nemo_skills/dataset/imo-answerbench/__init__.py b/nemo_skills/dataset/imo-answerbench/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATASET_GROUP = "math"
+METRICS_TYPE = "math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
+# Judge configuration: Use the AnswerAutoGrader prompt.
+# Recommended model: Gemini 2.5 Pro (or similar strong reasoner)
+JUDGE_ARGS = "++prompt_config=judge/imo_answerbench ++generation_key=judgement ++inference.reasoning_effort=dynamic"
+
+JUDGE_PIPELINE_ARGS = {
+    "generation_type": "math_judge",
+    "model": "gemini-2.5-pro",
+    "server_type": "gemini",
+}
diff --git a/nemo_skills/dataset/imo-answerbench/prepare.py b/nemo_skills/dataset/imo-answerbench/prepare.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import io
+import json
+import urllib.request
+from pathlib import Path
+
+if __name__ == "__main__":
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+
+    base_url = "https://raw.githubusercontent.com/google-deepmind/superhuman/c1ee02e03d4cdb2ab21cd01ac927d895f5287fc8/imobench"
+    source_url = f"{base_url}/answerbench.csv"
+    output_file = data_dir / "test.jsonl"
+
+    with urllib.request.urlopen(source_url, timeout=30) as response:
+        content = response.read().decode("utf-8")
+        reader = csv.DictReader(io.StringIO(content))
+
+        with open(output_file, "w", encoding="utf-8") as out:
+            for row in reader:
+                entry = {
+                    "problem_id": row["Problem ID"],
+                    "problem": row["Problem"],
+                    "expected_answer": row["Short Answer"],
+                    "category": row["Category"],
+                    "subcategory": row["Subcategory"],
+                    "source": row["Source"],
+                }
+                out.write(json.dumps(entry) + "\n")
-    with urllib.request.urlopen(source_url, timeout=30) as response:
-        content = response.read().decode("utf-8")
-        reader = csv.DictReader(io.StringIO(content))
-
-        with open(output_file, "w", encoding="utf-8") as out:
-            for row in reader:
-                entry = {
-                    "problem_id": row["Problem ID"],
-                    "problem": row["Problem"],
-                    "expected_answer": row["Short Answer"],
-                    "category": row["Category"],
-                    "subcategory": row["Subcategory"],
-                    "source": row["Source"],
-                }
-                out.write(json.dumps(entry) + "\n")
+    try:
+        with urllib.request.urlopen(source_url, timeout=30) as response:
+            content = response.read().decode("utf-8")
+            reader = csv.DictReader(io.StringIO(content))
+
+            with open(output_file, "w", encoding="utf-8") as out:
+                for row in reader:
+                    try:
+                        entry = {
+                            "problem_id": row["Problem ID"],
+                            "problem": row["Problem"],
+                            "expected_answer": row["Short Answer"],
+                            "category": row["Category"],
+                            "subcategory": row["Subcategory"],
+                            "source": row["Source"],
+                        }
+                        out.write(json.dumps(entry) + "\n")
+                    except KeyError as e:
+                        raise KeyError(f"Missing expected column in CSV: {e}")
+    except urllib.error.URLError as e:
+        raise RuntimeError(f"Failed to download dataset from {source_url}: {e}")
-    with urllib.request.urlopen(source_url, timeout=30) as response:
-        content = response.read().decode("utf-8")
-        reader = csv.DictReader(io.StringIO(content))
-
-        with open(output_file, "w", encoding="utf-8") as out:
-            for row in reader:
-                entry = {
-                    "problem_id": row["Problem ID"],
-                    "problem": row["Problem"],
-                    "expected_answer": row["Short Answer"],
-                    "category": row["Category"],
-                    "subcategory": row["Subcategory"],
-                    "source": row["Source"],
-                }
-                out.write(json.dumps(entry) + "\n")
+    try:
+        with urllib.request.urlopen(source_url, timeout=30) as response:
+            content = response.read().decode("utf-8")
+            reader = csv.DictReader(io.StringIO(content))
+
+            with open(output_file, "w", encoding="utf-8") as out:
+                for row in reader:
+                    try:
+                        entry = {
+                            "problem_id": row["Problem ID"],
+                            "problem": row["Problem"],
+                            "expected_answer": row["Short Answer"],
+                            "category": row["Category"],
+                            "subcategory": row["Subcategory"],
+                            "source": row["Source"],
+                        }
+                        out.write(json.dumps(entry) + "\n")
+                    except KeyError as e:
+                        raise KeyError(f"Missing expected column in CSV: {e}")
+    except urllib.error.URLError as e:
+        raise RuntimeError(f"Failed to download dataset from {source_url}: {e}")
diff --git a/nemo_skills/dataset/imo-gradingbench/__init__.py b/nemo_skills/dataset/imo-gradingbench/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATASET_GROUP = "judge"
+METRICS_TYPE = "gradingbench"
+# This dataset is for evaluating grading ability - the model must grade proofs.
+GENERATION_ARGS = "++prompt_config=judge/imo_gradingbench ++eval_type=math"
diff --git a/nemo_skills/dataset/imo-gradingbench/prepare.py b/nemo_skills/dataset/imo-gradingbench/prepare.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import io
+import json
+import urllib.request
+from pathlib import Path
+
+if __name__ == "__main__":
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+
+    base_url = "https://raw.githubusercontent.com/google-deepmind/superhuman/c1ee02e03d4cdb2ab21cd01ac927d895f5287fc8/imobench"
+    source_url = f"{base_url}/gradingbench.csv"
+    output_file = data_dir / "test.jsonl"
+
+    with urllib.request.urlopen(source_url, timeout=30) as response:
+        content = response.read().decode("utf-8")
+        reader = csv.DictReader(io.StringIO(content))
+
+        with open(output_file, "w", encoding="utf-8") as out:
+            for row in reader:
+                entry = {
+                    "grading_id": row["Grading ID"],
+                    "problem_id": row["Problem ID"],
+                    "problem_statement": row["Problem"],
+                    "reference_solution": row["Solution"],
+                    "rubric": row["Grading guidelines"],
+                    "proof": row["Response"],
+                    "points": row["Points"],
+                    "reward": row["Reward"],
+                    "source": row["Problem Source"],
+                    # We set 'expected_answer' to the reward/points for evaluation
+                    "expected_answer": row["Reward"],
+                }
+                out.write(json.dumps(entry) + "\n")
diff --git a/nemo_skills/dataset/imo-proofbench/__init__.py b/nemo_skills/dataset/imo-proofbench/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATASET_GROUP = "math"
+# Using judge metrics as we need an LLM to evaluate the proof
+METRICS_TYPE = "math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
+# Judge configuration: Use the ProofAutoGrader prompt.
+# Recommended model: Gemini 2.5 Pro (or similar strong reasoner)
+JUDGE_ARGS = "++prompt_config=judge/imo_proofbench ++generation_key=judgement ++inference.reasoning_effort=dynamic"
+
+JUDGE_PIPELINE_ARGS = {
+    "generation_type": "math_judge",
+    "model": "gemini-2.5-pro",
+    "server_type": "gemini",
+}
diff --git a/nemo_skills/dataset/imo-proofbench/prepare.py b/nemo_skills/dataset/imo-proofbench/prepare.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import io
+import json
+import urllib.request
+from pathlib import Path
+
+if __name__ == "__main__":
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+
+    base_url = "https://raw.githubusercontent.com/google-deepmind/superhuman/c1ee02e03d4cdb2ab21cd01ac927d895f5287fc8/imobench"
+    source_url = f"{base_url}/proofbench.csv"
+    output_file = data_dir / "test.jsonl"
+
+    with urllib.request.urlopen(source_url, timeout=30) as response:
+        content = response.read().decode("utf-8")
+        reader = csv.DictReader(io.StringIO(content))
+
+        with open(output_file, "w", encoding="utf-8") as out:
+            for row in reader:
+                entry = {
+                    "problem_id": row["Problem ID"],
+                    "problem": row["Problem"],
+                    "reference_solution": row["Solution"],
+                    "rubric": row["Grading guidelines"],
+                    "category": row["Category"],
+                    "level": row["Level"],
+                    "expected_answer": row["Short Answer"],
+                    "source": row["Source"],
+                }
+                out.write(json.dumps(entry) + "\n")