diff --git a/docs/evaluation/natural-math.md b/docs/evaluation/natural-math.md
index c7a7cf5f58..d26093db91 100644
--- a/docs/evaluation/natural-math.md
+++ b/docs/evaluation/natural-math.md
@@ -232,4 +232,22 @@ In either case you can always customize the judge prompt by setting a new `++pro
### beyond-aime
- Benchmark is defined in [`nemo_skills/dataset/beyond-aime/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/beyond-aime/__init__.py)
-- Original benchmark source is [here](https://huggingface.co/datasets/ByteDance-Seed/BeyondAIME).
\ No newline at end of file
+- Original benchmark source is [here](https://huggingface.co/datasets/ByteDance-Seed/BeyondAIME).
+
+### imo-answerbench
+
+- Benchmark is defined in [`nemo_skills/dataset/imo-answerbench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/imo-answerbench/__init__.py)
+- Original benchmark source is [here](https://github.com/google-deepmind/superhuman/tree/main/imobench).
+- Part of IMO-Bench from [this paper](https://arxiv.org/abs/2511.01846). Tests mathematical answer equivalence using LLM-as-a-judge.
+
+### imo-proofbench
+
+- Benchmark is defined in [`nemo_skills/dataset/imo-proofbench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/imo-proofbench/__init__.py)
+- Original benchmark source is [here](https://github.com/google-deepmind/superhuman/tree/main/imobench).
+- Part of IMO-Bench from [this paper](https://arxiv.org/abs/2511.01846). Evaluates full mathematical proofs using LLM-as-a-judge with a detailed rubric.
+
+### imo-gradingbench
+
+- Benchmark is defined in [`nemo_skills/dataset/imo-gradingbench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/imo-gradingbench/__init__.py)
+- Original benchmark source is [here](https://github.com/google-deepmind/superhuman/tree/main/imobench).
+- Part of IMO-Bench from [this paper](https://arxiv.org/abs/2511.01846). Meta-benchmark for evaluating how well models can grade mathematical solutions.
\ No newline at end of file
diff --git a/nemo_skills/dataset/imo-answerbench/__init__.py b/nemo_skills/dataset/imo-answerbench/__init__.py
new file mode 100644
index 0000000000..9f97fe78b9
--- /dev/null
+++ b/nemo_skills/dataset/imo-answerbench/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATASET_GROUP = "math"
+METRICS_TYPE = "math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
+# Judge configuration: Use the AnswerAutoGrader prompt.
+# Recommended model: Gemini 2.5 Pro (or similar strong reasoner)
+JUDGE_ARGS = "++prompt_config=judge/imo_answerbench ++generation_key=judgement ++inference.reasoning_effort=dynamic"
+
+JUDGE_PIPELINE_ARGS = {
+ "generation_type": "math_judge",
+ "model": "gemini-2.5-pro",
+ "server_type": "gemini",
+}
diff --git a/nemo_skills/dataset/imo-answerbench/prepare.py b/nemo_skills/dataset/imo-answerbench/prepare.py
new file mode 100644
index 0000000000..61dfa82fd3
--- /dev/null
+++ b/nemo_skills/dataset/imo-answerbench/prepare.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import io
+import json
+import urllib.request
+from pathlib import Path
+
+if __name__ == "__main__":
+ data_dir = Path(__file__).absolute().parent
+ data_dir.mkdir(exist_ok=True)
+
+ base_url = "https://raw.githubusercontent.com/google-deepmind/superhuman/c1ee02e03d4cdb2ab21cd01ac927d895f5287fc8/imobench"
+ source_url = f"{base_url}/answerbench.csv"
+ output_file = data_dir / "test.jsonl"
+
+ with urllib.request.urlopen(source_url, timeout=30) as response:
+ content = response.read().decode("utf-8")
+ reader = csv.DictReader(io.StringIO(content))
+
+ with open(output_file, "w", encoding="utf-8") as out:
+ for row in reader:
+ entry = {
+ "problem_id": row["Problem ID"],
+ "problem": row["Problem"],
+ "expected_answer": row["Short Answer"],
+ "category": row["Category"],
+ "subcategory": row["Subcategory"],
+ "source": row["Source"],
+ }
+ out.write(json.dumps(entry) + "\n")
diff --git a/nemo_skills/dataset/imo-gradingbench/__init__.py b/nemo_skills/dataset/imo-gradingbench/__init__.py
new file mode 100644
index 0000000000..616974c319
--- /dev/null
+++ b/nemo_skills/dataset/imo-gradingbench/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATASET_GROUP = "judge"
+METRICS_TYPE = "gradingbench"
+# This dataset is for evaluating grading ability - the model must grade proofs.
+GENERATION_ARGS = "++prompt_config=judge/imo_gradingbench ++eval_type=math"
diff --git a/nemo_skills/dataset/imo-gradingbench/prepare.py b/nemo_skills/dataset/imo-gradingbench/prepare.py
new file mode 100644
index 0000000000..20778c9aff
--- /dev/null
+++ b/nemo_skills/dataset/imo-gradingbench/prepare.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import io
+import json
+import urllib.request
+from pathlib import Path
+
+if __name__ == "__main__":
+ data_dir = Path(__file__).absolute().parent
+ data_dir.mkdir(exist_ok=True)
+
+ base_url = "https://raw.githubusercontent.com/google-deepmind/superhuman/c1ee02e03d4cdb2ab21cd01ac927d895f5287fc8/imobench"
+ source_url = f"{base_url}/gradingbench.csv"
+ output_file = data_dir / "test.jsonl"
+
+ with urllib.request.urlopen(source_url, timeout=30) as response:
+ content = response.read().decode("utf-8")
+ reader = csv.DictReader(io.StringIO(content))
+
+ with open(output_file, "w", encoding="utf-8") as out:
+ for row in reader:
+ entry = {
+ "grading_id": row["Grading ID"],
+ "problem_id": row["Problem ID"],
+ "problem_statement": row["Problem"],
+ "reference_solution": row["Solution"],
+ "rubric": row["Grading guidelines"],
+ "proof": row["Response"],
+ "points": row["Points"],
+ "reward": row["Reward"],
+ "source": row["Problem Source"],
+ # We set 'expected_answer' to the reward/points for evaluation
+ "expected_answer": row["Reward"],
+ }
+ out.write(json.dumps(entry) + "\n")
diff --git a/nemo_skills/dataset/imo-proofbench/__init__.py b/nemo_skills/dataset/imo-proofbench/__init__.py
new file mode 100644
index 0000000000..8714de74c1
--- /dev/null
+++ b/nemo_skills/dataset/imo-proofbench/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATASET_GROUP = "math"
+# Using judge metrics as we need an LLM to evaluate the proof
+METRICS_TYPE = "math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
+# Judge configuration: Use the ProofAutoGrader prompt.
+# Recommended model: Gemini 2.5 Pro (or similar strong reasoner)
+JUDGE_ARGS = "++prompt_config=judge/imo_proofbench ++generation_key=judgement ++inference.reasoning_effort=dynamic"
+
+JUDGE_PIPELINE_ARGS = {
+ "generation_type": "math_judge",
+ "model": "gemini-2.5-pro",
+ "server_type": "gemini",
+}
diff --git a/nemo_skills/dataset/imo-proofbench/prepare.py b/nemo_skills/dataset/imo-proofbench/prepare.py
new file mode 100644
index 0000000000..564902c32e
--- /dev/null
+++ b/nemo_skills/dataset/imo-proofbench/prepare.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import io
+import json
+import urllib.request
+from pathlib import Path
+
+if __name__ == "__main__":
+ data_dir = Path(__file__).absolute().parent
+ data_dir.mkdir(exist_ok=True)
+
+ base_url = "https://raw.githubusercontent.com/google-deepmind/superhuman/c1ee02e03d4cdb2ab21cd01ac927d895f5287fc8/imobench"
+ source_url = f"{base_url}/proofbench.csv"
+ output_file = data_dir / "test.jsonl"
+
+ with urllib.request.urlopen(source_url, timeout=30) as response:
+ content = response.read().decode("utf-8")
+ reader = csv.DictReader(io.StringIO(content))
+
+ with open(output_file, "w", encoding="utf-8") as out:
+ for row in reader:
+ entry = {
+ "problem_id": row["Problem ID"],
+ "problem": row["Problem"],
+ "reference_solution": row["Solution"],
+ "rubric": row["Grading guidelines"],
+ "category": row["Category"],
+ "level": row["Level"],
+ "expected_answer": row["Short Answer"],
+ "source": row["Source"],
+ }
+ out.write(json.dumps(entry) + "\n")
diff --git a/nemo_skills/evaluation/metrics/gradingbench_metrics.py b/nemo_skills/evaluation/metrics/gradingbench_metrics.py
new file mode 100644
index 0000000000..7b2861f055
--- /dev/null
+++ b/nemo_skills/evaluation/metrics/gradingbench_metrics.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import re
+
+from nemo_skills.evaluation.metrics.base import BaseMetrics, as_float, as_int, as_percentage
+from nemo_skills.utils import get_logger_name
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+class GradingBenchMetrics(BaseMetrics):
+ """Metrics for IMO GradingBench evaluation.
+
+ Computes:
+ - exact_accuracy: Percentage of exact grade matches
+ - binarized_accuracy: Percentage where both grades fall in same bucket
+ (correct/almost vs partial/incorrect)
+ - mae: Mean Absolute Error between predicted and expected numeric scores
+
+ Grade to score mapping:
+ - correct: 7
+ - almost: 6
+ - partial: 1
+ - incorrect: 0
+ """
+
+ GRADE_TO_SCORE = {
+ "correct": 7,
+ "almost": 6,
+ "partial": 1,
+ "incorrect": 0,
+ }
+ GRADE_TO_BINARY = {
+ "correct": "high",
+ "almost": "high",
+ "partial": "low",
+ "incorrect": "low",
+ }
+ VALID_GRADES = set(GRADE_TO_SCORE.keys())
+
+ def __init__(self):
+ super().__init__(compute_no_answer=False)
+ self.mae_errors = []
+
+ def _extract_grade(self, text: str) -> str | None:
+ """Extract grade from the last word of judge output.
+
+ Handles markdown formatting and punctuation.
+ Returns None if no valid grade (correct/almost/partial/incorrect) is found.
+ """
+ if not text or not isinstance(text, str):
+ LOG.debug("Cannot extract grade: text is empty or not a string")
+ return None
+
+ words = text.strip().split()
+ if not words:
+ LOG.debug("Cannot extract grade: text contains no words")
+ return None
+
+ last_word = words[-1]
+
+ # Strip markdown and punctuation, normalize to lowercase
+ last_word = re.sub(r"[*_`.,;:!?()[\]{}]+", "", last_word).lower()
+
+ if last_word not in self.VALID_GRADES:
+ LOG.debug(
+ "Cannot extract grade: '%s' not in valid grades %s. Text ends with: '...%s'",
+ last_word,
+ self.VALID_GRADES,
+ text[-100:] if len(text) > 100 else text,
+ )
+ return None
+
+ return last_word
+
+ def _get_grades(self, prediction: dict) -> tuple[str | None, str | None]:
+ """Extract predicted and expected grades from a prediction."""
+ judgement = prediction.get("judgement", "") or prediction.get("generation", "")
+ pred_grade = self._extract_grade(judgement)
+
+ expected_answer_raw = prediction.get("expected_answer", "")
+ expected_grade = None
+ if expected_answer_raw:
+ expected_grade = expected_answer_raw.lower().strip()
+ if expected_grade not in self.VALID_GRADES:
+ LOG.warning(
+ "Invalid expected_answer '%s' - must be one of %s. Entry will be excluded from evaluation.",
+ expected_answer_raw,
+ self.VALID_GRADES,
+ )
+ expected_grade = None
+
+ return pred_grade, expected_grade
+
+ def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
+ """Return correctness scores for a prediction."""
+ pred_grade, expected_grade = self._get_grades(prediction)
+
+ exact_match = pred_grade is not None and pred_grade == expected_grade
+
+ pred_binary = self.GRADE_TO_BINARY.get(pred_grade)
+ expected_binary = self.GRADE_TO_BINARY.get(expected_grade)
+ binary_match = pred_binary is not None and expected_binary is not None and pred_binary == expected_binary
+
+ return {
+ "exact_accuracy": exact_match,
+ "binarized_accuracy": binary_match,
+ }
+
+ def update(self, predictions):
+ """Update metrics with predictions."""
+ super().update(predictions)
+
+ for pred in predictions:
+ pred_grade, expected_grade = self._get_grades(pred)
+ pred_score = self.GRADE_TO_SCORE.get(pred_grade)
+ expected_score = self.GRADE_TO_SCORE.get(expected_grade)
+
+ if pred_score is not None and expected_score is not None:
+ self.mae_errors.append(abs(pred_score - expected_score))
+
+ self._compute_pass_at_k(predictions=predictions)
+
+ def get_metrics(self):
+ """Return metrics including MAE."""
+ metrics_dict = super().get_metrics()
+
+ if self.mae_errors:
+ mae = sum(self.mae_errors) / len(self.mae_errors)
+ for agg_mode in metrics_dict:
+ metrics_dict[agg_mode]["mae"] = mae
+ metrics_dict[agg_mode]["mae_count"] = len(self.mae_errors)
+
+ return metrics_dict
+
+ def reset(self):
+ """Reset all tracked metrics."""
+ super().reset()
+ self.mae_errors = []
+
+ def metrics_to_print(self):
+ return {
+ "num_entries": as_int,
+ "exact_accuracy": as_percentage,
+ "binarized_accuracy": as_percentage,
+ "mae": as_float,
+ }
+
+ def evaluations_to_print(self):
+ return [f"pass@1[avg-of-{self.max_k}]", f"pass@{self.max_k}"]
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
index 4cd3c2d4a7..41c2acab64 100644
--- a/nemo_skills/evaluation/metrics/map_metrics.py
+++ b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -31,6 +31,7 @@
SciCodeMetrics,
SweBenchMetrics,
)
+from nemo_skills.evaluation.metrics.gradingbench_metrics import GradingBenchMetrics
from nemo_skills.evaluation.metrics.icpc_metrics import ICPCMetrics
from nemo_skills.evaluation.metrics.if_metrics import IFMetrics
from nemo_skills.evaluation.metrics.ioi_metrics import IOIMetrics
@@ -80,6 +81,7 @@
"mmau_pro_instruction_following": MMAUProMetrics,
"omniscience": OmniMetrics,
"compute-eval": ComputeEvalMetrics,
+ "gradingbench": GradingBenchMetrics,
}
diff --git a/nemo_skills/evaluation/metrics/utils.py b/nemo_skills/evaluation/metrics/utils.py
index 6b25098407..e9b7d241f7 100644
--- a/nemo_skills/evaluation/metrics/utils.py
+++ b/nemo_skills/evaluation/metrics/utils.py
@@ -35,8 +35,17 @@ def read_predictions(predictions, line_idx, file_handles):
def is_correct_judgement(judgement, return_none=False) -> Union[bool, None]:
- # Match both plain "Judgement:" and markdown bold "**Judgement**:" formats, this happens for gpt-4o which is AA Judge model.
+ """Parse judgement text to determine correctness.
+
+ Supports multiple formats:
+ - "Judgement: Yes/No" (standard format, also handles markdown bold)
+ - "\\boxed{Correct/Incorrect}" (IMO AnswerBench format)
+ - "N out of 7" (IMO ProofBench format: 6-7 = correct, 0-1 = incorrect)
+ """
if judgement:
+ # Format 1: "Judgement: Yes/No" (standard format)
+ # Match both plain "Judgement:" and markdown bold "**Judgement**:" formats,
+ # this happens for gpt-4o which is AA Judge model.
match = re.search(r"\*{0,2}Judgement\*{0,2}\s*:", judgement, re.IGNORECASE)
if match:
verdict = judgement[match.end() :].strip().lstrip("*").strip()
@@ -45,6 +54,20 @@ def is_correct_judgement(judgement, return_none=False) -> Union[bool, None]:
elif verdict.lower().startswith("no"):
return False
+ # Format 2: "\boxed{Correct/Incorrect}" (IMO AnswerBench format)
+ boxed_match = re.search(r"\\boxed\s*\{\s*(Correct|Incorrect)\s*\}", judgement, re.IGNORECASE)
+ if boxed_match:
+ verdict = boxed_match.group(1).lower()
+ return verdict == "correct"
+
+ # Format 3: "N out of 7" (IMO ProofBench format)
+ # 7 or 6 points = correct/almost correct (True)
+ # 1 or 0 points = partial/incorrect (False)
+ points_match = re.search(r"\s*(\d+)\s*out of 7\s*", judgement, re.IGNORECASE)
+ if points_match:
+ points = int(points_match.group(1))
+ return points >= 6 # 6-7 is correct/almost, 0-1 is incorrect/partial
+
if return_none:
return None
else:
diff --git a/nemo_skills/inference/model/gemini.py b/nemo_skills/inference/model/gemini.py
index 05d3b59245..44764cd833 100644
--- a/nemo_skills/inference/model/gemini.py
+++ b/nemo_skills/inference/model/gemini.py
@@ -27,7 +27,9 @@ def __init__(self, base_url: str | None = None, *args, **kwargs):
- gemini-2.5-flash: thinking budget 0-24576 (default: no thinking)
- gemini-2.5-flash-lite: thinking budget 0-24576 (default: no thinking)
"""
- super().__init__(base_url="", *args, **kwargs)
+ # Use empty string only if no base_url provided (uses litellm's default Gemini endpoint)
+ # Otherwise pass through the custom base_url
+ super().__init__(base_url=base_url if base_url else "", *args, **kwargs)
def _get_api_key(self, api_key: str | None, api_key_env_var: str | None, base_url: str) -> str | None:
api_key = super()._get_api_key(api_key, api_key_env_var, base_url)
@@ -100,7 +102,9 @@ def _build_chat_request_params(
params["thinking"] = {
"type": "enabled",
"budget_tokens": -1,
+ "include_thoughts": True,
}
+ params["allowed_openai_params"].append("thinking")
params["reasoning_effort"] = reasoning_effort
diff --git a/nemo_skills/pipeline/utils/cluster.py b/nemo_skills/pipeline/utils/cluster.py
index 7edd572e12..d525746f3e 100644
--- a/nemo_skills/pipeline/utils/cluster.py
+++ b/nemo_skills/pipeline/utils/cluster.py
@@ -168,7 +168,7 @@ def get_env_variables(cluster_config):
- `required_env_vars` - list of required environment variables
- `env_vars` - list of optional environment variables
- WANDB_API_KEY, NVIDIA_API_KEY, AZURE_OPENAI_API_KEY, OPENAI_API_KEY, and HF_TOKEN are always added if they exist.
+ WANDB_API_KEY, NVIDIA_API_KEY, AZURE_OPENAI_API_KEY, OPENAI_API_KEY, GEMINI_API_KEY, and HF_TOKEN are always added if they exist.
Args:
cluster_config: cluster config dictionary
@@ -211,6 +211,7 @@ def get_env_variables(cluster_config):
"NVIDIA_API_KEY",
"AZURE_OPENAI_API_KEY",
"OPENAI_API_KEY",
+ "GEMINI_API_KEY",
"HF_TOKEN",
"NGC_API_KEY",
}
diff --git a/nemo_skills/prompt/config/eval/matharena/aime.yaml b/nemo_skills/prompt/config/eval/matharena/aime.yaml
new file mode 100644
index 0000000000..82f1a84c77
--- /dev/null
+++ b/nemo_skills/prompt/config/eval/matharena/aime.yaml
@@ -0,0 +1,8 @@
+# MathArena AIME prompt - matches https://github.com/MathArena
+# Used for reproducing MathArena benchmark results
+
+user: |-
+ Please reason step by step, and put your final answer within \boxed{{}}.
+ The answer is an integer between 0 and 999 inclusive.
+
+ {problem}
diff --git a/nemo_skills/prompt/config/generic/matharena.yaml b/nemo_skills/prompt/config/generic/matharena.yaml
new file mode 100644
index 0000000000..4a0cd6a730
--- /dev/null
+++ b/nemo_skills/prompt/config/generic/matharena.yaml
@@ -0,0 +1,4 @@
+user: |-
+ Please reason step by step, and put your final answer within \boxed{{}}.
+
+ {problem}
diff --git a/nemo_skills/prompt/config/judge/imo_answerbench.yaml b/nemo_skills/prompt/config/judge/imo_answerbench.yaml
new file mode 100644
index 0000000000..30e39218c7
--- /dev/null
+++ b/nemo_skills/prompt/config/judge/imo_answerbench.yaml
@@ -0,0 +1,51 @@
+user: |-
+ # System Role: Deterministic Mathematical Autograder
+
+ You are a precise, automated grading system. Your sole function is to determine if the final answer provided in the Model Solution is mathematically equivalent to the Golden Answer. You must NOT grade the reasoning or steps, only the final result.
+
+ # 1. Grading Guidelines (Equivalence Rules)
+
+ Equivalence is mandatory for a correct grade. You must rigorously verify if the answers represent the exact same mathematical value or expression, even if the format differs.
+ * **Algebraic Equivalence:** e.g., `n(n+1)/2` is equivalent to `n^2/2 + n/2`. You must verify the algebra.
+ * **Numerical Equivalence:** e.g., `1/2` is equivalent to `0.5`; `sqrt(2)/2` is equivalent to `1/sqrt(2)`.
+ * **Set/List Equivalence:** Unless specified as an ordered tuple/vector, the order of elements does not matter (e.g., {1, 2} is equivalent to {2, 1}).
+ * **Partial Credit:** No partial credit is allowed. If the answer is incomplete or partially incorrect, it is incorrect.
+ * **No Answers:** If no clear, unambiguous final answer can be extracted, the solution must be graded as incorrect.
+
+ # 3. Output Protocol (Strict Compliance Required)
+
+ You must execute the task using a two-part structure. Failure to follow this structure will result in task failure.
+
+ **Part 1: Analysis (Chain-of-Thought)**
+ You MUST perform your analysis within tags. Make your thinking concise. This section details your reasoning process and must follow these steps sequentially:
+ 1. **Golden Answer:** State the Golden Answer.
+ 2. **Extracted Model Answer:** State the extracted answer based on the Extraction Protocol. If none found, state "No clear final answer found."
+ 3. **Equivalence Analysis:** Compare the two answers using the Grading Guidelines. Detail the steps taken to verify mathematical equivalence (e.g., simplification, algebraic manipulation). You must actively try to prove they are the same before concluding they are different.
+ 4. **Conclusion:** State the final determination ("Correct" or "Incorrect").
+
+ **Part 2: Final Grade**
+ Immediately following the closing tag, output **ONLY** the final grade.
+ * If Correct: \boxed{{Correct}}
+ * If Incorrect: \boxed{{Incorrect}}
+
+ **CRITICAL CONSTRAINT: Do not add any text, explanations, or formatting outside the tags or the final \boxed{{}} output.**
+
+ # Output example:
+
+
+ 1. **Golden Answer:** $(-\infty, -4) \cup (-4, \infty)$
+ 2. **Extracted Model Answer:** $\emptyset$ (the empty set)
+ 3. **Equivalence Analysis:**
+ The Golden Answer is a non-empty set of real numbers.
+ The Model Answer is the empty set.
+ These two sets are not equivalent. The empty set contains no elements, while the Golden Answer contains an infinite number of elements.
+ 4. **Conclusion:** Incorrect
+
+ \boxed{{Incorrect}}
+
+ # 4. Input Data
+ Here is the problem, model solution, and golden answer to grade:
+
+ Problem: {problem}
+ Model Solution: {predicted_answer}
+ Golden Answer: {expected_answer}
diff --git a/nemo_skills/prompt/config/judge/imo_gradingbench.yaml b/nemo_skills/prompt/config/judge/imo_gradingbench.yaml
new file mode 100644
index 0000000000..0d6826a43a
--- /dev/null
+++ b/nemo_skills/prompt/config/judge/imo_gradingbench.yaml
@@ -0,0 +1,18 @@
+# Grader prompt for IMO-GradingBench
+# Based on IMO-Bench paper Appendix Section 7.3
+# https://arxiv.org/abs/2511.01846
+
+user: |-
+ Carefully analyze the given problem statement and the proposed solution, and then write out your analysis regarding the correctness of the proposed solution.
+
+ After the analysis, you must provide a score based on the following criteria:
+
+ * **incorrect:** The solution is completely incorrect or irrelevant.
+ * **partial:** The solution is partially correct but has significant errors or omissions.
+ * **almost:** The solution is almost correct but contains minor errors or inaccuracies.
+ * **correct:** The solution is fully correct and complete.
+
+ The very last part of your response must be **only** one of the following words: incorrect, partial, almost, or correct.
+
+ Problem: {problem_statement}
+ Solution: {proof}
diff --git a/nemo_skills/prompt/config/judge/imo_proofbench.yaml b/nemo_skills/prompt/config/judge/imo_proofbench.yaml
new file mode 100644
index 0000000000..bb9c56e624
--- /dev/null
+++ b/nemo_skills/prompt/config/judge/imo_proofbench.yaml
@@ -0,0 +1,47 @@
+user: |-
+ You are an expert grader for the International Mathematics Olympiad (IMO). Your task is to evaluate a proposed solution strictly and rigorously. Keep in mind the standards at the IMO are extremely high: only arguments that are logically sound, complete, and precise should be rewarded.
+
+ ### General Scoring Rubric
+ Scores are assigned on a 0-7 scale. The general guidelines are:
+ * **7 Points (Correct):** The solution is complete, correct, and fully rigorous. If the submission contains incorrect attempts or lines of reasoning but ultimately presents a complete and correct solution, it should still be awarded full points; the presence of earlier, discarded work does not detract from the final correct proof.
+ * **6 Points (Almost Correct):** The solution is almost correct with a sound core argument, but contains minor errors in calculation or small gaps in logic. Missing proofs for major components, unjustified claims, or sketchy arguments are **not** eligible for 6 points.
+ * **1 Point (Partial Progress):** The solution demonstrates substantial progress explicitly mentioned in the grading guidelines. Initial observations, reformulating the problem without making substantive headway, or proving partial results not mentioned in the grading guidelines are generally **not** eligible for this score.
+ * **0 Points (Incorrect):** The solution doesn't make substantial progress that is a key step in the full solution or is fundamentally flawed. All partial progress without key results or lacking rigor also fall in this category.
+
+ ### Input Data and Interpretation
+ You are provided with the following:
+ 1. **Problem Statement:** The IMO problem.
+ 2. **Ground Truth Solution:** A reference solution. Assume this solution is correct. It demonstrates one valid approach.
+ 3. **Specific Grading Guidelines:** Criteria for awarding credit for this specific problem. These guidelines take precedence over the General Scoring Rubric, especially for partial credit.
+ 4. **Proposed Solution:** The student submission.
+
+ ### Evaluation Process
+ You must follow this structured process:
+ 1. **Analyze References:** Meticulously read and understand the problem and Ground Truth Solution check the Specific Grading Guidelines. Identify the key steps for a complete solution and the criteria for partial credit.
+ 2. **Step-by-Step Verification:** Verify the logical validity and rigor of every step. Identify all flaws, gaps, assumptions, and errors. **Make sure you fully understand every piece of logic behind each step of the proposed solution, you must be careful for solutions that 'pretend' to be correct.**
+ 3. **Assess Progress:** Determine the extent of non-trivial progress made.
+ 4. **Score Determination:** Compare the findings against the Specific Grading Guidelines and the General Rubric to determine the final score.
+
+ ### Output Requirements
+ You must provide your final score in the format N out of 7. Ensure the `` block is used **only once**, as your answer will be parsed based on the first block that appears in your whole response.
+
+ **PROBLEM STATEMENT**
+ {problem}
+
+ **GROUND-TRUTH SOLUTION**
+ {reference_solution}
+
+ **SPECIFIC GRADING GUIDELINES**
+ {rubric}
+
+ **PROPOSED SOLUTION**
+ {predicted_answer}
+
+ Present your detailed thought process and formal justification based on the scoring rubric and grading guidelines, and finally present your final score in the format below.
+
+ [Select one of the following options]
+
+ * 7 out of 7
+ * 6 out of 7
+ * 1 out of 7
+ * 0 out of 7