NVIDIA-NeMo · terrykong · Jul 30, 2025 · Jul 28, 2025 · Jul 30, 2025
@@ -89,9 +89,12 @@ score=0.1000 (3.0/30)
 
 ## List of currently supported benchmarks
 
-- [AIME-2024](../../nemo_rl/data/eval_datasets/aime2024.py)
-- [GPQA and GPQA-diamond](../../nemo_rl/data/eval_datasets/gpqa.py)
-- [MATH and MATH-500](../../nemo_rl/data/eval_datasets/math.py)
-- [MMLU](../../nemo_rl/data/eval_datasets/mmlu.py): this also includes MMMLU (Multilingual MMLU), a total of 14 languages.
-- [MMLU-Pro](../../nemo_rl/data/eval_datasets/mmlu_pro.py)
+- [AIME-2024](../../nemo_rl/data/eval_datasets/aime2024.py): the corresponding `data.dataset_name` is `"aime2024"`.
+- [AIME-2025](../../nemo_rl/data/eval_datasets/aime2025.py): the corresponding `data.dataset_name` is `"aime2025"`.
+- [GPQA and GPQA-diamond](../../nemo_rl/data/eval_datasets/gpqa.py): the corresponding `data.dataset_name` are `"gpqa"` and `"gpqa-diamond"`.
+- [MATH and MATH-500](../../nemo_rl/data/eval_datasets/math.py): the corresponding `data.dataset_name` are `"math"` and `"math500"`.
+- [MMLU](../../nemo_rl/data/eval_datasets/mmlu.py): this also includes MMMLU (Multilingual MMLU), a total of 14 languages. When `data.dataset_name` is set to `mmlu`, the English version is used. If one wants to run evaluation on another language, `data.dataset_name` should be set to `mmlu_{language}` where `language` is one of following 14 values, `["AR-XY", "BN-BD", "DE-DE", "ES-LA", "FR-FR", "HI-IN", "ID-ID", "IT-IT", "JA-JP", "KO-KR", "PT-BR", "ZH-CN", "SW-KE", "YO-NG"]`.
+- [MMLU-Pro](../../nemo_rl/data/eval_datasets/mmlu_pro.py): the corresponding `data.dataset_name` is `"mmlu_pro"`.
+
+More details can be found in [load_eval_dataset](../../nemo_rl/data/eval_datasets/__init__.py).
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from nemo_rl.data.eval_datasets.aime2024 import AIME2024Dataset
+from nemo_rl.data.eval_datasets.aime2025 import AIME2025Dataset
 from nemo_rl.data.eval_datasets.gpqa import GPQADataset
 from nemo_rl.data.eval_datasets.local_math_dataset import LocalMathDataset
 from nemo_rl.data.eval_datasets.math import MathDataset
@@ -41,6 +42,11 @@ def load_eval_dataset(data_config):
             prompt_file=data_config["prompt_file"],
             system_prompt_file=data_config["system_prompt_file"],
         )
+    elif dataset_name == "aime2025":
+        base_dataset = AIME2025Dataset(
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
     elif dataset_name == "gpqa":
         base_dataset = GPQADataset(
             variant="main",
@@ -88,6 +94,7 @@ def load_eval_dataset(data_config):
 
 __all__ = [
     "AIME2024Dataset",
+    "AIME2025Dataset",
     "GPQADataset",
     "LocalMathDataset",
     "MathDataset",

@@ -0,0 +1,46 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AIME 2025 dataset."""
+
+from typing import Any, Optional
+
+from datasets import concatenate_datasets, load_dataset
+
+from nemo_rl.data import processors
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+class AIME2025Dataset:
+    def __init__(
+        self,
+        prompt_file: Optional[str] = None,
+        system_prompt_file: Optional[str] = None,
+    ):
+        ds0 = load_dataset("opencompass/AIME2025", "AIME2025-I", split="test")
+        ds1 = load_dataset("opencompass/AIME2025", "AIME2025-II", split="test")
+        ds = concatenate_datasets([ds0, ds1])
+        self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
+        self.task_spec = TaskDataSpec(
+            task_name="aime2025",
+            prompt_file=prompt_file,
+            system_prompt_file=system_prompt_file,
+        )
+        self.processor = processors.math_data_processor
+
+    def _rekey(self, data: dict[str, Any]):
+        return {
+            "problem": data["question"],
+            "expected_answer": data["answer"],
+        }
@@ -43,6 +43,7 @@ project-includes = [
     "nemo_rl/data/datasets.py",
     "nemo_rl/data/eval_datasets/__init__.py",
     "nemo_rl/data/eval_datasets/aime2024.py",
+    "nemo_rl/data/eval_datasets/aime2025.py",
     "nemo_rl/data/eval_datasets/gpqa.py",
     "nemo_rl/data/eval_datasets/local_math_dataset.py",
     "nemo_rl/data/eval_datasets/math.py",