diff --git a/docs/guides/eval.md b/docs/guides/eval.md index c82b4bff37..31779f3dda 100644 --- a/docs/guides/eval.md +++ b/docs/guides/eval.md @@ -89,9 +89,12 @@ score=0.1000 (3.0/30) ## List of currently supported benchmarks -- [AIME-2024](../../nemo_rl/data/eval_datasets/aime2024.py) -- [GPQA and GPQA-diamond](../../nemo_rl/data/eval_datasets/gpqa.py) -- [MATH and MATH-500](../../nemo_rl/data/eval_datasets/math.py) -- [MMLU](../../nemo_rl/data/eval_datasets/mmlu.py): this also includes MMMLU (Multilingual MMLU), a total of 14 languages. -- [MMLU-Pro](../../nemo_rl/data/eval_datasets/mmlu_pro.py) +- [AIME-2024](../../nemo_rl/data/eval_datasets/aime2024.py): the corresponding `data.dataset_name` is `"aime2024"`. +- [AIME-2025](../../nemo_rl/data/eval_datasets/aime2025.py): the corresponding `data.dataset_name` is `"aime2025"`. +- [GPQA and GPQA-diamond](../../nemo_rl/data/eval_datasets/gpqa.py): the corresponding `data.dataset_name` are `"gpqa"` and `"gpqa-diamond"`. +- [MATH and MATH-500](../../nemo_rl/data/eval_datasets/math.py): the corresponding `data.dataset_name` are `"math"` and `"math500"`. +- [MMLU](../../nemo_rl/data/eval_datasets/mmlu.py): this also includes MMMLU (Multilingual MMLU), a total of 14 languages. When `data.dataset_name` is set to `mmlu`, the English version is used. If one wants to run evaluation on another language, `data.dataset_name` should be set to `mmlu_{language}` where `language` is one of following 14 values, `["AR-XY", "BN-BD", "DE-DE", "ES-LA", "FR-FR", "HI-IN", "ID-ID", "IT-IT", "JA-JP", "KO-KR", "PT-BR", "ZH-CN", "SW-KE", "YO-NG"]`. +- [MMLU-Pro](../../nemo_rl/data/eval_datasets/mmlu_pro.py): the corresponding `data.dataset_name` is `"mmlu_pro"`. + +More details can be found in [load_eval_dataset](../../nemo_rl/data/eval_datasets/__init__.py). diff --git a/nemo_rl/data/eval_datasets/__init__.py b/nemo_rl/data/eval_datasets/__init__.py index e99a7c6af2..c735bf2e18 100644 --- a/nemo_rl/data/eval_datasets/__init__.py +++ b/nemo_rl/data/eval_datasets/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from nemo_rl.data.eval_datasets.aime2024 import AIME2024Dataset +from nemo_rl.data.eval_datasets.aime2025 import AIME2025Dataset from nemo_rl.data.eval_datasets.gpqa import GPQADataset from nemo_rl.data.eval_datasets.local_math_dataset import LocalMathDataset from nemo_rl.data.eval_datasets.math import MathDataset @@ -41,6 +42,11 @@ def load_eval_dataset(data_config): prompt_file=data_config["prompt_file"], system_prompt_file=data_config["system_prompt_file"], ) + elif dataset_name == "aime2025": + base_dataset = AIME2025Dataset( + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) elif dataset_name == "gpqa": base_dataset = GPQADataset( variant="main", @@ -88,6 +94,7 @@ def load_eval_dataset(data_config): __all__ = [ "AIME2024Dataset", + "AIME2025Dataset", "GPQADataset", "LocalMathDataset", "MathDataset", diff --git a/nemo_rl/data/eval_datasets/aime2025.py b/nemo_rl/data/eval_datasets/aime2025.py new file mode 100644 index 0000000000..43db69cf1b --- /dev/null +++ b/nemo_rl/data/eval_datasets/aime2025.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AIME 2025 dataset.""" + +from typing import Any, Optional + +from datasets import concatenate_datasets, load_dataset + +from nemo_rl.data import processors +from nemo_rl.data.interfaces import TaskDataSpec + + +class AIME2025Dataset: + def __init__( + self, + prompt_file: Optional[str] = None, + system_prompt_file: Optional[str] = None, + ): + ds0 = load_dataset("opencompass/AIME2025", "AIME2025-I", split="test") + ds1 = load_dataset("opencompass/AIME2025", "AIME2025-II", split="test") + ds = concatenate_datasets([ds0, ds1]) + self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) + self.task_spec = TaskDataSpec( + task_name="aime2025", + prompt_file=prompt_file, + system_prompt_file=system_prompt_file, + ) + self.processor = processors.math_data_processor + + def _rekey(self, data: dict[str, Any]): + return { + "problem": data["question"], + "expected_answer": data["answer"], + } diff --git a/pyrefly.toml b/pyrefly.toml index f3bc05a639..2fa6327786 100644 --- a/pyrefly.toml +++ b/pyrefly.toml @@ -43,6 +43,7 @@ project-includes = [ "nemo_rl/data/datasets.py", "nemo_rl/data/eval_datasets/__init__.py", "nemo_rl/data/eval_datasets/aime2024.py", + "nemo_rl/data/eval_datasets/aime2025.py", "nemo_rl/data/eval_datasets/gpqa.py", "nemo_rl/data/eval_datasets/local_math_dataset.py", "nemo_rl/data/eval_datasets/math.py",