Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions docs/guides/eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,12 @@ score=0.1000 (3.0/30)

## List of currently supported benchmarks

- [AIME-2024](../../nemo_rl/data/eval_datasets/aime2024.py)
- [GPQA and GPQA-diamond](../../nemo_rl/data/eval_datasets/gpqa.py)
- [MATH and MATH-500](../../nemo_rl/data/eval_datasets/math.py)
- [MMLU](../../nemo_rl/data/eval_datasets/mmlu.py): this also includes MMMLU (Multilingual MMLU), a total of 14 languages.
- [MMLU-Pro](../../nemo_rl/data/eval_datasets/mmlu_pro.py)
- [AIME-2024](../../nemo_rl/data/eval_datasets/aime2024.py): the corresponding `data.dataset_name` is `"aime2024"`.
- [AIME-2025](../../nemo_rl/data/eval_datasets/aime2025.py): the corresponding `data.dataset_name` is `"aime2025"`.
- [GPQA and GPQA-diamond](../../nemo_rl/data/eval_datasets/gpqa.py): the corresponding `data.dataset_name` are `"gpqa"` and `"gpqa-diamond"`.
- [MATH and MATH-500](../../nemo_rl/data/eval_datasets/math.py): the corresponding `data.dataset_name` are `"math"` and `"math500"`.
- [MMLU](../../nemo_rl/data/eval_datasets/mmlu.py): this also includes MMMLU (Multilingual MMLU), a total of 14 languages. When `data.dataset_name` is set to `mmlu`, the English version is used. If one wants to run evaluation on another language, `data.dataset_name` should be set to `mmlu_{language}` where `language` is one of following 14 values, `["AR-XY", "BN-BD", "DE-DE", "ES-LA", "FR-FR", "HI-IN", "ID-ID", "IT-IT", "JA-JP", "KO-KR", "PT-BR", "ZH-CN", "SW-KE", "YO-NG"]`.
- [MMLU-Pro](../../nemo_rl/data/eval_datasets/mmlu_pro.py): the corresponding `data.dataset_name` is `"mmlu_pro"`.

More details can be found in [load_eval_dataset](../../nemo_rl/data/eval_datasets/__init__.py).

7 changes: 7 additions & 0 deletions nemo_rl/data/eval_datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from nemo_rl.data.eval_datasets.aime2024 import AIME2024Dataset
from nemo_rl.data.eval_datasets.aime2025 import AIME2025Dataset
from nemo_rl.data.eval_datasets.gpqa import GPQADataset
from nemo_rl.data.eval_datasets.local_math_dataset import LocalMathDataset
from nemo_rl.data.eval_datasets.math import MathDataset
Expand Down Expand Up @@ -41,6 +42,11 @@ def load_eval_dataset(data_config):
prompt_file=data_config["prompt_file"],
system_prompt_file=data_config["system_prompt_file"],
)
elif dataset_name == "aime2025":
base_dataset = AIME2025Dataset(
prompt_file=data_config["prompt_file"],
system_prompt_file=data_config["system_prompt_file"],
)
elif dataset_name == "gpqa":
base_dataset = GPQADataset(
variant="main",
Expand Down Expand Up @@ -88,6 +94,7 @@ def load_eval_dataset(data_config):

__all__ = [
"AIME2024Dataset",
"AIME2025Dataset",
"GPQADataset",
"LocalMathDataset",
"MathDataset",
Expand Down
46 changes: 46 additions & 0 deletions nemo_rl/data/eval_datasets/aime2025.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""AIME 2025 dataset."""

from typing import Any, Optional

from datasets import concatenate_datasets, load_dataset

from nemo_rl.data import processors
from nemo_rl.data.interfaces import TaskDataSpec


class AIME2025Dataset:
def __init__(
self,
prompt_file: Optional[str] = None,
system_prompt_file: Optional[str] = None,
):
ds0 = load_dataset("opencompass/AIME2025", "AIME2025-I", split="test")
ds1 = load_dataset("opencompass/AIME2025", "AIME2025-II", split="test")
ds = concatenate_datasets([ds0, ds1])
self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
self.task_spec = TaskDataSpec(
task_name="aime2025",
prompt_file=prompt_file,
system_prompt_file=system_prompt_file,
)
self.processor = processors.math_data_processor

def _rekey(self, data: dict[str, Any]):
return {
"problem": data["question"],
"expected_answer": data["answer"],
}
1 change: 1 addition & 0 deletions pyrefly.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ project-includes = [
"nemo_rl/data/datasets.py",
"nemo_rl/data/eval_datasets/__init__.py",
"nemo_rl/data/eval_datasets/aime2024.py",
"nemo_rl/data/eval_datasets/aime2025.py",
"nemo_rl/data/eval_datasets/gpqa.py",
"nemo_rl/data/eval_datasets/local_math_dataset.py",
"nemo_rl/data/eval_datasets/math.py",
Expand Down