NVIDIA-NeMo · Kipok · Feb 20, 2026 · Feb 19, 2026 · coderabbitai · Feb 19, 2026
diff --git a/nemo_skills/dataset/aime26/__init__.py b/nemo_skills/dataset/aime26/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+DATASET_GROUP = "math"
+METRICS_TYPE = "math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/aime26/prepare.py b/nemo_skills/dataset/aime26/prepare.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+from pathlib import Path
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+def format_entry(entry):
+    return {
+        "id": f"aime26-{entry['problem_idx']}",
+        "problem": entry["problem"],
+        "expected_answer": str(entry["answer"]),
+    }
+
+
+def write_data_to_file(output_file, data):
+    with open(output_file, "wt", encoding="utf-8") as fout:
+        for entry in tqdm(data, desc=f"Writing {output_file.name}"):
+            json.dump(format_entry(entry), fout, ensure_ascii=False)
+            fout.write("\n")
+
+
+def main(args):
+    dataset = load_dataset("MathArena/aime_2026", split="train")
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+    output_file = data_dir / f"{args.split}.jsonl"
+    write_data_to_file(output_file, dataset)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--split", default="test", choices=("test",), help="Dataset split to process.")
+    args = parser.parse_args()
+    main(args)