-
Notifications
You must be signed in to change notification settings - Fork 163
first version of 3 imo-bench datasets #1047
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a9a7f78
4f7bd0e
1db1bda
663451c
2988a90
d28fe3c
75cea4d
dd6fbc5
d633726
5bc260e
64e1a49
b378959
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| DATASET_GROUP = "math" | ||
| METRICS_TYPE = "math" | ||
| GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math" | ||
| # Judge configuration: Use the AnswerAutoGrader prompt. | ||
| # Recommended model: Gemini 2.5 Pro (or similar strong reasoner) | ||
| JUDGE_ARGS = "++prompt_config=judge/imo_answerbench ++generation_key=judgement ++inference.reasoning_effort=dynamic" | ||
|
|
||
| JUDGE_PIPELINE_ARGS = { | ||
| "generation_type": "math_judge", | ||
| "model": "gemini-2.5-pro", | ||
| "server_type": "gemini", | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import csv | ||
| import io | ||
| import json | ||
| import urllib.request | ||
| from pathlib import Path | ||
|
|
||
| if __name__ == "__main__": | ||
| data_dir = Path(__file__).absolute().parent | ||
| data_dir.mkdir(exist_ok=True) | ||
|
|
||
| base_url = "https://raw.githubusercontent.com/google-deepmind/superhuman/c1ee02e03d4cdb2ab21cd01ac927d895f5287fc8/imobench" | ||
| source_url = f"{base_url}/answerbench.csv" | ||
| output_file = data_dir / "test.jsonl" | ||
|
|
||
| with urllib.request.urlopen(source_url, timeout=30) as response: | ||
| content = response.read().decode("utf-8") | ||
| reader = csv.DictReader(io.StringIO(content)) | ||
|
|
||
| with open(output_file, "w", encoding="utf-8") as out: | ||
| for row in reader: | ||
| entry = { | ||
| "problem_id": row["Problem ID"], | ||
| "problem": row["Problem"], | ||
| "expected_answer": row["Short Answer"], | ||
| "category": row["Category"], | ||
| "subcategory": row["Subcategory"], | ||
| "source": row["Source"], | ||
| } | ||
| out.write(json.dumps(entry) + "\n") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| DATASET_GROUP = "judge" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have answer-judgement data type for binary judgement, we could potentially expand that since imo-grading-bench has 4 different possible states (or maybe we could binarize the imo-proof-bench instead).
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, we should unify, let's create an issue for this, but we don't need to do it in this pr |
||
| METRICS_TYPE = "gradingbench" | ||
| # This dataset is for evaluating grading ability - the model must grade proofs. | ||
| GENERATION_ARGS = "++prompt_config=judge/imo_gradingbench ++eval_type=math" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import csv | ||
| import io | ||
| import json | ||
| import urllib.request | ||
| from pathlib import Path | ||
|
|
||
| if __name__ == "__main__": | ||
| data_dir = Path(__file__).absolute().parent | ||
| data_dir.mkdir(exist_ok=True) | ||
|
|
||
| base_url = "https://raw.githubusercontent.com/google-deepmind/superhuman/c1ee02e03d4cdb2ab21cd01ac927d895f5287fc8/imobench" | ||
| source_url = f"{base_url}/gradingbench.csv" | ||
| output_file = data_dir / "test.jsonl" | ||
|
|
||
| with urllib.request.urlopen(source_url, timeout=30) as response: | ||
| content = response.read().decode("utf-8") | ||
| reader = csv.DictReader(io.StringIO(content)) | ||
|
|
||
| with open(output_file, "w", encoding="utf-8") as out: | ||
| for row in reader: | ||
| entry = { | ||
| "grading_id": row["Grading ID"], | ||
| "problem_id": row["Problem ID"], | ||
| "problem_statement": row["Problem"], | ||
| "reference_solution": row["Solution"], | ||
| "rubric": row["Grading guidelines"], | ||
| "proof": row["Response"], | ||
| "points": row["Points"], | ||
| "reward": row["Reward"], | ||
| "source": row["Problem Source"], | ||
| # We set 'expected_answer' to the reward/points for evaluation | ||
| "expected_answer": row["Reward"], | ||
| } | ||
| out.write(json.dumps(entry) + "\n") | ||
|
Comment on lines
+29
to
+48
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Consider adding try-except blocks similar to the pattern used in |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| DATASET_GROUP = "math" | ||
| # Using judge metrics as we need an LLM to evaluate the proof | ||
| METRICS_TYPE = "math" | ||
| GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math" | ||
| # Judge configuration: Use the ProofAutoGrader prompt. | ||
| # Recommended model: Gemini 2.5 Pro (or similar strong reasoner) | ||
| JUDGE_ARGS = "++prompt_config=judge/imo_proofbench ++generation_key=judgement ++inference.reasoning_effort=dynamic" | ||
|
|
||
| JUDGE_PIPELINE_ARGS = { | ||
| "generation_type": "math_judge", | ||
| "model": "gemini-2.5-pro", | ||
| "server_type": "gemini", | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import csv | ||
| import io | ||
| import json | ||
| import urllib.request | ||
| from pathlib import Path | ||
|
|
||
| if __name__ == "__main__": | ||
| data_dir = Path(__file__).absolute().parent | ||
| data_dir.mkdir(exist_ok=True) | ||
|
|
||
| base_url = "https://raw.githubusercontent.com/google-deepmind/superhuman/c1ee02e03d4cdb2ab21cd01ac927d895f5287fc8/imobench" | ||
| source_url = f"{base_url}/proofbench.csv" | ||
| output_file = data_dir / "test.jsonl" | ||
|
|
||
| with urllib.request.urlopen(source_url, timeout=30) as response: | ||
| content = response.read().decode("utf-8") | ||
| reader = csv.DictReader(io.StringIO(content)) | ||
|
|
||
| with open(output_file, "w", encoding="utf-8") as out: | ||
| for row in reader: | ||
| entry = { | ||
| "problem_id": row["Problem ID"], | ||
| "problem": row["Problem"], | ||
| "reference_solution": row["Solution"], | ||
| "rubric": row["Grading guidelines"], | ||
| "category": row["Category"], | ||
| "level": row["Level"], | ||
| "expected_answer": row["Short Answer"], | ||
| "source": row["Source"], | ||
| } | ||
| out.write(json.dumps(entry) + "\n") | ||
|
Comment on lines
+29
to
+45
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Consider adding try-except blocks similar to the pattern used in |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
urllib.request.urlopen()call and CSV parsing lack error handling. Network failures, HTTP errors, timeouts, or missing CSV columns will cause the script to crash with an unhelpful error message.Consider adding try-except blocks to handle potential errors gracefully:
This matches the error handling pattern used in other dataset preparation scripts like
nemo_skills/dataset/mmlu-prox/prepare.py.