diff --git a/nemo_skills/dataset/hle/__init__.py b/nemo_skills/dataset/hle/__init__.py index 435975c67f..a76ad58e49 100644 --- a/nemo_skills/dataset/hle/__init__.py +++ b/nemo_skills/dataset/hle/__init__.py @@ -20,7 +20,13 @@ GENERATION_ARGS = "" EVAL_SPLIT = "text" -# some answers are not possible to compare symbolically, so have to use a judge model -# setting openai judge by default, but can be overriden from command line for a locally hosted model -JUDGE_PIPELINE_ARGS = {"model": "gpt-4.1", "server_type": "openai", "server_address": "https://api.openai.com/v1"} +# Some answers are not possible to compare symbolically, so have to use a judge model +# Setting openai judge by default, but can be overriden from command line for a locally hosted model +# Currently using o3-mini-20250131 which is used by the official leaderboard - https://agi.safe.ai/ +# To approximate the Artificial Analysis Index results, we suggest using gpt-4o - https://artificialanalysis.ai/methodology/intelligence-benchmarking#evaluation-suite-details +JUDGE_PIPELINE_ARGS = { + "model": "o3-mini-20250131", + "server_type": "openai", + "server_address": "https://api.openai.com/v1", +} JUDGE_ARGS = "++prompt_config=judge/hle ++generation_key=judgement ++add_generation_stats=False" diff --git a/nemo_skills/dataset/hmmt_feb25/prepare.py b/nemo_skills/dataset/hmmt_feb25/prepare.py index c5622dbbbf..3e21c8aeb7 100644 --- a/nemo_skills/dataset/hmmt_feb25/prepare.py +++ b/nemo_skills/dataset/hmmt_feb25/prepare.py @@ -22,7 +22,6 @@ def write_data_to_file(output_file, data): with open(output_file, "wt", encoding="utf-8") as fout: for entry in tqdm(data, desc=f"Writing {output_file.name}"): - print(entry) entry['expected_answer'] = entry.pop('answer') json.dump(entry, fout) fout.write("\n") diff --git a/nemo_skills/inference/model/azure.py b/nemo_skills/inference/model/azure.py index 2af5643856..bede01e570 100644 --- a/nemo_skills/inference/model/azure.py +++ b/nemo_skills/inference/model/azure.py @@ -13,6 +13,7 @@ # limitations under the License. import os + from .openai import OpenAIModel @@ -23,7 +24,7 @@ def __init__( self, *args, api_key: str | None = None, - api_version: str = "2024-02-15-preview", + api_version: str = "2024-12-01-preview", **kwargs, ): if api_key is None: