NVIDIA-NeMo · shtoshni · Aug 14, 2025 · Aug 14, 2025 · Aug 14, 2025
diff --git a/nemo_skills/dataset/hle/__init__.py b/nemo_skills/dataset/hle/__init__.py
@@ -20,7 +20,13 @@
 GENERATION_ARGS = ""
 EVAL_SPLIT = "text"
 
-# some answers are not possible to compare symbolically, so have to use a judge model
-# setting openai judge by default, but can be overriden from command line for a locally hosted model
-JUDGE_PIPELINE_ARGS = {"model": "gpt-4.1", "server_type": "openai", "server_address": "https://api.openai.com/v1"}
+# Some answers are not possible to compare symbolically, so have to use a judge model
+# Setting openai judge by default, but can be overriden from command line for a locally hosted model
+# Currently using o3-mini-20250131 which is used by the official leaderboard - https://agi.safe.ai/
+# To approximate the Artificial Analysis Index results, we suggest using gpt-4o - https://artificialanalysis.ai/methodology/intelligence-benchmarking#evaluation-suite-details
+JUDGE_PIPELINE_ARGS = {
+    "model": "o3-mini-20250131",
+    "server_type": "openai",
+    "server_address": "https://api.openai.com/v1",
+}
 JUDGE_ARGS = "++prompt_config=judge/hle ++generation_key=judgement ++add_generation_stats=False"
diff --git a/nemo_skills/dataset/hmmt_feb25/prepare.py b/nemo_skills/dataset/hmmt_feb25/prepare.py
@@ -22,7 +22,6 @@
 def write_data_to_file(output_file, data):
     with open(output_file, "wt", encoding="utf-8") as fout:
         for entry in tqdm(data, desc=f"Writing {output_file.name}"):
-            print(entry)
             entry['expected_answer'] = entry.pop('answer')
             json.dump(entry, fout)
             fout.write("\n")

diff --git a/nemo_skills/inference/model/azure.py b/nemo_skills/inference/model/azure.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+
 from .openai import OpenAIModel
 
 
@@ -23,7 +24,7 @@ def __init__(
         self,
         *args,
         api_key: str | None = None,
-        api_version: str = "2024-02-15-preview",
+        api_version: str = "2024-12-01-preview",
         **kwargs,
     ):
         if api_key is None: