From 9bc4a9cb9a12227e3e50491cbd4af3ff454625d9 Mon Sep 17 00:00:00 2001
From: bzantium <ryumin93@gmail.com>
Date: Wed, 31 Dec 2025 09:00:46 +0900
Subject: [PATCH] o3-mini-20250131 -> o3-mini-2025-01-31

Signed-off-by: bzantium <ryumin93@gmail.com>
---
 docs/evaluation/natural-math.md                         | 2 +-
 docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md | 6 +++---
 docs/tutorials/posts/nemotron-nano-v2-evals.md          | 4 ++--
 nemo_skills/dataset/hle/__init__.py                     | 4 ++--
 nemo_skills/dataset/simpleqa/__init__.py                | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/evaluation/natural-math.md b/docs/evaluation/natural-math.md
index 9d0aac2d4e..c7a7cf5f58 100644
--- a/docs/evaluation/natural-math.md
+++ b/docs/evaluation/natural-math.md
@@ -104,7 +104,7 @@ For example, in a benchmark's `__init__.py` file, you can add default LLM-as-jud
 
 ```bash
 JUDGE_PIPELINE_ARGS = {
-    "model": "o3-mini-20250131",
+    "model": "o3-mini-2025-01-31",
     "server_type": "openai",
     "server_address": "https://api.openai.com/v1",
 }
diff --git a/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md b/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md
index 536b97a251..b447c61f83 100644
--- a/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md
+++ b/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md
@@ -158,7 +158,7 @@ ns eval \
 #### Command for HLE Eval (Reasoning on)
 
 
-For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-20250131` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}.
+For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-2025-01-31` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}.
 
 Note that using any of the OpenAI hosted models requires `OPENAI_API_KEY`. Alternatively, a self-hosted judge model can also be used for judgement. For example, `--judge_model="/workspace/Llama-3_3-Nemotron-Super-49B-v1_5"`  in tandem with `--judge_server_type="vllm" --judge_server_gpus 2` will use the `Llama-3_3-Nemotron-Super-49B-v1_5` itself as a judge.
 
@@ -171,7 +171,7 @@ ns eval \
     --output_dir=/workspace/llama_nemotron_49b_1_5/ \
     --benchmarks=hle:16 \
     --server_gpus=2 \
-    --judge_model="o3-mini-20250131" \
+    --judge_model="o3-mini-2025-01-31" \
     --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \
     ++parse_reasoning=True \
     ++inference.tokens_to_generate=65536 \
@@ -434,7 +434,7 @@ ns eval \
     --output_dir=/workspace/llama_nemotron_49b_1_5_reasoning_off/ \
     --benchmarks=hle:16 \
     --server_gpus=2 \
-    --judge_model="o3-mini-20250131" \
+    --judge_model="o3-mini-2025-01-31" \
     --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.0 \
diff --git a/docs/tutorials/posts/nemotron-nano-v2-evals.md b/docs/tutorials/posts/nemotron-nano-v2-evals.md
index 45fa9bb1c7..0e64342a8e 100644
--- a/docs/tutorials/posts/nemotron-nano-v2-evals.md
+++ b/docs/tutorials/posts/nemotron-nano-v2-evals.md
@@ -188,7 +188,7 @@ ns eval \
 
 #### Command for HLE Eval
 
-For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-20250131` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}.
+For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-2025-01-31` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}.
 
 Note that using any of the OpenAI hosted models requires `OPENAI_API_KEY`. Alternatively, a self-hosted judge model can also be used for judgement. For example, `--judge_model="/workspace/NVIDIA-Nemotron-Nano-9B-v2"`  in tandem with `--judge_server_type="vllm" --judge_server_gpus 1` will use the `NVIDIA-Nemotron-Nano-9B-v2` itself as a judge.
 
@@ -202,7 +202,7 @@ ns eval \
     --server_type=vllm \
     --server_gpus=1 \
     --server_args="--mamba_ssm_cache_dtype float32 " \
-    --judge_model="o3-mini-20250131" \
+    --judge_model="o3-mini-2025-01-31" \
     --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \
     ++parse_reasoning=True \
     ++inference.tokens_to_generate=32768 \
diff --git a/nemo_skills/dataset/hle/__init__.py b/nemo_skills/dataset/hle/__init__.py
index 51db80829d..d805fbf3e7 100644
--- a/nemo_skills/dataset/hle/__init__.py
+++ b/nemo_skills/dataset/hle/__init__.py
@@ -20,10 +20,10 @@
 
 # Some answers are not possible to compare symbolically, so have to use a judge model
 # Setting openai judge by default, but can be overriden from command line for a locally hosted model
-# Currently using o3-mini-20250131 which is used by the official leaderboard - https://agi.safe.ai/
+# Currently using o3-mini-2025-01-31 which is used by the official leaderboard - https://agi.safe.ai/
 # To approximate the Artificial Analysis Index results, we suggest using gpt-4o - https://artificialanalysis.ai/methodology/intelligence-benchmarking#evaluation-suite-details
 JUDGE_PIPELINE_ARGS = {
-    "model": "o3-mini-20250131",
+    "model": "o3-mini-2025-01-31",
     "server_type": "openai",
     "server_address": "https://api.openai.com/v1",
 }
diff --git a/nemo_skills/dataset/simpleqa/__init__.py b/nemo_skills/dataset/simpleqa/__init__.py
index d3829e4281..c1e7d82158 100644
--- a/nemo_skills/dataset/simpleqa/__init__.py
+++ b/nemo_skills/dataset/simpleqa/__init__.py
@@ -20,10 +20,10 @@
 
 # SimpleQA requires judge model for evaluating factual accuracy
 # Setting openai judge by default, but can be overridden from command line for a locally hosted model
-# Using o3-mini-20250131 as recommended for factual evaluation tasks
+# Using o3-mini-2025-01-31 as recommended for factual evaluation tasks
 
 JUDGE_PIPELINE_ARGS = {
-    "model": "o3-mini-20250131",
+    "model": "o3-mini-2025-01-31",
     "server_type": "openai",
     "server_address": "https://api.openai.com/v1",
 }