NVIDIA-NeMo · Kipok · Jan 7, 2026 · Dec 31, 2025 · Jan 6, 2026
diff --git a/docs/evaluation/natural-math.md b/docs/evaluation/natural-math.md
@@ -104,7 +104,7 @@ For example, in a benchmark's `__init__.py` file, you can add default LLM-as-jud
 
 ```bash
 JUDGE_PIPELINE_ARGS = {
-    "model": "o3-mini-20250131",
+    "model": "o3-mini-2025-01-31",
     "server_type": "openai",
     "server_address": "https://api.openai.com/v1",
 }

diff --git a/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md b/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md
@@ -158,7 +158,7 @@ ns eval \
 #### Command for HLE Eval (Reasoning on)
 
 
-For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-20250131` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}.
+For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-2025-01-31` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}.
 
 Note that using any of the OpenAI hosted models requires `OPENAI_API_KEY`. Alternatively, a self-hosted judge model can also be used for judgement. For example, `--judge_model="/workspace/Llama-3_3-Nemotron-Super-49B-v1_5"`  in tandem with `--judge_server_type="vllm" --judge_server_gpus 2` will use the `Llama-3_3-Nemotron-Super-49B-v1_5` itself as a judge.
 
@@ -171,7 +171,7 @@ ns eval \
     --output_dir=/workspace/llama_nemotron_49b_1_5/ \
     --benchmarks=hle:16 \
     --server_gpus=2 \
-    --judge_model="o3-mini-20250131" \
+    --judge_model="o3-mini-2025-01-31" \
     --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \
     ++parse_reasoning=True \
     ++inference.tokens_to_generate=65536 \
@@ -434,7 +434,7 @@ ns eval \
     --output_dir=/workspace/llama_nemotron_49b_1_5_reasoning_off/ \
     --benchmarks=hle:16 \
     --server_gpus=2 \
-    --judge_model="o3-mini-20250131" \
+    --judge_model="o3-mini-2025-01-31" \
     --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.0 \

diff --git a/docs/tutorials/posts/nemotron-nano-v2-evals.md b/docs/tutorials/posts/nemotron-nano-v2-evals.md
@@ -188,7 +188,7 @@ ns eval \
 
 #### Command for HLE Eval
 
-For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-20250131` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}.
+For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-2025-01-31` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}.
 
 Note that using any of the OpenAI hosted models requires `OPENAI_API_KEY`. Alternatively, a self-hosted judge model can also be used for judgement. For example, `--judge_model="/workspace/NVIDIA-Nemotron-Nano-9B-v2"`  in tandem with `--judge_server_type="vllm" --judge_server_gpus 1` will use the `NVIDIA-Nemotron-Nano-9B-v2` itself as a judge.
 
@@ -202,7 +202,7 @@ ns eval \
     --server_type=vllm \
     --server_gpus=1 \
     --server_args="--mamba_ssm_cache_dtype float32 " \
-    --judge_model="o3-mini-20250131" \
+    --judge_model="o3-mini-2025-01-31" \
     --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \
     ++parse_reasoning=True \
     ++inference.tokens_to_generate=32768 \

diff --git a/nemo_skills/dataset/hle/__init__.py b/nemo_skills/dataset/hle/__init__.py
@@ -20,10 +20,10 @@
 
 # Some answers are not possible to compare symbolically, so have to use a judge model
 # Setting openai judge by default, but can be overriden from command line for a locally hosted model
-# Currently using o3-mini-20250131 which is used by the official leaderboard - https://agi.safe.ai/
+# Currently using o3-mini-2025-01-31 which is used by the official leaderboard - https://agi.safe.ai/
 # To approximate the Artificial Analysis Index results, we suggest using gpt-4o - https://artificialanalysis.ai/methodology/intelligence-benchmarking#evaluation-suite-details
 JUDGE_PIPELINE_ARGS = {
-    "model": "o3-mini-20250131",
+    "model": "o3-mini-2025-01-31",
     "server_type": "openai",
     "server_address": "https://api.openai.com/v1",
 }

diff --git a/nemo_skills/dataset/simpleqa/__init__.py b/nemo_skills/dataset/simpleqa/__init__.py
@@ -20,10 +20,10 @@
 
 # SimpleQA requires judge model for evaluating factual accuracy
 # Setting openai judge by default, but can be overridden from command line for a locally hosted model
-# Using o3-mini-20250131 as recommended for factual evaluation tasks
+# Using o3-mini-2025-01-31 as recommended for factual evaluation tasks
 
 JUDGE_PIPELINE_ARGS = {
-    "model": "o3-mini-20250131",
+    "model": "o3-mini-2025-01-31",
     "server_type": "openai",
     "server_address": "https://api.openai.com/v1",
 }