From 9bc4a9cb9a12227e3e50491cbd4af3ff454625d9 Mon Sep 17 00:00:00 2001 From: bzantium Date: Wed, 31 Dec 2025 09:00:46 +0900 Subject: [PATCH] o3-mini-20250131 -> o3-mini-2025-01-31 Signed-off-by: bzantium --- docs/evaluation/natural-math.md | 2 +- docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md | 6 +++--- docs/tutorials/posts/nemotron-nano-v2-evals.md | 4 ++-- nemo_skills/dataset/hle/__init__.py | 4 ++-- nemo_skills/dataset/simpleqa/__init__.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/evaluation/natural-math.md b/docs/evaluation/natural-math.md index 9d0aac2d4e..c7a7cf5f58 100644 --- a/docs/evaluation/natural-math.md +++ b/docs/evaluation/natural-math.md @@ -104,7 +104,7 @@ For example, in a benchmark's `__init__.py` file, you can add default LLM-as-jud ```bash JUDGE_PIPELINE_ARGS = { - "model": "o3-mini-20250131", + "model": "o3-mini-2025-01-31", "server_type": "openai", "server_address": "https://api.openai.com/v1", } diff --git a/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md b/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md index 536b97a251..b447c61f83 100644 --- a/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md +++ b/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md @@ -158,7 +158,7 @@ ns eval \ #### Command for HLE Eval (Reasoning on) -For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-20250131` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}. +For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-2025-01-31` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}. Note that using any of the OpenAI hosted models requires `OPENAI_API_KEY`. Alternatively, a self-hosted judge model can also be used for judgement. For example, `--judge_model="/workspace/Llama-3_3-Nemotron-Super-49B-v1_5"` in tandem with `--judge_server_type="vllm" --judge_server_gpus 2` will use the `Llama-3_3-Nemotron-Super-49B-v1_5` itself as a judge. @@ -171,7 +171,7 @@ ns eval \ --output_dir=/workspace/llama_nemotron_49b_1_5/ \ --benchmarks=hle:16 \ --server_gpus=2 \ - --judge_model="o3-mini-20250131" \ + --judge_model="o3-mini-2025-01-31" \ --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \ ++parse_reasoning=True \ ++inference.tokens_to_generate=65536 \ @@ -434,7 +434,7 @@ ns eval \ --output_dir=/workspace/llama_nemotron_49b_1_5_reasoning_off/ \ --benchmarks=hle:16 \ --server_gpus=2 \ - --judge_model="o3-mini-20250131" \ + --judge_model="o3-mini-2025-01-31" \ --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \ ++inference.tokens_to_generate=65536 \ ++inference.temperature=0.0 \ diff --git a/docs/tutorials/posts/nemotron-nano-v2-evals.md b/docs/tutorials/posts/nemotron-nano-v2-evals.md index 45fa9bb1c7..0e64342a8e 100644 --- a/docs/tutorials/posts/nemotron-nano-v2-evals.md +++ b/docs/tutorials/posts/nemotron-nano-v2-evals.md @@ -188,7 +188,7 @@ ns eval \ #### Command for HLE Eval -For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-20250131` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}. +For HLE, because symbolic comparison is not sufficient to determine the correctness of the output, we use the recommended `o3-mini-2025-01-31` model as the judge. Note that this model is the default in Nemo-Skills, and we have just added this argument for illustration purposes. To evaluate for the [Artificial Analysis Index (AAI) setting, please use the gpt-4o-20240806 model as the judge](https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview){target="_blank"}. Note that using any of the OpenAI hosted models requires `OPENAI_API_KEY`. Alternatively, a self-hosted judge model can also be used for judgement. For example, `--judge_model="/workspace/NVIDIA-Nemotron-Nano-9B-v2"` in tandem with `--judge_server_type="vllm" --judge_server_gpus 1` will use the `NVIDIA-Nemotron-Nano-9B-v2` itself as a judge. @@ -202,7 +202,7 @@ ns eval \ --server_type=vllm \ --server_gpus=1 \ --server_args="--mamba_ssm_cache_dtype float32 " \ - --judge_model="o3-mini-20250131" \ + --judge_model="o3-mini-2025-01-31" \ --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \ ++parse_reasoning=True \ ++inference.tokens_to_generate=32768 \ diff --git a/nemo_skills/dataset/hle/__init__.py b/nemo_skills/dataset/hle/__init__.py index 51db80829d..d805fbf3e7 100644 --- a/nemo_skills/dataset/hle/__init__.py +++ b/nemo_skills/dataset/hle/__init__.py @@ -20,10 +20,10 @@ # Some answers are not possible to compare symbolically, so have to use a judge model # Setting openai judge by default, but can be overriden from command line for a locally hosted model -# Currently using o3-mini-20250131 which is used by the official leaderboard - https://agi.safe.ai/ +# Currently using o3-mini-2025-01-31 which is used by the official leaderboard - https://agi.safe.ai/ # To approximate the Artificial Analysis Index results, we suggest using gpt-4o - https://artificialanalysis.ai/methodology/intelligence-benchmarking#evaluation-suite-details JUDGE_PIPELINE_ARGS = { - "model": "o3-mini-20250131", + "model": "o3-mini-2025-01-31", "server_type": "openai", "server_address": "https://api.openai.com/v1", } diff --git a/nemo_skills/dataset/simpleqa/__init__.py b/nemo_skills/dataset/simpleqa/__init__.py index d3829e4281..c1e7d82158 100644 --- a/nemo_skills/dataset/simpleqa/__init__.py +++ b/nemo_skills/dataset/simpleqa/__init__.py @@ -20,10 +20,10 @@ # SimpleQA requires judge model for evaluating factual accuracy # Setting openai judge by default, but can be overridden from command line for a locally hosted model -# Using o3-mini-20250131 as recommended for factual evaluation tasks +# Using o3-mini-2025-01-31 as recommended for factual evaluation tasks JUDGE_PIPELINE_ARGS = { - "model": "o3-mini-20250131", + "model": "o3-mini-2025-01-31", "server_type": "openai", "server_address": "https://api.openai.com/v1", }