Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
26f94c1
Remove incorrect presence-penalty setting (#1259)
Kipok Feb 20, 2026
305a3a8
Add reference to internal benchmarks repo (#1261)
Kipok Feb 20, 2026
d72cf82
Rename custom -> external benchmarks (#1262)
Kipok Feb 20, 2026
598c841
Add aime26 (#1256)
bzantium Feb 20, 2026
0857d88
fix deps (#1258)
yqwangustc Feb 20, 2026
1d6035c
feat: add custom judge type support for external repo integration
peri044 Feb 20, 2026
1f40be5
chore: Address review comments
peri044 Feb 21, 2026
9c738a2
Remove deprecated dataset group (#1263)
Kipok Feb 20, 2026
57761a9
chore: Fix argument
peri044 Feb 21, 2026
9604fd5
chore: address review comments
peri044 Feb 21, 2026
285cbbc
chore: address review comments
peri044 Feb 24, 2026
25633a0
chore: rename judge_step to judge_path
peri044 Feb 24, 2026
39f25ce
Fix incorrect prompt tokens count due to HF api update (#1264)
Kipok Feb 20, 2026
f90490e
Fix no_answer metric overcounting in _compute_pass_at_k (#1245)
sgunasekar Feb 21, 2026
9c6800d
Add CritPt benchmark (#1200)
jiacheng-xu Feb 21, 2026
d9b1649
Add DSBench-DA evaluation (#1254)
sgunasekar Feb 22, 2026
b7b7dbe
Numb3rs ds addition (#1174)
Jorjeous Feb 23, 2026
af7454a
style: fix ruff format for judge_path line
peri044 Feb 24, 2026
4111e16
Small renaming
Kipok Feb 24, 2026
a1fdfe2
Update tests with new api
Kipok Feb 24, 2026
d27e339
Fix judge assignment
Kipok Feb 24, 2026
31f6ff7
Merge branch 'main' into peri044/external_judge
Kipok Feb 24, 2026
b397ca8
Year change
Kipok Feb 24, 2026
6984468
Merge branch 'main' into peri044/external_judge
Kipok Feb 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/evaluation/multilingual.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,6 @@ By default, we compute [BLEU score](https://github.com/mjpost/sacrebleu) to eval
```bash
ns eval \
... \
--judge_type=comet \
--judge_step_fn="nemo_skills.pipeline.judges.comet_judge::create_judge_tasks" \
--judge_model=[path_to_comet_checkpoint]
```
2 changes: 1 addition & 1 deletion nemo_skills/dataset/mmau-pro/closed_form/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@

# NVEmbed judge configuration for closed-form evaluation
JUDGE_PIPELINE_ARGS = {
"judge_type": "nvembed",
"judge_step_fn": "nemo_skills.pipeline.judges.nvembed_judge::create_judge_tasks",
}
2 changes: 1 addition & 1 deletion nemo_skills/pipeline/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
230 changes: 27 additions & 203 deletions nemo_skills/pipeline/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,177 +41,6 @@ class SingleNodeMode(str, enum.Enum):
parallel = "parallel"


def _create_comet_judge_tasks(
exp,
expname,
benchmark,
judge_pipeline_args,
rerun_done,
log_dir,
server_parameters,
cluster_config,
judge_server_gpus,
judge_server_nodes,
partition,
run_after,
reuse_code_exp,
reuse_code,
dependent_tasks,
all_tasks,
_task_dependencies,
installation_command,
skip_hf_home_check,
sbatch_kwargs,
):
"""Create tasks for Comet judge evaluation."""
from nemo_skills.pipeline.utils.generation import get_remaining_jobs

output_dir_path = judge_pipeline_args.get("output_dir")
input_file = judge_pipeline_args.get("input_file")
comet_model_path = judge_pipeline_args.get("judge_model")

# Determine seeds to check
if input_file is None:
num_seeds = judge_pipeline_args.get("num_random_seeds", 1)
random_seeds = list(range(num_seeds))
else:
random_seeds = [None]

remaining_jobs = get_remaining_jobs(
cluster_config=cluster_config,
output_dir=output_dir_path,
random_seeds=random_seeds,
chunk_ids=[None], # No chunking for judge task
rerun_done=rerun_done,
)

if not remaining_jobs or all(not chunks for chunks in remaining_jobs.values()):
LOG.info(f"Skipping Comet judge for {benchmark} - all output files and .done markers exist")
return []

# Build command to run xCOMET-XXL judge script
script_args = [f"--output-dir {output_dir_path} --comet-model-path {comet_model_path}"]

if input_file is None:
input_dir = judge_pipeline_args.get("input_dir")
script_args.append(f"--input-dir {input_dir}")
script_args.append(f"--num-seeds {num_seeds}")
else:
script_args.append(f"--input-file {input_file}")

run_cmd = f"pip install unbabel-comet && python3 -I /nemo_run/code/nemo_skills/evaluation/evaluator/comet.py {' '.join(script_args)}"

# Create task with GPU support for Comet
judge_task = pipeline_utils.add_task(
exp,
cmd=run_cmd,
task_name=f"{expname}-{benchmark}-comet-judge",
log_dir=log_dir + "/judge",
container=cluster_config["containers"]["vllm"],
cluster_config=cluster_config,
num_gpus=judge_server_gpus or 1,
num_nodes=judge_server_nodes or 1,
partition=partition,
run_after=run_after,
reuse_code_exp=reuse_code_exp,
reuse_code=reuse_code,
task_dependencies=(
dependent_tasks if cluster_config["executor"] == "slurm" else all_tasks + _task_dependencies
),
installation_command=installation_command,
skip_hf_home_check=skip_hf_home_check,
sbatch_kwargs=sbatch_kwargs,
)
return [judge_task]


def _create_nvembed_judge_tasks(
exp,
expname,
benchmark,
judge_pipeline_args,
rerun_done,
log_dir,
server_parameters,
cluster_config,
judge_server_gpus,
judge_server_nodes,
partition,
run_after,
reuse_code_exp,
reuse_code,
dependent_tasks,
all_tasks,
_task_dependencies,
installation_command,
skip_hf_home_check,
sbatch_kwargs,
):
"""Create tasks for NVEmbed judge evaluation."""
from nemo_skills.pipeline.utils.generation import get_remaining_jobs

output_dir_path = judge_pipeline_args.get("output_dir")
input_file = judge_pipeline_args.get("input_file")

# Determine seeds to check
if input_file is None:
num_seeds = judge_pipeline_args.get("num_random_seeds", 1)
random_seeds = list(range(num_seeds))
else:
random_seeds = [None]

remaining_jobs = get_remaining_jobs(
cluster_config=cluster_config,
output_dir=output_dir_path,
random_seeds=random_seeds,
chunk_ids=[None], # No chunking for judge task
rerun_done=rerun_done,
)

if not remaining_jobs or all(not chunks for chunks in remaining_jobs.values()):
LOG.info(f"Skipping NVEmbed judge for {benchmark} - all output files and .done markers exist")
return []

# Build command to run NVEmbed judge script
script_args = [f"--output-dir {output_dir_path}"]

if input_file is None:
input_dir = judge_pipeline_args.get("input_dir")
script_args.append(f"--input-dir {input_dir}")
script_args.append(f"--num-seeds {num_seeds}")
else:
script_args.append(f"--input-file {input_file}")

# Add skip-existing flag unless rerun_done is set
if not rerun_done:
script_args.append("--skip-existing")

run_cmd = f"python3 -I /nemo_run/code/nemo_skills/evaluation/evaluator/nvembed_judge.py {' '.join(script_args)}"

# Create task with GPU support for NVEmbed
judge_task = pipeline_utils.add_task(
exp,
cmd=run_cmd,
task_name=f"{expname}-{benchmark}-nvembed-judge",
log_dir=log_dir + "/judge",
container=cluster_config["containers"]["vllm"],
cluster_config=cluster_config,
num_gpus=judge_server_gpus or 1,
num_nodes=judge_server_nodes or 1,
partition=partition,
run_after=run_after,
reuse_code_exp=reuse_code_exp,
reuse_code=reuse_code,
task_dependencies=(
dependent_tasks if cluster_config["executor"] == "slurm" else all_tasks + _task_dependencies
),
installation_command=installation_command,
skip_hf_home_check=skip_hf_home_check,
sbatch_kwargs=sbatch_kwargs,
)
return [judge_task]


def _create_llm_judge_tasks(
ctx,
expname,
Expand Down Expand Up @@ -325,7 +154,11 @@ def eval(
help="Path to the entrypoint of the server. "
"If not specified, will use the default entrypoint for the server type.",
),
judge_type: str = typer.Option("llm", help="Type of judge to use: 'llm' (default) or 'nvembed'"),
judge_step_fn: str = typer.Option(
None,
help="Path to the judge step creator function to use for the judge (locate() convention). "
"Eg: nemo_skills.pipeline.judges.nvembed_judge::create_judge_tasks. Can also accept callable directly.",
),
judge_model: str = typer.Option(None, help="Path to the model to be used as a judge (if applicable)"),
judge_server_address: str = typer.Option(None, help="Address of the server hosting the judge model"),
judge_server_type: pipeline_utils.SupportedServers = typer.Option(
Expand Down Expand Up @@ -519,7 +352,7 @@ def eval(
"generation_type": judge_generation_type,
"generation_module": judge_generation_module,
}
eval_requires_judge = any(param_value for param_value in cli_judge_pipeline_args.values()) or judge_type != "llm"
eval_requires_judge = any(param_value for param_value in cli_judge_pipeline_args.values()) or judge_step_fn

# Prepare cluster config and mount paths
cluster_config = pipeline_utils.get_cluster_config(cluster, config_dir)
Expand Down Expand Up @@ -643,43 +476,34 @@ def eval(
benchmark_args.eval_subfolder = benchmark_args.eval_subfolder[4:]
judge_pipeline_args["output_dir"] = str(Path(output_dir) / benchmark_args.eval_subfolder)

# Check for per-benchmark judge_type, fall back to global judge_type
benchmark_judge_type = judge_pipeline_args.pop("judge_type", judge_type)
# judge_step_fn is a :: path to the judge creator function (locate() convention).
# Could be set directly in JUDGE_PIPELINE_ARGS; falls back to None for LLM judge.
judge_step_fn = judge_pipeline_args.pop("judge_step_fn", judge_step_fn)

# Create judge tasks based on judge type
if benchmark_judge_type == "nvembed":
judge_tasks = _create_nvembed_judge_tasks(
exp=exp,
expname=expname,
benchmark=benchmark,
judge_pipeline_args=judge_pipeline_args,
rerun_done=rerun_done,
log_dir=log_dir,
server_parameters=server_parameters,
cluster_config=cluster_config,
judge_server_gpus=judge_server_gpus,
judge_server_nodes=judge_server_nodes,
partition=partition,
run_after=run_after,
reuse_code_exp=reuse_code_exp,
reuse_code=reuse_code,
dependent_tasks=dependent_tasks,
all_tasks=all_tasks,
_task_dependencies=_task_dependencies,
installation_command=installation_command,
skip_hf_home_check=skip_hf_home_check,
sbatch_kwargs=sbatch_kwargs,
)
elif benchmark_judge_type == "comet":
judge_pipeline_args["judge_model"] = judge_model
judge_tasks = _create_comet_judge_tasks(
# TODO: we should rework the interface here to have consistent parameters between main llm and custom
# judge creation steps. E.g. things like judge_model assignment below shouldn't be necessary

if judge_step_fn:
has_tasks = True
if not callable(judge_step_fn):
Comment on lines +486 to +488
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

has_tasks = True is set before confirming the judge creator returns tasks.

If judge_step_fn() returns [] (e.g., all outputs already exist), has_tasks is still True. In an edge case where this is the only scheduled work, run_exp would be called on an experiment with no tasks. Consider moving has_tasks = True inside the if judge_tasks: guard at line 554, consistent with how _generate returning None is handled.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@nemo_skills/pipeline/eval.py` around lines 486 - 488, The flag has_tasks is
being set unconditionally when a judge_step_fn exists, which can be incorrect if
judge_step_fn() returns an empty list; change the logic in the block handling
judge_step_fn so that you only set has_tasks = True after calling judge_step_fn
and confirming it returned a non-empty list (e.g., inside the if judge_tasks:
guard that processes the returned tasks), mirroring the existing pattern used
when _generate returns None; ensure run_exp is only triggered when has_tasks is
true after this check.

# Use locate() to dynamically load judge creator function
from nemo_skills.dataset.utils import locate

judge_step_fn = locate(judge_step_fn)
Comment on lines +479 to +492
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Bug: judge_step_fn leaks across benchmark iterations.

On line 481, judge_step_fn is reassigned from judge_pipeline_args.pop("judge_step_fn", judge_step_fn). Since this is inside the for benchmark ... loop (line 457), the fallback value on subsequent iterations is whatever was set by the previous benchmark — not the original CLI value.

Example: If benchmark A's JUDGE_PIPELINE_ARGS defines judge_step_fn = "...nvembed_judge::create_judge_tasks" but benchmark B does not, benchmark B will incorrectly inherit benchmark A's judge_step_fn instead of falling back to the CLI default (likely None → LLM judge).

Save the original CLI value before the loop and use it as the fallback:

Proposed fix
     all_tasks = []
     if _task_dependencies is None:
         _task_dependencies = []
+    cli_judge_step_fn = judge_step_fn
     with pipeline_utils.get_exp(expname, cluster_config, _reuse_exp) as exp:
         # scheduling main eval jobs
         ...
         # scheduling judge jobs if needed
         for idx, (benchmark, benchmark_args) in enumerate(benchmarks_dict.items()):
             ...
-            judge_step_fn = judge_pipeline_args.pop("judge_step_fn", judge_step_fn)
+            benchmark_judge_step_fn = judge_pipeline_args.pop("judge_step_fn", cli_judge_step_fn)
             ...
-            if judge_step_fn:
+            if benchmark_judge_step_fn:
                 has_tasks = True
-                if not callable(judge_step_fn):
+                if not callable(benchmark_judge_step_fn):
                     from nemo_skills.dataset.utils import locate
-                    judge_step_fn = locate(judge_step_fn)
+                    benchmark_judge_step_fn = locate(benchmark_judge_step_fn)
                 ...
-                judge_tasks = judge_step_fn(
+                judge_tasks = benchmark_judge_step_fn(
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@nemo_skills/pipeline/eval.py` around lines 479 - 492, The bug is that
judge_step_fn is mutated inside the benchmark loop by using
judge_pipeline_args.pop("judge_step_fn", judge_step_fn), causing the previous
iteration's value to be used as the fallback; to fix, capture the original
CLI/default value (e.g., orig_judge_step_fn = judge_step_fn) before entering the
loop and inside the loop use a local variable or call pop with that original as
the default (judge_pipeline_args.pop("judge_step_fn", orig_judge_step_fn)), and
ensure you don't assign back to the outer-scope judge_step_fn so the CLI default
isn't overwritten across iterations; keep the locate() dynamic-loading logic
(from nemo_skills.dataset.utils import locate) but apply it to the loop-local
variable only.


# Pass judge_model through so judge implementations can access it if needed (e.g. comet)
if judge_model:
judge_pipeline_args.setdefault("judge_model", judge_model)

# Call with standardized parameters
judge_tasks = judge_step_fn(
exp=exp,
expname=expname,
benchmark=benchmark,
judge_pipeline_args=judge_pipeline_args,
rerun_done=rerun_done,
log_dir=log_dir,
server_parameters=server_parameters,
output_dir=output_dir,
cluster_config=cluster_config,
judge_server_gpus=judge_server_gpus,
judge_server_nodes=judge_server_nodes,
Expand Down
15 changes: 15 additions & 0 deletions nemo_skills/pipeline/judges/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Judge implementations for evaluation pipeline."""
Loading