Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions nemo_skills/pipeline/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,12 @@ def eval(
None,
help="Number of chunks to split the dataset into. If None, will not chunk the dataset.",
),
gpus_per_node: int = typer.Option(
1,
help="Number of GPUs per node for multi-instance mode. "
"When > 1, launches multiple server instances (one per GPU) within a single job. "
"Requires num_chunks to be a multiple of gpus_per_node.",
),
chunk_ids: str = typer.Option(
None,
help="List of explicit chunk ids to run. Separate with , or .. to specify range. "
Expand Down Expand Up @@ -581,6 +587,7 @@ def eval(
eval_requires_judge=eval_requires_judge,
generation_type=generation_type,
generation_module=generation_module,
gpus_per_node=gpus_per_node,
)

sbatch_kwargs = parse_kwargs(sbatch_kwargs, exclusive=exclusive, qos=qos, time_min=time_min)
Expand All @@ -605,9 +612,14 @@ def eval(
job_server_address,
job_server_command,
job_sandbox_env_overrides,
job_gpus_per_node,
) = job_args
prev_tasks = _task_dependencies

# Add gpus_per_node to server config for multi-instance mode
if job_server_config and job_gpus_per_node > 1:
job_server_config["gpus_per_node"] = job_gpus_per_node

for _ in range(dependent_jobs + 1):
has_tasks = True
new_task = pipeline_utils.add_task(
Expand All @@ -617,6 +629,7 @@ def eval(
log_dir=log_dir,
container=cluster_config["containers"]["nemo-skills"],
cluster_config=cluster_config,
num_tasks=job_gpus_per_node,
partition=partition,
server_config=job_server_config,
with_sandbox=job_needs_sandbox or with_sandbox,
Expand Down
39 changes: 36 additions & 3 deletions nemo_skills/pipeline/utils/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,11 +267,20 @@ def prepare_eval_commands(
eval_requires_judge,
generation_type=None,
generation_module=None,
gpus_per_node: int = 1,
):
# TODO: there is a bit too much code duplication here and logic is quite dense, should try to refactor

# TODO: should we allow setting num chunks per benchmark when not using groups? Maybe benchmark:rs_num:num_chunks?

# Validate gpus_per_node for multi-instance mode
if gpus_per_node > 1:
if num_chunks is None:
raise ValueError("gpus_per_node > 1 requires num_chunks to be specified")
if num_chunks % gpus_per_node != 0:
raise ValueError(f"num_chunks ({num_chunks}) must be a multiple of gpus_per_node ({gpus_per_node})")
LOG.info(f"Multi-instance mode: {gpus_per_node} GPUs per node, {num_chunks // gpus_per_node} jobs")

if generation_type is not None:
if generation_module is not None:
raise ValueError("Cannot specify both generation_module and generation_type. ")
Expand Down Expand Up @@ -354,7 +363,12 @@ def prepare_eval_commands(
rerun_done=rerun_done,
)
for seed_idx, (seed, benchmark_chunk_ids) in enumerate(benchmark_args.remaining_jobs.items()):
total_evals += len(benchmark_chunk_ids)
# Multi-instance mode: count unique base chunks (each base chunk = 1 job)
if gpus_per_node > 1:
base_chunks = set((cid // gpus_per_node) * gpus_per_node for cid in benchmark_chunk_ids)
total_evals += len(base_chunks)
else:
total_evals += len(benchmark_chunk_ids)

if num_jobs < 0:
# if num_jobs is -1, we run all benchmarks in parallel
Expand All @@ -376,6 +390,7 @@ def prepare_eval_commands(
**server_parameters,
extra_arguments=extra_arguments,
get_random_port=get_random_port,
gpus_per_node=gpus_per_node,
)

cur_eval = 0
Expand All @@ -398,7 +413,18 @@ def prepare_eval_commands(
random_seed=seed,
chunk_id=None,
)
for chunk_id in benchmark_chunk_ids:
# Multi-instance mode: compute which base chunks need to run
# If ANY chunk in a batch is incomplete, we run the entire batch (base_chunk)
if gpus_per_node > 1:
base_chunks_to_run = set()
for cid in benchmark_chunk_ids:
base_chunk = (cid // gpus_per_node) * gpus_per_node
base_chunks_to_run.add(base_chunk)
chunks_to_process = sorted(base_chunks_to_run)
else:
chunks_to_process = benchmark_chunk_ids

for chunk_id in chunks_to_process:
job_benchmarks.add(benchmark)

effective_generation_module = generation_module or benchmark_args.generation_module
Expand Down Expand Up @@ -431,12 +457,17 @@ def prepare_eval_commands(
f"{job_extra_arguments} "
)

# Multi-instance mode: use shell expression for chunk_id
effective_chunk_id = chunk_id
if gpus_per_node > 1:
effective_chunk_id = f"$(({chunk_id} + $SLURM_LOCALID))"

cmd = pipeline_utils.get_generation_cmd(
input_file=benchmark_args.input_file,
output_dir=benchmark_output_dir,
extra_arguments=full_extra_arguments,
random_seed=seed,
chunk_id=chunk_id,
chunk_id=effective_chunk_id,
num_chunks=benchmark_args.num_chunks,
script=generation_module or benchmark_args.generation_module,
requirements=requirements,
Expand Down Expand Up @@ -480,12 +511,14 @@ def prepare_eval_commands(
# a check above guarantees that this is the same for all tasks in a job
generation_task.get_server_command_fn(),
job_sandbox_env_overrides,
gpus_per_node, # client num_tasks for multi-instance mode
)
)
job_server_config, job_server_address, job_extra_arguments = pipeline_utils.configure_client(
**server_parameters,
extra_arguments=extra_arguments,
get_random_port=get_random_port,
gpus_per_node=gpus_per_node,
)
for job_benchmark in job_benchmarks:
benchmarks_dict[job_benchmark].job_ids.append(cur_job_idx)
Expand Down
25 changes: 21 additions & 4 deletions nemo_skills/pipeline/utils/exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,15 @@ def stdout(self) -> Path:

@property
def srun_stdout(self) -> Path:
return Path(self.folder) / f"{self.srun_prefix}%j_srun.log"
return Path(self.folder) / f"{self.srun_prefix}%j_%t_srun.log"

@property
def stderr(self) -> Path:
return Path(self.folder) / f"{self.sbatch_prefix}%j_sbatch.log"

@property
def srun_stderr(self) -> Path:
return Path(self.folder) / f"{self.srun_prefix}%j_srun.log"
return Path(self.folder) / f"{self.srun_prefix}%j_%t_srun.log"

@property
def ls_term(self) -> str:
Expand All @@ -144,7 +144,7 @@ def ls_term(self) -> str:
The command used to list the files is ls -1 {ls_term} 2> /dev/null
"""
assert self.folder
return os.path.join(self.folder, "*%j_srun.log")
return os.path.join(self.folder, "*%j_*_srun.log")


@dataclass(kw_only=True)
Expand Down Expand Up @@ -314,13 +314,30 @@ def get_executor(
srun_args = [
"--no-container-mount-home",
"--mpi=pmix",
"--wait=10",
# we need to be explicit about this in srun as commands might need to run in parallel
f"--ntasks-per-node={tasks_per_node}",
f"--nodes={num_nodes}",
# NeMo-run should take care of this, but we'll put it here temporarily
f"--container-env={','.join([k.strip() for k in env_vars.keys()])}",
]
# IMPORTANT:
# Slurm's `srun --wait=<sec>` terminates the job step if other tasks are still
# running <sec> seconds after the first task exits.
#
# `nemo_run` adds `--wait=60` by default; for multi-instance runs (e.g., chunked
# evaluation) tasks can finish at very different times (some may exit quickly
# due to `++skip_filled=True`), which causes Slurm to kill still-running tasks.
#
# We override this with a large wait by default for multi-instance mode.
# You can customize via cluster config:
# srun_wait_seconds: <int>
srun_wait_seconds = cluster_config.get("srun_wait_seconds")
if srun_wait_seconds is None and tasks_per_node > 1:
# Use a reasonably large wait (1 hour) so long-running ranks aren't killed just
# because other ranks finished earlier.
srun_wait_seconds = 60 * 60
if srun_wait_seconds is not None:
srun_args.append(f"--wait={int(srun_wait_seconds)}")
if overlap:
srun_args.append("--overlap")
if not cluster_config.get("disable_gpus_per_node", False) and gpus_per_node is not None:
Expand Down
45 changes: 37 additions & 8 deletions nemo_skills/pipeline/utils/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,8 +495,15 @@ def get_generation_cmd(
cmd += "++wait_for_sandbox=true "

if chunk_id is not None:
cmd += f" ++num_chunks={num_chunks} ++chunk_id={chunk_id} "
output_file = get_chunked_rs_filename(output_dir, random_seed=random_seed, chunk_id=chunk_id)
# Check if chunk_id is a shell expression (e.g., "$((0 + $SLURM_LOCALID))")
is_shell_expr = isinstance(chunk_id, str) and "$" in str(chunk_id)

if is_shell_expr:
# For shell expressions, use double quotes so shell expands the expression
cmd += f' ++num_chunks={num_chunks} "++chunk_id={chunk_id}" '
else:
cmd += f" ++num_chunks={num_chunks} ++chunk_id={chunk_id} "

donefiles = []
# we are always waiting for all chunks in num_chunks, no matter chunk_ids in
# the current run (as we don't want to merge partial jobs)
Expand All @@ -505,10 +512,23 @@ def get_generation_cmd(
donefile = f"{filename}.done"
donefiles.append(donefile)

if job_end_cmd:
job_end_cmd += f" && touch {donefiles[chunk_id]} "
if is_shell_expr:
# For shell expression, compute the donefile path at runtime
# Get the base pattern with _chunk_0 and replace with shell expression
base_donefile = donefiles[0] # e.g., /path/output_chunk_0.jsonl.done
# Replace "_chunk_0.jsonl" with "_chunk_$((expr)).jsonl" where expr is expanded by shell
# Extract the expression part (e.g., "0 + $SLURM_LOCALID" from "$((0 + $SLURM_LOCALID))")
donefile_pattern = base_donefile.replace("_chunk_0.jsonl", f"_chunk_{chunk_id}.jsonl")
if job_end_cmd:
job_end_cmd += f' && touch "{donefile_pattern}" '
else:
job_end_cmd = f'touch "{donefile_pattern}" '
Comment on lines +515 to +525
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Find and examine get_chunked_filename function
rg -n "def get_chunked_filename" --type py -A 10

# Also search for where donefiles is constructed
rg -n "donefiles\s*=" --type py | head -20

Repository: NVIDIA-NeMo/Skills

Length of output: 792


🏁 Script executed:

#!/bin/bash
# Read the specific file and lines mentioned in the review
cat -n nemo_skills/pipeline/utils/generation.py | sed -n '490,540p'

Repository: NVIDIA-NeMo/Skills

Length of output: 2945


🏁 Script executed:

#!/bin/bash
# Check how donefiles is built and what patterns are used
rg -n "donefiles" nemo_skills/pipeline/utils/generation.py -B 2 -A 2 | head -60

Repository: NVIDIA-NeMo/Skills

Length of output: 1663


🏁 Script executed:

#!/bin/bash
# Find get_chunked_rs_filename function
rg -n "def get_chunked_rs_filename" --type py -A 8

Repository: NVIDIA-NeMo/Skills

Length of output: 704


🏁 Script executed:

#!/bin/bash
# Also check imports in generation.py to see where functions come from
head -50 nemo_skills/pipeline/utils/generation.py | grep -E "^import|^from"

Repository: NVIDIA-NeMo/Skills

Length of output: 502


🏁 Script executed:

#!/bin/bash
# Read the full get_chunked_rs_filename implementation
cat -n nemo_skills/pipeline/utils/generation.py | sed -n '152,175p'

Repository: NVIDIA-NeMo/Skills

Length of output: 947


🏁 Script executed:

#!/bin/bash
# Verify the exact pattern by checking what donefiles[0] would look like
# with different random_seed values
cat -n nemo_skills/pipeline/utils/generation.py | sed -n '510,525p'

Repository: NVIDIA-NeMo/Skills

Length of output: 1159


🏁 Script executed:

#!/bin/bash
# Check if there are any tests or usages that show actual donefile paths
rg "donefile" --type py -A 2 -B 2 | head -40

Repository: NVIDIA-NeMo/Skills

Length of output: 3276


🏁 Script executed:

#!/bin/bash
# Let me trace through the exact pattern by simulating what get_chunked_filename produces
cat -n nemo_skills/utils.py | sed -n '383,386p'

Repository: NVIDIA-NeMo/Skills

Length of output: 244


🏁 Script executed:

#!/bin/bash
# Verify if there could be any edge cases where _chunk_0.jsonl appears elsewhere
rg "_chunk_0" --type py

Repository: NVIDIA-NeMo/Skills

Length of output: 694


Fragile string replacement for shell-expression donefile path — no error if pattern is absent.

base_donefile.replace("_chunk_0.jsonl", f"_chunk_{chunk_id}.jsonl") silently returns the original string when the pattern isn't found (e.g., naming convention change) and could replace in a directory component if the output path itself contains _chunk_0.jsonl. The touch would then create a file at the wrong path and the subsequent merge_chunks would stall waiting for a done file that never appears.

Limit the replacement to the basename only and validate the pattern was found:

🛡️ Proposed fix
-        base_donefile = donefiles[0]  # e.g., /path/output_chunk_0.jsonl.done
-        # Replace "_chunk_0.jsonl" with "_chunk_$((expr)).jsonl" where expr is expanded by shell
-        # Extract the expression part (e.g., "0 + $SLURM_LOCALID" from "$((0 + $SLURM_LOCALID))")
-        donefile_pattern = base_donefile.replace("_chunk_0.jsonl", f"_chunk_{chunk_id}.jsonl")
+        import os as _os
+        base_donefile = donefiles[0]
+        _dir, _fname = _os.path.split(base_donefile)
+        _new_fname = _fname.replace("_chunk_0.jsonl", f"_chunk_{chunk_id}.jsonl", 1)
+        if _new_fname == _fname:
+            raise RuntimeError(
+                f"Could not build shell-expression donefile from {base_donefile!r}; "
+                "'_chunk_0.jsonl' pattern not found in filename."
+            )
+        donefile_pattern = _os.path.join(_dir, _new_fname)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@nemo_skills/pipeline/utils/generation.py` around lines 515 - 525, The current
logic in the is_shell_expr branch does a blind string replace on base_donefile
which can silently fail or modify directory components; change it to operate on
the filename only: extract the directory and basename from donefiles[0] (use the
basename for the replacement of "_chunk_0.jsonl" -> f"_chunk_{chunk_id}.jsonl"),
verify the basename contains the expected "_chunk_0.jsonl" pattern and raise or
log an error if not found, then rejoin dir + modified basename to form
donefile_pattern, and finally append that safe donefile_pattern to job_end_cmd
(symbols: is_shell_expr, base_donefile, donefiles, donefile_pattern, chunk_id,
job_end_cmd).

else:
job_end_cmd = f"touch {donefiles[chunk_id]} "
output_file = get_chunked_rs_filename(output_dir, random_seed=random_seed, chunk_id=chunk_id)
if job_end_cmd:
job_end_cmd += f" && touch {donefiles[chunk_id]} "
else:
job_end_cmd = f"touch {donefiles[chunk_id]} "

# getting file name as if there is no chunking since that's where we want to merge
merged_output_file = get_chunked_rs_filename(output_dir=output_dir, random_seed=random_seed)
Expand Down Expand Up @@ -582,6 +602,7 @@ def configure_client(
get_random_port: bool,
extra_arguments: str,
server_container: str | None = None,
gpus_per_node: int = 1,
):
"""
Utility function to configure a client for the model inference server.
Expand All @@ -597,6 +618,7 @@ def configure_client(
get_random_port: Whether to get a random port for the server.
extra_arguments: Extra arguments to pass to the command.
server_container: Container to use for the server.
gpus_per_node: Number of GPUs per node for multi-instance mode.

Returns:
A tuple containing:
Expand Down Expand Up @@ -625,9 +647,16 @@ def configure_client(
}
if server_container:
server_config["container"] = server_container
extra_arguments = (
f"++server.host=127.0.0.1 ++server.port={server_port} ++server.model={model} {extra_arguments}"
)
if gpus_per_node > 1:
# Multi-instance mode: port is computed at runtime based on SLURM_LOCALID
extra_arguments = (
f"++server.host=127.0.0.1 "
f'"++server.port=$(({server_port} + $SLURM_LOCALID))" ++server.model={model} {extra_arguments}'
)
Comment on lines +650 to +655
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Inconsistent SLURM_LOCALID fallback between server and client port computation.

server.py (line 222) uses ${SLURM_LOCALID:-0} (with fallback to 0), but here the client uses bare $SLURM_LOCALID without a default. If SLURM_LOCALID is unset and set -u (or equivalent) is active, the client command will fail while the server succeeds.

🛡️ Proposed fix — use consistent fallback
-                f'"++server.port=$(({server_port} + $SLURM_LOCALID))" ++server.model={model} {extra_arguments}'
+                f'"++server.port=$(({server_port} + ${{SLURM_LOCALID:-0}}))" ++server.model={model} {extra_arguments}'
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if gpus_per_node > 1:
# Multi-instance mode: port is computed at runtime based on SLURM_LOCALID
extra_arguments = (
f"++server.host=127.0.0.1 "
f'"++server.port=$(({server_port} + $SLURM_LOCALID))" ++server.model={model} {extra_arguments}'
)
if gpus_per_node > 1:
# Multi-instance mode: port is computed at runtime based on SLURM_LOCALID
extra_arguments = (
f"++server.host=127.0.0.1 "
f'"++server.port=$(({server_port} + ${{SLURM_LOCALID:-0}}))" ++server.model={model} {extra_arguments}'
)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@nemo_skills/pipeline/utils/generation.py` around lines 650 - 655, The
client-side port math in generation.py uses bare $SLURM_LOCALID which can fail
under set -u; update the string building that produces extra_arguments (the
branch where gpus_per_node > 1) to use the same fallback syntax as server.py,
replacing $SLURM_LOCALID with ${SLURM_LOCALID:-0} in the client port expression
(the reference symbols are extra_arguments, server_port and model) so the
computed port is robust when SLURM_LOCALID is unset.

else:
extra_arguments = (
f"++server.host=127.0.0.1 ++server.port={server_port} ++server.model={model} {extra_arguments}"
)
else: # model is hosted elsewhere
server_config = None
extra_arguments = f"++server.base_url={server_address} ++server.model={model} {extra_arguments}"
Expand Down
40 changes: 31 additions & 9 deletions nemo_skills/pipeline/utils/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,17 @@ def get_server_command(
server_port: int,
server_args: str = "",
server_entrypoint: str | None = None,
gpus_per_node: int = 1,
):
num_tasks = num_gpus

if gpus_per_node > 1 and server_type != "generic":
raise ValueError(
f"Multi-instance mode (gpus_per_node={gpus_per_node}) is only supported for "
f"server_type='generic', but got server_type='{server_type}'. "
f"Use gpus_per_node=1 or switch to server_type='generic'."
)

# check if the model path is mounted if not vllm, sglang, or trtllm;
# vllm, sglang, trtllm can also pass model name as "model_path" so we need special processing
if server_type not in ["vllm", "sglang", "trtllm", "generic"]:
Expand Down Expand Up @@ -209,15 +217,29 @@ def get_server_command(
elif server_type == "generic":
if not server_entrypoint:
raise ValueError("For 'generic' server type, 'server_entrypoint' must be specified.")
server_start_cmd = (
f"{server_entrypoint} "
f" --model {model_path} "
f" --num_gpus {num_gpus} "
f" --num_nodes {num_nodes} "
f" --port {server_port} "
f" {server_args} "
)
num_tasks = 1
if gpus_per_node > 1:
# Multi-instance mode: each SLURM task gets its own GPU and port
server_start_cmd = (
f"echo 'SLURM_LOCALID='$SLURM_LOCALID' SLURM_PROCID='$SLURM_PROCID && "
f"export CUDA_VISIBLE_DEVICES=${{SLURM_LOCALID:-0}} && "
f"{server_entrypoint} "
f" --model {model_path} "
f" --num_gpus 1 "
f" --num_nodes 1 "
f" --port $(({server_port} + ${{SLURM_LOCALID:-0}})) "
f" {server_args} "
)
num_tasks = gpus_per_node
else:
server_start_cmd = (
f"{server_entrypoint} "
f" --model {model_path} "
f" --num_gpus {num_gpus} "
f" --num_nodes {num_nodes} "
f" --port {server_port} "
f" {server_args} "
)
num_tasks = 1
Comment on lines +220 to +242
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check the current file structure and the changes
cd /tmp/repo 2>/dev/null || cd . 

# 1. Find the server.py file and verify it exists
find . -name "server.py" -path "*/pipeline/utils/*" 2>/dev/null | head -5

# 2. Check git diff if in a repo
if git rev-parse --git-dir > /dev/null 2>&1; then
  echo "=== Git status ==="
  git status --short
  echo "=== Checking git diff for server.py ==="
  git diff HEAD -- nemo_skills/pipeline/utils/server.py 2>/dev/null | head -150
else
  echo "Not in a git repo"
fi

Repository: NVIDIA-NeMo/Skills

Length of output: 172


🏁 Script executed:

#!/bin/bash
# 3. Search for gpus_per_node in the codebase to see if it's validated
echo "=== Searching for gpus_per_node usage ==="
rg "gpus_per_node" --type py -B 3 -A 3

# 4. Search for any assertions or validations related to GPU counts
echo "=== Searching for GPU count validations ==="
rg "(gpus_per_node|num_gpus)" --type py | rg -E "(assert|raise|if.*<=|if.*>=|if.*<|if.*>)" -B 1 -A 1

Repository: NVIDIA-NeMo/Skills

Length of output: 21948


🏁 Script executed:

#!/bin/bash
# 5. Check the server.py file around lines 220-242 to see current state
echo "=== Reading server.py around lines 220-242 ==="
if [ -f "nemo_skills/pipeline/utils/server.py" ]; then
  sed -n '210,250p' nemo_skills/pipeline/utils/server.py | cat -n
else
  echo "File not found, searching..."
  find . -name "server.py" -path "*/pipeline/utils/*" -exec sed -n '210,250p' {} \; 2>/dev/null
fi

Repository: NVIDIA-NeMo/Skills

Length of output: 2001


🏁 Script executed:

#!/bin/bash
# 6. Check how num_tasks is used after being set
echo "=== Checking num_tasks usage in server.py ==="
rg "num_tasks" nemo_skills/pipeline/utils/server.py -B 2 -A 2

# 7. Check exp.py for num_tasks usage
echo "=== Checking num_tasks in exp.py ==="
rg "num_tasks" nemo_skills/pipeline/utils/exp.py -B 2 -A 2 2>/dev/null || echo "exp.py patterns not found"

# 8. Search broader for num_tasks usage
echo "=== Broader num_tasks search ==="
rg "num_tasks" --type py -l 2>/dev/null | head -10

Repository: NVIDIA-NeMo/Skills

Length of output: 2518


🏁 Script executed:

#!/bin/bash
# 9. Look at the full function context around lines 220-242 to understand what this is
echo "=== Full context of the function ==="
if [ -f "nemo_skills/pipeline/utils/server.py" ]; then
  sed -n '100,260p' nemo_skills/pipeline/utils/server.py | cat -n
fi

Repository: NVIDIA-NeMo/Skills

Length of output: 7162


Add validation for gpus_per_node <= num_gpus in multi-instance mode.

The num_tasks = 1 assignment for single-instance generic (line 143) is intentional and consistent with vllm, sglang, and trtllm implementations—no issue here.

However, the multi-instance path (lines 121–133) lacks validation that gpus_per_node ≤ num_gpus. If a user passes gpus_per_node=8 but num_gpus=4, the code launches 8 SLURM tasks with CUDA_VISIBLE_DEVICES=${SLURM_LOCALID:-0}, where SLURM_LOCALID ranges 0–7. Tasks with IDs 4–7 will fail at runtime trying to access non-existent GPUs. Add an explicit guard to fail fast with a clear error message:

if gpus_per_node > num_gpus:
    raise ValueError(
        f"gpus_per_node ({gpus_per_node}) cannot exceed num_gpus ({num_gpus})"
    )

Place this check immediately after the gpus_per_node > 1 and server_type != "generic" validation block.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@nemo_skills/pipeline/utils/server.py` around lines 220 - 242, Add a fast-fail
validation immediately after the gpus_per_node > 1 and server_type != "generic"
guard to ensure gpus_per_node does not exceed num_gpus: check the variables
gpus_per_node and num_gpus and raise a ValueError with a clear message like
"gpus_per_node (X) cannot exceed num_gpus (Y)" if gpus_per_node > num_gpus, so
you avoid launching extra SLURM tasks that bind to non-existent GPUs (this
relates to the multi-instance path that builds server_start_cmd using
server_entrypoint and CUDA_VISIBLE_DEVICES).

else:
raise ValueError(f"Server type '{server_type}' not supported for model inference.")

Expand Down