From b1c206679ad704810ea116b3427e3fca6a48baf1 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 3 Feb 2026 11:52:58 -0800 Subject: [PATCH 01/13] Add an option to specify non-default slurm account Signed-off-by: Igor Gitman --- nemo_skills/pipeline/convert.py | 2 ++ nemo_skills/pipeline/eval.py | 11 +++++++++++ nemo_skills/pipeline/generate.py | 4 ++++ nemo_skills/pipeline/nemo_evaluator.py | 21 +++++++++++++++++++-- nemo_skills/pipeline/run_cmd.py | 2 ++ nemo_skills/pipeline/start_server.py | 2 ++ nemo_skills/pipeline/utils/declarative.py | 2 ++ nemo_skills/pipeline/utils/exp.py | 11 ++++++++++- 8 files changed, 52 insertions(+), 3 deletions(-) diff --git a/nemo_skills/pipeline/convert.py b/nemo_skills/pipeline/convert.py index 97c7e52597..68564b39f0 100644 --- a/nemo_skills/pipeline/convert.py +++ b/nemo_skills/pipeline/convert.py @@ -180,6 +180,7 @@ def convert( partition: str = typer.Option( None, help="Can specify if need interactive jobs or a specific non-default partition" ), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), @@ -326,6 +327,7 @@ def convert( num_tasks=1, cluster_config=cluster_config, partition=partition, + account=account, run_after=run_after, reuse_code=reuse_code, reuse_code_exp=reuse_code_exp, diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py index fac2f884bd..28c68f87f3 100644 --- a/nemo_skills/pipeline/eval.py +++ b/nemo_skills/pipeline/eval.py @@ -54,6 +54,7 @@ def _create_comet_judge_tasks( judge_server_gpus, judge_server_nodes, partition, + account, run_after, reuse_code_exp, reuse_code, @@ -113,6 +114,7 @@ def _create_comet_judge_tasks( num_gpus=judge_server_gpus or 1, num_nodes=judge_server_nodes or 1, partition=partition, + account=account, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, @@ -138,6 +140,7 @@ def _create_nvembed_judge_tasks( judge_server_gpus, judge_server_nodes, partition, + account, run_after, reuse_code_exp, reuse_code, @@ -200,6 +203,7 @@ def _create_nvembed_judge_tasks( num_gpus=judge_server_gpus or 1, num_nodes=judge_server_nodes or 1, partition=partition, + account=account, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, @@ -227,6 +231,7 @@ def _create_llm_judge_tasks( cluster, config_dir, partition, + account, with_sandbox, keep_mounts_for_sandbox, run_after, @@ -267,6 +272,7 @@ def _create_llm_judge_tasks( cluster=cluster, config_dir=config_dir, partition=partition, + account=account, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, run_after=run_after, @@ -378,6 +384,7 @@ def eval( "Can provide a list directly when using through Python", ), partition: str = typer.Option(None, help="Cluster partition to use"), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), @@ -614,6 +621,7 @@ def eval( container=cluster_config["containers"]["nemo-skills"], cluster_config=cluster_config, partition=partition, + account=account, server_config=job_server_config, with_sandbox=job_needs_sandbox or with_sandbox, keep_mounts_for_sandbox=job_needs_sandbox_to_keep_mounts or keep_mounts_for_sandbox, @@ -675,6 +683,7 @@ def eval( judge_server_gpus=judge_server_gpus, judge_server_nodes=judge_server_nodes, partition=partition, + account=account, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, @@ -699,6 +708,7 @@ def eval( judge_server_gpus=judge_server_gpus, judge_server_nodes=judge_server_nodes, partition=partition, + account=account, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, @@ -726,6 +736,7 @@ def eval( cluster=cluster, config_dir=config_dir, partition=partition, + account=account, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, run_after=run_after, diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index a1b96f556a..d908fabfcc 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -55,6 +55,7 @@ def _create_job_unified( installation_command: Optional[str], with_sandbox: bool, partition: Optional[str], + account: Optional[str], keep_mounts_for_sandbox: bool, task_name: str, log_dir: str, @@ -191,6 +192,7 @@ def _create_job_unified( commands=components, hardware=HardwareConfig( partition=partition, + account=account, num_gpus=group_gpus, num_nodes=group_nodes, num_tasks=group_tasks, @@ -297,6 +299,7 @@ def generate( partition: str = typer.Option( None, help="Can specify if need interactive jobs or a specific non-default partition" ), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), run_after: List[str] = typer.Option( @@ -589,6 +592,7 @@ def convert_server_type_to_string(server_type): installation_command=installation_command, with_sandbox=with_sandbox, partition=partition, + account=account, keep_mounts_for_sandbox=keep_mounts_for_sandbox, task_name=task_name, log_dir=log_dir, diff --git a/nemo_skills/pipeline/nemo_evaluator.py b/nemo_skills/pipeline/nemo_evaluator.py index 39838737ed..f3e9057c0e 100644 --- a/nemo_skills/pipeline/nemo_evaluator.py +++ b/nemo_skills/pipeline/nemo_evaluator.py @@ -125,6 +125,7 @@ def nemo_evaluator( job_gpus: int = typer.Option(0, help="GPUs to allocate for the evaluator client when no servers are hosted"), job_nodes: int = typer.Option(1, help="Nodes to allocate for the evaluator job"), partition: str = typer.Option(None, help="Cluster partition to use"), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Slurm QoS"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), log_dir: str = typer.Option(None, help="Custom location for logs"), @@ -325,6 +326,7 @@ def nemo_evaluator( job_nodes=job_nodes, cluster_config=cluster_config, partition=partition, + account=account, qos=qos, exclusive=exclusive, ) @@ -346,6 +348,7 @@ def nemo_evaluator( commands=[main_server_cmd, client_cmd], hardware=_hardware_for_group( task_ctx.partition, + task_ctx.account, task_ctx.server_gpus or None, task_ctx.server_nodes or 1, task_ctx.qos, @@ -358,6 +361,7 @@ def nemo_evaluator( commands=[judge_server_cmd], hardware=_hardware_for_group( task_ctx.partition, + task_ctx.account, task_ctx.judge_server_gpus or None, task_ctx.judge_server_nodes or 1, task_ctx.qos, @@ -393,7 +397,12 @@ def nemo_evaluator( CommandGroup( commands=sg_cmds, hardware=_hardware_for_group( - task_ctx.partition, group_num_gpus, group_num_nodes, task_ctx.qos, task_ctx.exclusive + task_ctx.partition, + task_ctx.account, + group_num_gpus, + group_num_nodes, + task_ctx.qos, + task_ctx.exclusive, ), name=f"{task_ctx.expname}-{task_ctx.idx}", log_dir=log_dir, @@ -543,17 +552,24 @@ class _TaskCreationContext: job_nodes: int cluster_config: Dict partition: Optional[str] + account: Optional[str] qos: Optional[str] exclusive: bool def _hardware_for_group( - partition: Optional[str], num_gpus: Optional[int], num_nodes: int, qos: Optional[str], exclusive: bool + partition: Optional[str], + account: Optional[str], + num_gpus: Optional[int], + num_nodes: int, + qos: Optional[str], + exclusive: bool, ) -> HardwareConfig: """Create HardwareConfig for a CommandGroup. Args: partition: SLURM partition name + account: SLURM account name num_gpus: Number of GPUs (None means no GPU allocation) num_nodes: Number of nodes qos: SLURM QoS setting @@ -564,6 +580,7 @@ def _hardware_for_group( """ return HardwareConfig( partition=partition, + account=account, num_gpus=num_gpus, num_nodes=num_nodes, sbatch_kwargs={ diff --git a/nemo_skills/pipeline/run_cmd.py b/nemo_skills/pipeline/run_cmd.py index 5706d516e6..9e383ac9ab 100644 --- a/nemo_skills/pipeline/run_cmd.py +++ b/nemo_skills/pipeline/run_cmd.py @@ -58,6 +58,7 @@ def run_cmd( partition: str = typer.Option( None, help="Can specify if need interactive jobs or a specific non-default partition" ), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), num_gpus: int | None = typer.Option(None, help="Number of GPUs per node to use"), @@ -197,6 +198,7 @@ def run_cmd( container=containers, cluster_config=cluster_config, partition=partition, + account=account, server_config=server_config, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, diff --git a/nemo_skills/pipeline/start_server.py b/nemo_skills/pipeline/start_server.py index 71eae6bcb0..c875e0d7fe 100644 --- a/nemo_skills/pipeline/start_server.py +++ b/nemo_skills/pipeline/start_server.py @@ -128,6 +128,7 @@ def start_server( ), server_container: str = typer.Option(None, help="Override container image for the hosted server"), partition: str = typer.Option(None, help="Cluster partition to use"), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), @@ -203,6 +204,7 @@ def start_server( container=cluster_config["containers"]["nemo-skills"], cluster_config=cluster_config, partition=partition, + account=account, server_config=server_config, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 7029dcc638..f47067ee99 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -264,6 +264,7 @@ class HardwareConfig: """Hardware configuration for a group of tasks.""" partition: Optional[str] = None + account: Optional[str] = None num_gpus: Optional[int] = None num_nodes: Optional[int] = None num_tasks: Optional[int] = 1 @@ -585,6 +586,7 @@ def _create_executor( log_dir=log_dir, log_prefix=exec_config["log_prefix"], partition=hardware.partition if hardware else None, + account=hardware.account if hardware else None, heterogeneous=heterogeneous, het_group=het_group, total_het_groups=total_het_groups, diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index 3bce2eb864..c22700652c 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -168,6 +168,7 @@ def get_executor( log_prefix: str = "main", mounts=None, partition=None, + account=None, dependencies=None, extra_package_dirs: tuple[str] | None = None, heterogeneous=False, @@ -210,6 +211,7 @@ def get_executor( taken from `cluster_config`. partition: SLURM partition override. If omitted, inferred from `gpus_per_node` and `cluster_config`. + account: SLURM account override. If omitted, uses `cluster_config["account"]`. dependencies: SLURM job handles to depend on. The dependency type is taken from `cluster_config['dependency_type']` (default: "afterany"). extra_package_dirs: Additional directories to package with the code for remote @@ -328,9 +330,12 @@ def get_executor( dependency_type = cluster_config.get("dependency_type", "afterany") job_details_class = CustomJobDetailsRay if with_ray else CustomJobDetails + # Resolve account with fallback to cluster_config + account = account or cluster_config.get("account") + # Build executor parameters as a dictionary to avoid duplicate parameters executor_params = { - "account": cluster_config["account"], + "account": account, "partition": partition, "nodes": num_nodes, "ntasks_per_node": tasks_per_node, @@ -429,6 +434,7 @@ def add_task( num_nodes=1, log_dir=None, partition=None, + account=None, with_sandbox=False, keep_mounts_for_sandbox=False, sandbox_port: int | None = None, @@ -538,6 +544,7 @@ def add_task( tasks_per_node=num_server_tasks, gpus_per_node=server_config["num_gpus"], partition=partition, + account=account, dependencies=dependencies, job_name=task_name, log_dir=log_dir, @@ -582,6 +589,7 @@ def add_task( tasks_per_node=cur_tasks, gpus_per_node=num_gpus if server_config is None else 0, partition=partition, + account=account, dependencies=dependencies, job_name=task_name, log_dir=log_dir, @@ -626,6 +634,7 @@ def add_task( tasks_per_node=1, gpus_per_node=0, partition=partition, + account=account, mounts=None if keep_mounts_for_sandbox else [], dependencies=dependencies, job_name=task_name, From 5a89b6835b2ce2b6f444fcb3aee3e8f2cc0e6858 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 3 Feb 2026 14:00:36 -0800 Subject: [PATCH 02/13] Add overrides for containers Signed-off-by: Igor Gitman --- nemo_skills/pipeline/convert.py | 3 ++- nemo_skills/pipeline/eval.py | 24 +++++++++++++++++++++--- nemo_skills/pipeline/generate.py | 10 ++++++++-- nemo_skills/pipeline/run_cmd.py | 2 ++ nemo_skills/pipeline/start_server.py | 5 ++++- nemo_skills/pipeline/utils/exp.py | 3 ++- 6 files changed, 39 insertions(+), 8 deletions(-) diff --git a/nemo_skills/pipeline/convert.py b/nemo_skills/pipeline/convert.py index 68564b39f0..99ca445552 100644 --- a/nemo_skills/pipeline/convert.py +++ b/nemo_skills/pipeline/convert.py @@ -181,6 +181,7 @@ def convert( None, help="Can specify if need interactive jobs or a specific non-default partition" ), account: str = typer.Option(None, help="Can specify a non-default Slurm account"), + container: str = typer.Option(None, help="Override container image for the conversion job"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), @@ -321,7 +322,7 @@ def convert( cmd=conversion_cmd, task_name=expname, log_dir=log_dir, - container=container_map[(convert_from, convert_to)], + container=container or container_map[(convert_from, convert_to)], num_gpus=num_gpus, num_nodes=1, # always running on a single node, might need to change that in the future num_tasks=1, diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py index 28c68f87f3..1907da1311 100644 --- a/nemo_skills/pipeline/eval.py +++ b/nemo_skills/pipeline/eval.py @@ -55,6 +55,7 @@ def _create_comet_judge_tasks( judge_server_nodes, partition, account, + judge_container, run_after, reuse_code_exp, reuse_code, @@ -109,7 +110,7 @@ def _create_comet_judge_tasks( cmd=run_cmd, task_name=f"{expname}-{benchmark}-comet-judge", log_dir=log_dir + "/judge", - container=cluster_config["containers"]["vllm"], + container=judge_container or cluster_config["containers"]["vllm"], cluster_config=cluster_config, num_gpus=judge_server_gpus or 1, num_nodes=judge_server_nodes or 1, @@ -141,6 +142,7 @@ def _create_nvembed_judge_tasks( judge_server_nodes, partition, account, + judge_container, run_after, reuse_code_exp, reuse_code, @@ -198,7 +200,7 @@ def _create_nvembed_judge_tasks( cmd=run_cmd, task_name=f"{expname}-{benchmark}-nvembed-judge", log_dir=log_dir + "/judge", - container=cluster_config["containers"]["vllm"], + container=judge_container or cluster_config["containers"]["vllm"], cluster_config=cluster_config, num_gpus=judge_server_gpus or 1, num_nodes=judge_server_nodes or 1, @@ -232,6 +234,8 @@ def _create_llm_judge_tasks( config_dir, partition, account, + main_container, + sandbox_container, with_sandbox, keep_mounts_for_sandbox, run_after, @@ -273,6 +277,8 @@ def _create_llm_judge_tasks( config_dir=config_dir, partition=partition, account=account, + main_container=main_container, + sandbox_container=sandbox_container, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, run_after=run_after, @@ -358,6 +364,12 @@ def eval( server_container: str = typer.Option( None, help="Override container image for the hosted server (if server_gpus is set)" ), + main_container: str = typer.Option(None, help="Override container image for the main evaluation client"), + sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"), + judge_container: str = typer.Option(None, help="Override container image for GPU-based judges (comet, nvembed)"), + judge_server_container: str = typer.Option( + None, help="Override container image for the hosted judge server (if judge_server_gpus is set)" + ), extra_judge_args: str = typer.Option( "", help="Additional arguments for judge (passed to generate script, so should start with ++)" ), @@ -529,6 +541,7 @@ def eval( "server_nodes": judge_server_nodes, "server_args": judge_server_args, "server_entrypoint": judge_server_entrypoint, + "server_container": judge_server_container, "generation_type": judge_generation_type, "generation_module": judge_generation_module, } @@ -618,7 +631,7 @@ def eval( cmd=pipeline_utils.wrap_python_path(cmd=combine_cmds(cmds, single_node_mode)), task_name=f"{expname}-{'-'.join(job_benchmarks)}", log_dir=log_dir, - container=cluster_config["containers"]["nemo-skills"], + container=main_container or cluster_config["containers"]["nemo-skills"], cluster_config=cluster_config, partition=partition, account=account, @@ -627,6 +640,7 @@ def eval( keep_mounts_for_sandbox=job_needs_sandbox_to_keep_mounts or keep_mounts_for_sandbox, sandbox_port=None if get_random_port else 6000, sandbox_env_overrides=job_sandbox_env_overrides, + sandbox_container=sandbox_container, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, @@ -684,6 +698,7 @@ def eval( judge_server_nodes=judge_server_nodes, partition=partition, account=account, + judge_container=judge_container, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, @@ -709,6 +724,7 @@ def eval( judge_server_nodes=judge_server_nodes, partition=partition, account=account, + judge_container=judge_container, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, @@ -737,6 +753,8 @@ def eval( config_dir=config_dir, partition=partition, account=account, + main_container=main_container, + sandbox_container=sandbox_container, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, run_after=run_after, diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index d908fabfcc..187a563cc7 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -61,6 +61,8 @@ def _create_job_unified( log_dir: str, sbatch_kwargs: Optional[Dict] = None, sandbox_env_overrides: Optional[List[str]] = None, + main_container: Optional[str] = None, + sandbox_container: Optional[str] = None, ) -> List[CommandGroup]: """ Create CommandGroups for n models (unified for n=1 and n>1). @@ -148,7 +150,7 @@ def _create_job_unified( sandbox_cmd = Command( script=sandbox_script, - container=cluster_config["containers"]["sandbox"], + container=sandbox_container or cluster_config["containers"]["sandbox"], name=f"{task_name}_sandbox", ) components.append(sandbox_cmd) @@ -179,7 +181,7 @@ def _create_job_unified( client_cmd = Command( script=client_script, - container=cluster_config["containers"]["nemo-skills"], + container=main_container or cluster_config["containers"]["nemo-skills"], name=f"{task_name}", ) components.append(client_cmd) @@ -274,6 +276,8 @@ def generate( help="Container image(s). CLI: space-separated. Python API: string or list. " "Single value broadcasts to all models.", ), + main_container: str = typer.Option(None, help="Override container image for the main generation client"), + sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"), dependent_jobs: int = typer.Option(0, help="Specify this to launch that number of dependent jobs"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), num_random_seeds: int = typer.Option( @@ -598,6 +602,8 @@ def convert_server_type_to_string(server_type): log_dir=log_dir, sbatch_kwargs=sbatch_kwargs, sandbox_env_overrides=sandbox_env_overrides, + main_container=main_container, + sandbox_container=sandbox_container, ) # Use unique internal job name for dependency tracking, but same task_name diff --git a/nemo_skills/pipeline/run_cmd.py b/nemo_skills/pipeline/run_cmd.py index 9e383ac9ab..2628302234 100644 --- a/nemo_skills/pipeline/run_cmd.py +++ b/nemo_skills/pipeline/run_cmd.py @@ -78,6 +78,7 @@ def run_cmd( server_container: str = typer.Option( None, help="Override container image for the hosted server (if server_gpus is set)" ), + sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"), dependent_jobs: int = typer.Option(0, help="Specify this to launch that number of dependent jobs"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), run_after: List[str] = typer.Option( @@ -203,6 +204,7 @@ def run_cmd( with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, sandbox_port=None if get_random_port else 6000, + sandbox_container=sandbox_container, run_after=run_after, reuse_code=reuse_code, reuse_code_exp=reuse_code_exp, diff --git a/nemo_skills/pipeline/start_server.py b/nemo_skills/pipeline/start_server.py index c875e0d7fe..cd557659e3 100644 --- a/nemo_skills/pipeline/start_server.py +++ b/nemo_skills/pipeline/start_server.py @@ -127,6 +127,8 @@ def start_server( "If not specified, will use the default entrypoint for the server type.", ), server_container: str = typer.Option(None, help="Override container image for the hosted server"), + main_container: str = typer.Option(None, help="Override container image for the main task (e.g., chat interface)"), + sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"), partition: str = typer.Option(None, help="Cluster partition to use"), account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), @@ -201,7 +203,7 @@ def start_server( cmd=cmd, task_name="server", log_dir=log_dir, - container=cluster_config["containers"]["nemo-skills"], + container=main_container or cluster_config["containers"]["nemo-skills"], cluster_config=cluster_config, partition=partition, account=account, @@ -209,6 +211,7 @@ def start_server( with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, sandbox_port=sandbox_port, + sandbox_container=sandbox_container, sbatch_kwargs=parse_kwargs(sbatch_kwargs, exclusive=exclusive, qos=qos, time_min=time_min), ) diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index c22700652c..949bef372f 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -436,6 +436,7 @@ def add_task( partition=None, account=None, with_sandbox=False, + sandbox_container=None, keep_mounts_for_sandbox=False, sandbox_port: int | None = None, server_config=None, @@ -629,7 +630,7 @@ def add_task( commands.append(get_sandbox_command(cluster_config)) sandbox_executor = get_executor( cluster_config=cluster_config, - container=cluster_config["containers"]["sandbox"], + container=sandbox_container or cluster_config["containers"]["sandbox"], num_nodes=executors[0].nodes if cluster_config["executor"] == "slurm" else 1, tasks_per_node=1, gpus_per_node=0, From be1b70004fdec4329217a1ed0fb59563dfa6b2c8 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 3 Feb 2026 16:05:10 -0800 Subject: [PATCH 03/13] Debugging Signed-off-by: Igor Gitman --- nemo_skills/inference/model/base.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py index 81398a6587..59b4679d3c 100644 --- a/nemo_skills/inference/model/base.py +++ b/nemo_skills/inference/model/base.py @@ -279,7 +279,16 @@ async def generate_async( if endpoint_type == EndpointType.chat: assert isinstance(prompt, list), "Chat completion requests must be a list of messages." request_params = self._build_chat_request_params(messages=prompt, stream=stream, **kwargs) + print("MAX TOKENS ASKED", request_params["max_tokens"]) + print("FULL REQUEST PARAMS", request_params) + print("FULL LITELLM KWARGS", self.litellm_kwargs) response = await litellm.acompletion(**request_params, **self.litellm_kwargs) + print("TOKENS RECEIVED", response.usage.completion_tokens) + print("FULL RESPONSE", response) + print("FULL GENERATION", response.choices[0].message.content) + print( + "FULL REASONING CONTENT", getattr(response.choices[0].message, "reasoning_content", None) + ) if stream: result = self._stream_chat_chunks_async(response) else: From 3d7aebd493cb02d1a2929e83893a5d66cae1d482 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 3 Feb 2026 19:40:42 -0800 Subject: [PATCH 04/13] Client side cuonting Signed-off-by: Igor Gitman --- nemo_skills/inference/model/tool_call.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nemo_skills/inference/model/tool_call.py b/nemo_skills/inference/model/tool_call.py index 8d25cbf762..0e0d3e7a35 100644 --- a/nemo_skills/inference/model/tool_call.py +++ b/nemo_skills/inference/model/tool_call.py @@ -142,6 +142,15 @@ async def generate_async( endpoint_type=endpoint_type, **generation_kwargs, ) + + # Recount tokens client-side for accuracy + # (vLLM reports wrong completion_tokens when finish_reason='length') + if hasattr(self.model, "tokenizer") and self.model.tokenizer: + text_to_count = generation.get("generation", "") + if generation.get("reasoning_content"): + text_to_count = generation["reasoning_content"] + text_to_count + generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count)) + if isinstance(tokens_to_generate, int): tokens_to_generate -= generation["num_generated_tokens"] From ef451ddf3d2b278cd1f1bf624243bd07e834b22c Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 3 Feb 2026 21:12:19 -0800 Subject: [PATCH 05/13] Fixed toeknizer Signed-off-by: Igor Gitman --- nemo_skills/inference/model/tool_call.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/nemo_skills/inference/model/tool_call.py b/nemo_skills/inference/model/tool_call.py index 0e0d3e7a35..83a25e4331 100644 --- a/nemo_skills/inference/model/tool_call.py +++ b/nemo_skills/inference/model/tool_call.py @@ -145,11 +145,10 @@ async def generate_async( # Recount tokens client-side for accuracy # (vLLM reports wrong completion_tokens when finish_reason='length') - if hasattr(self.model, "tokenizer") and self.model.tokenizer: - text_to_count = generation.get("generation", "") - if generation.get("reasoning_content"): - text_to_count = generation["reasoning_content"] + text_to_count - generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count)) + text_to_count = generation.get("generation", "") + if generation.get("reasoning_content"): + text_to_count = generation["reasoning_content"] + text_to_count + generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count)) if isinstance(tokens_to_generate, int): tokens_to_generate -= generation["num_generated_tokens"] From 9a9ba456a431edadbd9977759ee7495b61911437 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 3 Feb 2026 21:18:47 -0800 Subject: [PATCH 06/13] Debugging Signed-off-by: Igor Gitman --- nemo_skills/inference/model/tool_call.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nemo_skills/inference/model/tool_call.py b/nemo_skills/inference/model/tool_call.py index 83a25e4331..cf239db3bc 100644 --- a/nemo_skills/inference/model/tool_call.py +++ b/nemo_skills/inference/model/tool_call.py @@ -143,12 +143,12 @@ async def generate_async( **generation_kwargs, ) - # Recount tokens client-side for accuracy - # (vLLM reports wrong completion_tokens when finish_reason='length') - text_to_count = generation.get("generation", "") - if generation.get("reasoning_content"): - text_to_count = generation["reasoning_content"] + text_to_count - generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count)) + # # Recount tokens client-side for accuracy + # # (vLLM reports wrong completion_tokens when finish_reason='length') + # text_to_count = generation.get("generation", "") + # if generation.get("reasoning_content"): + # text_to_count = generation["reasoning_content"] + text_to_count + # generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count)) if isinstance(tokens_to_generate, int): tokens_to_generate -= generation["num_generated_tokens"] From dd0d94eabdbaee76dbd73f7ff31ffd98856b3879 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Wed, 25 Feb 2026 15:01:33 -0800 Subject: [PATCH 07/13] fix: add missing account= arg to test_server_metadata_from_num_tasks The test calls _create_job_unified() which now requires account as a positional argument after the addition of the --account CLI option. Signed-off-by: George Armstrong --- tests/test_generation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_generation.py b/tests/test_generation.py index 440a1f1446..cc7be52be8 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -181,6 +181,7 @@ def test_server_metadata_from_num_tasks(tmp_path): installation_command=None, with_sandbox=False, partition=None, + account=None, keep_mounts_for_sandbox=False, task_name="test-task", log_dir="/tmp/logs", From 316352b7b5ab03efca95a31b141a6d67dbef2958 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Wed, 25 Feb 2026 18:04:00 -0800 Subject: [PATCH 08/13] Roll-back tmp changes Signed-off-by: Igor Gitman --- nemo_skills/inference/model/base.py | 9 --------- nemo_skills/inference/model/tool_call.py | 7 ------- 2 files changed, 16 deletions(-) diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py index 86fccaebb4..f705e7e06f 100644 --- a/nemo_skills/inference/model/base.py +++ b/nemo_skills/inference/model/base.py @@ -281,16 +281,7 @@ async def generate_async( if endpoint_type == EndpointType.chat: assert isinstance(prompt, list), "Chat completion requests must be a list of messages." request_params = self._build_chat_request_params(messages=prompt, stream=stream, **kwargs) - print("MAX TOKENS ASKED", request_params["max_tokens"]) - print("FULL REQUEST PARAMS", request_params) - print("FULL LITELLM KWARGS", self.litellm_kwargs) response = await litellm.acompletion(**request_params, **self.litellm_kwargs) - print("TOKENS RECEIVED", response.usage.completion_tokens) - print("FULL RESPONSE", response) - print("FULL GENERATION", response.choices[0].message.content) - print( - "FULL REASONING CONTENT", getattr(response.choices[0].message, "reasoning_content", None) - ) if stream: result = self._stream_chat_chunks_async(response) else: diff --git a/nemo_skills/inference/model/tool_call.py b/nemo_skills/inference/model/tool_call.py index e912b2bb0a..00d6c2ca4d 100644 --- a/nemo_skills/inference/model/tool_call.py +++ b/nemo_skills/inference/model/tool_call.py @@ -146,13 +146,6 @@ async def generate_async( **generation_kwargs, ) - # # Recount tokens client-side for accuracy - # # (vLLM reports wrong completion_tokens when finish_reason='length') - # text_to_count = generation.get("generation", "") - # if generation.get("reasoning_content"): - # text_to_count = generation["reasoning_content"] + text_to_count - # generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count)) - if isinstance(tokens_to_generate, int): tokens_to_generate -= generation["num_generated_tokens"] From dbea2cafafc39aa4282b21c20566c3156767f126 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Wed, 25 Feb 2026 18:04:34 -0800 Subject: [PATCH 09/13] .get -> [] Signed-off-by: Igor Gitman --- nemo_skills/pipeline/utils/exp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index e0b9307b1b..728eda86ab 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -332,7 +332,7 @@ def get_executor( job_details_class = CustomJobDetailsRay if with_ray else CustomJobDetails # Resolve account with fallback to cluster_config - account = account or cluster_config.get("account") + account = account or cluster_config["account"] # Build executor parameters as a dictionary to avoid duplicate parameters executor_params = { From a7cac9df8e751f035c9ccbd0424ad57eea049379 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Thu, 26 Feb 2026 09:36:18 -0800 Subject: [PATCH 10/13] fix: add timeouts to API-calling tests to prevent CI hangs Per-request inference timeout (120s) and pytest-level test timeout (300s) for test_eval_gsm8k_api and test_eval_judge_api. Prevents external API hangs from blocking CI for 8+ minutes. Signed-off-by: George Armstrong --- tests/test_generation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_generation.py b/tests/test_generation.py index cc7be52be8..69adb5e921 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -24,6 +24,7 @@ from nemo_skills.pipeline.utils.scripts import ServerScript +@pytest.mark.timeout(300) def test_eval_gsm8k_api(tmp_path): cmd = ( f"ns eval " @@ -33,6 +34,7 @@ def test_eval_gsm8k_api(tmp_path): f" --benchmarks=gsm8k " f" --output_dir={tmp_path} " f" ++max_samples=2 " + f" ++inference.timeout=120 " ) subprocess.run(cmd, shell=True, check=True) @@ -51,6 +53,7 @@ def test_eval_gsm8k_api(tmp_path): assert metrics["symbolic_correct"] >= 80 +@pytest.mark.timeout(300) def test_eval_judge_api(tmp_path): cmd = ( f"ns eval " @@ -64,6 +67,7 @@ def test_eval_judge_api(tmp_path): f" --judge_server_type=openai " f" --judge_generation_type=math_judge " f" ++max_samples=2 " + f" ++inference.timeout=120 " ) subprocess.run(cmd, shell=True, check=True) From 5e7965ac9308ce7bc658ca1c43ca5b700fa92c60 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Thu, 26 Feb 2026 10:50:17 -0800 Subject: [PATCH 11/13] fix: also pass inference timeout to judge subprocess in test The judge step in test_eval_judge_api runs as a separate nemo-run job and doesn't inherit ++inference.timeout from the main generation step. Pass it via --extra_judge_args to prevent judge hangs too. Signed-off-by: George Armstrong --- tests/test_generation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_generation.py b/tests/test_generation.py index 69adb5e921..7f9dff18a6 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -66,6 +66,7 @@ def test_eval_judge_api(tmp_path): f" --judge_server_address=https://inference-api.nvidia.com/v1/ " f" --judge_server_type=openai " f" --judge_generation_type=math_judge " + f" --extra_judge_args='++inference.timeout=120' " f" ++max_samples=2 " f" ++inference.timeout=120 " ) From 4374543094ba18a7007502da0cf4c04c672d330f Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Thu, 26 Feb 2026 10:57:18 -0800 Subject: [PATCH 12/13] fix: disable litellm retries in API tests to prevent timeout compounding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: litellm max_retries=3 (default) compounds with inference.timeout — a single hanging request can take up to timeout * (max_retries + 1) = 120s * 4 = 480s, exceeding the 300s pytest timeout. Setting max_retries=0 ensures a timeout fails immediately without silent retries. Signed-off-by: George Armstrong --- tests/test_generation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_generation.py b/tests/test_generation.py index 7f9dff18a6..55cbd79383 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -35,6 +35,7 @@ def test_eval_gsm8k_api(tmp_path): f" --output_dir={tmp_path} " f" ++max_samples=2 " f" ++inference.timeout=120 " + f" ++server.max_retries=0 " ) subprocess.run(cmd, shell=True, check=True) @@ -66,9 +67,10 @@ def test_eval_judge_api(tmp_path): f" --judge_server_address=https://inference-api.nvidia.com/v1/ " f" --judge_server_type=openai " f" --judge_generation_type=math_judge " - f" --extra_judge_args='++inference.timeout=120' " + f" --extra_judge_args='++inference.timeout=120 ++server.max_retries=0' " f" ++max_samples=2 " f" ++inference.timeout=120 " + f" ++server.max_retries=0 " ) subprocess.run(cmd, shell=True, check=True) From 92600fb2b11d92e6e2a0824b5961a45e909bf29a Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Fri, 27 Feb 2026 09:11:10 -0800 Subject: [PATCH 13/13] tst: update test parameters to limit retries and use a different model name Signed-off-by: George Armstrong --- tests/test_generation.py | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/tests/test_generation.py b/tests/test_generation.py index 55cbd79383..77a73ca15d 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -29,13 +29,14 @@ def test_eval_gsm8k_api(tmp_path): cmd = ( f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=gsm8k " f" --output_dir={tmp_path} " f" ++max_samples=2 " + f" ++max_concurrent_requests=1 " f" ++inference.timeout=120 " - f" ++server.max_retries=0 " + f" ++server.max_retries=1 " ) subprocess.run(cmd, shell=True, check=True) @@ -59,18 +60,19 @@ def test_eval_judge_api(tmp_path): cmd = ( f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=math-500 " f" --output_dir={tmp_path} " - f" --judge_model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --judge_model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --judge_server_address=https://inference-api.nvidia.com/v1/ " f" --judge_server_type=openai " f" --judge_generation_type=math_judge " - f" --extra_judge_args='++inference.timeout=120 ++server.max_retries=0' " + f" --extra_judge_args='++max_concurrent_requests=1 ++inference.timeout=120 ++server.max_retries=1' " f" ++max_samples=2 " + f" ++max_concurrent_requests=1 " f" ++inference.timeout=120 " - f" ++server.max_retries=0 " + f" ++server.max_retries=1 " ) subprocess.run(cmd, shell=True, check=True) @@ -94,7 +96,7 @@ def test_fail_on_api_key_env_var(tmp_path): cmd = ( f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=gsm8k " f" --output_dir={tmp_path} " @@ -109,17 +111,21 @@ def test_fail_on_api_key_env_var(tmp_path): ), result.stdout.decode() +@pytest.mark.timeout(300) def test_succeed_on_api_key_env_var(tmp_path): cmd = ( f"export MY_CUSTOM_KEY=$NVIDIA_API_KEY && " f"unset NVIDIA_API_KEY && " f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=gsm8k " f" --output_dir={tmp_path} " f" ++max_samples=2 " + f" ++max_concurrent_requests=1 " + f" ++inference.timeout=120 " + f" ++server.max_retries=1 " f" ++server.api_key_env_var=MY_CUSTOM_KEY " ) subprocess.run(cmd, shell=True, check=True) @@ -139,16 +145,20 @@ def test_succeed_on_api_key_env_var(tmp_path): assert metrics["symbolic_correct"] >= 80 +@pytest.mark.timeout(300) @pytest.mark.parametrize("format", ["list", "dict"]) def test_generate_openai_format(tmp_path, format): cmd = ( f"ns generate " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --input_file=/nemo_run/code/tests/data/openai-input-{format}.test " f" --output_dir={tmp_path} " f" ++prompt_format=openai " + f" ++max_concurrent_requests=1 " + f" ++inference.timeout=120 " + f" ++server.max_retries=1 " ) subprocess.run(cmd, shell=True, check=True) @@ -202,20 +212,24 @@ def test_server_metadata_from_num_tasks(tmp_path): assert groups[0].hardware.num_tasks == server_cmd.script.num_tasks +@pytest.mark.timeout(300) def test_judge_generations_with_structured_output(tmp_path): cmd = ( f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=hle " f" --output_dir={tmp_path} " - f" --judge_model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --judge_model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --judge_server_address=https://inference-api.nvidia.com/v1/ " f" --judge_server_type=openai " f" --metric_type=hle-aa " - f' --extra_judge_args="++structured_output=HLE_JUDGE_AA" ' + f' --extra_judge_args="++structured_output=HLE_JUDGE_AA ++max_concurrent_requests=1 ++inference.timeout=120 ++server.max_retries=1" ' f" ++max_samples=2 " + f" ++max_concurrent_requests=1 " + f" ++inference.timeout=120 " + f" ++server.max_retries=1 " f" ++inference.tokens_to_generate=1024 " # to make test go fast ) subprocess.run(cmd, shell=True, check=True)