From b1c206679ad704810ea116b3427e3fca6a48baf1 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 3 Feb 2026 11:52:58 -0800
Subject: [PATCH 01/13] Add an option to specify non-default slurm account

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/pipeline/convert.py           |  2 ++
 nemo_skills/pipeline/eval.py              | 11 +++++++++++
 nemo_skills/pipeline/generate.py          |  4 ++++
 nemo_skills/pipeline/nemo_evaluator.py    | 21 +++++++++++++++++++--
 nemo_skills/pipeline/run_cmd.py           |  2 ++
 nemo_skills/pipeline/start_server.py      |  2 ++
 nemo_skills/pipeline/utils/declarative.py |  2 ++
 nemo_skills/pipeline/utils/exp.py         | 11 ++++++++++-
 8 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/pipeline/convert.py b/nemo_skills/pipeline/convert.py
index 97c7e52597..68564b39f0 100644
--- a/nemo_skills/pipeline/convert.py
+++ b/nemo_skills/pipeline/convert.py
@@ -180,6 +180,7 @@ def convert(
     partition: str = typer.Option(
         None, help="Can specify if need interactive jobs or a specific non-default partition"
     ),
+    account: str = typer.Option(None, help="Can specify a non-default Slurm account"),
     qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"),
     time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"),
     mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"),
@@ -326,6 +327,7 @@ def convert(
             num_tasks=1,
             cluster_config=cluster_config,
             partition=partition,
+            account=account,
             run_after=run_after,
             reuse_code=reuse_code,
             reuse_code_exp=reuse_code_exp,
diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py
index fac2f884bd..28c68f87f3 100644
--- a/nemo_skills/pipeline/eval.py
+++ b/nemo_skills/pipeline/eval.py
@@ -54,6 +54,7 @@ def _create_comet_judge_tasks(
     judge_server_gpus,
     judge_server_nodes,
     partition,
+    account,
     run_after,
     reuse_code_exp,
     reuse_code,
@@ -113,6 +114,7 @@ def _create_comet_judge_tasks(
         num_gpus=judge_server_gpus or 1,
         num_nodes=judge_server_nodes or 1,
         partition=partition,
+        account=account,
         run_after=run_after,
         reuse_code_exp=reuse_code_exp,
         reuse_code=reuse_code,
@@ -138,6 +140,7 @@ def _create_nvembed_judge_tasks(
     judge_server_gpus,
     judge_server_nodes,
     partition,
+    account,
     run_after,
     reuse_code_exp,
     reuse_code,
@@ -200,6 +203,7 @@ def _create_nvembed_judge_tasks(
         num_gpus=judge_server_gpus or 1,
         num_nodes=judge_server_nodes or 1,
         partition=partition,
+        account=account,
         run_after=run_after,
         reuse_code_exp=reuse_code_exp,
         reuse_code=reuse_code,
@@ -227,6 +231,7 @@ def _create_llm_judge_tasks(
     cluster,
     config_dir,
     partition,
+    account,
     with_sandbox,
     keep_mounts_for_sandbox,
     run_after,
@@ -267,6 +272,7 @@ def _create_llm_judge_tasks(
         cluster=cluster,
         config_dir=config_dir,
         partition=partition,
+        account=account,
         with_sandbox=with_sandbox,
         keep_mounts_for_sandbox=keep_mounts_for_sandbox,
         run_after=run_after,
@@ -378,6 +384,7 @@ def eval(
         "Can provide a list directly when using through Python",
     ),
     partition: str = typer.Option(None, help="Cluster partition to use"),
+    account: str = typer.Option(None, help="Can specify a non-default Slurm account"),
     qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"),
     time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"),
     mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"),
@@ -614,6 +621,7 @@ def eval(
                     container=cluster_config["containers"]["nemo-skills"],
                     cluster_config=cluster_config,
                     partition=partition,
+                    account=account,
                     server_config=job_server_config,
                     with_sandbox=job_needs_sandbox or with_sandbox,
                     keep_mounts_for_sandbox=job_needs_sandbox_to_keep_mounts or keep_mounts_for_sandbox,
@@ -675,6 +683,7 @@ def eval(
                     judge_server_gpus=judge_server_gpus,
                     judge_server_nodes=judge_server_nodes,
                     partition=partition,
+                    account=account,
                     run_after=run_after,
                     reuse_code_exp=reuse_code_exp,
                     reuse_code=reuse_code,
@@ -699,6 +708,7 @@ def eval(
                     judge_server_gpus=judge_server_gpus,
                     judge_server_nodes=judge_server_nodes,
                     partition=partition,
+                    account=account,
                     run_after=run_after,
                     reuse_code_exp=reuse_code_exp,
                     reuse_code=reuse_code,
@@ -726,6 +736,7 @@ def eval(
                     cluster=cluster,
                     config_dir=config_dir,
                     partition=partition,
+                    account=account,
                     with_sandbox=with_sandbox,
                     keep_mounts_for_sandbox=keep_mounts_for_sandbox,
                     run_after=run_after,
diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index a1b96f556a..d908fabfcc 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -55,6 +55,7 @@ def _create_job_unified(
     installation_command: Optional[str],
     with_sandbox: bool,
     partition: Optional[str],
+    account: Optional[str],
     keep_mounts_for_sandbox: bool,
     task_name: str,
     log_dir: str,
@@ -191,6 +192,7 @@ def _create_job_unified(
                 commands=components,
                 hardware=HardwareConfig(
                     partition=partition,
+                    account=account,
                     num_gpus=group_gpus,
                     num_nodes=group_nodes,
                     num_tasks=group_tasks,
@@ -297,6 +299,7 @@ def generate(
     partition: str = typer.Option(
         None, help="Can specify if need interactive jobs or a specific non-default partition"
     ),
+    account: str = typer.Option(None, help="Can specify a non-default Slurm account"),
     qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"),
     time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"),
     run_after: List[str] = typer.Option(
@@ -589,6 +592,7 @@ def convert_server_type_to_string(server_type):
                     installation_command=installation_command,
                     with_sandbox=with_sandbox,
                     partition=partition,
+                    account=account,
                     keep_mounts_for_sandbox=keep_mounts_for_sandbox,
                     task_name=task_name,
                     log_dir=log_dir,
diff --git a/nemo_skills/pipeline/nemo_evaluator.py b/nemo_skills/pipeline/nemo_evaluator.py
index 39838737ed..f3e9057c0e 100644
--- a/nemo_skills/pipeline/nemo_evaluator.py
+++ b/nemo_skills/pipeline/nemo_evaluator.py
@@ -125,6 +125,7 @@ def nemo_evaluator(
     job_gpus: int = typer.Option(0, help="GPUs to allocate for the evaluator client when no servers are hosted"),
     job_nodes: int = typer.Option(1, help="Nodes to allocate for the evaluator job"),
     partition: str = typer.Option(None, help="Cluster partition to use"),
+    account: str = typer.Option(None, help="Can specify a non-default Slurm account"),
     qos: str = typer.Option(None, help="Slurm QoS"),
     mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"),
     log_dir: str = typer.Option(None, help="Custom location for logs"),
@@ -325,6 +326,7 @@ def nemo_evaluator(
             job_nodes=job_nodes,
             cluster_config=cluster_config,
             partition=partition,
+            account=account,
             qos=qos,
             exclusive=exclusive,
         )
@@ -346,6 +348,7 @@ def nemo_evaluator(
                     commands=[main_server_cmd, client_cmd],
                     hardware=_hardware_for_group(
                         task_ctx.partition,
+                        task_ctx.account,
                         task_ctx.server_gpus or None,
                         task_ctx.server_nodes or 1,
                         task_ctx.qos,
@@ -358,6 +361,7 @@ def nemo_evaluator(
                     commands=[judge_server_cmd],
                     hardware=_hardware_for_group(
                         task_ctx.partition,
+                        task_ctx.account,
                         task_ctx.judge_server_gpus or None,
                         task_ctx.judge_server_nodes or 1,
                         task_ctx.qos,
@@ -393,7 +397,12 @@ def nemo_evaluator(
                 CommandGroup(
                     commands=sg_cmds,
                     hardware=_hardware_for_group(
-                        task_ctx.partition, group_num_gpus, group_num_nodes, task_ctx.qos, task_ctx.exclusive
+                        task_ctx.partition,
+                        task_ctx.account,
+                        group_num_gpus,
+                        group_num_nodes,
+                        task_ctx.qos,
+                        task_ctx.exclusive,
                     ),
                     name=f"{task_ctx.expname}-{task_ctx.idx}",
                     log_dir=log_dir,
@@ -543,17 +552,24 @@ class _TaskCreationContext:
     job_nodes: int
     cluster_config: Dict
     partition: Optional[str]
+    account: Optional[str]
     qos: Optional[str]
     exclusive: bool
 
 
 def _hardware_for_group(
-    partition: Optional[str], num_gpus: Optional[int], num_nodes: int, qos: Optional[str], exclusive: bool
+    partition: Optional[str],
+    account: Optional[str],
+    num_gpus: Optional[int],
+    num_nodes: int,
+    qos: Optional[str],
+    exclusive: bool,
 ) -> HardwareConfig:
     """Create HardwareConfig for a CommandGroup.
 
     Args:
         partition: SLURM partition name
+        account: SLURM account name
         num_gpus: Number of GPUs (None means no GPU allocation)
         num_nodes: Number of nodes
         qos: SLURM QoS setting
@@ -564,6 +580,7 @@ def _hardware_for_group(
     """
     return HardwareConfig(
         partition=partition,
+        account=account,
         num_gpus=num_gpus,
         num_nodes=num_nodes,
         sbatch_kwargs={
diff --git a/nemo_skills/pipeline/run_cmd.py b/nemo_skills/pipeline/run_cmd.py
index 5706d516e6..9e383ac9ab 100644
--- a/nemo_skills/pipeline/run_cmd.py
+++ b/nemo_skills/pipeline/run_cmd.py
@@ -58,6 +58,7 @@ def run_cmd(
     partition: str = typer.Option(
         None, help="Can specify if need interactive jobs or a specific non-default partition"
     ),
+    account: str = typer.Option(None, help="Can specify a non-default Slurm account"),
     qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"),
     time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"),
     num_gpus: int | None = typer.Option(None, help="Number of GPUs per node to use"),
@@ -197,6 +198,7 @@ def run_cmd(
                 container=containers,
                 cluster_config=cluster_config,
                 partition=partition,
+                account=account,
                 server_config=server_config,
                 with_sandbox=with_sandbox,
                 keep_mounts_for_sandbox=keep_mounts_for_sandbox,
diff --git a/nemo_skills/pipeline/start_server.py b/nemo_skills/pipeline/start_server.py
index 71eae6bcb0..c875e0d7fe 100644
--- a/nemo_skills/pipeline/start_server.py
+++ b/nemo_skills/pipeline/start_server.py
@@ -128,6 +128,7 @@ def start_server(
     ),
     server_container: str = typer.Option(None, help="Override container image for the hosted server"),
     partition: str = typer.Option(None, help="Cluster partition to use"),
+    account: str = typer.Option(None, help="Can specify a non-default Slurm account"),
     qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"),
     time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"),
     mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"),
@@ -203,6 +204,7 @@ def start_server(
             container=cluster_config["containers"]["nemo-skills"],
             cluster_config=cluster_config,
             partition=partition,
+            account=account,
             server_config=server_config,
             with_sandbox=with_sandbox,
             keep_mounts_for_sandbox=keep_mounts_for_sandbox,
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 7029dcc638..f47067ee99 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -264,6 +264,7 @@ class HardwareConfig:
     """Hardware configuration for a group of tasks."""
 
     partition: Optional[str] = None
+    account: Optional[str] = None
     num_gpus: Optional[int] = None
     num_nodes: Optional[int] = None
     num_tasks: Optional[int] = 1
@@ -585,6 +586,7 @@ def _create_executor(
                 log_dir=log_dir,
                 log_prefix=exec_config["log_prefix"],
                 partition=hardware.partition if hardware else None,
+                account=hardware.account if hardware else None,
                 heterogeneous=heterogeneous,
                 het_group=het_group,
                 total_het_groups=total_het_groups,
diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index 3bce2eb864..c22700652c 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -168,6 +168,7 @@ def get_executor(
     log_prefix: str = "main",
     mounts=None,
     partition=None,
+    account=None,
     dependencies=None,
     extra_package_dirs: tuple[str] | None = None,
     heterogeneous=False,
@@ -210,6 +211,7 @@ def get_executor(
             taken from `cluster_config`.
         partition: SLURM partition override. If omitted, inferred from `gpus_per_node`
             and `cluster_config`.
+        account: SLURM account override. If omitted, uses `cluster_config["account"]`.
         dependencies: SLURM job handles to depend on. The dependency type is taken from
             `cluster_config['dependency_type']` (default: "afterany").
         extra_package_dirs: Additional directories to package with the code for remote
@@ -328,9 +330,12 @@ def get_executor(
     dependency_type = cluster_config.get("dependency_type", "afterany")
     job_details_class = CustomJobDetailsRay if with_ray else CustomJobDetails
 
+    # Resolve account with fallback to cluster_config
+    account = account or cluster_config.get("account")
+
     # Build executor parameters as a dictionary to avoid duplicate parameters
     executor_params = {
-        "account": cluster_config["account"],
+        "account": account,
         "partition": partition,
         "nodes": num_nodes,
         "ntasks_per_node": tasks_per_node,
@@ -429,6 +434,7 @@ def add_task(
     num_nodes=1,
     log_dir=None,
     partition=None,
+    account=None,
     with_sandbox=False,
     keep_mounts_for_sandbox=False,
     sandbox_port: int | None = None,
@@ -538,6 +544,7 @@ def add_task(
             tasks_per_node=num_server_tasks,
             gpus_per_node=server_config["num_gpus"],
             partition=partition,
+            account=account,
             dependencies=dependencies,
             job_name=task_name,
             log_dir=log_dir,
@@ -582,6 +589,7 @@ def add_task(
                         tasks_per_node=cur_tasks,
                         gpus_per_node=num_gpus if server_config is None else 0,
                         partition=partition,
+                        account=account,
                         dependencies=dependencies,
                         job_name=task_name,
                         log_dir=log_dir,
@@ -626,6 +634,7 @@ def add_task(
                 tasks_per_node=1,
                 gpus_per_node=0,
                 partition=partition,
+                account=account,
                 mounts=None if keep_mounts_for_sandbox else [],
                 dependencies=dependencies,
                 job_name=task_name,

From 5a89b6835b2ce2b6f444fcb3aee3e8f2cc0e6858 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 3 Feb 2026 14:00:36 -0800
Subject: [PATCH 02/13] Add overrides for containers

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/pipeline/convert.py      |  3 ++-
 nemo_skills/pipeline/eval.py         | 24 +++++++++++++++++++++---
 nemo_skills/pipeline/generate.py     | 10 ++++++++--
 nemo_skills/pipeline/run_cmd.py      |  2 ++
 nemo_skills/pipeline/start_server.py |  5 ++++-
 nemo_skills/pipeline/utils/exp.py    |  3 ++-
 6 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/nemo_skills/pipeline/convert.py b/nemo_skills/pipeline/convert.py
index 68564b39f0..99ca445552 100644
--- a/nemo_skills/pipeline/convert.py
+++ b/nemo_skills/pipeline/convert.py
@@ -181,6 +181,7 @@ def convert(
         None, help="Can specify if need interactive jobs or a specific non-default partition"
     ),
     account: str = typer.Option(None, help="Can specify a non-default Slurm account"),
+    container: str = typer.Option(None, help="Override container image for the conversion job"),
     qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"),
     time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"),
     mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"),
@@ -321,7 +322,7 @@ def convert(
             cmd=conversion_cmd,
             task_name=expname,
             log_dir=log_dir,
-            container=container_map[(convert_from, convert_to)],
+            container=container or container_map[(convert_from, convert_to)],
             num_gpus=num_gpus,
             num_nodes=1,  # always running on a single node, might need to change that in the future
             num_tasks=1,
diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py
index 28c68f87f3..1907da1311 100644
--- a/nemo_skills/pipeline/eval.py
+++ b/nemo_skills/pipeline/eval.py
@@ -55,6 +55,7 @@ def _create_comet_judge_tasks(
     judge_server_nodes,
     partition,
     account,
+    judge_container,
     run_after,
     reuse_code_exp,
     reuse_code,
@@ -109,7 +110,7 @@ def _create_comet_judge_tasks(
         cmd=run_cmd,
         task_name=f"{expname}-{benchmark}-comet-judge",
         log_dir=log_dir + "/judge",
-        container=cluster_config["containers"]["vllm"],
+        container=judge_container or cluster_config["containers"]["vllm"],
         cluster_config=cluster_config,
         num_gpus=judge_server_gpus or 1,
         num_nodes=judge_server_nodes or 1,
@@ -141,6 +142,7 @@ def _create_nvembed_judge_tasks(
     judge_server_nodes,
     partition,
     account,
+    judge_container,
     run_after,
     reuse_code_exp,
     reuse_code,
@@ -198,7 +200,7 @@ def _create_nvembed_judge_tasks(
         cmd=run_cmd,
         task_name=f"{expname}-{benchmark}-nvembed-judge",
         log_dir=log_dir + "/judge",
-        container=cluster_config["containers"]["vllm"],
+        container=judge_container or cluster_config["containers"]["vllm"],
         cluster_config=cluster_config,
         num_gpus=judge_server_gpus or 1,
         num_nodes=judge_server_nodes or 1,
@@ -232,6 +234,8 @@ def _create_llm_judge_tasks(
     config_dir,
     partition,
     account,
+    main_container,
+    sandbox_container,
     with_sandbox,
     keep_mounts_for_sandbox,
     run_after,
@@ -273,6 +277,8 @@ def _create_llm_judge_tasks(
         config_dir=config_dir,
         partition=partition,
         account=account,
+        main_container=main_container,
+        sandbox_container=sandbox_container,
         with_sandbox=with_sandbox,
         keep_mounts_for_sandbox=keep_mounts_for_sandbox,
         run_after=run_after,
@@ -358,6 +364,12 @@ def eval(
     server_container: str = typer.Option(
         None, help="Override container image for the hosted server (if server_gpus is set)"
     ),
+    main_container: str = typer.Option(None, help="Override container image for the main evaluation client"),
+    sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"),
+    judge_container: str = typer.Option(None, help="Override container image for GPU-based judges (comet, nvembed)"),
+    judge_server_container: str = typer.Option(
+        None, help="Override container image for the hosted judge server (if judge_server_gpus is set)"
+    ),
     extra_judge_args: str = typer.Option(
         "", help="Additional arguments for judge (passed to generate script, so should start with ++)"
     ),
@@ -529,6 +541,7 @@ def eval(
         "server_nodes": judge_server_nodes,
         "server_args": judge_server_args,
         "server_entrypoint": judge_server_entrypoint,
+        "server_container": judge_server_container,
         "generation_type": judge_generation_type,
         "generation_module": judge_generation_module,
     }
@@ -618,7 +631,7 @@ def eval(
                     cmd=pipeline_utils.wrap_python_path(cmd=combine_cmds(cmds, single_node_mode)),
                     task_name=f"{expname}-{'-'.join(job_benchmarks)}",
                     log_dir=log_dir,
-                    container=cluster_config["containers"]["nemo-skills"],
+                    container=main_container or cluster_config["containers"]["nemo-skills"],
                     cluster_config=cluster_config,
                     partition=partition,
                     account=account,
@@ -627,6 +640,7 @@ def eval(
                     keep_mounts_for_sandbox=job_needs_sandbox_to_keep_mounts or keep_mounts_for_sandbox,
                     sandbox_port=None if get_random_port else 6000,
                     sandbox_env_overrides=job_sandbox_env_overrides,
+                    sandbox_container=sandbox_container,
                     run_after=run_after,
                     reuse_code_exp=reuse_code_exp,
                     reuse_code=reuse_code,
@@ -684,6 +698,7 @@ def eval(
                     judge_server_nodes=judge_server_nodes,
                     partition=partition,
                     account=account,
+                    judge_container=judge_container,
                     run_after=run_after,
                     reuse_code_exp=reuse_code_exp,
                     reuse_code=reuse_code,
@@ -709,6 +724,7 @@ def eval(
                     judge_server_nodes=judge_server_nodes,
                     partition=partition,
                     account=account,
+                    judge_container=judge_container,
                     run_after=run_after,
                     reuse_code_exp=reuse_code_exp,
                     reuse_code=reuse_code,
@@ -737,6 +753,8 @@ def eval(
                     config_dir=config_dir,
                     partition=partition,
                     account=account,
+                    main_container=main_container,
+                    sandbox_container=sandbox_container,
                     with_sandbox=with_sandbox,
                     keep_mounts_for_sandbox=keep_mounts_for_sandbox,
                     run_after=run_after,
diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index d908fabfcc..187a563cc7 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -61,6 +61,8 @@ def _create_job_unified(
     log_dir: str,
     sbatch_kwargs: Optional[Dict] = None,
     sandbox_env_overrides: Optional[List[str]] = None,
+    main_container: Optional[str] = None,
+    sandbox_container: Optional[str] = None,
 ) -> List[CommandGroup]:
     """
     Create CommandGroups for n models (unified for n=1 and n>1).
@@ -148,7 +150,7 @@ def _create_job_unified(
 
                 sandbox_cmd = Command(
                     script=sandbox_script,
-                    container=cluster_config["containers"]["sandbox"],
+                    container=sandbox_container or cluster_config["containers"]["sandbox"],
                     name=f"{task_name}_sandbox",
                 )
                 components.append(sandbox_cmd)
@@ -179,7 +181,7 @@ def _create_job_unified(
 
             client_cmd = Command(
                 script=client_script,
-                container=cluster_config["containers"]["nemo-skills"],
+                container=main_container or cluster_config["containers"]["nemo-skills"],
                 name=f"{task_name}",
             )
             components.append(client_cmd)
@@ -274,6 +276,8 @@ def generate(
         help="Container image(s). CLI: space-separated. Python API: string or list. "
         "Single value broadcasts to all models.",
     ),
+    main_container: str = typer.Option(None, help="Override container image for the main generation client"),
+    sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"),
     dependent_jobs: int = typer.Option(0, help="Specify this to launch that number of dependent jobs"),
     mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"),
     num_random_seeds: int = typer.Option(
@@ -598,6 +602,8 @@ def convert_server_type_to_string(server_type):
                     log_dir=log_dir,
                     sbatch_kwargs=sbatch_kwargs,
                     sandbox_env_overrides=sandbox_env_overrides,
+                    main_container=main_container,
+                    sandbox_container=sandbox_container,
                 )
 
                 # Use unique internal job name for dependency tracking, but same task_name
diff --git a/nemo_skills/pipeline/run_cmd.py b/nemo_skills/pipeline/run_cmd.py
index 9e383ac9ab..2628302234 100644
--- a/nemo_skills/pipeline/run_cmd.py
+++ b/nemo_skills/pipeline/run_cmd.py
@@ -78,6 +78,7 @@ def run_cmd(
     server_container: str = typer.Option(
         None, help="Override container image for the hosted server (if server_gpus is set)"
     ),
+    sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"),
     dependent_jobs: int = typer.Option(0, help="Specify this to launch that number of dependent jobs"),
     mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"),
     run_after: List[str] = typer.Option(
@@ -203,6 +204,7 @@ def run_cmd(
                 with_sandbox=with_sandbox,
                 keep_mounts_for_sandbox=keep_mounts_for_sandbox,
                 sandbox_port=None if get_random_port else 6000,
+                sandbox_container=sandbox_container,
                 run_after=run_after,
                 reuse_code=reuse_code,
                 reuse_code_exp=reuse_code_exp,
diff --git a/nemo_skills/pipeline/start_server.py b/nemo_skills/pipeline/start_server.py
index c875e0d7fe..cd557659e3 100644
--- a/nemo_skills/pipeline/start_server.py
+++ b/nemo_skills/pipeline/start_server.py
@@ -127,6 +127,8 @@ def start_server(
         "If not specified, will use the default entrypoint for the server type.",
     ),
     server_container: str = typer.Option(None, help="Override container image for the hosted server"),
+    main_container: str = typer.Option(None, help="Override container image for the main task (e.g., chat interface)"),
+    sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"),
     partition: str = typer.Option(None, help="Cluster partition to use"),
     account: str = typer.Option(None, help="Can specify a non-default Slurm account"),
     qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"),
@@ -201,7 +203,7 @@ def start_server(
             cmd=cmd,
             task_name="server",
             log_dir=log_dir,
-            container=cluster_config["containers"]["nemo-skills"],
+            container=main_container or cluster_config["containers"]["nemo-skills"],
             cluster_config=cluster_config,
             partition=partition,
             account=account,
@@ -209,6 +211,7 @@ def start_server(
             with_sandbox=with_sandbox,
             keep_mounts_for_sandbox=keep_mounts_for_sandbox,
             sandbox_port=sandbox_port,
+            sandbox_container=sandbox_container,
             sbatch_kwargs=parse_kwargs(sbatch_kwargs, exclusive=exclusive, qos=qos, time_min=time_min),
         )
 
diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index c22700652c..949bef372f 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -436,6 +436,7 @@ def add_task(
     partition=None,
     account=None,
     with_sandbox=False,
+    sandbox_container=None,
     keep_mounts_for_sandbox=False,
     sandbox_port: int | None = None,
     server_config=None,
@@ -629,7 +630,7 @@ def add_task(
             commands.append(get_sandbox_command(cluster_config))
             sandbox_executor = get_executor(
                 cluster_config=cluster_config,
-                container=cluster_config["containers"]["sandbox"],
+                container=sandbox_container or cluster_config["containers"]["sandbox"],
                 num_nodes=executors[0].nodes if cluster_config["executor"] == "slurm" else 1,
                 tasks_per_node=1,
                 gpus_per_node=0,

From be1b70004fdec4329217a1ed0fb59563dfa6b2c8 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 3 Feb 2026 16:05:10 -0800
Subject: [PATCH 03/13] Debugging

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/inference/model/base.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py
index 81398a6587..59b4679d3c 100644
--- a/nemo_skills/inference/model/base.py
+++ b/nemo_skills/inference/model/base.py
@@ -279,7 +279,16 @@ async def generate_async(
                     if endpoint_type == EndpointType.chat:
                         assert isinstance(prompt, list), "Chat completion requests must be a list of messages."
                         request_params = self._build_chat_request_params(messages=prompt, stream=stream, **kwargs)
+                        print("MAX TOKENS ASKED", request_params["max_tokens"])
+                        print("FULL REQUEST PARAMS", request_params)
+                        print("FULL LITELLM KWARGS", self.litellm_kwargs)
                         response = await litellm.acompletion(**request_params, **self.litellm_kwargs)
+                        print("TOKENS RECEIVED", response.usage.completion_tokens)
+                        print("FULL RESPONSE", response)
+                        print("FULL GENERATION", response.choices[0].message.content)
+                        print(
+                            "FULL REASONING CONTENT", getattr(response.choices[0].message, "reasoning_content", None)
+                        )
                         if stream:
                             result = self._stream_chat_chunks_async(response)
                         else:

From 3d7aebd493cb02d1a2929e83893a5d66cae1d482 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 3 Feb 2026 19:40:42 -0800
Subject: [PATCH 04/13] Client side cuonting

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/inference/model/tool_call.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/nemo_skills/inference/model/tool_call.py b/nemo_skills/inference/model/tool_call.py
index 8d25cbf762..0e0d3e7a35 100644
--- a/nemo_skills/inference/model/tool_call.py
+++ b/nemo_skills/inference/model/tool_call.py
@@ -142,6 +142,15 @@ async def generate_async(
                 endpoint_type=endpoint_type,
                 **generation_kwargs,
             )
+
+            # Recount tokens client-side for accuracy
+            # (vLLM reports wrong completion_tokens when finish_reason='length')
+            if hasattr(self.model, "tokenizer") and self.model.tokenizer:
+                text_to_count = generation.get("generation", "")
+                if generation.get("reasoning_content"):
+                    text_to_count = generation["reasoning_content"] + text_to_count
+                generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count))
+
             if isinstance(tokens_to_generate, int):
                 tokens_to_generate -= generation["num_generated_tokens"]
 

From ef451ddf3d2b278cd1f1bf624243bd07e834b22c Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 3 Feb 2026 21:12:19 -0800
Subject: [PATCH 05/13] Fixed toeknizer

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/inference/model/tool_call.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/nemo_skills/inference/model/tool_call.py b/nemo_skills/inference/model/tool_call.py
index 0e0d3e7a35..83a25e4331 100644
--- a/nemo_skills/inference/model/tool_call.py
+++ b/nemo_skills/inference/model/tool_call.py
@@ -145,11 +145,10 @@ async def generate_async(
 
             # Recount tokens client-side for accuracy
             # (vLLM reports wrong completion_tokens when finish_reason='length')
-            if hasattr(self.model, "tokenizer") and self.model.tokenizer:
-                text_to_count = generation.get("generation", "")
-                if generation.get("reasoning_content"):
-                    text_to_count = generation["reasoning_content"] + text_to_count
-                generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count))
+            text_to_count = generation.get("generation", "")
+            if generation.get("reasoning_content"):
+                text_to_count = generation["reasoning_content"] + text_to_count
+            generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count))
 
             if isinstance(tokens_to_generate, int):
                 tokens_to_generate -= generation["num_generated_tokens"]

From 9a9ba456a431edadbd9977759ee7495b61911437 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 3 Feb 2026 21:18:47 -0800
Subject: [PATCH 06/13] Debugging

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/inference/model/tool_call.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/nemo_skills/inference/model/tool_call.py b/nemo_skills/inference/model/tool_call.py
index 83a25e4331..cf239db3bc 100644
--- a/nemo_skills/inference/model/tool_call.py
+++ b/nemo_skills/inference/model/tool_call.py
@@ -143,12 +143,12 @@ async def generate_async(
                 **generation_kwargs,
             )
 
-            # Recount tokens client-side for accuracy
-            # (vLLM reports wrong completion_tokens when finish_reason='length')
-            text_to_count = generation.get("generation", "")
-            if generation.get("reasoning_content"):
-                text_to_count = generation["reasoning_content"] + text_to_count
-            generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count))
+            # # Recount tokens client-side for accuracy
+            # # (vLLM reports wrong completion_tokens when finish_reason='length')
+            # text_to_count = generation.get("generation", "")
+            # if generation.get("reasoning_content"):
+            #     text_to_count = generation["reasoning_content"] + text_to_count
+            # generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count))
 
             if isinstance(tokens_to_generate, int):
                 tokens_to_generate -= generation["num_generated_tokens"]

From dd0d94eabdbaee76dbd73f7ff31ffd98856b3879 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Wed, 25 Feb 2026 15:01:33 -0800
Subject: [PATCH 07/13] fix: add missing account= arg to
 test_server_metadata_from_num_tasks

The test calls _create_job_unified() which now requires account as a
positional argument after the addition of the --account CLI option.

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_generation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_generation.py b/tests/test_generation.py
index 440a1f1446..cc7be52be8 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -181,6 +181,7 @@ def test_server_metadata_from_num_tasks(tmp_path):
         installation_command=None,
         with_sandbox=False,
         partition=None,
+        account=None,
         keep_mounts_for_sandbox=False,
         task_name="test-task",
         log_dir="/tmp/logs",

From 316352b7b5ab03efca95a31b141a6d67dbef2958 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Wed, 25 Feb 2026 18:04:00 -0800
Subject: [PATCH 08/13] Roll-back tmp changes

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/inference/model/base.py      | 9 ---------
 nemo_skills/inference/model/tool_call.py | 7 -------
 2 files changed, 16 deletions(-)

diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py
index 86fccaebb4..f705e7e06f 100644
--- a/nemo_skills/inference/model/base.py
+++ b/nemo_skills/inference/model/base.py
@@ -281,16 +281,7 @@ async def generate_async(
                     if endpoint_type == EndpointType.chat:
                         assert isinstance(prompt, list), "Chat completion requests must be a list of messages."
                         request_params = self._build_chat_request_params(messages=prompt, stream=stream, **kwargs)
-                        print("MAX TOKENS ASKED", request_params["max_tokens"])
-                        print("FULL REQUEST PARAMS", request_params)
-                        print("FULL LITELLM KWARGS", self.litellm_kwargs)
                         response = await litellm.acompletion(**request_params, **self.litellm_kwargs)
-                        print("TOKENS RECEIVED", response.usage.completion_tokens)
-                        print("FULL RESPONSE", response)
-                        print("FULL GENERATION", response.choices[0].message.content)
-                        print(
-                            "FULL REASONING CONTENT", getattr(response.choices[0].message, "reasoning_content", None)
-                        )
                         if stream:
                             result = self._stream_chat_chunks_async(response)
                         else:
diff --git a/nemo_skills/inference/model/tool_call.py b/nemo_skills/inference/model/tool_call.py
index e912b2bb0a..00d6c2ca4d 100644
--- a/nemo_skills/inference/model/tool_call.py
+++ b/nemo_skills/inference/model/tool_call.py
@@ -146,13 +146,6 @@ async def generate_async(
                 **generation_kwargs,
             )
 
-            # # Recount tokens client-side for accuracy
-            # # (vLLM reports wrong completion_tokens when finish_reason='length')
-            # text_to_count = generation.get("generation", "")
-            # if generation.get("reasoning_content"):
-            #     text_to_count = generation["reasoning_content"] + text_to_count
-            # generation["num_generated_tokens"] = len(self.model.tokenizer.encode(text_to_count))
-
             if isinstance(tokens_to_generate, int):
                 tokens_to_generate -= generation["num_generated_tokens"]
 

From dbea2cafafc39aa4282b21c20566c3156767f126 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Wed, 25 Feb 2026 18:04:34 -0800
Subject: [PATCH 09/13] .get -> []

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/pipeline/utils/exp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index e0b9307b1b..728eda86ab 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -332,7 +332,7 @@ def get_executor(
     job_details_class = CustomJobDetailsRay if with_ray else CustomJobDetails
 
     # Resolve account with fallback to cluster_config
-    account = account or cluster_config.get("account")
+    account = account or cluster_config["account"]
 
     # Build executor parameters as a dictionary to avoid duplicate parameters
     executor_params = {

From a7cac9df8e751f035c9ccbd0424ad57eea049379 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Thu, 26 Feb 2026 09:36:18 -0800
Subject: [PATCH 10/13] fix: add timeouts to API-calling tests to prevent CI
 hangs

Per-request inference timeout (120s) and pytest-level test timeout (300s)
for test_eval_gsm8k_api and test_eval_judge_api. Prevents external API
hangs from blocking CI for 8+ minutes.

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_generation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_generation.py b/tests/test_generation.py
index cc7be52be8..69adb5e921 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -24,6 +24,7 @@
 from nemo_skills.pipeline.utils.scripts import ServerScript
 
 
+@pytest.mark.timeout(300)
 def test_eval_gsm8k_api(tmp_path):
     cmd = (
         f"ns eval "
@@ -33,6 +34,7 @@ def test_eval_gsm8k_api(tmp_path):
         f"    --benchmarks=gsm8k "
         f"    --output_dir={tmp_path} "
         f"    ++max_samples=2 "
+        f"    ++inference.timeout=120 "
     )
     subprocess.run(cmd, shell=True, check=True)
 
@@ -51,6 +53,7 @@ def test_eval_gsm8k_api(tmp_path):
     assert metrics["symbolic_correct"] >= 80
 
 
+@pytest.mark.timeout(300)
 def test_eval_judge_api(tmp_path):
     cmd = (
         f"ns eval "
@@ -64,6 +67,7 @@ def test_eval_judge_api(tmp_path):
         f"    --judge_server_type=openai "
         f"    --judge_generation_type=math_judge "
         f"    ++max_samples=2 "
+        f"    ++inference.timeout=120 "
     )
     subprocess.run(cmd, shell=True, check=True)
 

From 5e7965ac9308ce7bc658ca1c43ca5b700fa92c60 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Thu, 26 Feb 2026 10:50:17 -0800
Subject: [PATCH 11/13] fix: also pass inference timeout to judge subprocess in
 test

The judge step in test_eval_judge_api runs as a separate nemo-run job
and doesn't inherit ++inference.timeout from the main generation step.
Pass it via --extra_judge_args to prevent judge hangs too.

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_generation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_generation.py b/tests/test_generation.py
index 69adb5e921..7f9dff18a6 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -66,6 +66,7 @@ def test_eval_judge_api(tmp_path):
         f"    --judge_server_address=https://inference-api.nvidia.com/v1/ "
         f"    --judge_server_type=openai "
         f"    --judge_generation_type=math_judge "
+        f"    --extra_judge_args='++inference.timeout=120' "
         f"    ++max_samples=2 "
         f"    ++inference.timeout=120 "
     )

From 4374543094ba18a7007502da0cf4c04c672d330f Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Thu, 26 Feb 2026 10:57:18 -0800
Subject: [PATCH 12/13] fix: disable litellm retries in API tests to prevent
 timeout compounding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause: litellm max_retries=3 (default) compounds with
inference.timeout — a single hanging request can take up to
timeout * (max_retries + 1) = 120s * 4 = 480s, exceeding the 300s
pytest timeout. Setting max_retries=0 ensures a timeout fails
immediately without silent retries.

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_generation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_generation.py b/tests/test_generation.py
index 7f9dff18a6..55cbd79383 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -35,6 +35,7 @@ def test_eval_gsm8k_api(tmp_path):
         f"    --output_dir={tmp_path} "
         f"    ++max_samples=2 "
         f"    ++inference.timeout=120 "
+        f"    ++server.max_retries=0 "
     )
     subprocess.run(cmd, shell=True, check=True)
 
@@ -66,9 +67,10 @@ def test_eval_judge_api(tmp_path):
         f"    --judge_server_address=https://inference-api.nvidia.com/v1/ "
         f"    --judge_server_type=openai "
         f"    --judge_generation_type=math_judge "
-        f"    --extra_judge_args='++inference.timeout=120' "
+        f"    --extra_judge_args='++inference.timeout=120 ++server.max_retries=0' "
         f"    ++max_samples=2 "
         f"    ++inference.timeout=120 "
+        f"    ++server.max_retries=0 "
     )
     subprocess.run(cmd, shell=True, check=True)
 

From 92600fb2b11d92e6e2a0824b5961a45e909bf29a Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Fri, 27 Feb 2026 09:11:10 -0800
Subject: [PATCH 13/13] tst: update test parameters to limit retries and use a
 different model name

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_generation.py | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/tests/test_generation.py b/tests/test_generation.py
index 55cbd79383..77a73ca15d 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -29,13 +29,14 @@ def test_eval_gsm8k_api(tmp_path):
     cmd = (
         f"ns eval "
         f"    --server_type=openai "
-        f"    --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B "
+        f"    --model=nvidia/nvidia/nemotron-nano-30b-v3 "
         f"    --server_address=https://inference-api.nvidia.com/v1/ "
         f"    --benchmarks=gsm8k "
         f"    --output_dir={tmp_path} "
         f"    ++max_samples=2 "
+        f"    ++max_concurrent_requests=1 "
         f"    ++inference.timeout=120 "
-        f"    ++server.max_retries=0 "
+        f"    ++server.max_retries=1 "
     )
     subprocess.run(cmd, shell=True, check=True)
 
@@ -59,18 +60,19 @@ def test_eval_judge_api(tmp_path):
     cmd = (
         f"ns eval "
         f"    --server_type=openai "
-        f"    --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B "
+        f"    --model=nvidia/nvidia/nemotron-nano-30b-v3 "
         f"    --server_address=https://inference-api.nvidia.com/v1/ "
         f"    --benchmarks=math-500 "
         f"    --output_dir={tmp_path} "
-        f"    --judge_model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B "
+        f"    --judge_model=nvidia/nvidia/nemotron-nano-30b-v3 "
         f"    --judge_server_address=https://inference-api.nvidia.com/v1/ "
         f"    --judge_server_type=openai "
         f"    --judge_generation_type=math_judge "
-        f"    --extra_judge_args='++inference.timeout=120 ++server.max_retries=0' "
+        f"    --extra_judge_args='++max_concurrent_requests=1 ++inference.timeout=120 ++server.max_retries=1' "
         f"    ++max_samples=2 "
+        f"    ++max_concurrent_requests=1 "
         f"    ++inference.timeout=120 "
-        f"    ++server.max_retries=0 "
+        f"    ++server.max_retries=1 "
     )
     subprocess.run(cmd, shell=True, check=True)
 
@@ -94,7 +96,7 @@ def test_fail_on_api_key_env_var(tmp_path):
     cmd = (
         f"ns eval "
         f"    --server_type=openai "
-        f"    --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B "
+        f"    --model=nvidia/nvidia/nemotron-nano-30b-v3 "
         f"    --server_address=https://inference-api.nvidia.com/v1/ "
         f"    --benchmarks=gsm8k "
         f"    --output_dir={tmp_path} "
@@ -109,17 +111,21 @@ def test_fail_on_api_key_env_var(tmp_path):
     ), result.stdout.decode()
 
 
+@pytest.mark.timeout(300)
 def test_succeed_on_api_key_env_var(tmp_path):
     cmd = (
         f"export MY_CUSTOM_KEY=$NVIDIA_API_KEY && "
         f"unset NVIDIA_API_KEY && "
         f"ns eval "
         f"    --server_type=openai "
-        f"    --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B "
+        f"    --model=nvidia/nvidia/nemotron-nano-30b-v3 "
         f"    --server_address=https://inference-api.nvidia.com/v1/ "
         f"    --benchmarks=gsm8k "
         f"    --output_dir={tmp_path} "
         f"    ++max_samples=2 "
+        f"    ++max_concurrent_requests=1 "
+        f"    ++inference.timeout=120 "
+        f"    ++server.max_retries=1 "
         f"    ++server.api_key_env_var=MY_CUSTOM_KEY "
     )
     subprocess.run(cmd, shell=True, check=True)
@@ -139,16 +145,20 @@ def test_succeed_on_api_key_env_var(tmp_path):
     assert metrics["symbolic_correct"] >= 80
 
 
+@pytest.mark.timeout(300)
 @pytest.mark.parametrize("format", ["list", "dict"])
 def test_generate_openai_format(tmp_path, format):
     cmd = (
         f"ns generate "
         f"    --server_type=openai "
-        f"    --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B "
+        f"    --model=nvidia/nvidia/nemotron-nano-30b-v3 "
         f"    --server_address=https://inference-api.nvidia.com/v1/ "
         f"    --input_file=/nemo_run/code/tests/data/openai-input-{format}.test "
         f"    --output_dir={tmp_path} "
         f"    ++prompt_format=openai "
+        f"    ++max_concurrent_requests=1 "
+        f"    ++inference.timeout=120 "
+        f"    ++server.max_retries=1 "
     )
     subprocess.run(cmd, shell=True, check=True)
 
@@ -202,20 +212,24 @@ def test_server_metadata_from_num_tasks(tmp_path):
     assert groups[0].hardware.num_tasks == server_cmd.script.num_tasks
 
 
+@pytest.mark.timeout(300)
 def test_judge_generations_with_structured_output(tmp_path):
     cmd = (
         f"ns eval "
         f"    --server_type=openai "
-        f"    --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B "
+        f"    --model=nvidia/nvidia/nemotron-nano-30b-v3 "
         f"    --server_address=https://inference-api.nvidia.com/v1/ "
         f"    --benchmarks=hle "
         f"    --output_dir={tmp_path} "
-        f"    --judge_model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B "
+        f"    --judge_model=nvidia/nvidia/nemotron-nano-30b-v3 "
         f"    --judge_server_address=https://inference-api.nvidia.com/v1/ "
         f"    --judge_server_type=openai "
         f"    --metric_type=hle-aa "
-        f'    --extra_judge_args="++structured_output=HLE_JUDGE_AA" '
+        f'    --extra_judge_args="++structured_output=HLE_JUDGE_AA ++max_concurrent_requests=1 ++inference.timeout=120 ++server.max_retries=1" '
         f"    ++max_samples=2 "
+        f"    ++max_concurrent_requests=1 "
+        f"    ++inference.timeout=120 "
+        f"    ++server.max_retries=1 "
         f"    ++inference.tokens_to_generate=1024 "  # to make test go fast
     )
     subprocess.run(cmd, shell=True, check=True)