diff --git a/nemo_skills/inference/model/tool_call.py b/nemo_skills/inference/model/tool_call.py index 313b88b057..00d6c2ca4d 100644 --- a/nemo_skills/inference/model/tool_call.py +++ b/nemo_skills/inference/model/tool_call.py @@ -145,6 +145,7 @@ async def generate_async( endpoint_type=endpoint_type, **generation_kwargs, ) + if isinstance(tokens_to_generate, int): tokens_to_generate -= generation["num_generated_tokens"] diff --git a/nemo_skills/pipeline/convert.py b/nemo_skills/pipeline/convert.py index 97c7e52597..99ca445552 100644 --- a/nemo_skills/pipeline/convert.py +++ b/nemo_skills/pipeline/convert.py @@ -180,6 +180,8 @@ def convert( partition: str = typer.Option( None, help="Can specify if need interactive jobs or a specific non-default partition" ), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), + container: str = typer.Option(None, help="Override container image for the conversion job"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), @@ -320,12 +322,13 @@ def convert( cmd=conversion_cmd, task_name=expname, log_dir=log_dir, - container=container_map[(convert_from, convert_to)], + container=container or container_map[(convert_from, convert_to)], num_gpus=num_gpus, num_nodes=1, # always running on a single node, might need to change that in the future num_tasks=1, cluster_config=cluster_config, partition=partition, + account=account, run_after=run_after, reuse_code=reuse_code, reuse_code_exp=reuse_code_exp, diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py index 75aa1ae70b..37e9c38095 100644 --- a/nemo_skills/pipeline/eval.py +++ b/nemo_skills/pipeline/eval.py @@ -55,6 +55,9 @@ def _create_llm_judge_tasks( cluster, config_dir, partition, + account, + main_container, + sandbox_container, with_sandbox, keep_mounts_for_sandbox, run_after, @@ -95,6 +98,9 @@ def _create_llm_judge_tasks( cluster=cluster, config_dir=config_dir, partition=partition, + account=account, + main_container=main_container, + sandbox_container=sandbox_container, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, run_after=run_after, @@ -184,6 +190,12 @@ def eval( server_container: str = typer.Option( None, help="Override container image for the hosted server (if server_gpus is set)" ), + main_container: str = typer.Option(None, help="Override container image for the main evaluation client"), + sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"), + judge_container: str = typer.Option(None, help="Override container image for GPU-based judges (comet, nvembed)"), + judge_server_container: str = typer.Option( + None, help="Override container image for the hosted judge server (if judge_server_gpus is set)" + ), extra_judge_args: str = typer.Option( "", help="Additional arguments for judge (passed to generate script, so should start with ++)" ), @@ -210,6 +222,7 @@ def eval( "Can provide a list directly when using through Python", ), partition: str = typer.Option(None, help="Cluster partition to use"), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), @@ -349,6 +362,7 @@ def eval( "server_nodes": judge_server_nodes, "server_args": judge_server_args, "server_entrypoint": judge_server_entrypoint, + "server_container": judge_server_container, "generation_type": judge_generation_type, "generation_module": judge_generation_module, } @@ -430,14 +444,16 @@ def eval( cmd=pipeline_utils.wrap_python_path(cmd=combine_cmds(cmds, single_node_mode)), task_name=f"{expname}-{'-'.join(job_benchmarks)}", log_dir=log_dir, - container=cluster_config["containers"]["nemo-skills"], + container=main_container or cluster_config["containers"]["nemo-skills"], cluster_config=cluster_config, partition=partition, + account=account, server_config=job_server_config, with_sandbox=job_needs_sandbox or with_sandbox, keep_mounts_for_sandbox=job_needs_sandbox_to_keep_mounts or keep_mounts_for_sandbox, sandbox_port=None if get_random_port else 6000, sandbox_env_overrides=job_sandbox_env_overrides, + sandbox_container=sandbox_container, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, @@ -508,6 +524,8 @@ def eval( judge_server_gpus=judge_server_gpus, judge_server_nodes=judge_server_nodes, partition=partition, + account=account, + judge_container=judge_container, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, @@ -535,6 +553,9 @@ def eval( cluster=cluster, config_dir=config_dir, partition=partition, + account=account, + main_container=main_container, + sandbox_container=sandbox_container, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, run_after=run_after, diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index a1b96f556a..187a563cc7 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -55,11 +55,14 @@ def _create_job_unified( installation_command: Optional[str], with_sandbox: bool, partition: Optional[str], + account: Optional[str], keep_mounts_for_sandbox: bool, task_name: str, log_dir: str, sbatch_kwargs: Optional[Dict] = None, sandbox_env_overrides: Optional[List[str]] = None, + main_container: Optional[str] = None, + sandbox_container: Optional[str] = None, ) -> List[CommandGroup]: """ Create CommandGroups for n models (unified for n=1 and n>1). @@ -147,7 +150,7 @@ def _create_job_unified( sandbox_cmd = Command( script=sandbox_script, - container=cluster_config["containers"]["sandbox"], + container=sandbox_container or cluster_config["containers"]["sandbox"], name=f"{task_name}_sandbox", ) components.append(sandbox_cmd) @@ -178,7 +181,7 @@ def _create_job_unified( client_cmd = Command( script=client_script, - container=cluster_config["containers"]["nemo-skills"], + container=main_container or cluster_config["containers"]["nemo-skills"], name=f"{task_name}", ) components.append(client_cmd) @@ -191,6 +194,7 @@ def _create_job_unified( commands=components, hardware=HardwareConfig( partition=partition, + account=account, num_gpus=group_gpus, num_nodes=group_nodes, num_tasks=group_tasks, @@ -272,6 +276,8 @@ def generate( help="Container image(s). CLI: space-separated. Python API: string or list. " "Single value broadcasts to all models.", ), + main_container: str = typer.Option(None, help="Override container image for the main generation client"), + sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"), dependent_jobs: int = typer.Option(0, help="Specify this to launch that number of dependent jobs"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), num_random_seeds: int = typer.Option( @@ -297,6 +303,7 @@ def generate( partition: str = typer.Option( None, help="Can specify if need interactive jobs or a specific non-default partition" ), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), run_after: List[str] = typer.Option( @@ -589,11 +596,14 @@ def convert_server_type_to_string(server_type): installation_command=installation_command, with_sandbox=with_sandbox, partition=partition, + account=account, keep_mounts_for_sandbox=keep_mounts_for_sandbox, task_name=task_name, log_dir=log_dir, sbatch_kwargs=sbatch_kwargs, sandbox_env_overrides=sandbox_env_overrides, + main_container=main_container, + sandbox_container=sandbox_container, ) # Use unique internal job name for dependency tracking, but same task_name diff --git a/nemo_skills/pipeline/nemo_evaluator.py b/nemo_skills/pipeline/nemo_evaluator.py index 39838737ed..f3e9057c0e 100644 --- a/nemo_skills/pipeline/nemo_evaluator.py +++ b/nemo_skills/pipeline/nemo_evaluator.py @@ -125,6 +125,7 @@ def nemo_evaluator( job_gpus: int = typer.Option(0, help="GPUs to allocate for the evaluator client when no servers are hosted"), job_nodes: int = typer.Option(1, help="Nodes to allocate for the evaluator job"), partition: str = typer.Option(None, help="Cluster partition to use"), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Slurm QoS"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), log_dir: str = typer.Option(None, help="Custom location for logs"), @@ -325,6 +326,7 @@ def nemo_evaluator( job_nodes=job_nodes, cluster_config=cluster_config, partition=partition, + account=account, qos=qos, exclusive=exclusive, ) @@ -346,6 +348,7 @@ def nemo_evaluator( commands=[main_server_cmd, client_cmd], hardware=_hardware_for_group( task_ctx.partition, + task_ctx.account, task_ctx.server_gpus or None, task_ctx.server_nodes or 1, task_ctx.qos, @@ -358,6 +361,7 @@ def nemo_evaluator( commands=[judge_server_cmd], hardware=_hardware_for_group( task_ctx.partition, + task_ctx.account, task_ctx.judge_server_gpus or None, task_ctx.judge_server_nodes or 1, task_ctx.qos, @@ -393,7 +397,12 @@ def nemo_evaluator( CommandGroup( commands=sg_cmds, hardware=_hardware_for_group( - task_ctx.partition, group_num_gpus, group_num_nodes, task_ctx.qos, task_ctx.exclusive + task_ctx.partition, + task_ctx.account, + group_num_gpus, + group_num_nodes, + task_ctx.qos, + task_ctx.exclusive, ), name=f"{task_ctx.expname}-{task_ctx.idx}", log_dir=log_dir, @@ -543,17 +552,24 @@ class _TaskCreationContext: job_nodes: int cluster_config: Dict partition: Optional[str] + account: Optional[str] qos: Optional[str] exclusive: bool def _hardware_for_group( - partition: Optional[str], num_gpus: Optional[int], num_nodes: int, qos: Optional[str], exclusive: bool + partition: Optional[str], + account: Optional[str], + num_gpus: Optional[int], + num_nodes: int, + qos: Optional[str], + exclusive: bool, ) -> HardwareConfig: """Create HardwareConfig for a CommandGroup. Args: partition: SLURM partition name + account: SLURM account name num_gpus: Number of GPUs (None means no GPU allocation) num_nodes: Number of nodes qos: SLURM QoS setting @@ -564,6 +580,7 @@ def _hardware_for_group( """ return HardwareConfig( partition=partition, + account=account, num_gpus=num_gpus, num_nodes=num_nodes, sbatch_kwargs={ diff --git a/nemo_skills/pipeline/run_cmd.py b/nemo_skills/pipeline/run_cmd.py index 5706d516e6..2628302234 100644 --- a/nemo_skills/pipeline/run_cmd.py +++ b/nemo_skills/pipeline/run_cmd.py @@ -58,6 +58,7 @@ def run_cmd( partition: str = typer.Option( None, help="Can specify if need interactive jobs or a specific non-default partition" ), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), num_gpus: int | None = typer.Option(None, help="Number of GPUs per node to use"), @@ -77,6 +78,7 @@ def run_cmd( server_container: str = typer.Option( None, help="Override container image for the hosted server (if server_gpus is set)" ), + sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"), dependent_jobs: int = typer.Option(0, help="Specify this to launch that number of dependent jobs"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), run_after: List[str] = typer.Option( @@ -197,10 +199,12 @@ def run_cmd( container=containers, cluster_config=cluster_config, partition=partition, + account=account, server_config=server_config, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, sandbox_port=None if get_random_port else 6000, + sandbox_container=sandbox_container, run_after=run_after, reuse_code=reuse_code, reuse_code_exp=reuse_code_exp, diff --git a/nemo_skills/pipeline/start_server.py b/nemo_skills/pipeline/start_server.py index 571cfad433..195278e099 100644 --- a/nemo_skills/pipeline/start_server.py +++ b/nemo_skills/pipeline/start_server.py @@ -125,10 +125,13 @@ def launch_server( tail_logs=False, cmd="", partition=None, + account=None, with_sandbox=False, keep_mounts_for_sandbox=False, server_port=None, sandbox_port=None, + main_container=None, + sandbox_container=None, sbatch_kwargs=None, ): """Launch a model server in the background. @@ -174,13 +177,15 @@ def launch_server( cmd=cmd, task_name="server", log_dir=log_dir, - container=cluster_config["containers"]["nemo-skills"], + container=main_container or cluster_config["containers"]["nemo-skills"], cluster_config=cluster_config, partition=partition, + account=account, server_config=server_config, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, sandbox_port=sandbox_port, + sandbox_container=sandbox_container, sbatch_kwargs=sbatch_kwargs, ) exp.run(detach=True, tail_logs=tail_logs) @@ -213,7 +218,10 @@ def start_server( "If not specified, will use the default entrypoint for the server type.", ), server_container: str = typer.Option(None, help="Override container image for the hosted server"), + main_container: str = typer.Option(None, help="Override container image for the main task (e.g., chat interface)"), + sandbox_container: str = typer.Option(None, help="Override container image for the sandbox"), partition: str = typer.Option(None, help="Cluster partition to use"), + account: str = typer.Option(None, help="Can specify a non-default Slurm account"), qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"), @@ -274,10 +282,13 @@ def start_server( tail_logs=True, cmd=cmd, partition=partition, + account=account, with_sandbox=with_sandbox, keep_mounts_for_sandbox=keep_mounts_for_sandbox, server_port=server_port, sandbox_port=sandbox_port, + main_container=main_container, + sandbox_container=sandbox_container, sbatch_kwargs=parse_kwargs(sbatch_kwargs, exclusive=exclusive, qos=qos, time_min=time_min), ) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 7029dcc638..f47067ee99 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -264,6 +264,7 @@ class HardwareConfig: """Hardware configuration for a group of tasks.""" partition: Optional[str] = None + account: Optional[str] = None num_gpus: Optional[int] = None num_nodes: Optional[int] = None num_tasks: Optional[int] = 1 @@ -585,6 +586,7 @@ def _create_executor( log_dir=log_dir, log_prefix=exec_config["log_prefix"], partition=hardware.partition if hardware else None, + account=hardware.account if hardware else None, heterogeneous=heterogeneous, het_group=het_group, total_het_groups=total_het_groups, diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index 0f9ec7f123..728eda86ab 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -169,6 +169,7 @@ def get_executor( log_prefix: str = "main", mounts=None, partition=None, + account=None, dependencies=None, extra_package_dirs: tuple[str] | None = None, heterogeneous=False, @@ -211,6 +212,7 @@ def get_executor( taken from `cluster_config`. partition: SLURM partition override. If omitted, inferred from `gpus_per_node` and `cluster_config`. + account: SLURM account override. If omitted, uses `cluster_config["account"]`. dependencies: SLURM job handles to depend on. The dependency type is taken from `cluster_config['dependency_type']` (default: "afterany"). extra_package_dirs: Additional directories to package with the code for remote @@ -329,9 +331,12 @@ def get_executor( dependency_type = cluster_config.get("dependency_type", "afterany") job_details_class = CustomJobDetailsRay if with_ray else CustomJobDetails + # Resolve account with fallback to cluster_config + account = account or cluster_config["account"] + # Build executor parameters as a dictionary to avoid duplicate parameters executor_params = { - "account": cluster_config["account"], + "account": account, "partition": partition, "nodes": num_nodes, "ntasks_per_node": tasks_per_node, @@ -430,7 +435,9 @@ def add_task( num_nodes=1, log_dir=None, partition=None, + account=None, with_sandbox=False, + sandbox_container=None, keep_mounts_for_sandbox=False, sandbox_port: int | None = None, server_config=None, @@ -541,6 +548,7 @@ def add_task( tasks_per_node=num_server_tasks, gpus_per_node=server_config["num_gpus"], partition=partition, + account=account, dependencies=dependencies, job_name=task_name, log_dir=log_dir, @@ -585,6 +593,7 @@ def add_task( tasks_per_node=cur_tasks, gpus_per_node=num_gpus if server_config is None else 0, partition=partition, + account=account, dependencies=dependencies, job_name=task_name, log_dir=log_dir, @@ -624,11 +633,12 @@ def add_task( commands.append(get_sandbox_command(cluster_config)) sandbox_executor = get_executor( cluster_config=cluster_config, - container=cluster_config["containers"]["sandbox"], + container=sandbox_container or cluster_config["containers"]["sandbox"], num_nodes=executors[0].nodes if cluster_config["executor"] == "slurm" else 1, tasks_per_node=1, gpus_per_node=0, partition=partition, + account=account, mounts=None if keep_mounts_for_sandbox else [], dependencies=dependencies, job_name=task_name, diff --git a/tests/test_generation.py b/tests/test_generation.py index 440a1f1446..77a73ca15d 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -24,15 +24,19 @@ from nemo_skills.pipeline.utils.scripts import ServerScript +@pytest.mark.timeout(300) def test_eval_gsm8k_api(tmp_path): cmd = ( f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=gsm8k " f" --output_dir={tmp_path} " f" ++max_samples=2 " + f" ++max_concurrent_requests=1 " + f" ++inference.timeout=120 " + f" ++server.max_retries=1 " ) subprocess.run(cmd, shell=True, check=True) @@ -51,19 +55,24 @@ def test_eval_gsm8k_api(tmp_path): assert metrics["symbolic_correct"] >= 80 +@pytest.mark.timeout(300) def test_eval_judge_api(tmp_path): cmd = ( f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=math-500 " f" --output_dir={tmp_path} " - f" --judge_model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --judge_model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --judge_server_address=https://inference-api.nvidia.com/v1/ " f" --judge_server_type=openai " f" --judge_generation_type=math_judge " + f" --extra_judge_args='++max_concurrent_requests=1 ++inference.timeout=120 ++server.max_retries=1' " f" ++max_samples=2 " + f" ++max_concurrent_requests=1 " + f" ++inference.timeout=120 " + f" ++server.max_retries=1 " ) subprocess.run(cmd, shell=True, check=True) @@ -87,7 +96,7 @@ def test_fail_on_api_key_env_var(tmp_path): cmd = ( f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=gsm8k " f" --output_dir={tmp_path} " @@ -102,17 +111,21 @@ def test_fail_on_api_key_env_var(tmp_path): ), result.stdout.decode() +@pytest.mark.timeout(300) def test_succeed_on_api_key_env_var(tmp_path): cmd = ( f"export MY_CUSTOM_KEY=$NVIDIA_API_KEY && " f"unset NVIDIA_API_KEY && " f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=gsm8k " f" --output_dir={tmp_path} " f" ++max_samples=2 " + f" ++max_concurrent_requests=1 " + f" ++inference.timeout=120 " + f" ++server.max_retries=1 " f" ++server.api_key_env_var=MY_CUSTOM_KEY " ) subprocess.run(cmd, shell=True, check=True) @@ -132,16 +145,20 @@ def test_succeed_on_api_key_env_var(tmp_path): assert metrics["symbolic_correct"] >= 80 +@pytest.mark.timeout(300) @pytest.mark.parametrize("format", ["list", "dict"]) def test_generate_openai_format(tmp_path, format): cmd = ( f"ns generate " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --input_file=/nemo_run/code/tests/data/openai-input-{format}.test " f" --output_dir={tmp_path} " f" ++prompt_format=openai " + f" ++max_concurrent_requests=1 " + f" ++inference.timeout=120 " + f" ++server.max_retries=1 " ) subprocess.run(cmd, shell=True, check=True) @@ -181,6 +198,7 @@ def test_server_metadata_from_num_tasks(tmp_path): installation_command=None, with_sandbox=False, partition=None, + account=None, keep_mounts_for_sandbox=False, task_name="test-task", log_dir="/tmp/logs", @@ -194,20 +212,24 @@ def test_server_metadata_from_num_tasks(tmp_path): assert groups[0].hardware.num_tasks == server_cmd.script.num_tasks +@pytest.mark.timeout(300) def test_judge_generations_with_structured_output(tmp_path): cmd = ( f"ns eval " f" --server_type=openai " - f" --model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --server_address=https://inference-api.nvidia.com/v1/ " f" --benchmarks=hle " f" --output_dir={tmp_path} " - f" --judge_model=nvidia/nvidia/Nemotron-3-Nano-30B-A3B " + f" --judge_model=nvidia/nvidia/nemotron-nano-30b-v3 " f" --judge_server_address=https://inference-api.nvidia.com/v1/ " f" --judge_server_type=openai " f" --metric_type=hle-aa " - f' --extra_judge_args="++structured_output=HLE_JUDGE_AA" ' + f' --extra_judge_args="++structured_output=HLE_JUDGE_AA ++max_concurrent_requests=1 ++inference.timeout=120 ++server.max_retries=1" ' f" ++max_samples=2 " + f" ++max_concurrent_requests=1 " + f" ++inference.timeout=120 " + f" ++server.max_retries=1 " f" ++inference.tokens_to_generate=1024 " # to make test go fast ) subprocess.run(cmd, shell=True, check=True)