From 69c02f20a3f38f72e44c0d60f47993db4793c20b Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 29 Sep 2025 13:43:17 -0700 Subject: [PATCH 01/29] ENH refactor pipeline to be extensible Signed-off-by: George Armstrong WIP add more declarative syntax Signed-off-by: George Armstrong add crossgroup sandbox Signed-off-by: George Armstrong WIP make cross group built in Signed-off-by: George Armstrong add run_cmd Signed-off-by: George Armstrong small fixes Signed-off-by: George Armstrong WIP add variable definition and getting utilities Signed-off-by: George Armstrong get het group working again Signed-off-by: George Armstrong MAINT add simple example WIP fixing hetgroup -- better but host is broken Signed-off-by: George Armstrong multi hetgroup example Signed-off-by: George Armstrong cleanup Signed-off-by: George Armstrong FIX het group cross references Signed-off-by: George Armstrong MAINT simplify declarative Signed-off-by: George Armstrong Maint remove stuff MAINT more simplification and improvements Signed-off-by: George Armstrong MAINT make dependencies work Signed-off-by: George Armstrong make example simplistic but functional Signed-off-by: George Armstrong MAINT remove v2 examples WIP working send to remote Signed-off-by: George Armstrong add commands Signed-off-by: George Armstrong MAINT renaming and additional docs Signed-off-by: George Armstrong MAINT remove examples MAINT remove unused files MAINT fix the dep suffix Signed-off-by: George Armstrong FIXED timing issue on creation Signed-off-by: George Armstrong resolve more differences Signed-off-by: George Armstrong MAINT revert generate.py to refactored? Signed-off-by: George Armstrong MAINT respect log dir Signed-off-by: George Armstrong MAINT cleanup and FIX run_after Signed-off-by: George Armstrong MAINT rename stage back to pipeline MAINT rename stage back to pipeline FIX log dir and output dir setting Signed-off-by: George Armstrong MAINT simplify declarative Signed-off-by: George Armstrong MAINT remvoe extra abstraction leak move imports to top MAINT support using single group job if groups is only one item Signed-off-by: George Armstrong simplify unneeded path Signed-off-by: George Armstrong remove special sandbox handling refactor pipeline Signed-off-by: George Armstrong fix pipeline Signed-off-by: George Armstrong remove extra generate MAINT move import Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 267 ++++++-- nemo_skills/pipeline/utils/commands.py | 144 +++++ nemo_skills/pipeline/utils/declarative.py | 738 ++++++++++++++++++++++ 3 files changed, 1106 insertions(+), 43 deletions(-) create mode 100644 nemo_skills/pipeline/utils/commands.py create mode 100644 nemo_skills/pipeline/utils/declarative.py diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index 5543c9c6c8..626fe62fb1 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -14,7 +14,7 @@ import importlib import logging import os -from typing import List +from typing import Callable, Dict, List, Optional import typer @@ -22,6 +22,8 @@ from nemo_skills.dataset.utils import import_from_path from nemo_skills.inference import GENERATION_MODULE_MAP, GenerationType from nemo_skills.pipeline.app import app, typer_unpacker +from nemo_skills.pipeline.utils.commands import sandbox_command +from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline from nemo_skills.utils import ( compute_chunk_ids, get_logger_name, @@ -35,6 +37,113 @@ # TODO: add num_jobs here for consistency with eval? +def _create_commandgroup_from_config( + generation_cmd: str, + server_config: Optional[Dict], + with_sandbox: bool, + sandbox_port: Optional[int], + cluster_config: Dict, + installation_command: Optional[str], + get_server_command_fn: Callable, + partition: Optional[str], + time_min: Optional[str], + exclusive: bool, + task_name: str, + log_dir: str, +) -> CommandGroup: + """Create a CommandGroup from server_config, matching add_task logic. + + This function replicates the component ordering from add_task: + 1. Server (if server_config provided) + 2. Main client command + 3. Sandbox (if with_sandbox=True) + """ + + components = [] + + # 1. Add server if server_config is provided (matches add_task lines 433-463) + if server_config is not None and int(server_config["num_gpus"]) > 0: + # Extract server container (matches line 435) + server_type = server_config["server_type"] + server_container = server_config.pop("container", cluster_config["containers"][server_type]) + + # Use the EXISTING get_server_command function via lambda + # This ensures we use the same server command construction as add_task + server_config_copy = server_config.copy() + + def make_server_cmd(cfg): + cmd, num_tasks = get_server_command_fn(**server_config_copy, cluster_config=cfg) + # Include log_prefix in metadata to match old add_task behavior + return ( + cmd, + { + "num_tasks": num_tasks, + "gpus": server_config_copy["num_gpus"], + "nodes": server_config_copy["num_nodes"], + "log_prefix": "server", # Explicitly set log prefix + }, + ) + + server_cmd = Command( + command=make_server_cmd, + container=server_container, + gpus=server_config["num_gpus"], + nodes=server_config["num_nodes"], + name=task_name, # Use base task_name, not with _server suffix + ) + components.append(server_cmd) + + # 2. Add main generation command (matches add_task lines 466-580) + client_cmd = Command( + command=generation_cmd, # Already built with get_generation_cmd! + container=cluster_config["containers"]["nemo-skills"], + name=task_name, # Use base task_name, not with _client suffix + installation_command=installation_command, + metadata={"log_prefix": "main"}, # Set log prefix to match old add_task + ) + components.append(client_cmd) + + # 3. Add sandbox if requested (matches add_task lines 527-565) + if with_sandbox: + # Use existing sandbox_command builder + # Wrap to ensure log_prefix is set in returned metadata + def make_sandbox_cmd(cfg): + # sandbox_command returns (callable, metadata), so we need to call the callable + cmd_builder, initial_metadata = sandbox_command(port=sandbox_port) + # Call the builder to get the actual command string + cmd_string, runtime_metadata = cmd_builder(cfg) + # Merge metadata + metadata = initial_metadata.copy() + metadata.update(runtime_metadata) + metadata["log_prefix"] = "sandbox" # Explicitly set log prefix + return (cmd_string, metadata) + + sandbox_cmd = Command( + command=make_sandbox_cmd, + container=cluster_config["containers"]["sandbox"], + name=task_name, # Use base task_name, not with _sandbox suffix + ) + components.append(sandbox_cmd) + + # Find MAXIMUM GPUs needed by any component for the HardwareConfig + # This is critical for multi-component jobs - the job-level request must be the max + max_gpus = max((comp.gpus or 0) for comp in components) + max_nodes = max((comp.nodes or 1) for comp in components) + + return CommandGroup( + commands=components, + hardware=HardwareConfig( + partition=partition, + time_min=time_min, + exclusive=exclusive, + num_gpus=max_gpus, + num_nodes=max_nodes, + ), + name=task_name, + log_dir=log_dir, + ) + + @app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True}) @typer_unpacker def generate( @@ -257,46 +366,101 @@ def generate( chunk_ids=chunk_ids, rerun_done=rerun_done, ) - has_tasks = False - all_tasks = [] + if _task_dependencies is None: _task_dependencies = [] - with pipeline_utils.get_exp(expname, cluster_config, _reuse_exp) as exp: - for seed_idx, (seed, chunk_ids) in enumerate(remaining_jobs.items()): - if wandb_parameters: - # no need for chunks as it will run after merging - wandb_parameters["samples_file"] = pipeline_utils.get_chunked_rs_filename( - output_dir, - random_seed=seed, - chunk_id=None, - ) - for chunk_id in chunk_ids: - has_tasks = True - server_config, server_address, extra_arguments = pipeline_utils.configure_client( - model=model, - server_type=server_type, - server_address=original_server_address, - server_gpus=server_gpus, - server_nodes=server_nodes, - server_args=server_args, - server_entrypoint=server_entrypoint, - server_container=server_container, - extra_arguments=extra_arguments_original, - get_random_port=get_random_port, + + # Build jobs list using declarative interface + jobs = [] + all_job_names = [] + + for seed_idx, (seed, chunk_ids) in enumerate(remaining_jobs.items()): + if wandb_parameters: + # no need for chunks as it will run after merging + wandb_parameters["samples_file"] = pipeline_utils.get_chunked_rs_filename( + output_dir, + random_seed=seed, + chunk_id=None, + ) + for chunk_id in chunk_ids: + # Configure client (same as before) + server_config, server_address, extra_arguments = pipeline_utils.configure_client( + model=model, + server_type=server_type, + server_address=original_server_address, + server_gpus=server_gpus, + server_nodes=server_nodes, + server_args=server_args, + server_entrypoint=server_entrypoint, + server_container=server_container, + extra_arguments=extra_arguments_original, + get_random_port=get_random_port, + ) + + # Build generation command (same as before) + cmd = pipeline_utils.get_generation_cmd( + input_file=input_file, + input_dir=input_dir, + random_seed=seed, + output_dir=output_dir, + extra_arguments=extra_arguments, + eval_args=eval_args, + chunk_id=chunk_id, + num_chunks=num_chunks, + preprocess_cmd=preprocess_cmd, + postprocess_cmd=postprocess_cmd, + wandb_parameters=wandb_parameters if seed_idx == 0 else None, + script=generation_module, + ) + cmd = pipeline_utils.wrap_python_path(cmd=cmd) + + # Base task name (same for all dependent jobs in chain - matches original!) + task_name = f"{expname}-rs{seed}" if seed is not None else expname + if chunk_id is not None: + task_name += f"-chunk{chunk_id}" + + # Handle dependent_jobs chain (matching add_task behavior) + # Note: run_after will be handled by Stage for jobs that don't have task_dependencies + dependencies = _task_dependencies.copy() if _task_dependencies else [] + + for dep_idx in range(dependent_jobs + 1): + # Create CommandGroup for this task + cmd_group = _create_commandgroup_from_config( + generation_cmd=cmd, + server_config=server_config.copy() if server_config else None, + with_sandbox=with_sandbox, + sandbox_port=None if get_random_port else 6000, + cluster_config=cluster_config, + installation_command=installation_command, + get_server_command_fn=generation_task.get_server_command_fn(), + partition=partition, + time_min=time_min, + exclusive=exclusive, + task_name=task_name, + log_dir=log_dir, ) - cmd = pipeline_utils.get_generation_cmd( - input_file=input_file, - input_dir=input_dir, - random_seed=seed, - output_dir=output_dir, - extra_arguments=extra_arguments, - eval_args=eval_args, - chunk_id=chunk_id, - num_chunks=num_chunks, - preprocess_cmd=preprocess_cmd, - postprocess_cmd=postprocess_cmd, - wandb_parameters=wandb_parameters if seed_idx == 0 else None, - script=generation_module, + + # Use unique internal job name for dependency tracking, but same task_name + internal_job_name = f"{task_name}-dep{dep_idx}" if dep_idx > 0 else task_name + + # Build dependencies: first job gets _task_dependencies+run_after, rest get previous in chain + if dep_idx == 0: + # First job: add run_after if no task_dependencies (matching add_task logic) + job_deps = dependencies.copy() if dependencies else [] + if not dependencies and run_after: + run_after_list = run_after if isinstance(run_after, list) else [run_after] + job_deps.extend(run_after_list) + job_deps = job_deps if job_deps else None + else: + # Subsequent jobs in chain depend on previous + job_deps = [f"{task_name}-dep{dep_idx - 1}"] + + jobs.append( + { + "name": internal_job_name, + "group": cmd_group, + "dependencies": job_deps, + } ) prev_tasks = _task_dependencies for _ in range(dependent_jobs + 1): @@ -333,13 +497,30 @@ def generate( if has_tasks and not _reuse_exp: # if we are reusing an experiment, the tasks will run from there pipeline_utils.run_exp(exp, cluster_config, dry_run=dry_run) - if _reuse_exp: - return all_tasks - else: - if has_tasks: - return exp + all_job_names.append(internal_job_name) + + # If no jobs to run, return early + if not jobs: return None + # Handle _reuse_exp case + if _reuse_exp: + # For internal use: return job names for dependency tracking + return all_job_names + + # Create and run pipeline (with performance optimizations) + pipeline = Pipeline( + name=expname, + cluster=cluster, + jobs=jobs, + reuse_code=reuse_code, # PERFORMANCE: Enable code reuse + reuse_code_exp=reuse_code_exp, # PERFORMANCE: Reuse from specific exp + skip_hf_home_check=skip_hf_home_check, # Validation control + ) + + result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run) + return result + if __name__ == "__main__": typer.main.get_command_name = lambda name: name diff --git a/nemo_skills/pipeline/utils/commands.py b/nemo_skills/pipeline/utils/commands.py new file mode 100644 index 0000000000..cfebb8753b --- /dev/null +++ b/nemo_skills/pipeline/utils/commands.py @@ -0,0 +1,144 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Minimal command builders for declarative pipeline interface. + +These are thin wrappers around existing command construction utilities +in server.py and other modules, designed to work with Command objects. +""" + +from typing import Callable, Dict, Optional, Tuple + +from nemo_skills.pipeline.utils.exp import get_sandbox_command +from nemo_skills.pipeline.utils.server import get_free_port, get_server_command + + +def vllm_server_command( + model: str, + port: Optional[int] = None, + server_type: str = "vllm", + gpus: int = 8, + nodes: int = 1, + args: str = "", + entrypoint: Optional[str] = None, + **kwargs, +) -> Tuple[Callable, Dict]: + """Build vLLM server command. + + Returns a lambda that will build the command when cluster_config is available. + + Args: + model: Model path or name + port: Port to use (if None, will use get_free_port) + server_type: Type of server (vllm, sglang, trtllm, megatron) + gpus: Number of GPUs + nodes: Number of nodes + args: Additional server arguments + entrypoint: Custom entrypoint script + + Returns: + Tuple of (lambda, metadata_dict) + """ + if port is None: + port = get_free_port(strategy="random") + + metadata = { + "port": port, + "log_prefix": "server", + } + + # Return lambda that will call get_server_command when cluster_config is available + def server_cmd_builder(cfg): + cmd, num_tasks = get_server_command( + server_type=server_type, + num_gpus=gpus, + num_nodes=nodes, + model_path=model, + cluster_config=cfg, + server_port=port, + server_args=args, + server_entrypoint=entrypoint, + ) + return (cmd, {"num_tasks": num_tasks}) + + return server_cmd_builder, metadata + + +def sandbox_command(port: Optional[int] = None, **kwargs) -> Tuple[Callable, Dict]: + """Build sandbox command. + + Returns a lambda that will build the command when cluster_config is available. + + Args: + port: Port to use for sandbox + + Returns: + Tuple of (lambda, metadata_dict) + """ + if port is None: + port = get_free_port(strategy="random") + + metadata = { + "port": port, + "log_prefix": "sandbox", + "mounts": [], # Sandbox doesn't mount anything + "environment": { + "LISTEN_PORT": str(port), + "NGINX_PORT": str(port), + }, + } + + # Return lambda that will be evaluated when cluster_config is available + def sandbox_cmd_with_env(cfg): + cmd = get_sandbox_command(cfg) + # Build PYTHONPATH from cluster config + pythonpath_env = {} + for env_var in cfg.get("env_vars", []): + if "PYTHONPATH" in env_var: + pythonpath = env_var[11:] if env_var.startswith("PYTHONPATH=") else env_var + pythonpath_env["PYTHONPATH"] = pythonpath + ":/app" + break + # Return command + additional metadata + return (cmd, {"environment": pythonpath_env}) + + return sandbox_cmd_with_env, metadata + + +def wrap_command(command: str, working_dir: str = "/nemo_run/code", env_vars: Optional[Dict[str, str]] = None) -> str: + """Wrap command with working directory and environment variable setup. + + Args: + command: The command to wrap + working_dir: Working directory to cd into + env_vars: Environment variables to export + + Returns: + Wrapped command string + """ + parts = [] + + # Export environment variables + if env_vars: + for key, value in env_vars.items(): + parts.append(f"export {key}={value}") + + # Change to working directory + if working_dir: + parts.append(f"cd {working_dir}") + + # Add the actual command + parts.append(command) + + return " && ".join(parts) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py new file mode 100644 index 0000000000..af2bdae4e5 --- /dev/null +++ b/nemo_skills/pipeline/utils/declarative.py @@ -0,0 +1,738 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Simplified declarative pipeline system using only Command for all task types. + +Basic Example (Single job with multiple commands): + from nemo_skills.pipeline.utils.commands import vllm_server_command, sandbox_command + from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline + + # Commands that run together in one SLURM job + server = Command(command=vllm_server_command(model="Qwen/Qwen3-8B"), gpus=8, name="server") + sandbox = Command(command=sandbox_command(), name="sandbox") + client = Command( + command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}/health", + name="client" + ) + + # Group them together + inference_group = CommandGroup( + commands=[server, sandbox, client], + hardware=HardwareConfig(partition="batch"), + name="inference" + ) + + # Create and run pipeline + pipeline = Pipeline( + name="my_inference", + cluster="local", + groups=[inference_group] # Legacy mode: single job + ) + pipeline.run() + +Advanced Example (Multiple jobs with dependencies and heterogeneous components): + log_dir = "/experiments/full_pipeline/logs" + # Job 1: Preprocessing + preprocess = Command( + command="python preprocess.py --input data.jsonl --output processed.jsonl", + gpus=0, + name="preprocess" + ) + prep_group = CommandGroup( + commands=[preprocess], + hardware=HardwareConfig(partition="cpu"), + name="prep", + log_dir=log_dir + ) + + # Job 2: Two different model servers (HETEROGENEOUS SLURM job with 2 het components) + server_8b = Command(command=vllm_server_command(model="Qwen/Qwen3-8B"), gpus=8, name="server_8b") + sandbox_8b = Command(command=sandbox_command(), name="sandbox_8b") + eval_8b = Command(command="python eval.py --model 8b", gpus=1, name="eval_8b") + + server_32b = Command(command=vllm_server_command(model="Qwen/Qwen3-32B"), gpus=8, name="server_32b") + sandbox_32b = Command(command=sandbox_command(), name="sandbox_32b") + eval_32b = Command(command="python eval.py --model 32b", gpus=1, name="eval_32b") + + group_8b = CommandGroup(commands=[server_8b, sandbox_8b, eval_8b], name="eval_8b", log_dir=log_dir) + group_32b = CommandGroup(commands=[server_32b, sandbox_32b, eval_32b], name="eval_32b", log_dir=log_dir) + + # Job 3: Report generation (depends on both evaluations) + report = Command( + command="python generate_report.py --output report.txt", + gpus=0, + name="report" + ) + report_group = CommandGroup(commands=[report], name="report", log_dir=log_dir) + + # Create pipeline with dependency graph + pipeline = Pipeline( + name="full_pipeline", + cluster="slurm", + jobs=[ + {"name": "prep", "group": prep_group}, + + # Multi-group heterogeneous job (both eval groups in ONE SLURM job with 2 het components) + {"name": "evals", "groups": [group_8b, group_32b], "dependencies": ["prep"]}, + + # Report depends on the multi-group eval job + {"name": "report", "group": report_group, "dependencies": ["evals"]}, + pipeline.run() +""" + +import inspect +import logging +import shlex +from contextlib import nullcontext +from dataclasses import dataclass, field +from typing import Callable, Dict, List, Optional, Tuple, Union + +import nemo_run as run + +from nemo_skills.pipeline.utils import ( + get_cluster_config, + get_env_variables, + get_executor, + get_exp, + get_exp_handles, + get_tunnel, + run_exp, + temporary_env_update, +) +from nemo_skills.pipeline.utils.commands import wrap_command +from nemo_skills.pipeline.utils.exp import REUSE_CODE_EXP, get_packaging_job_key, install_packages_wrap, tunnel_hash +from nemo_skills.pipeline.utils.mounts import is_mounted_filepath +from nemo_skills.pipeline.utils.packager import get_registered_external_repo +from nemo_skills.utils import get_logger_name + +LOG = logging.getLogger(get_logger_name(__file__)) + + +@dataclass +class Command: + """Declarative command for running tasks in containers. + + The command can be either: + - A string: evaluated immediately + - A callable (lambda): evaluated lazily when the task is prepared + - A tuple (command, metadata): command with metadata like port + - A callable returning (command, metadata): lazy evaluation with metadata + + Using a lambda allows references to work correctly in heterogeneous jobs: + Command(command=lambda: f"curl {server.hostname_ref()}:5000") + + Metadata from command builders (like port) can be referenced: + server = Command(command=vllm_server_command(...)) + client = Command(command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}") + """ + + command: Union[ + str, + Callable[[], str], # Lambda for cross-group refs + Callable[[Dict], str], # Lambda needing cluster_config (e.g., sandbox) + Tuple[Optional[str], Dict], # Command builder result + Callable[[], Tuple[Optional[str], Dict]], # Lambda returning command builder result + None, + ] + container: str = "nemo-skills" + gpus: Optional[int] = None + nodes: int = 1 + name: Optional[str] = None + working_dir: str = "/nemo_run/code" + env_vars: Dict[str, str] = field(default_factory=dict) + installation_command: Optional[str] = None + port: Optional[int] = None # Can be set from metadata + metadata: Dict[str, any] = field(default_factory=dict) # Stores metadata from command builders + het_group_index: Optional[int] = None # Set automatically by Pipeline + + def __post_init__(self): + # Initialize component (merged from old Component class) + if self.name is None: + self.name = "command" + + # Extract metadata if command is a tuple + self._extract_metadata() + + # Wrap plain strings with environment setup + if isinstance(self.command, str) and (self.env_vars or self.working_dir): + self.command = wrap_command(self.command, self.working_dir, self.env_vars) + + def _extract_metadata(self): + """Extract metadata from command if it's a tuple.""" + if not callable(self.command): + if isinstance(self.command, tuple): + cmd, self.metadata = self.command + self.command = cmd # Can be None for sandbox + + def hostname_ref(self) -> str: + """Get hostname reference for hetjob cross-component communication.""" + if self.het_group_index is None: + return "127.0.0.1" # Local fallback + # For heterogeneous SLURM jobs, resolve nodelist to actual hostname + return f"$(scontrol show hostnames $SLURM_JOB_NODELIST_HET_GROUP_{self.het_group_index} | head -n1)" + + def meta_ref(self, key: str) -> str: + """Get metadata value (like port). Fails if key not found.""" + if key not in self.metadata: + raise KeyError( + f"Metadata key '{key}' not found in Command '{self.name}'. " + f"Available keys: {list(self.metadata.keys())}" + ) + return str(self.metadata[key]) + + def prepare_for_execution(self, cluster_config: Dict) -> Tuple[str, Dict]: + """Prepare command for execution. + + This method: + 1. Evaluates callables (resolves cross-group references and cluster_config-dependent commands) + 2. Adds cross-group environment variables + 3. Wraps with installation_command if provided + + Returns: + Tuple of (final_command, execution_config) + """ + # 1. Evaluate if callable (resolves cross-group references or cluster_config needs) + if callable(self.command): + # Check if lambda needs cluster_config (for sandbox) + + sig = inspect.signature(self.command) + if len(sig.parameters) > 0: + # Lambda expects cluster_config + result = self.command(cluster_config) + else: + # Regular lambda (cross-group refs) + result = self.command() + + if isinstance(result, tuple): + final_command, runtime_metadata = result + # Deep merge metadata, especially environment dict + for key, value in runtime_metadata.items(): + if key == "environment" and key in self.metadata: + # Merge environment dicts instead of replacing + self.metadata[key].update(value) + else: + self.metadata[key] = value + else: + final_command = result + else: + final_command = self.command + + # 2. Wrap with installation_command if provided (matches add_task behavior) + if self.installation_command: + final_command = install_packages_wrap(final_command, self.installation_command) + + # 3. Build execution config from metadata + execution_config = { + "num_tasks": self.metadata.get("num_tasks", 1), + "num_gpus": self.metadata.get("gpus", self.gpus or 0), + "num_nodes": self.metadata.get("nodes", self.nodes), + "environment": self.metadata.get("environment", {}), + "log_prefix": self.metadata.get("log_prefix", "main"), + "mounts": self.metadata.get("mounts"), + "container": self.metadata.get("container", self.container), # Use container from metadata if available + } + + return final_command, execution_config + + def get_name(self) -> str: + return self.name + + +@dataclass +class HardwareConfig: + """Hardware configuration for a group of tasks.""" + + partition: Optional[str] = None + time_min: Optional[str] = None + exclusive: bool = False + num_gpus: Optional[int] = None + num_nodes: Optional[int] = None + + +class CommandGroup: + """Command group where commands run together with shared resource requirements.""" + + def __init__( + self, + commands: List[Command], + hardware: Optional[HardwareConfig] = None, + name: Optional[str] = None, + log_dir: Optional[str] = None, + ): + self.components = commands # Keep as self.components internally for backwards compatibility + self.hardware = hardware or HardwareConfig() + self.name = name + self.log_dir = log_dir + + +class Pipeline: + """Top-level pipeline that composes command groups with dependency support. + + Supports two modes: + 1. Groups: groups=[cmdgroup1, cmdgroup2] - combines all into one job + 2. Jobs: jobs=[{...}, {...}] - supports dependencies and multi-group jobs + """ + + def __init__( + self, + name: str, + cluster: Optional[str] = None, + groups: Optional[List[CommandGroup]] = None, # Legacy mode + jobs: Optional[List[Dict]] = None, # New mode with dependencies + reuse_code: bool = True, + reuse_code_exp: Optional[str] = None, + skip_hf_home_check: bool = False, + with_ray: bool = False, + run_after: Optional[Union[str, List[str]]] = None, # Pipeline-level dependency on other experiments + ): + self.name = name + self.cluster = cluster + self.reuse_code = reuse_code + self.reuse_code_exp = reuse_code_exp + self.skip_hf_home_check = skip_hf_home_check + self.with_ray = with_ray + self.run_after = run_after + self._cluster_config: Optional[Dict] = None + + if groups is not None and jobs is not None: + raise ValueError("Cannot specify both 'groups' and 'jobs'.") + + if groups is not None: + self.jobs = [{"group": g} for g in groups] + self._legacy_mode = True + elif jobs is not None: + self.jobs = jobs + self._legacy_mode = False + else: + raise ValueError("Must specify either 'groups' or 'jobs'") + + self._assign_het_group_indices() + + def _assign_het_group_indices(self): + """Assign het_group_index to all components for cross-group references.""" + # Collect all groups from jobs + all_groups = [] + for job_spec in self.jobs: + if "group" in job_spec: + all_groups.append(job_spec["group"]) + elif "groups" in job_spec: + all_groups.extend(job_spec["groups"]) + + # Assign indices + for group_idx, group in enumerate(all_groups): + for component in group.components: + component.het_group_index = group_idx + LOG.debug(f"Assigned het_group_index {group_idx} to {component.get_name()}") + + def _get_cluster_config(self) -> Dict: + """Get cluster configuration, loading it if necessary.""" + if self._cluster_config is None: + if self.cluster is None: + raise ValueError("Must specify cluster either in Pipeline() or run() method") + + self._cluster_config = get_cluster_config(self.cluster) + return self._cluster_config + + def run( + self, + cluster_config: Optional[Dict] = None, + cluster: Optional[str] = None, + dry_run: bool = False, + log_dir: Optional[str] = None, + ): + """Execute the pipeline by calling NeMo-Run directly. + + Args: + cluster_config: Cluster configuration dict (optional, can use cluster name instead) + cluster: Cluster name to load config from (optional if cluster_config provided) + dry_run: If True, validate without executing + log_dir: Default log directory for groups that don't specify one (optional) + """ + if not self.jobs: + LOG.info("No jobs to execute") + return None + + # Determine cluster config to use + if cluster_config is not None: + final_cluster_config = cluster_config + elif cluster is not None: + final_cluster_config = get_cluster_config(cluster) + else: + final_cluster_config = self._get_cluster_config() + + # Validate HF_HOME (matching add_task validation) + if final_cluster_config["executor"] != "none" and not self.skip_hf_home_check: + env_vars = get_env_variables(final_cluster_config) + if "HF_HOME" not in env_vars: + raise RuntimeError( + "Invalid cluster_config: HF_HOME is missing from env_vars while skip_hf_home_check=False.\n" + f"Current env_vars: {final_cluster_config.get('env_vars', [])}\n" + "Please add a new variable: HF_HOME=/mounted/path/to/your/hf_home" + ) + if not is_mounted_filepath(final_cluster_config, env_vars["HF_HOME"]): + raise RuntimeError(f"Invalid cluster_config: HF_HOME={env_vars['HF_HOME']} is not a mounted path.") + + # Track job name -> task handle for dependency resolution + job_name_to_handle = {} + + with get_exp(self.name, final_cluster_config) as exp: + # Process each job in order + for job_spec in self.jobs: + job_name = job_spec.get("name", "unnamed") + + # Resolve dependencies to task handles (matching add_task lines 390-404) + run_after_deps = [] + + # Handle dependencies from job spec + job_dependencies = job_spec.get("dependencies", []) + # Handle explicit None (when dependencies key exists but value is None) + if job_dependencies is None: + job_dependencies = [] + + # If no dependencies and Pipeline has run_after, apply it (matching add_task behavior) + if not job_dependencies and self.run_after: + run_after_list = self.run_after if isinstance(self.run_after, list) else [self.run_after] + job_dependencies = run_after_list + + for dep in job_dependencies: + if isinstance(dep, str): + # String dependency - look up by name + if dep in job_name_to_handle: + # Internal pipeline dependency - add the handle + run_after_deps.append(job_name_to_handle[dep]) + LOG.info(f"Job '{job_name}' depends on '{dep}' (handle: {job_name_to_handle[dep]})") + else: + # External experiment name - will be resolved by get_exp_handles below + if final_cluster_config["executor"] == "slurm": + exp_handles = get_exp_handles(dep) + if len(exp_handles) == 0: + LOG.warning( + f"No pending or running tasks found for experiment {dep}, cannot set dependencies." + ) + run_after_deps.extend(exp_handles) + LOG.info( + f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)" + ) + + # Convert empty list to None (matching add_task line 402) + if len(run_after_deps) == 0: + run_after_deps = None + + # Check if this is a multi-group job or single group + if "groups" in job_spec: + # If only one group in list, use single group job for efficiency + if len(job_spec["groups"]) == 1: + task_handle = self._add_single_group_job( + exp, + job_spec["groups"][0], + final_cluster_config, + default_log_dir=log_dir, + run_after=run_after_deps if run_after_deps else None, + ) + else: + # True multi-group: combine multiple groups into one heterogeneous SLURM job + task_handle = self._add_multi_group_job( + exp, + job_spec["groups"], + final_cluster_config, + default_log_dir=log_dir, + run_after=run_after_deps if run_after_deps else None, + ) + elif "group" in job_spec: + # Single group job + task_handle = self._add_single_group_job( + exp, + job_spec["group"], + final_cluster_config, + default_log_dir=log_dir, + run_after=run_after_deps if run_after_deps else None, + ) + else: + raise ValueError(f"Job spec must have either 'group' or 'groups': {job_spec}") + + # Track task handle for this job + job_name_to_handle[job_name] = task_handle + LOG.info(f"Added job '{job_name}' with task_handle={task_handle}") + + if not dry_run: + run_exp(exp, final_cluster_config) + + # Cache experiment for code reuse in future runs (matching add_task behavior) + if final_cluster_config["executor"] != "none": + tunnel = get_tunnel(final_cluster_config) + cur_tunnel_hash = tunnel_hash(tunnel) + if cur_tunnel_hash not in REUSE_CODE_EXP: + REUSE_CODE_EXP[cur_tunnel_hash] = exp + LOG.info("Cached experiment for future code reuse") + + return exp + + def _prepare_command(self, command, cluster_config: Dict) -> Tuple[str, Dict]: + """Prepare command and handle mpirun wrapping.""" + final_cmd, exec_config = command.prepare_for_execution(cluster_config) + + # Handle mpirun wrapping for non-SLURM executors + num_tasks = exec_config["num_tasks"] + if cluster_config["executor"] != "slurm" and num_tasks > 1: + final_cmd = f"mpirun --allow-run-as-root -np {num_tasks} bash -c {shlex.quote(final_cmd)}" + + return final_cmd, exec_config + + def _resolve_container(self, exec_config: Dict, command, cluster_config: Dict) -> str: + """Resolve container name to image path.""" + container_name = exec_config.get("container", command.container) + if container_name in cluster_config.get("containers", {}): + return cluster_config["containers"][container_name] + return container_name + + def _create_executor( + self, + command, + exec_config: Dict, + container_image: str, + cluster_config: Dict, + log_dir: str, + hardware: HardwareConfig, + heterogeneous: bool, + het_group: int, + total_het_groups: int, + overlap: bool, + ): + """Create executor with optional environment update.""" + env_context = ( + temporary_env_update(cluster_config, exec_config["environment"]) + if exec_config.get("environment") + else nullcontext() + ) + + with env_context: + return get_executor( + cluster_config=cluster_config, + container=container_image, + num_nodes=exec_config["num_nodes"], + tasks_per_node=exec_config["num_tasks"], + gpus_per_node=exec_config["num_gpus"], + job_name=command.name, + log_dir=log_dir, + log_prefix=exec_config["log_prefix"], + partition=hardware.partition if hardware else None, + time_min=hardware.time_min if hardware else None, + heterogeneous=heterogeneous, + het_group=het_group, + total_het_groups=total_het_groups, + overlap=overlap, + mounts=exec_config.get("mounts"), + with_ray=self.with_ray, + slurm_kwargs={"exclusive": hardware.exclusive} if (hardware and hardware.exclusive) else None, + ) + + def _plan_and_add_job( + self, + exp, + groups: List[CommandGroup], + cluster_config: Dict, + default_log_dir: Optional[str] = None, + run_after: Optional[List] = None, + heterogeneous: bool = False, + ) -> str: + """Plan commands/executors for one or more groups and add to experiment. + + This encapsulates shared logic between single-group and multi-group jobs. Behavior + differences are controlled by the 'heterogeneous' flag and the provided 'groups'. + """ + + # Dependencies are only passed through for SLURM + dependencies = run_after if (run_after and cluster_config["executor"] == "slurm") else None + + # Resolve log directory (use first group's log_dir if present) + log_dir = groups[0].log_dir or default_log_dir + if log_dir is None: + raise ValueError(f"CommandGroup '{groups[0].name}' must have log_dir set, or provide it to pipeline.run()") + + commands: List[str] = [] + executors: List = [] + het_group_indices: List[int] = [] + + # In heterogeneous jobs, collect environment from all commands for cross-component refs + shared_env_vars: Dict[str, str] = {} + if heterogeneous: + for het_idx, group in enumerate(groups): + for command in group.components: + _, exec_config_probe = command.prepare_for_execution(cluster_config) + shared_env_vars.update(exec_config_probe.get("environment", {})) + + # PERFORMANCE: Track first packager to share across executors (single-group only) + shared_packager = None + + # Build commands and executors + for het_idx, group in enumerate(groups): + has_multiple_components = len(group.components) > 1 + total_het_groups = ( + len(groups) if heterogeneous else (len(group.components) if has_multiple_components else 1) + ) + + # For single-group jobs with multiple components, allow job-level GPU override for sbatch allocation + job_level_gpus = ( + group.hardware.num_gpus if (not heterogeneous and has_multiple_components and group.hardware) else None + ) + + for comp_idx, command in enumerate(group.components): + final_cmd, exec_config = self._prepare_command(command, cluster_config) + commands.append(final_cmd) + + # Adjust GPU allocation (first component gets job-level GPUs for sbatch) for single-group jobs + exec_config["num_gpus"] = exec_config["num_gpus"] or 0 + if (not heterogeneous) and (comp_idx == 0) and (job_level_gpus is not None): + exec_config["num_gpus"] = job_level_gpus + + # Merge shared environment for heterogeneous jobs + if heterogeneous and shared_env_vars: + exec_config["environment"].update(shared_env_vars) + + # Resolve container and create executor + container_image = self._resolve_container(exec_config, command, cluster_config) + executor = self._create_executor( + command, + exec_config, + container_image, + cluster_config, + log_dir, + group.hardware, + heterogeneous, + het_idx if heterogeneous else comp_idx, + total_het_groups, + (len(group.components) > 1), + ) + + # Share packager across executors for single-group jobs + if not heterogeneous: + if comp_idx == 0 and het_idx == 0: + shared_packager = executor.packager + else: + executor.packager = shared_packager + + executors.append(executor) + if heterogeneous: + het_group_indices.append(het_idx) + + # For heterogeneous jobs, set het_group_indices on the first executor + if heterogeneous and executors: + executors[0].het_group_indices = het_group_indices + + # PERFORMANCE: Handle code reuse from previous experiments (single-group only) + if (not heterogeneous) and cluster_config["executor"] != "none": + tunnel = get_tunnel(cluster_config) + if self.reuse_code: + reuse_exp = self.reuse_code_exp or REUSE_CODE_EXP.get(tunnel_hash(tunnel)) + if reuse_exp is not None: + if isinstance(reuse_exp, str): + try: + reuse_exp = run.Experiment.from_id(reuse_exp) + except Exception: + try: + reuse_exp = run.Experiment.from_title(reuse_exp) + except Exception: + LOG.warning(f"Failed to load experiment {reuse_exp} for code reuse") + reuse_exp = None + if reuse_exp is not None: + LOG.info(f"Trying to reuse code from experiment {reuse_exp._title}") + reuse_key = get_packaging_job_key(reuse_exp._id, "nemo-run") + if reuse_key in reuse_exp.tunnels[tunnel.key].packaging_jobs: + reuse_dir = reuse_exp.tunnels[tunnel.key].packaging_jobs[reuse_key].dst_path + for executor in executors: + executor.packager.symlink_from_remote_dir = reuse_dir + LOG.info(f"Successfully reused code from {reuse_key}") + else: + LOG.warning(f"Relevant packaging job not found for experiment {reuse_exp._title}") + else: + # If reuse_code=False, clear cache (matching original behavior) + REUSE_CODE_EXP.pop(tunnel_hash(tunnel), None) + + # Handle executor="none" path replacements (single-group only) + if (not heterogeneous) and cluster_config["executor"] == "none": + for idx in range(len(commands)): + commands[idx] = commands[idx].replace( + "/nemo_run/code/nemo_skills", str(get_registered_external_repo("nemo_skills").path) + ) + commands[idx] = commands[idx].replace("/nemo_run/code", "./") + + # Ray metadata handling + if self.with_ray and cluster_config["executor"] == "slurm": + metadata = {"use_with_ray_cluster": True} + else: + metadata = None + + # Add to experiment and return task ID + if (not heterogeneous) and len(commands) == 1: + task_id = exp.add( + run.Script(inline=commands[0], metadata=metadata), + executor=executors[0], + name="nemo-run", + dependencies=dependencies, + ) + else: + task_id = exp.add( + [ + run.Script(inline=cmd, metadata=(metadata if idx == 0 else None)) + for idx, cmd in enumerate(commands) + ], + executor=executors, + name="nemo-run", + dependencies=dependencies, + ) + + return task_id + + def _add_single_group_job( + self, + exp, + group: CommandGroup, + cluster_config: Dict, + default_log_dir: Optional[str] = None, + run_after: Optional[List] = None, + ) -> str: + """Add a single CommandGroup as one job and return its task handle.""" + + return self._plan_and_add_job( + exp=exp, + groups=[group], + cluster_config=cluster_config, + default_log_dir=default_log_dir, + run_after=run_after, + heterogeneous=False, + ) + + def _add_multi_group_job( + self, + exp, + groups: List[CommandGroup], + cluster_config: Dict, + default_log_dir: Optional[str] = None, + run_after: Optional[List] = None, + ) -> str: + """Add multiple CommandGroups as a single heterogeneous SLURM job and return task handle.""" + + return self._plan_and_add_job( + exp=exp, + groups=groups, + cluster_config=cluster_config, + default_log_dir=default_log_dir, + run_after=run_after, + heterogeneous=True, + ) + + +# Backwards compatibility alias +Pipeline = Pipeline From 47308a0429a3628e9c99f5a7b9d06ea915b1c88e Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Fri, 3 Oct 2025 12:39:30 -0700 Subject: [PATCH 02/29] MAINT fix merge Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 41 ++++------------------- nemo_skills/pipeline/utils/commands.py | 5 +-- nemo_skills/pipeline/utils/declarative.py | 2 ++ 3 files changed, 11 insertions(+), 37 deletions(-) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index 626fe62fb1..e18d7e2543 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -46,8 +46,10 @@ def _create_commandgroup_from_config( installation_command: Optional[str], get_server_command_fn: Callable, partition: Optional[str], + qos: Optional[str], time_min: Optional[str], exclusive: bool, + keep_mounts_for_sandbox: bool, task_name: str, log_dir: str, ) -> CommandGroup: @@ -109,7 +111,7 @@ def make_server_cmd(cfg): # Wrap to ensure log_prefix is set in returned metadata def make_sandbox_cmd(cfg): # sandbox_command returns (callable, metadata), so we need to call the callable - cmd_builder, initial_metadata = sandbox_command(port=sandbox_port) + cmd_builder, initial_metadata = sandbox_command(port=sandbox_port, keep_mounts=keep_mounts_for_sandbox) # Call the builder to get the actual command string cmd_string, runtime_metadata = cmd_builder(cfg) # Merge metadata @@ -134,6 +136,7 @@ def make_sandbox_cmd(cfg): commands=components, hardware=HardwareConfig( partition=partition, + qos=qos, time_min=time_min, exclusive=exclusive, num_gpus=max_gpus, @@ -434,8 +437,10 @@ def generate( installation_command=installation_command, get_server_command_fn=generation_task.get_server_command_fn(), partition=partition, + qos=qos, time_min=time_min, exclusive=exclusive, + keep_mounts_for_sandbox=keep_mounts_for_sandbox, task_name=task_name, log_dir=log_dir, ) @@ -462,40 +467,6 @@ def generate( "dependencies": job_deps, } ) - prev_tasks = _task_dependencies - for _ in range(dependent_jobs + 1): - task_name = f"{expname}-rs{seed}" if seed is not None else expname - if chunk_id is not None: - task_name += f"-chunk{chunk_id}" - new_task = pipeline_utils.add_task( - exp, - cmd=pipeline_utils.wrap_python_path(cmd=cmd), - task_name=task_name, - log_dir=log_dir, - container=cluster_config["containers"]["nemo-skills"], - cluster_config=cluster_config, - partition=partition, - qos=qos, - time_min=time_min, - server_config=server_config, - with_sandbox=with_sandbox, - keep_mounts_for_sandbox=keep_mounts_for_sandbox, - sandbox_port=None if get_random_port else 6000, - run_after=run_after, - reuse_code=reuse_code, - reuse_code_exp=reuse_code_exp, - task_dependencies=( - prev_tasks if cluster_config["executor"] == "slurm" else all_tasks + _task_dependencies - ), - get_server_command=generation_task.get_server_command_fn(), - slurm_kwargs={"exclusive": exclusive} if exclusive else None, - installation_command=installation_command, - skip_hf_home_check=skip_hf_home_check, - ) - prev_tasks = [new_task] - all_tasks.append(new_task) - if has_tasks and not _reuse_exp: # if we are reusing an experiment, the tasks will run from there - pipeline_utils.run_exp(exp, cluster_config, dry_run=dry_run) all_job_names.append(internal_job_name) diff --git a/nemo_skills/pipeline/utils/commands.py b/nemo_skills/pipeline/utils/commands.py index cfebb8753b..f2402c590b 100644 --- a/nemo_skills/pipeline/utils/commands.py +++ b/nemo_skills/pipeline/utils/commands.py @@ -76,13 +76,14 @@ def server_cmd_builder(cfg): return server_cmd_builder, metadata -def sandbox_command(port: Optional[int] = None, **kwargs) -> Tuple[Callable, Dict]: +def sandbox_command(port: Optional[int] = None, keep_mounts: bool = False, **kwargs) -> Tuple[Callable, Dict]: """Build sandbox command. Returns a lambda that will build the command when cluster_config is available. Args: port: Port to use for sandbox + keep_mounts: If True, keep mounts from cluster config. If False, use empty mounts for safety. Returns: Tuple of (lambda, metadata_dict) @@ -93,7 +94,7 @@ def sandbox_command(port: Optional[int] = None, **kwargs) -> Tuple[Callable, Dic metadata = { "port": port, "log_prefix": "sandbox", - "mounts": [], # Sandbox doesn't mount anything + "mounts": None if keep_mounts else [], # None means use cluster config mounts, [] means no mounts "environment": { "LISTEN_PORT": str(port), "NGINX_PORT": str(port), diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index af2bdae4e5..2ef4f1e03f 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -255,6 +255,7 @@ class HardwareConfig: """Hardware configuration for a group of tasks.""" partition: Optional[str] = None + qos: Optional[str] = None time_min: Optional[str] = None exclusive: bool = False num_gpus: Optional[int] = None @@ -528,6 +529,7 @@ def _create_executor( log_dir=log_dir, log_prefix=exec_config["log_prefix"], partition=hardware.partition if hardware else None, + qos=hardware.qos if hardware else None, time_min=hardware.time_min if hardware else None, heterogeneous=heterogeneous, het_group=het_group, From 0e3688d15d74e8016665d116d6a6a90ada9fe89c Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Fri, 3 Oct 2025 12:53:40 -0700 Subject: [PATCH 03/29] MAINT fix hetgroup indexing Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 27 ++++++++--------------- 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 2ef4f1e03f..415f542457 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -155,7 +155,7 @@ class Command: installation_command: Optional[str] = None port: Optional[int] = None # Can be set from metadata metadata: Dict[str, any] = field(default_factory=dict) # Stores metadata from command builders - het_group_index: Optional[int] = None # Set automatically by Pipeline + het_group_index: Optional[int] = None # Set per-job by Pipeline (not global) def __post_init__(self): # Initialize component (merged from old Component class) @@ -319,23 +319,7 @@ def __init__( else: raise ValueError("Must specify either 'groups' or 'jobs'") - self._assign_het_group_indices() - - def _assign_het_group_indices(self): - """Assign het_group_index to all components for cross-group references.""" - # Collect all groups from jobs - all_groups = [] - for job_spec in self.jobs: - if "group" in job_spec: - all_groups.append(job_spec["group"]) - elif "groups" in job_spec: - all_groups.extend(job_spec["groups"]) - - # Assign indices - for group_idx, group in enumerate(all_groups): - for component in group.components: - component.het_group_index = group_idx - LOG.debug(f"Assigned het_group_index {group_idx} to {component.get_name()}") + # Note: het_group_indices are assigned per-job in _plan_and_add_job, not globally def _get_cluster_config(self) -> Dict: """Get cluster configuration, loading it if necessary.""" @@ -591,6 +575,13 @@ def _plan_and_add_job( ) for comp_idx, command in enumerate(group.components): + # Assign het_group_index ONLY for heterogeneous jobs (per-job, not global) + # Non-heterogeneous jobs use localhost, so het_group_index should remain None + if heterogeneous: + command.het_group_index = het_idx + else: + command.het_group_index = None + final_cmd, exec_config = self._prepare_command(command, cluster_config) commands.append(final_cmd) From 849253f282fb85af72c7ebd6ac763cce3237ae52 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Fri, 3 Oct 2025 12:57:04 -0700 Subject: [PATCH 04/29] MAINT cleanup Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 57 +++++++++++------------ nemo_skills/pipeline/utils/declarative.py | 28 +++++------ 2 files changed, 38 insertions(+), 47 deletions(-) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index e18d7e2543..959635f8bc 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -53,36 +53,33 @@ def _create_commandgroup_from_config( task_name: str, log_dir: str, ) -> CommandGroup: - """Create a CommandGroup from server_config, matching add_task logic. + """Create a CommandGroup from server_config. - This function replicates the component ordering from add_task: + Component ordering: 1. Server (if server_config provided) - 2. Main client command + 2. Client command 3. Sandbox (if with_sandbox=True) """ components = [] - # 1. Add server if server_config is provided (matches add_task lines 433-463) + # 1. Add server if server_config is provided if server_config is not None and int(server_config["num_gpus"]) > 0: - # Extract server container (matches line 435) server_type = server_config["server_type"] server_container = server_config.pop("container", cluster_config["containers"][server_type]) - # Use the EXISTING get_server_command function via lambda - # This ensures we use the same server command construction as add_task + # Create server command builder that defers execution until cluster_config is available server_config_copy = server_config.copy() def make_server_cmd(cfg): cmd, num_tasks = get_server_command_fn(**server_config_copy, cluster_config=cfg) - # Include log_prefix in metadata to match old add_task behavior return ( cmd, { "num_tasks": num_tasks, "gpus": server_config_copy["num_gpus"], "nodes": server_config_copy["num_nodes"], - "log_prefix": "server", # Explicitly set log prefix + "log_prefix": "server", }, ) @@ -91,24 +88,23 @@ def make_server_cmd(cfg): container=server_container, gpus=server_config["num_gpus"], nodes=server_config["num_nodes"], - name=task_name, # Use base task_name, not with _server suffix + name=task_name, ) components.append(server_cmd) - # 2. Add main generation command (matches add_task lines 466-580) + # 2. Add main generation command client_cmd = Command( - command=generation_cmd, # Already built with get_generation_cmd! + command=generation_cmd, container=cluster_config["containers"]["nemo-skills"], - name=task_name, # Use base task_name, not with _client suffix + name=task_name, installation_command=installation_command, - metadata={"log_prefix": "main"}, # Set log prefix to match old add_task + metadata={"log_prefix": "main"}, ) components.append(client_cmd) - # 3. Add sandbox if requested (matches add_task lines 527-565) + # 3. Add sandbox if requested if with_sandbox: - # Use existing sandbox_command builder - # Wrap to ensure log_prefix is set in returned metadata + def make_sandbox_cmd(cfg): # sandbox_command returns (callable, metadata), so we need to call the callable cmd_builder, initial_metadata = sandbox_command(port=sandbox_port, keep_mounts=keep_mounts_for_sandbox) @@ -117,18 +113,18 @@ def make_sandbox_cmd(cfg): # Merge metadata metadata = initial_metadata.copy() metadata.update(runtime_metadata) - metadata["log_prefix"] = "sandbox" # Explicitly set log prefix + metadata["log_prefix"] = "sandbox" return (cmd_string, metadata) sandbox_cmd = Command( command=make_sandbox_cmd, container=cluster_config["containers"]["sandbox"], - name=task_name, # Use base task_name, not with _sandbox suffix + name=task_name, ) components.append(sandbox_cmd) - # Find MAXIMUM GPUs needed by any component for the HardwareConfig - # This is critical for multi-component jobs - the job-level request must be the max + # Find maximum GPUs/nodes needed by any component for the HardwareConfig + # The job-level resource request must be the maximum across all components max_gpus = max((comp.gpus or 0) for comp in components) max_nodes = max((comp.nodes or 1) for comp in components) @@ -417,13 +413,12 @@ def generate( ) cmd = pipeline_utils.wrap_python_path(cmd=cmd) - # Base task name (same for all dependent jobs in chain - matches original!) + # Base task name (shared across all dependent jobs in the chain) task_name = f"{expname}-rs{seed}" if seed is not None else expname if chunk_id is not None: task_name += f"-chunk{chunk_id}" - # Handle dependent_jobs chain (matching add_task behavior) - # Note: run_after will be handled by Stage for jobs that don't have task_dependencies + # Handle dependent_jobs chain dependencies = _task_dependencies.copy() if _task_dependencies else [] for dep_idx in range(dependent_jobs + 1): @@ -448,16 +443,16 @@ def generate( # Use unique internal job name for dependency tracking, but same task_name internal_job_name = f"{task_name}-dep{dep_idx}" if dep_idx > 0 else task_name - # Build dependencies: first job gets _task_dependencies+run_after, rest get previous in chain + # Build dependencies: first job in chain gets external dependencies, rest chain to previous if dep_idx == 0: - # First job: add run_after if no task_dependencies (matching add_task logic) + # First job: add run_after if no task_dependencies job_deps = dependencies.copy() if dependencies else [] if not dependencies and run_after: run_after_list = run_after if isinstance(run_after, list) else [run_after] job_deps.extend(run_after_list) job_deps = job_deps if job_deps else None else: - # Subsequent jobs in chain depend on previous + # Subsequent jobs in chain depend on previous job job_deps = [f"{task_name}-dep{dep_idx - 1}"] jobs.append( @@ -479,14 +474,14 @@ def generate( # For internal use: return job names for dependency tracking return all_job_names - # Create and run pipeline (with performance optimizations) + # Create and run pipeline pipeline = Pipeline( name=expname, cluster=cluster, jobs=jobs, - reuse_code=reuse_code, # PERFORMANCE: Enable code reuse - reuse_code_exp=reuse_code_exp, # PERFORMANCE: Reuse from specific exp - skip_hf_home_check=skip_hf_home_check, # Validation control + reuse_code=reuse_code, + reuse_code_exp=reuse_code_exp, + skip_hf_home_check=skip_hf_home_check, ) result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 415f542457..8c604c4655 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -38,7 +38,7 @@ pipeline = Pipeline( name="my_inference", cluster="local", - groups=[inference_group] # Legacy mode: single job + groups=[inference_group] ) pipeline.run() @@ -158,7 +158,7 @@ class Command: het_group_index: Optional[int] = None # Set per-job by Pipeline (not global) def __post_init__(self): - # Initialize component (merged from old Component class) + # Initialize defaults if self.name is None: self.name = "command" @@ -229,7 +229,7 @@ def prepare_for_execution(self, cluster_config: Dict) -> Tuple[str, Dict]: else: final_command = self.command - # 2. Wrap with installation_command if provided (matches add_task behavior) + # 2. Wrap with installation_command if provided if self.installation_command: final_command = install_packages_wrap(final_command, self.installation_command) @@ -272,7 +272,7 @@ def __init__( name: Optional[str] = None, log_dir: Optional[str] = None, ): - self.components = commands # Keep as self.components internally for backwards compatibility + self.components = commands self.hardware = hardware or HardwareConfig() self.name = name self.log_dir = log_dir @@ -357,7 +357,7 @@ def run( else: final_cluster_config = self._get_cluster_config() - # Validate HF_HOME (matching add_task validation) + # Validate HF_HOME environment variable if final_cluster_config["executor"] != "none" and not self.skip_hf_home_check: env_vars = get_env_variables(final_cluster_config) if "HF_HOME" not in env_vars: @@ -377,7 +377,7 @@ def run( for job_spec in self.jobs: job_name = job_spec.get("name", "unnamed") - # Resolve dependencies to task handles (matching add_task lines 390-404) + # Resolve dependencies to task handles run_after_deps = [] # Handle dependencies from job spec @@ -386,7 +386,7 @@ def run( if job_dependencies is None: job_dependencies = [] - # If no dependencies and Pipeline has run_after, apply it (matching add_task behavior) + # If no job-level dependencies, apply pipeline-level run_after if not job_dependencies and self.run_after: run_after_list = self.run_after if isinstance(self.run_after, list) else [self.run_after] job_dependencies = run_after_list @@ -411,7 +411,7 @@ def run( f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)" ) - # Convert empty list to None (matching add_task line 402) + # Convert empty list to None for cleaner handling if len(run_after_deps) == 0: run_after_deps = None @@ -454,7 +454,7 @@ def run( if not dry_run: run_exp(exp, final_cluster_config) - # Cache experiment for code reuse in future runs (matching add_task behavior) + # Cache experiment for code reuse in future runs if final_cluster_config["executor"] != "none": tunnel = get_tunnel(final_cluster_config) cur_tunnel_hash = tunnel_hash(tunnel) @@ -559,7 +559,7 @@ def _plan_and_add_job( _, exec_config_probe = command.prepare_for_execution(cluster_config) shared_env_vars.update(exec_config_probe.get("environment", {})) - # PERFORMANCE: Track first packager to share across executors (single-group only) + # Share packager across executors for efficiency (single-group only) shared_packager = None # Build commands and executors @@ -624,7 +624,7 @@ def _plan_and_add_job( if heterogeneous and executors: executors[0].het_group_indices = het_group_indices - # PERFORMANCE: Handle code reuse from previous experiments (single-group only) + # Handle code reuse from previous experiments (single-group only) if (not heterogeneous) and cluster_config["executor"] != "none": tunnel = get_tunnel(cluster_config) if self.reuse_code: @@ -650,7 +650,7 @@ def _plan_and_add_job( else: LOG.warning(f"Relevant packaging job not found for experiment {reuse_exp._title}") else: - # If reuse_code=False, clear cache (matching original behavior) + # If reuse_code=False, clear cache REUSE_CODE_EXP.pop(tunnel_hash(tunnel), None) # Handle executor="none" path replacements (single-group only) @@ -725,7 +725,3 @@ def _add_multi_group_job( run_after=run_after, heterogeneous=True, ) - - -# Backwards compatibility alias -Pipeline = Pipeline From 934f3344f496f2e9f33c50103a20e733a3b0162c Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Fri, 3 Oct 2025 13:01:48 -0700 Subject: [PATCH 05/29] TST first round of tests Signed-off-by: George Armstrong --- tests/test_declarative_pipeline.py | 600 +++++++++++++++++++++++++++++ 1 file changed, 600 insertions(+) create mode 100644 tests/test_declarative_pipeline.py diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py new file mode 100644 index 0000000000..6af115675f --- /dev/null +++ b/tests/test_declarative_pipeline.py @@ -0,0 +1,600 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the declarative pipeline system.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline + + +class TestCommand: + """Test Command class functionality.""" + + def test_command_basic_string(self): + """Test creating a Command with a simple string.""" + cmd = Command(command="echo hello", name="test") + assert cmd.name == "test" + assert cmd.container == "nemo-skills" + assert cmd.gpus is None + assert cmd.nodes == 1 + + def test_command_with_metadata(self): + """Test Command with metadata tuple.""" + cmd = Command(command=("echo hello", {"port": 8080, "log_prefix": "server"}), name="server") + assert cmd.metadata["port"] == 8080 + assert cmd.metadata["log_prefix"] == "server" + # Command gets wrapped with working_dir by default + assert "echo hello" in cmd.command + + def test_command_with_callable(self): + """Test Command with callable that returns tuple.""" + + def make_cmd(): + return ("echo world", {"port": 5000}) + + cmd = Command(command=make_cmd, name="dynamic") + assert callable(cmd.command) + assert cmd.name == "dynamic" + + def test_command_prepare_for_execution_string(self): + """Test prepare_for_execution with string command.""" + cmd = Command(command="python script.py", gpus=2, name="test") + cluster_config = {"executor": "local", "containers": {}} + + final_cmd, exec_config = cmd.prepare_for_execution(cluster_config) + + assert "python script.py" in final_cmd + assert exec_config["num_gpus"] == 2 + assert exec_config["num_nodes"] == 1 + assert exec_config["num_tasks"] == 1 + + def test_command_prepare_for_execution_callable(self): + """Test prepare_for_execution with callable command.""" + + def make_cmd(): + return "echo test" + + cmd = Command(command=make_cmd, name="test") + cluster_config = {"executor": "local", "containers": {}} + + final_cmd, exec_config = cmd.prepare_for_execution(cluster_config) + + assert final_cmd == "echo test" + + def test_command_prepare_for_execution_callable_with_metadata(self): + """Test prepare_for_execution with callable returning tuple.""" + + def make_cmd(): + return ("echo metadata", {"num_tasks": 4, "environment": {"VAR": "value"}}) + + cmd = Command(command=make_cmd, name="test") + cluster_config = {"executor": "local", "containers": {}} + + final_cmd, exec_config = cmd.prepare_for_execution(cluster_config) + + assert final_cmd == "echo metadata" + assert exec_config["num_tasks"] == 4 + assert exec_config["environment"]["VAR"] == "value" + + def test_command_prepare_for_execution_with_cluster_config(self): + """Test prepare_for_execution with callable needing cluster_config.""" + + def make_cmd(cfg): + return f"echo {cfg['test_param']}" + + cmd = Command(command=make_cmd, name="test") + cluster_config = {"executor": "local", "containers": {}, "test_param": "value123"} + + final_cmd, exec_config = cmd.prepare_for_execution(cluster_config) + + assert final_cmd == "echo value123" + + def test_command_meta_ref(self): + """Test meta_ref for accessing metadata.""" + cmd = Command(command=("echo test", {"port": 8080, "host": "localhost"}), name="server") + + assert cmd.meta_ref("port") == "8080" + assert cmd.meta_ref("host") == "localhost" + + def test_command_meta_ref_missing_key(self): + """Test meta_ref with missing key raises KeyError.""" + cmd = Command(command="echo test", name="test") + + with pytest.raises(KeyError, match="Metadata key 'port' not found"): + cmd.meta_ref("port") + + def test_command_hostname_ref_none(self): + """Test hostname_ref returns localhost when het_group_index is None.""" + cmd = Command(command="echo test", name="test") + assert cmd.het_group_index is None + assert cmd.hostname_ref() == "127.0.0.1" + + def test_command_hostname_ref_heterogeneous(self): + """Test hostname_ref returns SLURM variable when het_group_index is set.""" + cmd = Command(command="echo test", name="test") + cmd.het_group_index = 2 + + hostname = cmd.hostname_ref() + assert "$SLURM_JOB_NODELIST_HET_GROUP_2" in hostname + assert "scontrol" in hostname + + def test_command_with_installation_command(self): + """Test Command with installation_command.""" + cmd = Command(command="python script.py", installation_command="pip install package", name="test") + cluster_config = {"executor": "local", "containers": {}} + + final_cmd, _ = cmd.prepare_for_execution(cluster_config) + + # Installation command should be wrapped around the main command + assert "pip install package" in final_cmd + assert "python script.py" in final_cmd + + def test_command_env_vars_wrapping(self): + """Test that env_vars and working_dir are applied to string commands.""" + cmd = Command( + command="python script.py", + env_vars={"MY_VAR": "value"}, + working_dir="/custom/path", + name="test", + ) + + # The command should be wrapped with env setup + assert "export MY_VAR=value" in cmd.command + assert "cd /custom/path" in cmd.command + + +class TestCommandGroup: + """Test CommandGroup class functionality.""" + + def test_commandgroup_basic(self): + """Test creating a basic CommandGroup.""" + cmd1 = Command(command="echo 1", name="cmd1") + cmd2 = Command(command="echo 2", name="cmd2") + + group = CommandGroup(commands=[cmd1, cmd2], name="test_group") + + assert group.name == "test_group" + assert len(group.components) == 2 + assert group.hardware is not None + + def test_commandgroup_with_hardware(self): + """Test CommandGroup with HardwareConfig.""" + cmd = Command(command="echo test", name="cmd") + hardware = HardwareConfig(partition="batch", time_min="01:00:00", num_gpus=8) + + group = CommandGroup(commands=[cmd], hardware=hardware, name="gpu_group") + + assert group.hardware.partition == "batch" + assert group.hardware.time_min == "01:00:00" + assert group.hardware.num_gpus == 8 + + def test_commandgroup_with_log_dir(self): + """Test CommandGroup with log_dir.""" + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], log_dir="/logs/test", name="group") + + assert group.log_dir == "/logs/test" + + +class TestPipeline: + """Test Pipeline class functionality.""" + + def test_pipeline_with_groups_legacy_mode(self): + """Test Pipeline with groups parameter (legacy mode).""" + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group") + + pipeline = Pipeline(name="test_pipeline", cluster="local", groups=[group]) + + assert pipeline.name == "test_pipeline" + assert pipeline._legacy_mode is True + assert len(pipeline.jobs) == 1 + assert "group" in pipeline.jobs[0] + + def test_pipeline_with_jobs(self): + """Test Pipeline with jobs parameter.""" + cmd1 = Command(command="echo 1", name="cmd1") + group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs") + + cmd2 = Command(command="echo 2", name="cmd2") + group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs") + + jobs = [ + {"name": "job1", "group": group1}, + {"name": "job2", "group": group2, "dependencies": ["job1"]}, + ] + + pipeline = Pipeline(name="test_pipeline", cluster="local", jobs=jobs) + + assert pipeline.name == "test_pipeline" + assert pipeline._legacy_mode is False + assert len(pipeline.jobs) == 2 + + def test_pipeline_cannot_specify_both_groups_and_jobs(self): + """Test that Pipeline raises error when both groups and jobs are specified.""" + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group") + jobs = [{"name": "job", "group": group}] + + with pytest.raises(ValueError, match="Cannot specify both 'groups' and 'jobs'"): + Pipeline(name="test", cluster="local", groups=[group], jobs=jobs) + + def test_pipeline_must_specify_either_groups_or_jobs(self): + """Test that Pipeline raises error when neither groups nor jobs are specified.""" + with pytest.raises(ValueError, match="Must specify either 'groups' or 'jobs'"): + Pipeline(name="test", cluster="local") + + def test_pipeline_with_run_after(self): + """Test Pipeline with run_after parameter.""" + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group") + + pipeline = Pipeline(name="test", cluster="local", groups=[group], run_after="other_exp") + + assert pipeline.run_after == "other_exp" + + def test_pipeline_with_run_after_list(self): + """Test Pipeline with run_after as list.""" + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group") + + pipeline = Pipeline(name="test", cluster="local", groups=[group], run_after=["exp1", "exp2"]) + + assert pipeline.run_after == ["exp1", "exp2"] + + @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") + def test_pipeline_get_cluster_config(self, mock_get_config): + """Test _get_cluster_config method.""" + mock_config = {"executor": "local", "containers": {}} + mock_get_config.return_value = mock_config + + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group") + pipeline = Pipeline(name="test", cluster="local", groups=[group]) + + config = pipeline._get_cluster_config() + + assert config == mock_config + mock_get_config.assert_called_once_with("local") + + @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") + def test_pipeline_missing_cluster_config(self, mock_get_config): + """Test that Pipeline raises error when cluster is not specified.""" + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group") + pipeline = Pipeline(name="test", groups=[group]) # No cluster specified + + with pytest.raises(ValueError, match="Must specify cluster"): + pipeline._get_cluster_config() + + +class TestPipelineExecution: + """Test Pipeline execution and job management.""" + + @patch("nemo_skills.pipeline.utils.declarative.get_exp") + @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + @patch("nemo_skills.pipeline.utils.declarative.run_exp") + def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_config, mock_get_exp): + """Test basic pipeline execution.""" + # Setup mocks + mock_config = { + "executor": "none", + "containers": {"nemo-skills": "container:latest"}, + } + mock_get_config.return_value = mock_config + mock_env_vars.return_value = {"HF_HOME": "/hf"} + + mock_exp = MagicMock() + mock_exp.add.return_value = "task_handle_1" + mock_get_exp.return_value.__enter__.return_value = mock_exp + + # Create pipeline + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") + pipeline = Pipeline(name="test", cluster="local", groups=[group], skip_hf_home_check=True) + + # Run pipeline + result = pipeline.run(cluster_config=mock_config, dry_run=True) + + # Verify + assert result == mock_exp + mock_exp.add.assert_called_once() + + @patch("nemo_skills.pipeline.utils.declarative.get_exp") + @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + @patch("nemo_skills.pipeline.utils.declarative.run_exp") + def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_get_config, mock_get_exp): + """Test pipeline execution with job dependencies.""" + # Setup mocks + mock_config = { + "executor": "none", + "containers": {"nemo-skills": "container:latest"}, + } + mock_get_config.return_value = mock_config + mock_env_vars.return_value = {"HF_HOME": "/hf"} + + mock_exp = MagicMock() + mock_exp.add.side_effect = ["handle_1", "handle_2"] + mock_get_exp.return_value.__enter__.return_value = mock_exp + + # Create pipeline with dependencies + cmd1 = Command(command="echo 1", name="cmd1") + group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs") + + cmd2 = Command(command="echo 2", name="cmd2") + group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs") + + jobs = [ + {"name": "job1", "group": group1}, + {"name": "job2", "group": group2, "dependencies": ["job1"]}, + ] + pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True) + + # Run pipeline + pipeline.run(cluster_config=mock_config, dry_run=True) + + # Verify both jobs were added + assert mock_exp.add.call_count == 2 + + @patch("nemo_skills.pipeline.utils.declarative.get_exp") + @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath") + @patch("nemo_skills.pipeline.utils.declarative.get_executor") + def test_pipeline_hf_home_validation( + self, mock_get_executor, mock_is_mounted, mock_env_vars, mock_get_config, mock_get_exp + ): + """Test HF_HOME validation.""" + mock_config = { + "executor": "slurm", + "containers": {"nemo-skills": "container:latest"}, + "account": "test_account", + } + mock_get_config.return_value = mock_config + mock_env_vars.return_value = {"HF_HOME": "/hf"} + mock_is_mounted.return_value = True + mock_get_executor.return_value = MagicMock() + + mock_exp = MagicMock() + mock_exp.add.return_value = "handle" + mock_get_exp.return_value.__enter__.return_value = mock_exp + + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") + pipeline = Pipeline(name="test", cluster="local", groups=[group]) + + # Should not raise + pipeline.run(cluster_config=mock_config, dry_run=True) + + # Verify executor was created + assert mock_get_executor.called + + @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + def test_pipeline_hf_home_missing(self, mock_env_vars, mock_get_config): + """Test that missing HF_HOME raises error.""" + mock_config = {"executor": "slurm", "containers": {}} + mock_get_config.return_value = mock_config + mock_env_vars.return_value = {} # No HF_HOME + + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") + pipeline = Pipeline(name="test", cluster="local", groups=[group]) + + with pytest.raises(RuntimeError, match="HF_HOME is missing"): + pipeline.run(cluster_config=mock_config, dry_run=True) + + @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath") + def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars, mock_get_config): + """Test that non-mounted HF_HOME raises error.""" + mock_config = {"executor": "slurm", "containers": {}} + mock_get_config.return_value = mock_config + mock_env_vars.return_value = {"HF_HOME": "/hf"} + mock_is_mounted.return_value = False + + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") + pipeline = Pipeline(name="test", cluster="local", groups=[group]) + + with pytest.raises(RuntimeError, match="is not a mounted path"): + pipeline.run(cluster_config=mock_config, dry_run=True) + + +class TestHetGroupIndices: + """Test het_group_index assignment.""" + + @patch("nemo_skills.pipeline.utils.declarative.get_exp") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + def test_het_group_index_non_heterogeneous(self, mock_env_vars, mock_get_exp): + """Test that non-heterogeneous jobs have het_group_index=None.""" + mock_config = { + "executor": "none", + "containers": {"nemo-skills": "container:latest"}, + } + mock_env_vars.return_value = {"HF_HOME": "/hf"} + + mock_exp = MagicMock() + mock_exp.add.return_value = "handle" + mock_get_exp.return_value.__enter__.return_value = mock_exp + + # Create single-group job with multiple components + cmd1 = Command(command="echo 1", name="cmd1") + cmd2 = Command(command="echo 2", name="cmd2") + group = CommandGroup(commands=[cmd1, cmd2], name="group", log_dir="/logs") + + pipeline = Pipeline(name="test", cluster="local", groups=[group], skip_hf_home_check=True) + pipeline.run(cluster_config=mock_config, dry_run=True) + + # Both commands should have None het_group_index (localhost communication) + assert cmd1.het_group_index is None + assert cmd2.het_group_index is None + assert cmd1.hostname_ref() == "127.0.0.1" + assert cmd2.hostname_ref() == "127.0.0.1" + + @patch("nemo_skills.pipeline.utils.declarative.get_exp") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + def test_het_group_index_heterogeneous(self, mock_env_vars, mock_get_exp): + """Test that heterogeneous jobs get per-job het_group_index.""" + mock_config = { + "executor": "none", + "containers": {"nemo-skills": "container:latest"}, + } + mock_env_vars.return_value = {"HF_HOME": "/hf"} + + mock_exp = MagicMock() + mock_exp.add.return_value = "handle" + mock_get_exp.return_value.__enter__.return_value = mock_exp + + # Create multi-group heterogeneous job + cmd1 = Command(command="echo 1", name="cmd1") + group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs") + + cmd2 = Command(command="echo 2", name="cmd2") + group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs") + + jobs = [{"name": "hetjob", "groups": [group1, group2]}] + pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True) + pipeline.run(cluster_config=mock_config, dry_run=True) + + # Commands should have het_group_index 0 and 1 + assert cmd1.het_group_index == 0 + assert cmd2.het_group_index == 1 + assert "$SLURM_JOB_NODELIST_HET_GROUP_0" in cmd1.hostname_ref() + assert "$SLURM_JOB_NODELIST_HET_GROUP_1" in cmd2.hostname_ref() + + @patch("nemo_skills.pipeline.utils.declarative.get_exp") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + def test_het_group_index_per_job_not_global(self, mock_env_vars, mock_get_exp): + """Test that het_group_index is per-job, not global across pipeline.""" + mock_config = { + "executor": "none", + "containers": {"nemo-skills": "container:latest"}, + } + mock_env_vars.return_value = {"HF_HOME": "/hf"} + + mock_exp = MagicMock() + mock_exp.add.side_effect = ["handle1", "handle2"] + mock_get_exp.return_value.__enter__.return_value = mock_exp + + # Create two separate heterogeneous jobs + cmd1 = Command(command="echo 1", name="cmd1") + group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs") + + cmd2 = Command(command="echo 2", name="cmd2") + group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs") + + cmd3 = Command(command="echo 3", name="cmd3") + group3 = CommandGroup(commands=[cmd3], name="group3", log_dir="/logs") + + cmd4 = Command(command="echo 4", name="cmd4") + group4 = CommandGroup(commands=[cmd4], name="group4", log_dir="/logs") + + jobs = [ + {"name": "hetjob1", "groups": [group1, group2]}, + {"name": "hetjob2", "groups": [group3, group4]}, + ] + pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True) + pipeline.run(cluster_config=mock_config, dry_run=True) + + # Both jobs should have het_group_index starting from 0 + assert cmd1.het_group_index == 0 + assert cmd2.het_group_index == 1 + assert cmd3.het_group_index == 0 # Starts from 0 again! + assert cmd4.het_group_index == 1 + + +class TestDependencyResolution: + """Test dependency resolution in Pipeline.""" + + @patch("nemo_skills.pipeline.utils.declarative.get_exp") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + def test_dependency_none_handling(self, mock_env_vars, mock_get_exp): + """Test that explicit None dependencies are handled correctly.""" + mock_config = { + "executor": "none", + "containers": {"nemo-skills": "container:latest"}, + } + mock_env_vars.return_value = {"HF_HOME": "/hf"} + + mock_exp = MagicMock() + mock_exp.add.return_value = "handle" + mock_get_exp.return_value.__enter__.return_value = mock_exp + + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") + + jobs = [{"name": "job", "group": group, "dependencies": None}] + pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True) + + # Should not raise + pipeline.run(cluster_config=mock_config, dry_run=True) + + @patch("nemo_skills.pipeline.utils.declarative.get_exp") + @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") + def test_pipeline_run_after_applies_to_jobs(self, mock_env_vars, mock_get_exp): + """Test that pipeline-level run_after applies to jobs without dependencies.""" + mock_config = { + "executor": "none", + "containers": {"nemo-skills": "container:latest"}, + } + mock_env_vars.return_value = {"HF_HOME": "/hf"} + + mock_exp = MagicMock() + mock_exp.add.return_value = "handle" + mock_get_exp.return_value.__enter__.return_value = mock_exp + + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") + + pipeline = Pipeline( + name="test", cluster="local", groups=[group], run_after="other_exp", skip_hf_home_check=True + ) + + # Should not raise and should apply run_after + pipeline.run(cluster_config=mock_config, dry_run=True) + + +class TestErrorHandling: + """Test error handling in Pipeline.""" + + def test_pipeline_job_missing_group_or_groups(self): + """Test that job spec without group or groups raises error.""" + jobs = [{"name": "bad_job"}] # Missing 'group' or 'groups' + + with pytest.raises(ValueError, match="must have either 'group' or 'groups'"): + pipeline = Pipeline(name="test", cluster="local", jobs=jobs) + mock_config = {"executor": "none", "containers": {}} + pipeline.run(cluster_config=mock_config, dry_run=True) + + def test_commandgroup_missing_log_dir(self): + """Test that CommandGroup without log_dir raises error during execution.""" + cmd = Command(command="echo test", name="cmd") + group = CommandGroup(commands=[cmd], name="group") # No log_dir + + pipeline = Pipeline(name="test", cluster="local", groups=[group]) + mock_config = {"executor": "none", "containers": {}} + + with pytest.raises(ValueError, match="must have log_dir set"): + pipeline.run(cluster_config=mock_config, dry_run=True) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) From 7005d4e0e64b3083797ac2cb53a01c0ad1fbd14d Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Fri, 3 Oct 2025 14:31:51 -0700 Subject: [PATCH 06/29] FIX _resuse_exp behavior Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 8 ++--- nemo_skills/pipeline/utils/declarative.py | 43 +++++++++++++++++------ 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index 959635f8bc..180fe4d22f 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -469,11 +469,6 @@ def generate( if not jobs: return None - # Handle _reuse_exp case - if _reuse_exp: - # For internal use: return job names for dependency tracking - return all_job_names - # Create and run pipeline pipeline = Pipeline( name=expname, @@ -484,7 +479,8 @@ def generate( skip_hf_home_check=skip_hf_home_check, ) - result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run) + # Pass _reuse_exp to pipeline.run() to add jobs to existing experiment + result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run, _reuse_exp=_reuse_exp) return result diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 8c604c4655..7e55ac8d69 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -336,6 +336,7 @@ def run( cluster: Optional[str] = None, dry_run: bool = False, log_dir: Optional[str] = None, + _reuse_exp=None, ): """Execute the pipeline by calling NeMo-Run directly. @@ -344,6 +345,7 @@ def run( cluster: Cluster name to load config from (optional if cluster_config provided) dry_run: If True, validate without executing log_dir: Default log directory for groups that don't specify one (optional) + _reuse_exp: Internal - reuse existing experiment object (for eval.py integration) """ if not self.jobs: LOG.info("No jobs to execute") @@ -372,7 +374,7 @@ def run( # Track job name -> task handle for dependency resolution job_name_to_handle = {} - with get_exp(self.name, final_cluster_config) as exp: + with get_exp(self.name, final_cluster_config, _reuse_exp) as exp: # Process each job in order for job_spec in self.jobs: job_name = job_spec.get("name", "unnamed") @@ -393,23 +395,39 @@ def run( for dep in job_dependencies: if isinstance(dep, str): - # String dependency - look up by name + # String dependency - could be job name, task handle, or experiment name if dep in job_name_to_handle: # Internal pipeline dependency - add the handle run_after_deps.append(job_name_to_handle[dep]) LOG.info(f"Job '{job_name}' depends on '{dep}' (handle: {job_name_to_handle[dep]})") else: - # External experiment name - will be resolved by get_exp_handles below + # Could be external experiment name OR task handle from _reuse_exp case + # Try to get as experiment name first if final_cluster_config["executor"] == "slurm": exp_handles = get_exp_handles(dep) if len(exp_handles) == 0: LOG.warning( f"No pending or running tasks found for experiment {dep}, cannot set dependencies." ) - run_after_deps.extend(exp_handles) - LOG.info( - f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)" - ) + # If no experiment found, treat as direct task handle (for _reuse_exp case) + if _reuse_exp: + run_after_deps.append(dep) + LOG.info( + f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)" + ) + else: + run_after_deps.extend(exp_handles) + LOG.info( + f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)" + ) + elif _reuse_exp: + # For non-SLURM executors with _reuse_exp, treat as task handle + run_after_deps.append(dep) + LOG.info(f"Job '{job_name}' depends on task handle '{dep}'") + else: + # Direct task handle object (not string) + run_after_deps.append(dep) + LOG.info(f"Job '{job_name}' depends on task handle (object)") # Convert empty list to None for cleaner handling if len(run_after_deps) == 0: @@ -451,7 +469,8 @@ def run( job_name_to_handle[job_name] = task_handle LOG.info(f"Added job '{job_name}' with task_handle={task_handle}") - if not dry_run: + # Only run if not using existing experiment (matching generate_v0.py line 331) + if not dry_run and not _reuse_exp: run_exp(exp, final_cluster_config) # Cache experiment for code reuse in future runs @@ -462,6 +481,10 @@ def run( REUSE_CODE_EXP[cur_tunnel_hash] = exp LOG.info("Cached experiment for future code reuse") + # When reusing experiment, return list of task handles (matching generate_v0.py line 335) + if _reuse_exp: + return list(job_name_to_handle.values()) + return exp def _prepare_command(self, command, cluster_config: Dict) -> Tuple[str, Dict]: @@ -539,8 +562,8 @@ def _plan_and_add_job( differences are controlled by the 'heterogeneous' flag and the provided 'groups'. """ - # Dependencies are only passed through for SLURM - dependencies = run_after if (run_after and cluster_config["executor"] == "slurm") else None + # Pass dependencies through (nemo-run handles executor-specific behavior) + dependencies = run_after if run_after else None # Resolve log directory (use first group's log_dir if present) log_dir = groups[0].log_dir or default_log_dir From c6157786d776f1b6ad9d7984ca6d0644a5ee034a Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Fri, 3 Oct 2025 14:36:46 -0700 Subject: [PATCH 07/29] Update nemo_skills/pipeline/generate.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index 180fe4d22f..26a0953e2a 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -110,12 +110,17 @@ def make_sandbox_cmd(cfg): cmd_builder, initial_metadata = sandbox_command(port=sandbox_port, keep_mounts=keep_mounts_for_sandbox) # Call the builder to get the actual command string cmd_string, runtime_metadata = cmd_builder(cfg) - # Merge metadata + # Merge metadata (deep-merge environment) metadata = initial_metadata.copy() - metadata.update(runtime_metadata) + if "environment" in runtime_metadata: + env = metadata.get("environment", {}).copy() + env.update(runtime_metadata["environment"]) + metadata["environment"] = env + for k, v in runtime_metadata.items(): + if k != "environment": + metadata[k] = v metadata["log_prefix"] = "sandbox" return (cmd_string, metadata) - sandbox_cmd = Command( command=make_sandbox_cmd, container=cluster_config["containers"]["sandbox"], From 62948b1d6441bf905c9aa3bcf9504704a9ce36ba Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Fri, 3 Oct 2025 14:41:15 -0700 Subject: [PATCH 08/29] lint Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index 26a0953e2a..bebee08c32 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -121,11 +121,13 @@ def make_sandbox_cmd(cfg): metadata[k] = v metadata["log_prefix"] = "sandbox" return (cmd_string, metadata) + sandbox_cmd = Command( command=make_sandbox_cmd, container=cluster_config["containers"]["sandbox"], name=task_name, ) + components.append(sandbox_cmd) # Find maximum GPUs/nodes needed by any component for the HardwareConfig From b1ef852f2772d308a212b22e9c44280f935cae13 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 15:01:24 -0700 Subject: [PATCH 09/29] MAINT remove lambdas dependent on cluster_cfg Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 51 ++++++---------- nemo_skills/pipeline/utils/commands.py | 73 ++++++++++------------- nemo_skills/pipeline/utils/declarative.py | 36 +++++++---- 3 files changed, 76 insertions(+), 84 deletions(-) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index bebee08c32..3d7b0ea15c 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -68,27 +68,25 @@ def _create_commandgroup_from_config( server_type = server_config["server_type"] server_container = server_config.pop("container", cluster_config["containers"][server_type]) - # Create server command builder that defers execution until cluster_config is available - server_config_copy = server_config.copy() - - def make_server_cmd(cfg): - cmd, num_tasks = get_server_command_fn(**server_config_copy, cluster_config=cfg) - return ( - cmd, - { - "num_tasks": num_tasks, - "gpus": server_config_copy["num_gpus"], - "nodes": server_config_copy["num_nodes"], - "log_prefix": "server", - }, - ) + # Call server command builder directly with cluster_config + cmd, metadata = get_server_command_fn(**server_config, cluster_config=cluster_config) + + # Add additional metadata + metadata.update( + { + "gpus": server_config["num_gpus"], + "nodes": server_config["num_nodes"], + "log_prefix": "server", + } + ) server_cmd = Command( - command=make_server_cmd, + command=cmd, container=server_container, gpus=server_config["num_gpus"], nodes=server_config["num_nodes"], name=task_name, + metadata=metadata, ) components.append(server_cmd) @@ -104,28 +102,15 @@ def make_server_cmd(cfg): # 3. Add sandbox if requested if with_sandbox: - - def make_sandbox_cmd(cfg): - # sandbox_command returns (callable, metadata), so we need to call the callable - cmd_builder, initial_metadata = sandbox_command(port=sandbox_port, keep_mounts=keep_mounts_for_sandbox) - # Call the builder to get the actual command string - cmd_string, runtime_metadata = cmd_builder(cfg) - # Merge metadata (deep-merge environment) - metadata = initial_metadata.copy() - if "environment" in runtime_metadata: - env = metadata.get("environment", {}).copy() - env.update(runtime_metadata["environment"]) - metadata["environment"] = env - for k, v in runtime_metadata.items(): - if k != "environment": - metadata[k] = v - metadata["log_prefix"] = "sandbox" - return (cmd_string, metadata) + # Call sandbox command builder directly with cluster_config + cmd, metadata = sandbox_command(cluster_config=cluster_config, port=sandbox_port) + metadata["log_prefix"] = "sandbox" sandbox_cmd = Command( - command=make_sandbox_cmd, + command=cmd, container=cluster_config["containers"]["sandbox"], name=task_name, + metadata=metadata, ) components.append(sandbox_cmd) diff --git a/nemo_skills/pipeline/utils/commands.py b/nemo_skills/pipeline/utils/commands.py index f2402c590b..12413a5677 100644 --- a/nemo_skills/pipeline/utils/commands.py +++ b/nemo_skills/pipeline/utils/commands.py @@ -19,13 +19,14 @@ in server.py and other modules, designed to work with Command objects. """ -from typing import Callable, Dict, Optional, Tuple +from typing import Dict, Optional, Tuple from nemo_skills.pipeline.utils.exp import get_sandbox_command from nemo_skills.pipeline.utils.server import get_free_port, get_server_command def vllm_server_command( + cluster_config: Dict, model: str, port: Optional[int] = None, server_type: str = "vllm", @@ -34,12 +35,11 @@ def vllm_server_command( args: str = "", entrypoint: Optional[str] = None, **kwargs, -) -> Tuple[Callable, Dict]: +) -> Tuple[str, Dict]: """Build vLLM server command. - Returns a lambda that will build the command when cluster_config is available. - Args: + cluster_config: Cluster configuration dictionary model: Model path or name port: Port to use (if None, will use get_free_port) server_type: Type of server (vllm, sglang, trtllm, megatron) @@ -49,72 +49,65 @@ def vllm_server_command( entrypoint: Custom entrypoint script Returns: - Tuple of (lambda, metadata_dict) + Tuple of (command_string, metadata_dict) """ if port is None: port = get_free_port(strategy="random") + cmd, num_tasks = get_server_command( + server_type=server_type, + num_gpus=gpus, + num_nodes=nodes, + model_path=model, + cluster_config=cluster_config, + server_port=port, + server_args=args, + server_entrypoint=entrypoint, + ) + metadata = { "port": port, "log_prefix": "server", + "num_tasks": num_tasks, } - # Return lambda that will call get_server_command when cluster_config is available - def server_cmd_builder(cfg): - cmd, num_tasks = get_server_command( - server_type=server_type, - num_gpus=gpus, - num_nodes=nodes, - model_path=model, - cluster_config=cfg, - server_port=port, - server_args=args, - server_entrypoint=entrypoint, - ) - return (cmd, {"num_tasks": num_tasks}) - - return server_cmd_builder, metadata + return cmd, metadata -def sandbox_command(port: Optional[int] = None, keep_mounts: bool = False, **kwargs) -> Tuple[Callable, Dict]: +def sandbox_command(cluster_config: Dict, port: Optional[int] = None, **kwargs) -> Tuple[str, Dict]: """Build sandbox command. - Returns a lambda that will build the command when cluster_config is available. - Args: + cluster_config: Cluster configuration dictionary port: Port to use for sandbox - keep_mounts: If True, keep mounts from cluster config. If False, use empty mounts for safety. Returns: - Tuple of (lambda, metadata_dict) + Tuple of (command_string, metadata_dict) """ if port is None: port = get_free_port(strategy="random") + cmd = get_sandbox_command(cluster_config) + + # Build PYTHONPATH from cluster config + pythonpath_env = {} + for env_var in cluster_config.get("env_vars", []): + if "PYTHONPATH" in env_var: + pythonpath = env_var[11:] if env_var.startswith("PYTHONPATH=") else env_var + pythonpath_env["PYTHONPATH"] = pythonpath + ":/app" + break + metadata = { "port": port, "log_prefix": "sandbox", - "mounts": None if keep_mounts else [], # None means use cluster config mounts, [] means no mounts "environment": { "LISTEN_PORT": str(port), "NGINX_PORT": str(port), + **pythonpath_env, }, } - # Return lambda that will be evaluated when cluster_config is available - def sandbox_cmd_with_env(cfg): - cmd = get_sandbox_command(cfg) - # Build PYTHONPATH from cluster config - pythonpath_env = {} - for env_var in cfg.get("env_vars", []): - if "PYTHONPATH" in env_var: - pythonpath = env_var[11:] if env_var.startswith("PYTHONPATH=") else env_var - pythonpath_env["PYTHONPATH"] = pythonpath + ":/app" - break - # Return command + additional metadata - return (cmd, {"environment": pythonpath_env}) - - return sandbox_cmd_with_env, metadata + return cmd, metadata def wrap_command(command: str, working_dir: str = "/nemo_run/code", env_vars: Optional[Dict[str, str]] = None) -> str: diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 7e55ac8d69..9560f418d3 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -20,8 +20,15 @@ from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline # Commands that run together in one SLURM job - server = Command(command=vllm_server_command(model="Qwen/Qwen3-8B"), gpus=8, name="server") - sandbox = Command(command=sandbox_command(), name="sandbox") + # Note: Lambdas are needed for cross-component references (hostname_ref, meta_ref) + # which aren't resolved until het_group_index is assigned at pipeline execution time. + server = Command( + command=vllm_server_command(cluster_cfg, model="Qwen/Qwen3-8B"), + gpus=8, + name="server" + ) + sandbox = Command(command=sandbox_command(cluster_cfg), name="sandbox") + # This lambda is ESSENTIAL - server.hostname_ref() and meta_ref() aren't available until runtime client = Command( command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}/health", name="client" @@ -58,12 +65,20 @@ ) # Job 2: Two different model servers (HETEROGENEOUS SLURM job with 2 het components) - server_8b = Command(command=vllm_server_command(model="Qwen/Qwen3-8B"), gpus=8, name="server_8b") - sandbox_8b = Command(command=sandbox_command(), name="sandbox_8b") + server_8b = Command( + command=lambda cfg: vllm_server_command(cfg, model="Qwen/Qwen3-8B"), + gpus=8, + name="server_8b" + ) + sandbox_8b = Command(command=lambda cfg: sandbox_command(cfg), name="sandbox_8b") eval_8b = Command(command="python eval.py --model 8b", gpus=1, name="eval_8b") - server_32b = Command(command=vllm_server_command(model="Qwen/Qwen3-32B"), gpus=8, name="server_32b") - sandbox_32b = Command(command=sandbox_command(), name="sandbox_32b") + server_32b = Command( + command=lambda cfg: vllm_server_command(cfg, model="Qwen/Qwen3-32B"), + gpus=8, + name="server_32b" + ) + sandbox_32b = Command(command=lambda cfg: sandbox_command(cfg), name="sandbox_32b") eval_32b = Command(command="python eval.py --model 32b", gpus=1, name="eval_32b") group_8b = CommandGroup(commands=[server_8b, sandbox_8b, eval_8b], name="eval_8b", log_dir=log_dir) @@ -130,11 +145,10 @@ class Command: - A tuple (command, metadata): command with metadata like port - A callable returning (command, metadata): lazy evaluation with metadata - Using a lambda allows references to work correctly in heterogeneous jobs: - Command(command=lambda: f"curl {server.hostname_ref()}:5000") - - Metadata from command builders (like port) can be referenced: - server = Command(command=vllm_server_command(...)) + Lambdas are needed for cross-component references (hostname_ref, meta_ref). + The het_group_index isn't assigned until pipeline execution, so these must be lazy: + server = Command(command=lambda cfg: vllm_server_command(cfg, ...)) + # This lambda is ESSENTIAL - server.hostname_ref() and meta_ref() don't exist yet client = Command(command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}") """ From fd7e3c66c6f1167beb9458963149fab388cec716 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 15:18:35 -0700 Subject: [PATCH 10/29] MAINT simplify command typing Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 23 +++-------------------- tests/test_declarative_pipeline.py | 12 ++++++------ 2 files changed, 9 insertions(+), 26 deletions(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 9560f418d3..9db837bf9d 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -142,8 +142,6 @@ class Command: The command can be either: - A string: evaluated immediately - A callable (lambda): evaluated lazily when the task is prepared - - A tuple (command, metadata): command with metadata like port - - A callable returning (command, metadata): lazy evaluation with metadata Lambdas are needed for cross-component references (hostname_ref, meta_ref). The het_group_index isn't assigned until pipeline execution, so these must be lazy: @@ -152,14 +150,9 @@ class Command: client = Command(command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}") """ - command: Union[ - str, - Callable[[], str], # Lambda for cross-group refs - Callable[[Dict], str], # Lambda needing cluster_config (e.g., sandbox) - Tuple[Optional[str], Dict], # Command builder result - Callable[[], Tuple[Optional[str], Dict]], # Lambda returning command builder result - None, - ] + # Command can be a string or callable (lambda). + # Lambdas are primarily used for cross-component references (hostname_ref, meta_ref). + command: Union[str, Callable] container: str = "nemo-skills" gpus: Optional[int] = None nodes: int = 1 @@ -176,20 +169,10 @@ def __post_init__(self): if self.name is None: self.name = "command" - # Extract metadata if command is a tuple - self._extract_metadata() - # Wrap plain strings with environment setup if isinstance(self.command, str) and (self.env_vars or self.working_dir): self.command = wrap_command(self.command, self.working_dir, self.env_vars) - def _extract_metadata(self): - """Extract metadata from command if it's a tuple.""" - if not callable(self.command): - if isinstance(self.command, tuple): - cmd, self.metadata = self.command - self.command = cmd # Can be None for sandbox - def hostname_ref(self) -> str: """Get hostname reference for hetjob cross-component communication.""" if self.het_group_index is None: diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index 6af115675f..cfe36c9f96 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -33,8 +33,8 @@ def test_command_basic_string(self): assert cmd.nodes == 1 def test_command_with_metadata(self): - """Test Command with metadata tuple.""" - cmd = Command(command=("echo hello", {"port": 8080, "log_prefix": "server"}), name="server") + """Test Command with metadata passed separately.""" + cmd = Command(command="echo hello", name="server", metadata={"port": 8080, "log_prefix": "server"}) assert cmd.metadata["port"] == 8080 assert cmd.metadata["log_prefix"] == "server" # Command gets wrapped with working_dir by default @@ -43,7 +43,7 @@ def test_command_with_metadata(self): def test_command_with_callable(self): """Test Command with callable that returns tuple.""" - def make_cmd(): + def make_cmd(cfg): return ("echo world", {"port": 5000}) cmd = Command(command=make_cmd, name="dynamic") @@ -65,7 +65,7 @@ def test_command_prepare_for_execution_string(self): def test_command_prepare_for_execution_callable(self): """Test prepare_for_execution with callable command.""" - def make_cmd(): + def make_cmd(cfg): return "echo test" cmd = Command(command=make_cmd, name="test") @@ -78,7 +78,7 @@ def make_cmd(): def test_command_prepare_for_execution_callable_with_metadata(self): """Test prepare_for_execution with callable returning tuple.""" - def make_cmd(): + def make_cmd(cfg): return ("echo metadata", {"num_tasks": 4, "environment": {"VAR": "value"}}) cmd = Command(command=make_cmd, name="test") @@ -105,7 +105,7 @@ def make_cmd(cfg): def test_command_meta_ref(self): """Test meta_ref for accessing metadata.""" - cmd = Command(command=("echo test", {"port": 8080, "host": "localhost"}), name="server") + cmd = Command(command="echo test", name="server", metadata={"port": 8080, "host": "localhost"}) assert cmd.meta_ref("port") == "8080" assert cmd.meta_ref("host") == "localhost" From 3629e096569aa5d036c696be906d9db2b10ad1b7 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 15:21:04 -0700 Subject: [PATCH 11/29] set default name Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 9db837bf9d..1d3fd5a7c9 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -156,7 +156,7 @@ class Command: container: str = "nemo-skills" gpus: Optional[int] = None nodes: int = 1 - name: Optional[str] = None + name: str = "command" working_dir: str = "/nemo_run/code" env_vars: Dict[str, str] = field(default_factory=dict) installation_command: Optional[str] = None @@ -165,10 +165,6 @@ class Command: het_group_index: Optional[int] = None # Set per-job by Pipeline (not global) def __post_init__(self): - # Initialize defaults - if self.name is None: - self.name = "command" - # Wrap plain strings with environment setup if isinstance(self.command, str) and (self.env_vars or self.working_dir): self.command = wrap_command(self.command, self.working_dir, self.env_vars) From f286644e3bd591f3691487398cc8ebc804b4bfa3 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 15:25:15 -0700 Subject: [PATCH 12/29] remove cluster config lambda resolution Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 45 ++++++++--------------- tests/test_declarative_pipeline.py | 19 ++-------- 2 files changed, 19 insertions(+), 45 deletions(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 1d3fd5a7c9..18b25e06c8 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -65,20 +65,18 @@ ) # Job 2: Two different model servers (HETEROGENEOUS SLURM job with 2 het components) - server_8b = Command( - command=lambda cfg: vllm_server_command(cfg, model="Qwen/Qwen3-8B"), - gpus=8, - name="server_8b" - ) - sandbox_8b = Command(command=lambda cfg: sandbox_command(cfg), name="sandbox_8b") + # Build commands with cluster_config + server_8b_cmd, server_8b_meta = vllm_server_command(cluster_config, model="Qwen/Qwen3-8B") + sandbox_8b_cmd, sandbox_8b_meta = sandbox_command(cluster_config) + server_32b_cmd, server_32b_meta = vllm_server_command(cluster_config, model="Qwen/Qwen3-32B") + sandbox_32b_cmd, sandbox_32b_meta = sandbox_command(cluster_config) + + server_8b = Command(command=server_8b_cmd, gpus=8, name="server_8b", metadata=server_8b_meta) + sandbox_8b = Command(command=sandbox_8b_cmd, name="sandbox_8b", metadata=sandbox_8b_meta) eval_8b = Command(command="python eval.py --model 8b", gpus=1, name="eval_8b") - server_32b = Command( - command=lambda cfg: vllm_server_command(cfg, model="Qwen/Qwen3-32B"), - gpus=8, - name="server_32b" - ) - sandbox_32b = Command(command=lambda cfg: sandbox_command(cfg), name="sandbox_32b") + server_32b = Command(command=server_32b_cmd, gpus=8, name="server_32b", metadata=server_32b_meta) + sandbox_32b = Command(command=sandbox_32b_cmd, name="sandbox_32b", metadata=sandbox_32b_meta) eval_32b = Command(command="python eval.py --model 32b", gpus=1, name="eval_32b") group_8b = CommandGroup(commands=[server_8b, sandbox_8b, eval_8b], name="eval_8b", log_dir=log_dir) @@ -107,7 +105,6 @@ pipeline.run() """ -import inspect import logging import shlex from contextlib import nullcontext @@ -143,10 +140,9 @@ class Command: - A string: evaluated immediately - A callable (lambda): evaluated lazily when the task is prepared - Lambdas are needed for cross-component references (hostname_ref, meta_ref). + Lambdas are ONLY needed for cross-component references (hostname_ref, meta_ref). The het_group_index isn't assigned until pipeline execution, so these must be lazy: - server = Command(command=lambda cfg: vllm_server_command(cfg, ...)) - # This lambda is ESSENTIAL - server.hostname_ref() and meta_ref() don't exist yet + # Lambda is ESSENTIAL here - server.hostname_ref() and meta_ref() don't exist yet client = Command(command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}") """ @@ -189,24 +185,15 @@ def prepare_for_execution(self, cluster_config: Dict) -> Tuple[str, Dict]: """Prepare command for execution. This method: - 1. Evaluates callables (resolves cross-group references and cluster_config-dependent commands) - 2. Adds cross-group environment variables - 3. Wraps with installation_command if provided + 1. Evaluates callables (resolves cross-component references) + 2. Wraps with installation_command if provided Returns: Tuple of (final_command, execution_config) """ - # 1. Evaluate if callable (resolves cross-group references or cluster_config needs) + # 1. Evaluate if callable (for cross-component references like hostname_ref) if callable(self.command): - # Check if lambda needs cluster_config (for sandbox) - - sig = inspect.signature(self.command) - if len(sig.parameters) > 0: - # Lambda expects cluster_config - result = self.command(cluster_config) - else: - # Regular lambda (cross-group refs) - result = self.command() + result = self.command() if isinstance(result, tuple): final_command, runtime_metadata = result diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index cfe36c9f96..d3e862c9c0 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -43,7 +43,7 @@ def test_command_with_metadata(self): def test_command_with_callable(self): """Test Command with callable that returns tuple.""" - def make_cmd(cfg): + def make_cmd(): return ("echo world", {"port": 5000}) cmd = Command(command=make_cmd, name="dynamic") @@ -65,7 +65,7 @@ def test_command_prepare_for_execution_string(self): def test_command_prepare_for_execution_callable(self): """Test prepare_for_execution with callable command.""" - def make_cmd(cfg): + def make_cmd(): return "echo test" cmd = Command(command=make_cmd, name="test") @@ -78,7 +78,7 @@ def make_cmd(cfg): def test_command_prepare_for_execution_callable_with_metadata(self): """Test prepare_for_execution with callable returning tuple.""" - def make_cmd(cfg): + def make_cmd(): return ("echo metadata", {"num_tasks": 4, "environment": {"VAR": "value"}}) cmd = Command(command=make_cmd, name="test") @@ -90,19 +90,6 @@ def make_cmd(cfg): assert exec_config["num_tasks"] == 4 assert exec_config["environment"]["VAR"] == "value" - def test_command_prepare_for_execution_with_cluster_config(self): - """Test prepare_for_execution with callable needing cluster_config.""" - - def make_cmd(cfg): - return f"echo {cfg['test_param']}" - - cmd = Command(command=make_cmd, name="test") - cluster_config = {"executor": "local", "containers": {}, "test_param": "value123"} - - final_cmd, exec_config = cmd.prepare_for_execution(cluster_config) - - assert final_cmd == "echo value123" - def test_command_meta_ref(self): """Test meta_ref for accessing metadata.""" cmd = Command(command="echo test", name="server", metadata={"port": 8080, "host": "localhost"}) From db30066e819977b5dfe74d13863e0cb3688984ab Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 15:42:57 -0700 Subject: [PATCH 13/29] ENH simplify cluster config prep Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 4 +- nemo_skills/pipeline/utils/declarative.py | 55 ++++--------- tests/test_declarative_pipeline.py | 96 +++++++++++------------ 3 files changed, 62 insertions(+), 93 deletions(-) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index 3d7b0ea15c..a0e07da31c 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -464,7 +464,7 @@ def generate( # Create and run pipeline pipeline = Pipeline( name=expname, - cluster=cluster, + cluster_config=cluster_config, jobs=jobs, reuse_code=reuse_code, reuse_code_exp=reuse_code_exp, @@ -472,7 +472,7 @@ def generate( ) # Pass _reuse_exp to pipeline.run() to add jobs to existing experiment - result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run, _reuse_exp=_reuse_exp) + result = pipeline.run(dry_run=dry_run, _reuse_exp=_reuse_exp) return result diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 18b25e06c8..75ddff922e 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -44,7 +44,7 @@ # Create and run pipeline pipeline = Pipeline( name="my_inference", - cluster="local", + cluster_config=cluster_config, groups=[inference_group] ) pipeline.run() @@ -93,7 +93,7 @@ # Create pipeline with dependency graph pipeline = Pipeline( name="full_pipeline", - cluster="slurm", + cluster_config=cluster_config, jobs=[ {"name": "prep", "group": prep_group}, @@ -114,7 +114,6 @@ import nemo_run as run from nemo_skills.pipeline.utils import ( - get_cluster_config, get_env_variables, get_executor, get_exp, @@ -269,7 +268,7 @@ class Pipeline: def __init__( self, name: str, - cluster: Optional[str] = None, + cluster_config: Dict, groups: Optional[List[CommandGroup]] = None, # Legacy mode jobs: Optional[List[Dict]] = None, # New mode with dependencies reuse_code: bool = True, @@ -279,13 +278,12 @@ def __init__( run_after: Optional[Union[str, List[str]]] = None, # Pipeline-level dependency on other experiments ): self.name = name - self.cluster = cluster + self.cluster_config = cluster_config self.reuse_code = reuse_code self.reuse_code_exp = reuse_code_exp self.skip_hf_home_check = skip_hf_home_check self.with_ray = with_ray self.run_after = run_after - self._cluster_config: Optional[Dict] = None if groups is not None and jobs is not None: raise ValueError("Cannot specify both 'groups' and 'jobs'.") @@ -301,19 +299,8 @@ def __init__( # Note: het_group_indices are assigned per-job in _plan_and_add_job, not globally - def _get_cluster_config(self) -> Dict: - """Get cluster configuration, loading it if necessary.""" - if self._cluster_config is None: - if self.cluster is None: - raise ValueError("Must specify cluster either in Pipeline() or run() method") - - self._cluster_config = get_cluster_config(self.cluster) - return self._cluster_config - def run( self, - cluster_config: Optional[Dict] = None, - cluster: Optional[str] = None, dry_run: bool = False, log_dir: Optional[str] = None, _reuse_exp=None, @@ -321,8 +308,6 @@ def run( """Execute the pipeline by calling NeMo-Run directly. Args: - cluster_config: Cluster configuration dict (optional, can use cluster name instead) - cluster: Cluster name to load config from (optional if cluster_config provided) dry_run: If True, validate without executing log_dir: Default log directory for groups that don't specify one (optional) _reuse_exp: Internal - reuse existing experiment object (for eval.py integration) @@ -331,30 +316,22 @@ def run( LOG.info("No jobs to execute") return None - # Determine cluster config to use - if cluster_config is not None: - final_cluster_config = cluster_config - elif cluster is not None: - final_cluster_config = get_cluster_config(cluster) - else: - final_cluster_config = self._get_cluster_config() - # Validate HF_HOME environment variable - if final_cluster_config["executor"] != "none" and not self.skip_hf_home_check: - env_vars = get_env_variables(final_cluster_config) + if self.cluster_config["executor"] != "none" and not self.skip_hf_home_check: + env_vars = get_env_variables(self.cluster_config) if "HF_HOME" not in env_vars: raise RuntimeError( "Invalid cluster_config: HF_HOME is missing from env_vars while skip_hf_home_check=False.\n" - f"Current env_vars: {final_cluster_config.get('env_vars', [])}\n" + f"Current env_vars: {self.cluster_config.get('env_vars', [])}\n" "Please add a new variable: HF_HOME=/mounted/path/to/your/hf_home" ) - if not is_mounted_filepath(final_cluster_config, env_vars["HF_HOME"]): + if not is_mounted_filepath(self.cluster_config, env_vars["HF_HOME"]): raise RuntimeError(f"Invalid cluster_config: HF_HOME={env_vars['HF_HOME']} is not a mounted path.") # Track job name -> task handle for dependency resolution job_name_to_handle = {} - with get_exp(self.name, final_cluster_config, _reuse_exp) as exp: + with get_exp(self.name, self.cluster_config, _reuse_exp) as exp: # Process each job in order for job_spec in self.jobs: job_name = job_spec.get("name", "unnamed") @@ -383,7 +360,7 @@ def run( else: # Could be external experiment name OR task handle from _reuse_exp case # Try to get as experiment name first - if final_cluster_config["executor"] == "slurm": + if self.cluster_config["executor"] == "slurm": exp_handles = get_exp_handles(dep) if len(exp_handles) == 0: LOG.warning( @@ -420,7 +397,7 @@ def run( task_handle = self._add_single_group_job( exp, job_spec["groups"][0], - final_cluster_config, + self.cluster_config, default_log_dir=log_dir, run_after=run_after_deps if run_after_deps else None, ) @@ -429,7 +406,7 @@ def run( task_handle = self._add_multi_group_job( exp, job_spec["groups"], - final_cluster_config, + self.cluster_config, default_log_dir=log_dir, run_after=run_after_deps if run_after_deps else None, ) @@ -438,7 +415,7 @@ def run( task_handle = self._add_single_group_job( exp, job_spec["group"], - final_cluster_config, + self.cluster_config, default_log_dir=log_dir, run_after=run_after_deps if run_after_deps else None, ) @@ -451,11 +428,11 @@ def run( # Only run if not using existing experiment (matching generate_v0.py line 331) if not dry_run and not _reuse_exp: - run_exp(exp, final_cluster_config) + run_exp(exp, self.cluster_config) # Cache experiment for code reuse in future runs - if final_cluster_config["executor"] != "none": - tunnel = get_tunnel(final_cluster_config) + if self.cluster_config["executor"] != "none": + tunnel = get_tunnel(self.cluster_config) cur_tunnel_hash = tunnel_hash(tunnel) if cur_tunnel_hash not in REUSE_CODE_EXP: REUSE_CODE_EXP[cur_tunnel_hash] = exp diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index d3e862c9c0..093d38bf4c 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -184,8 +184,9 @@ def test_pipeline_with_groups_legacy_mode(self): """Test Pipeline with groups parameter (legacy mode).""" cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group") + cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test_pipeline", cluster="local", groups=[group]) + pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, groups=[group]) assert pipeline.name == "test_pipeline" assert pipeline._legacy_mode is True @@ -204,8 +205,9 @@ def test_pipeline_with_jobs(self): {"name": "job1", "group": group1}, {"name": "job2", "group": group2, "dependencies": ["job1"]}, ] + cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test_pipeline", cluster="local", jobs=jobs) + pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=jobs) assert pipeline.name == "test_pipeline" assert pipeline._legacy_mode is False @@ -216,21 +218,24 @@ def test_pipeline_cannot_specify_both_groups_and_jobs(self): cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group") jobs = [{"name": "job", "group": group}] + cluster_config = {"executor": "local", "containers": {}} with pytest.raises(ValueError, match="Cannot specify both 'groups' and 'jobs'"): - Pipeline(name="test", cluster="local", groups=[group], jobs=jobs) + Pipeline(name="test", cluster_config=cluster_config, groups=[group], jobs=jobs) def test_pipeline_must_specify_either_groups_or_jobs(self): """Test that Pipeline raises error when neither groups nor jobs are specified.""" + cluster_config = {"executor": "local", "containers": {}} with pytest.raises(ValueError, match="Must specify either 'groups' or 'jobs'"): - Pipeline(name="test", cluster="local") + Pipeline(name="test", cluster_config=cluster_config) def test_pipeline_with_run_after(self): """Test Pipeline with run_after parameter.""" cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group") + cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test", cluster="local", groups=[group], run_after="other_exp") + pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group], run_after="other_exp") assert pipeline.run_after == "other_exp" @@ -238,35 +243,22 @@ def test_pipeline_with_run_after_list(self): """Test Pipeline with run_after as list.""" cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group") + cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test", cluster="local", groups=[group], run_after=["exp1", "exp2"]) + pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group], run_after=["exp1", "exp2"]) assert pipeline.run_after == ["exp1", "exp2"] - @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") - def test_pipeline_get_cluster_config(self, mock_get_config): - """Test _get_cluster_config method.""" - mock_config = {"executor": "local", "containers": {}} - mock_get_config.return_value = mock_config - + def test_pipeline_cluster_config_passed_directly(self): + """Test that cluster_config is passed directly (no more string resolution).""" cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group") - pipeline = Pipeline(name="test", cluster="local", groups=[group]) - - config = pipeline._get_cluster_config() + cluster_config = {"executor": "local", "containers": {}} - assert config == mock_config - mock_get_config.assert_called_once_with("local") + pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group]) - @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") - def test_pipeline_missing_cluster_config(self, mock_get_config): - """Test that Pipeline raises error when cluster is not specified.""" - cmd = Command(command="echo test", name="cmd") - group = CommandGroup(commands=[cmd], name="group") - pipeline = Pipeline(name="test", groups=[group]) # No cluster specified - - with pytest.raises(ValueError, match="Must specify cluster"): - pipeline._get_cluster_config() + # cluster_config is stored as-is + assert pipeline.cluster_config == cluster_config class TestPipelineExecution: @@ -293,10 +285,10 @@ def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_config, # Create pipeline cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster="local", groups=[group], skip_hf_home_check=True) + pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group], skip_hf_home_check=True) # Run pipeline - result = pipeline.run(cluster_config=mock_config, dry_run=True) + result = pipeline.run(dry_run=True) # Verify assert result == mock_exp @@ -331,10 +323,10 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_ {"name": "job1", "group": group1}, {"name": "job2", "group": group2, "dependencies": ["job1"]}, ] - pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True) + pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True) # Run pipeline - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline.run(dry_run=True) # Verify both jobs were added assert mock_exp.add.call_count == 2 @@ -364,10 +356,10 @@ def test_pipeline_hf_home_validation( cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster="local", groups=[group]) + pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group]) # Should not raise - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline.run(dry_run=True) # Verify executor was created assert mock_get_executor.called @@ -382,10 +374,10 @@ def test_pipeline_hf_home_missing(self, mock_env_vars, mock_get_config): cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster="local", groups=[group]) + pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group]) with pytest.raises(RuntimeError, match="HF_HOME is missing"): - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline.run(dry_run=True) @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") @@ -399,10 +391,10 @@ def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars, mock cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster="local", groups=[group]) + pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group]) with pytest.raises(RuntimeError, match="is not a mounted path"): - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline.run(dry_run=True) class TestHetGroupIndices: @@ -427,8 +419,8 @@ def test_het_group_index_non_heterogeneous(self, mock_env_vars, mock_get_exp): cmd2 = Command(command="echo 2", name="cmd2") group = CommandGroup(commands=[cmd1, cmd2], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster="local", groups=[group], skip_hf_home_check=True) - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group], skip_hf_home_check=True) + pipeline.run(dry_run=True) # Both commands should have None het_group_index (localhost communication) assert cmd1.het_group_index is None @@ -458,8 +450,8 @@ def test_het_group_index_heterogeneous(self, mock_env_vars, mock_get_exp): group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs") jobs = [{"name": "hetjob", "groups": [group1, group2]}] - pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True) - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True) + pipeline.run(dry_run=True) # Commands should have het_group_index 0 and 1 assert cmd1.het_group_index == 0 @@ -498,8 +490,8 @@ def test_het_group_index_per_job_not_global(self, mock_env_vars, mock_get_exp): {"name": "hetjob1", "groups": [group1, group2]}, {"name": "hetjob2", "groups": [group3, group4]}, ] - pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True) - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True) + pipeline.run(dry_run=True) # Both jobs should have het_group_index starting from 0 assert cmd1.het_group_index == 0 @@ -529,10 +521,10 @@ def test_dependency_none_handling(self, mock_env_vars, mock_get_exp): group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") jobs = [{"name": "job", "group": group, "dependencies": None}] - pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True) + pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True) # Should not raise - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline.run(dry_run=True) @patch("nemo_skills.pipeline.utils.declarative.get_exp") @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") @@ -552,11 +544,11 @@ def test_pipeline_run_after_applies_to_jobs(self, mock_env_vars, mock_get_exp): group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") pipeline = Pipeline( - name="test", cluster="local", groups=[group], run_after="other_exp", skip_hf_home_check=True + name="test", cluster_config=mock_config, groups=[group], run_after="other_exp", skip_hf_home_check=True ) # Should not raise and should apply run_after - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline.run(dry_run=True) class TestErrorHandling: @@ -564,23 +556,23 @@ class TestErrorHandling: def test_pipeline_job_missing_group_or_groups(self): """Test that job spec without group or groups raises error.""" + mock_config = {"executor": "none", "containers": {}} jobs = [{"name": "bad_job"}] # Missing 'group' or 'groups' with pytest.raises(ValueError, match="must have either 'group' or 'groups'"): - pipeline = Pipeline(name="test", cluster="local", jobs=jobs) - mock_config = {"executor": "none", "containers": {}} - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs) + pipeline.run(dry_run=True) def test_commandgroup_missing_log_dir(self): """Test that CommandGroup without log_dir raises error during execution.""" + mock_config = {"executor": "none", "containers": {}} cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group") # No log_dir - pipeline = Pipeline(name="test", cluster="local", groups=[group]) - mock_config = {"executor": "none", "containers": {}} + pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group]) with pytest.raises(ValueError, match="must have log_dir set"): - pipeline.run(cluster_config=mock_config, dry_run=True) + pipeline.run(dry_run=True) if __name__ == "__main__": From 876913f04f9d25285bb67b6cd703ce49f4d16b3f Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 16:01:31 -0700 Subject: [PATCH 14/29] TST fix tests Signed-off-by: George Armstrong --- tests/test_declarative_pipeline.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index 093d38bf4c..494a00e8c4 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -265,17 +265,15 @@ class TestPipelineExecution: """Test Pipeline execution and job management.""" @patch("nemo_skills.pipeline.utils.declarative.get_exp") - @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") @patch("nemo_skills.pipeline.utils.declarative.run_exp") - def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_config, mock_get_exp): + def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_exp): """Test basic pipeline execution.""" # Setup mocks mock_config = { "executor": "none", "containers": {"nemo-skills": "container:latest"}, } - mock_get_config.return_value = mock_config mock_env_vars.return_value = {"HF_HOME": "/hf"} mock_exp = MagicMock() @@ -295,17 +293,15 @@ def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_config, mock_exp.add.assert_called_once() @patch("nemo_skills.pipeline.utils.declarative.get_exp") - @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") @patch("nemo_skills.pipeline.utils.declarative.run_exp") - def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_get_config, mock_get_exp): + def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_get_exp): """Test pipeline execution with job dependencies.""" # Setup mocks mock_config = { "executor": "none", "containers": {"nemo-skills": "container:latest"}, } - mock_get_config.return_value = mock_config mock_env_vars.return_value = {"HF_HOME": "/hf"} mock_exp = MagicMock() @@ -332,20 +328,16 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_ assert mock_exp.add.call_count == 2 @patch("nemo_skills.pipeline.utils.declarative.get_exp") - @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath") @patch("nemo_skills.pipeline.utils.declarative.get_executor") - def test_pipeline_hf_home_validation( - self, mock_get_executor, mock_is_mounted, mock_env_vars, mock_get_config, mock_get_exp - ): + def test_pipeline_hf_home_validation(self, mock_get_executor, mock_is_mounted, mock_env_vars, mock_get_exp): """Test HF_HOME validation.""" mock_config = { "executor": "slurm", "containers": {"nemo-skills": "container:latest"}, "account": "test_account", } - mock_get_config.return_value = mock_config mock_env_vars.return_value = {"HF_HOME": "/hf"} mock_is_mounted.return_value = True mock_get_executor.return_value = MagicMock() @@ -364,12 +356,10 @@ def test_pipeline_hf_home_validation( # Verify executor was created assert mock_get_executor.called - @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") - def test_pipeline_hf_home_missing(self, mock_env_vars, mock_get_config): + def test_pipeline_hf_home_missing(self, mock_env_vars): """Test that missing HF_HOME raises error.""" mock_config = {"executor": "slurm", "containers": {}} - mock_get_config.return_value = mock_config mock_env_vars.return_value = {} # No HF_HOME cmd = Command(command="echo test", name="cmd") @@ -379,13 +369,11 @@ def test_pipeline_hf_home_missing(self, mock_env_vars, mock_get_config): with pytest.raises(RuntimeError, match="HF_HOME is missing"): pipeline.run(dry_run=True) - @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config") @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath") - def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars, mock_get_config): + def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars): """Test that non-mounted HF_HOME raises error.""" mock_config = {"executor": "slurm", "containers": {}} - mock_get_config.return_value = mock_config mock_env_vars.return_value = {"HF_HOME": "/hf"} mock_is_mounted.return_value = False From 55d79702dc61604b97704173cb191b56c6ae8cba Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 16:23:28 -0700 Subject: [PATCH 15/29] FIX change legacy wording Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 12 +++++------- tests/test_declarative_pipeline.py | 8 +++----- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 75ddff922e..36cc7fb8a7 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -260,17 +260,17 @@ def __init__( class Pipeline: """Top-level pipeline that composes command groups with dependency support. - Supports two modes: - 1. Groups: groups=[cmdgroup1, cmdgroup2] - combines all into one job - 2. Jobs: jobs=[{...}, {...}] - supports dependencies and multi-group jobs + Supports two input formats (both converted to jobs internally): + 1. Groups: groups=[cmdgroup1, cmdgroup2] - shorthand, each becomes a job + 2. Jobs: jobs=[{...}, {...}] - full control with dependencies and multi-group jobs """ def __init__( self, name: str, cluster_config: Dict, - groups: Optional[List[CommandGroup]] = None, # Legacy mode - jobs: Optional[List[Dict]] = None, # New mode with dependencies + groups: Optional[List[CommandGroup]] = None, + jobs: Optional[List[Dict]] = None, reuse_code: bool = True, reuse_code_exp: Optional[str] = None, skip_hf_home_check: bool = False, @@ -290,10 +290,8 @@ def __init__( if groups is not None: self.jobs = [{"group": g} for g in groups] - self._legacy_mode = True elif jobs is not None: self.jobs = jobs - self._legacy_mode = False else: raise ValueError("Must specify either 'groups' or 'jobs'") diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index 494a00e8c4..2cedd3293b 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -180,8 +180,8 @@ def test_commandgroup_with_log_dir(self): class TestPipeline: """Test Pipeline class functionality.""" - def test_pipeline_with_groups_legacy_mode(self): - """Test Pipeline with groups parameter (legacy mode).""" + def test_pipeline_with_groups(self): + """Test Pipeline with groups parameter (shorthand format).""" cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group") cluster_config = {"executor": "local", "containers": {}} @@ -189,12 +189,11 @@ def test_pipeline_with_groups_legacy_mode(self): pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, groups=[group]) assert pipeline.name == "test_pipeline" - assert pipeline._legacy_mode is True assert len(pipeline.jobs) == 1 assert "group" in pipeline.jobs[0] def test_pipeline_with_jobs(self): - """Test Pipeline with jobs parameter.""" + """Test Pipeline with jobs parameter (full format with dependencies).""" cmd1 = Command(command="echo 1", name="cmd1") group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs") @@ -210,7 +209,6 @@ def test_pipeline_with_jobs(self): pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=jobs) assert pipeline.name == "test_pipeline" - assert pipeline._legacy_mode is False assert len(pipeline.jobs) == 2 def test_pipeline_cannot_specify_both_groups_and_jobs(self): From d8d6af114372f575cb031d708a7bdb188fa036bc Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 16:27:37 -0700 Subject: [PATCH 16/29] components -> commands Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 12 ++++++------ tests/test_declarative_pipeline.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 36cc7fb8a7..c67c662cc7 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -251,7 +251,7 @@ def __init__( name: Optional[str] = None, log_dir: Optional[str] = None, ): - self.components = commands + self.commands = commands self.hardware = hardware or HardwareConfig() self.name = name self.log_dir = log_dir @@ -533,7 +533,7 @@ def _plan_and_add_job( shared_env_vars: Dict[str, str] = {} if heterogeneous: for het_idx, group in enumerate(groups): - for command in group.components: + for command in group.commands: _, exec_config_probe = command.prepare_for_execution(cluster_config) shared_env_vars.update(exec_config_probe.get("environment", {})) @@ -542,9 +542,9 @@ def _plan_and_add_job( # Build commands and executors for het_idx, group in enumerate(groups): - has_multiple_components = len(group.components) > 1 + has_multiple_components = len(group.commands) > 1 total_het_groups = ( - len(groups) if heterogeneous else (len(group.components) if has_multiple_components else 1) + len(groups) if heterogeneous else (len(group.commands) if has_multiple_components else 1) ) # For single-group jobs with multiple components, allow job-level GPU override for sbatch allocation @@ -552,7 +552,7 @@ def _plan_and_add_job( group.hardware.num_gpus if (not heterogeneous and has_multiple_components and group.hardware) else None ) - for comp_idx, command in enumerate(group.components): + for comp_idx, command in enumerate(group.commands): # Assign het_group_index ONLY for heterogeneous jobs (per-job, not global) # Non-heterogeneous jobs use localhost, so het_group_index should remain None if heterogeneous: @@ -584,7 +584,7 @@ def _plan_and_add_job( heterogeneous, het_idx if heterogeneous else comp_idx, total_het_groups, - (len(group.components) > 1), + (len(group.commands) > 1), ) # Share packager across executors for single-group jobs diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index 2cedd3293b..56623d3300 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -155,7 +155,7 @@ def test_commandgroup_basic(self): group = CommandGroup(commands=[cmd1, cmd2], name="test_group") assert group.name == "test_group" - assert len(group.components) == 2 + assert len(group.commands) == 2 assert group.hardware is not None def test_commandgroup_with_hardware(self): From b8275c93a880698476ec104878dc6de501d7f7b0 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 17:15:50 -0700 Subject: [PATCH 17/29] ENH fix name dependencies Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 21 +++--- nemo_skills/pipeline/utils/declarative.py | 91 ++++++++++++++--------- tests/test_declarative_pipeline.py | 18 ++--- 3 files changed, 73 insertions(+), 57 deletions(-) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index a0e07da31c..4d07c2f889 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -412,6 +412,7 @@ def generate( # Handle dependent_jobs chain dependencies = _task_dependencies.copy() if _task_dependencies else [] + prev_job = None for dep_idx in range(dependent_jobs + 1): # Create CommandGroup for this task @@ -444,16 +445,16 @@ def generate( job_deps.extend(run_after_list) job_deps = job_deps if job_deps else None else: - # Subsequent jobs in chain depend on previous job - job_deps = [f"{task_name}-dep{dep_idx - 1}"] - - jobs.append( - { - "name": internal_job_name, - "group": cmd_group, - "dependencies": job_deps, - } - ) + # Subsequent jobs in chain depend on previous job (use job object, not string) + job_deps = [prev_job] + + job_spec = { + "name": internal_job_name, + "group": cmd_group, + "dependencies": job_deps, + } + jobs.append(job_spec) + prev_job = job_spec # Track for next iteration all_job_names.append(internal_job_name) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index c67c662cc7..4108fab6fe 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -63,6 +63,7 @@ name="prep", log_dir=log_dir ) + prep_job = {"name": "prep", "group": prep_group} # Job 2: Two different model servers (HETEROGENEOUS SLURM job with 2 het components) # Build commands with cluster_config @@ -82,6 +83,8 @@ group_8b = CommandGroup(commands=[server_8b, sandbox_8b, eval_8b], name="eval_8b", log_dir=log_dir) group_32b = CommandGroup(commands=[server_32b, sandbox_32b, eval_32b], name="eval_32b", log_dir=log_dir) + evals_job = {"name": "evals", "groups": [group_8b, group_32b], "dependencies": [prep_job]} + # Job 3: Report generation (depends on both evaluations) report = Command( command="python generate_report.py --output report.txt", @@ -95,13 +98,12 @@ name="full_pipeline", cluster_config=cluster_config, jobs=[ - {"name": "prep", "group": prep_group}, - - # Multi-group heterogeneous job (both eval groups in ONE SLURM job with 2 het components) - {"name": "evals", "groups": [group_8b, group_32b], "dependencies": ["prep"]}, - - # Report depends on the multi-group eval job - {"name": "report", "group": report_group, "dependencies": ["evals"]}, + prep_job, + evals_job, + # Report depends on the eval job (internal) and some external experiment (string) + {"name": "report", "group": report_group, "dependencies": [evals_job, "external_training_exp"]}, + ] + ) pipeline.run() """ @@ -263,6 +265,10 @@ class Pipeline: Supports two input formats (both converted to jobs internally): 1. Groups: groups=[cmdgroup1, cmdgroup2] - shorthand, each becomes a job 2. Jobs: jobs=[{...}, {...}] - full control with dependencies and multi-group jobs + + Dependency types: + - Job dict objects: Internal dependencies on jobs in the same pipeline + - Strings: External dependencies on other experiments """ def __init__( @@ -289,7 +295,8 @@ def __init__( raise ValueError("Cannot specify both 'groups' and 'jobs'.") if groups is not None: - self.jobs = [{"group": g} for g in groups] + # Auto-generate job names from group names + self.jobs = [{"name": g.name or f"job_{idx}", "group": g} for idx, g in enumerate(groups)] elif jobs is not None: self.jobs = jobs else: @@ -332,7 +339,9 @@ def run( with get_exp(self.name, self.cluster_config, _reuse_exp) as exp: # Process each job in order for job_spec in self.jobs: - job_name = job_spec.get("name", "unnamed") + job_name = job_spec.get("name") + if not job_name: + raise ValueError(f"Job spec must have a 'name' field: {job_spec}") # Resolve dependencies to task handles run_after_deps = [] @@ -350,37 +359,45 @@ def run( for dep in job_dependencies: if isinstance(dep, str): - # String dependency - could be job name, task handle, or experiment name - if dep in job_name_to_handle: - # Internal pipeline dependency - add the handle - run_after_deps.append(job_name_to_handle[dep]) - LOG.info(f"Job '{job_name}' depends on '{dep}' (handle: {job_name_to_handle[dep]})") - else: - # Could be external experiment name OR task handle from _reuse_exp case - # Try to get as experiment name first - if self.cluster_config["executor"] == "slurm": - exp_handles = get_exp_handles(dep) - if len(exp_handles) == 0: - LOG.warning( - f"No pending or running tasks found for experiment {dep}, cannot set dependencies." - ) - # If no experiment found, treat as direct task handle (for _reuse_exp case) - if _reuse_exp: - run_after_deps.append(dep) - LOG.info( - f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)" - ) - else: - run_after_deps.extend(exp_handles) + # String dependency = external experiment name + if self.cluster_config["executor"] == "slurm": + exp_handles = get_exp_handles(dep) + if len(exp_handles) == 0: + LOG.warning( + f"No pending or running tasks found for experiment {dep}, cannot set dependencies." + ) + # If no experiment found, treat as direct task handle (for _reuse_exp case) + if _reuse_exp: + run_after_deps.append(dep) LOG.info( - f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)" + f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)" ) - elif _reuse_exp: - # For non-SLURM executors with _reuse_exp, treat as task handle - run_after_deps.append(dep) - LOG.info(f"Job '{job_name}' depends on task handle '{dep}'") + else: + run_after_deps.extend(exp_handles) + LOG.info( + f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)" + ) + elif _reuse_exp: + # For non-SLURM executors with _reuse_exp, treat as task handle + run_after_deps.append(dep) + LOG.info(f"Job '{job_name}' depends on task handle '{dep}'") + elif isinstance(dep, dict): + # Dict dependency = internal job reference (by job spec object) + dep_name = dep.get("name") + if not dep_name: + raise ValueError(f"Job dependency must have a 'name' field: {dep}") + if dep_name in job_name_to_handle: + run_after_deps.append(job_name_to_handle[dep_name]) + LOG.info( + f"Job '{job_name}' depends on internal job '{dep_name}' (handle: {job_name_to_handle[dep_name]})" + ) + else: + raise ValueError( + f"Job '{job_name}' depends on job '{dep_name}' which hasn't been processed yet. " + f"Make sure dependencies are listed before the jobs that depend on them in the jobs list." + ) else: - # Direct task handle object (not string) + # Direct task handle object (not string or dict) run_after_deps.append(dep) LOG.info(f"Job '{job_name}' depends on task handle (object)") diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index 56623d3300..9a001770a6 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -200,13 +200,12 @@ def test_pipeline_with_jobs(self): cmd2 = Command(command="echo 2", name="cmd2") group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs") - jobs = [ - {"name": "job1", "group": group1}, - {"name": "job2", "group": group2, "dependencies": ["job1"]}, - ] + job1 = {"name": "job1", "group": group1} + job2 = {"name": "job2", "group": group2, "dependencies": [job1]} + cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=jobs) + pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=[job1, job2]) assert pipeline.name == "test_pipeline" assert len(pipeline.jobs) == 2 @@ -313,11 +312,10 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_ cmd2 = Command(command="echo 2", name="cmd2") group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs") - jobs = [ - {"name": "job1", "group": group1}, - {"name": "job2", "group": group2, "dependencies": ["job1"]}, - ] - pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True) + job1 = {"name": "job1", "group": group1, "dependencies": []} + job2 = {"name": "job2", "group": group2, "dependencies": [job1]} + + pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=[job1, job2], skip_hf_home_check=True) # Run pipeline pipeline.run(dry_run=True) From 78e1a9ca439cd1970a05988b4117c7100ce6175a Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 6 Oct 2025 17:27:24 -0700 Subject: [PATCH 18/29] MAINT remove groups syntax Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 71 +++++++++---------- tests/test_declarative_pipeline.py | 84 +++++++++++++++-------- 2 files changed, 90 insertions(+), 65 deletions(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 4108fab6fe..64d942c70c 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -45,7 +45,7 @@ pipeline = Pipeline( name="my_inference", cluster_config=cluster_config, - groups=[inference_group] + jobs=[{"name": "inference", "group": inference_group}] ) pipeline.run() @@ -262,9 +262,7 @@ def __init__( class Pipeline: """Top-level pipeline that composes command groups with dependency support. - Supports two input formats (both converted to jobs internally): - 1. Groups: groups=[cmdgroup1, cmdgroup2] - shorthand, each becomes a job - 2. Jobs: jobs=[{...}, {...}] - full control with dependencies and multi-group jobs + Jobs format: jobs=[{...}, {...}] - list of job dicts with dependencies and groups Dependency types: - Job dict objects: Internal dependencies on jobs in the same pipeline @@ -275,8 +273,7 @@ def __init__( self, name: str, cluster_config: Dict, - groups: Optional[List[CommandGroup]] = None, - jobs: Optional[List[Dict]] = None, + jobs: List[Dict], reuse_code: bool = True, reuse_code_exp: Optional[str] = None, skip_hf_home_check: bool = False, @@ -290,38 +287,31 @@ def __init__( self.skip_hf_home_check = skip_hf_home_check self.with_ray = with_ray self.run_after = run_after + self.jobs = jobs - if groups is not None and jobs is not None: - raise ValueError("Cannot specify both 'groups' and 'jobs'.") - - if groups is not None: - # Auto-generate job names from group names - self.jobs = [{"name": g.name or f"job_{idx}", "group": g} for idx, g in enumerate(groups)] - elif jobs is not None: - self.jobs = jobs - else: - raise ValueError("Must specify either 'groups' or 'jobs'") + # Validate configuration early + self._validate() # Note: het_group_indices are assigned per-job in _plan_and_add_job, not globally - def run( - self, - dry_run: bool = False, - log_dir: Optional[str] = None, - _reuse_exp=None, - ): - """Execute the pipeline by calling NeMo-Run directly. - - Args: - dry_run: If True, validate without executing - log_dir: Default log directory for groups that don't specify one (optional) - _reuse_exp: Internal - reuse existing experiment object (for eval.py integration) - """ + def _validate(self): + """Validate pipeline configuration early in __init__.""" + # Validate jobs if not self.jobs: - LOG.info("No jobs to execute") - return None + raise ValueError("Pipeline requires at least one job") - # Validate HF_HOME environment variable + for idx, job_spec in enumerate(self.jobs): + job_name = job_spec.get("name") + if not job_name: + raise ValueError(f"Job at index {idx} must have a 'name' field: {job_spec}") + + # Validate cluster_config has required fields + if "executor" not in self.cluster_config: + raise ValueError("cluster_config must have 'executor' field") + if "containers" not in self.cluster_config: + raise ValueError("cluster_config must have 'containers' field") + + # Validate HF_HOME if needed if self.cluster_config["executor"] != "none" and not self.skip_hf_home_check: env_vars = get_env_variables(self.cluster_config) if "HF_HOME" not in env_vars: @@ -333,15 +323,26 @@ def run( if not is_mounted_filepath(self.cluster_config, env_vars["HF_HOME"]): raise RuntimeError(f"Invalid cluster_config: HF_HOME={env_vars['HF_HOME']} is not a mounted path.") + def run( + self, + dry_run: bool = False, + log_dir: Optional[str] = None, + _reuse_exp=None, + ): + """Execute the pipeline by calling NeMo-Run directly. + + Args: + dry_run: If True, validate without executing + log_dir: Default log directory for groups that don't specify one (optional) + _reuse_exp: Internal - reuse existing experiment object (for eval.py integration) + """ # Track job name -> task handle for dependency resolution job_name_to_handle = {} with get_exp(self.name, self.cluster_config, _reuse_exp) as exp: # Process each job in order for job_spec in self.jobs: - job_name = job_spec.get("name") - if not job_name: - raise ValueError(f"Job spec must have a 'name' field: {job_spec}") + job_name = job_spec["name"] # Already validated in _validate() # Resolve dependencies to task handles run_after_deps = [] diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index 9a001770a6..b13b6f1ed1 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -180,13 +180,18 @@ def test_commandgroup_with_log_dir(self): class TestPipeline: """Test Pipeline class functionality.""" - def test_pipeline_with_groups(self): - """Test Pipeline with groups parameter (shorthand format).""" + def test_pipeline_with_single_job(self): + """Test Pipeline with single job.""" cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group") cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, groups=[group]) + pipeline = Pipeline( + name="test_pipeline", + cluster_config=cluster_config, + jobs=[{"name": "job1", "group": group}], + skip_hf_home_check=True, + ) assert pipeline.name == "test_pipeline" assert len(pipeline.jobs) == 1 @@ -205,25 +210,19 @@ def test_pipeline_with_jobs(self): cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=[job1, job2]) + pipeline = Pipeline( + name="test_pipeline", cluster_config=cluster_config, jobs=[job1, job2], skip_hf_home_check=True + ) assert pipeline.name == "test_pipeline" assert len(pipeline.jobs) == 2 - def test_pipeline_cannot_specify_both_groups_and_jobs(self): - """Test that Pipeline raises error when both groups and jobs are specified.""" - cmd = Command(command="echo test", name="cmd") - group = CommandGroup(commands=[cmd], name="group") - jobs = [{"name": "job", "group": group}] + def test_pipeline_requires_jobs(self): + """Test that Pipeline requires jobs parameter.""" cluster_config = {"executor": "local", "containers": {}} - with pytest.raises(ValueError, match="Cannot specify both 'groups' and 'jobs'"): - Pipeline(name="test", cluster_config=cluster_config, groups=[group], jobs=jobs) - - def test_pipeline_must_specify_either_groups_or_jobs(self): - """Test that Pipeline raises error when neither groups nor jobs are specified.""" - cluster_config = {"executor": "local", "containers": {}} - with pytest.raises(ValueError, match="Must specify either 'groups' or 'jobs'"): + # Missing jobs parameter should fail + with pytest.raises(TypeError): Pipeline(name="test", cluster_config=cluster_config) def test_pipeline_with_run_after(self): @@ -232,7 +231,13 @@ def test_pipeline_with_run_after(self): group = CommandGroup(commands=[cmd], name="group") cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group], run_after="other_exp") + pipeline = Pipeline( + name="test", + cluster_config=cluster_config, + jobs=[{"name": "job1", "group": group}], + run_after="other_exp", + skip_hf_home_check=True, + ) assert pipeline.run_after == "other_exp" @@ -242,7 +247,13 @@ def test_pipeline_with_run_after_list(self): group = CommandGroup(commands=[cmd], name="group") cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group], run_after=["exp1", "exp2"]) + pipeline = Pipeline( + name="test", + cluster_config=cluster_config, + jobs=[{"name": "job1", "group": group}], + run_after=["exp1", "exp2"], + skip_hf_home_check=True, + ) assert pipeline.run_after == ["exp1", "exp2"] @@ -252,7 +263,12 @@ def test_pipeline_cluster_config_passed_directly(self): group = CommandGroup(commands=[cmd], name="group") cluster_config = {"executor": "local", "containers": {}} - pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group]) + pipeline = Pipeline( + name="test", + cluster_config=cluster_config, + jobs=[{"name": "job1", "group": group}], + skip_hf_home_check=True, + ) # cluster_config is stored as-is assert pipeline.cluster_config == cluster_config @@ -280,7 +296,9 @@ def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_exp): # Create pipeline cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group], skip_hf_home_check=True) + pipeline = Pipeline( + name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}], skip_hf_home_check=True + ) # Run pipeline result = pipeline.run(dry_run=True) @@ -344,7 +362,7 @@ def test_pipeline_hf_home_validation(self, mock_get_executor, mock_is_mounted, m cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group]) + pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}]) # Should not raise pipeline.run(dry_run=True) @@ -354,31 +372,31 @@ def test_pipeline_hf_home_validation(self, mock_get_executor, mock_is_mounted, m @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") def test_pipeline_hf_home_missing(self, mock_env_vars): - """Test that missing HF_HOME raises error.""" + """Test that missing HF_HOME raises error in __init__.""" mock_config = {"executor": "slurm", "containers": {}} mock_env_vars.return_value = {} # No HF_HOME cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group]) + # Should raise in __init__ now, not run() with pytest.raises(RuntimeError, match="HF_HOME is missing"): - pipeline.run(dry_run=True) + Pipeline(name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}]) @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath") def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars): - """Test that non-mounted HF_HOME raises error.""" + """Test that non-mounted HF_HOME raises error in __init__.""" mock_config = {"executor": "slurm", "containers": {}} mock_env_vars.return_value = {"HF_HOME": "/hf"} mock_is_mounted.return_value = False cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group]) + # Should raise in __init__ now, not run() with pytest.raises(RuntimeError, match="is not a mounted path"): - pipeline.run(dry_run=True) + Pipeline(name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}]) class TestHetGroupIndices: @@ -403,7 +421,9 @@ def test_het_group_index_non_heterogeneous(self, mock_env_vars, mock_get_exp): cmd2 = Command(command="echo 2", name="cmd2") group = CommandGroup(commands=[cmd1, cmd2], name="group", log_dir="/logs") - pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group], skip_hf_home_check=True) + pipeline = Pipeline( + name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}], skip_hf_home_check=True + ) pipeline.run(dry_run=True) # Both commands should have None het_group_index (localhost communication) @@ -528,7 +548,11 @@ def test_pipeline_run_after_applies_to_jobs(self, mock_env_vars, mock_get_exp): group = CommandGroup(commands=[cmd], name="group", log_dir="/logs") pipeline = Pipeline( - name="test", cluster_config=mock_config, groups=[group], run_after="other_exp", skip_hf_home_check=True + name="test", + cluster_config=mock_config, + jobs=[{"name": "job1", "group": group}], + run_after="other_exp", + skip_hf_home_check=True, ) # Should not raise and should apply run_after @@ -553,7 +577,7 @@ def test_commandgroup_missing_log_dir(self): cmd = Command(command="echo test", name="cmd") group = CommandGroup(commands=[cmd], name="group") # No log_dir - pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group]) + pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}]) with pytest.raises(ValueError, match="must have log_dir set"): pipeline.run(dry_run=True) From bc59c722976d5674e52a9ed3116da5399e7d8ddf Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Wed, 8 Oct 2025 17:12:20 -0700 Subject: [PATCH 19/29] TST harden mcp client tests Signed-off-by: George Armstrong --- tests/test_mcp_clients.py | 46 +++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/tests/test_mcp_clients.py b/tests/test_mcp_clients.py index 879dd60d1c..bd31bc4524 100644 --- a/tests/test_mcp_clients.py +++ b/tests/test_mcp_clients.py @@ -205,6 +205,32 @@ async def execute(self, tool_name: str, arguments: dict, extra_args: dict | None return {"unknown": tool_name, "args": arguments} +# Helper class for test_tool_manager_cache_and_duplicate_detection +# Defined at module level so it can be imported via locate() +class CountingTool(DummyTool): + # Class variable to track calls across instances + _call_count = 0 + + async def list_tools(self): + CountingTool._call_count += 1 + return await super().list_tools() + + @classmethod + def reset_count(cls): + cls._call_count = 0 + + @classmethod + def get_count(cls): + return cls._call_count + + +# Helper class for duplicate tool detection test +class DupTool(DummyTool): + async def list_tools(self): + lst = await super().list_tools() + return [lst[0], lst[0]] # duplicate names within same tool + + @pytest.mark.asyncio async def test_tool_manager_list_and_execute_with_class_locator(): # Register this test module's DummyTool via module locator @@ -219,30 +245,18 @@ async def test_tool_manager_list_and_execute_with_class_locator(): @pytest.mark.asyncio async def test_tool_manager_cache_and_duplicate_detection(): - calls = {"n": 0} - - class CountingTool(DummyTool): - async def list_tools(self): - calls["n"] += 1 - return await super().list_tools() + # Reset counter before test + CountingTool.reset_count() - # Expose CountingTool from this module for locate - globals()["CountingTool"] = CountingTool tm = ToolManager(module_specs=["tests.test_mcp_clients::CountingTool"], overrides={}, context={}) _ = await tm.list_all_tools(use_cache=True) _ = await tm.list_all_tools(use_cache=True) - assert calls["n"] == 1 + assert CountingTool.get_count() == 1 with pytest.raises(ValueError) as excinfo: _ = await tm.list_all_tools(use_cache=False) assert "Duplicate raw tool name across providers: 'execute'" in str(excinfo.value) - assert calls["n"] == 2 - - class DupTool(DummyTool): - async def list_tools(self): - lst = await super().list_tools() - return [lst[0], lst[0]] # duplicate names within same tool + assert CountingTool.get_count() == 2 - globals()["DupTool"] = DupTool tm2 = ToolManager(module_specs=["tests.test_mcp_clients::DupTool"], overrides={}, context={}) tools2 = await tm2.list_all_tools(use_cache=False) names2 = sorted(t["name"] for t in tools2) From 32cc0149ce873599cb8fbef239401fe76b1bdf9e Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Wed, 8 Oct 2025 17:30:28 -0700 Subject: [PATCH 20/29] TST harden mcp client tests Signed-off-by: George Armstrong --- tests/test_mcp_clients.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/tests/test_mcp_clients.py b/tests/test_mcp_clients.py index bd31bc4524..878a65c112 100644 --- a/tests/test_mcp_clients.py +++ b/tests/test_mcp_clients.py @@ -205,24 +205,18 @@ async def execute(self, tool_name: str, arguments: dict, extra_args: dict | None return {"unknown": tool_name, "args": arguments} +# Shared state for CountingTool - must be a module-level mutable object +# so that instances created via locate() will reference the same object +_counting_tool_state = {"call_count": 0} + + # Helper class for test_tool_manager_cache_and_duplicate_detection # Defined at module level so it can be imported via locate() class CountingTool(DummyTool): - # Class variable to track calls across instances - _call_count = 0 - async def list_tools(self): - CountingTool._call_count += 1 + _counting_tool_state["call_count"] += 1 return await super().list_tools() - @classmethod - def reset_count(cls): - cls._call_count = 0 - - @classmethod - def get_count(cls): - return cls._call_count - # Helper class for duplicate tool detection test class DupTool(DummyTool): @@ -246,16 +240,16 @@ async def test_tool_manager_list_and_execute_with_class_locator(): @pytest.mark.asyncio async def test_tool_manager_cache_and_duplicate_detection(): # Reset counter before test - CountingTool.reset_count() + _counting_tool_state["call_count"] = 0 tm = ToolManager(module_specs=["tests.test_mcp_clients::CountingTool"], overrides={}, context={}) _ = await tm.list_all_tools(use_cache=True) _ = await tm.list_all_tools(use_cache=True) - assert CountingTool.get_count() == 1 + assert _counting_tool_state["call_count"] == 1 with pytest.raises(ValueError) as excinfo: _ = await tm.list_all_tools(use_cache=False) assert "Duplicate raw tool name across providers: 'execute'" in str(excinfo.value) - assert CountingTool.get_count() == 2 + assert _counting_tool_state["call_count"] == 2 tm2 = ToolManager(module_specs=["tests.test_mcp_clients::DupTool"], overrides={}, context={}) tools2 = await tm2.list_all_tools(use_cache=False) From 4248725836feaeb37e347e4b9f3f6986ab116c80 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Wed, 8 Oct 2025 17:46:57 -0700 Subject: [PATCH 21/29] TST harden mcp client tests WITH DEBUG Signed-off-by: George Armstrong --- tests/test_mcp_clients.py | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/tests/test_mcp_clients.py b/tests/test_mcp_clients.py index 6e850eece8..a2ef34155e 100644 --- a/tests/test_mcp_clients.py +++ b/tests/test_mcp_clients.py @@ -219,16 +219,19 @@ async def execute(self, tool_name: str, arguments: dict, extra_args: dict | None return {"unknown": tool_name, "args": arguments} -# Shared state for CountingTool - must be a module-level mutable object -# so that instances created via locate() will reference the same object -_counting_tool_state = {"call_count": 0} - - # Helper class for test_tool_manager_cache_and_duplicate_detection # Defined at module level so it can be imported via locate() class CountingTool(DummyTool): + # Use a class variable that's mutable to track calls + # This will be shared across all instances + call_count = 0 + + def __init__(self) -> None: + super().__init__() + async def list_tools(self): - _counting_tool_state["call_count"] += 1 + # Increment the class variable + CountingTool.call_count += 1 return await super().list_tools() @@ -253,17 +256,31 @@ async def test_tool_manager_list_and_execute_with_class_locator(): @pytest.mark.asyncio async def test_tool_manager_cache_and_duplicate_detection(): - # Reset counter before test - _counting_tool_state["call_count"] = 0 + import importlib + import sys + + # Reset counter before test - access via sys.modules to ensure we get the right class + this_module = sys.modules[__name__] + CountingToolClass = getattr(this_module, "CountingTool") + CountingToolClass.call_count = 0 + + # Debug: Check what locate() will see + imported_module = importlib.import_module("tests.test_mcp_clients") + imported_class = getattr(imported_module, "CountingTool") + print(f"DEBUG: Same module? {this_module is imported_module}") + print(f"DEBUG: Same class? {CountingToolClass is imported_class}") + print(f"DEBUG: Class IDs: {id(CountingToolClass)} vs {id(imported_class)}") tm = ToolManager(module_specs=["tests.test_mcp_clients::CountingTool"], overrides={}, context={}) _ = await tm.list_all_tools(use_cache=True) _ = await tm.list_all_tools(use_cache=True) - assert _counting_tool_state["call_count"] == 1 + print(f"DEBUG: Count after cached calls: {CountingToolClass.call_count}") + print(f"DEBUG: Imported class count: {imported_class.call_count}") + assert CountingToolClass.call_count == 1, f"Expected 1 call, got {CountingToolClass.call_count}" with pytest.raises(ValueError) as excinfo: _ = await tm.list_all_tools(use_cache=False) assert "Duplicate raw tool name across providers: 'execute'" in str(excinfo.value) - assert _counting_tool_state["call_count"] == 2 + assert CountingToolClass.call_count == 2, f"Expected 2 calls, got {CountingToolClass.call_count}" tm2 = ToolManager(module_specs=["tests.test_mcp_clients::DupTool"], overrides={}, context={}) tools2 = await tm2.list_all_tools(use_cache=False) From c140260186643dbf9f6bfbde923772b896b8e992 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Wed, 8 Oct 2025 17:55:12 -0700 Subject: [PATCH 22/29] TST add test for generation issue Signed-off-by: George Armstrong --- tests/test_generation.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_generation.py b/tests/test_generation.py index 0d0f42c586..99440ca4cd 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -16,10 +16,12 @@ # running most things through subprocess since that's how it's usually used import subprocess +from unittest.mock import MagicMock import pytest from nemo_skills.evaluation.metrics import ComputeMetrics +from nemo_skills.pipeline.generate import _create_commandgroup_from_config def test_eval_gsm8k_api(tmp_path): @@ -149,3 +151,41 @@ def test_generate_openai_format(tmp_path, format): assert len(data) == 2 assert len(data[0]["generation"]) > 0 assert len(data[1]["generation"]) > 0 + + +def test_server_metadata_from_num_tasks(): + """Test that metadata dict is properly created from server command returning (cmd, num_tasks).""" + mock_server_fn = MagicMock(return_value=("python server.py", 4)) + cluster_config = { + "containers": {"vllm": "nvcr.io/nvidia/nemo:vllm", "nemo-skills": "nvcr.io/nvidia/nemo:skills"}, + "executor": "slurm", + } + server_config = { + "server_type": "vllm", + "num_gpus": 8, + "num_nodes": 1, + "model_path": "/models/test", + "server_port": 5000, + } + + cmd_group = _create_commandgroup_from_config( + generation_cmd="python generate.py", + server_config=server_config, + with_sandbox=False, + sandbox_port=None, + cluster_config=cluster_config, + installation_command=None, + get_server_command_fn=mock_server_fn, + partition=None, + qos=None, + time_min=None, + exclusive=False, + keep_mounts_for_sandbox=False, + task_name="test-task", + log_dir="/tmp/logs", + ) + + server_cmd = cmd_group.commands[0] + assert isinstance(server_cmd.metadata, dict) + assert server_cmd.metadata["num_tasks"] == 4 + assert server_cmd.metadata["gpus"] == 8 From 2116744dc9ffd816234dc7fb172ebf51a0a9dacf Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Wed, 8 Oct 2025 17:56:05 -0700 Subject: [PATCH 23/29] FIX generation pipelin metadata Signed-off-by: George Armstrong --- nemo_skills/pipeline/generate.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index 4d07c2f889..436089504e 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -69,16 +69,15 @@ def _create_commandgroup_from_config( server_container = server_config.pop("container", cluster_config["containers"][server_type]) # Call server command builder directly with cluster_config - cmd, metadata = get_server_command_fn(**server_config, cluster_config=cluster_config) - - # Add additional metadata - metadata.update( - { - "gpus": server_config["num_gpus"], - "nodes": server_config["num_nodes"], - "log_prefix": "server", - } - ) + cmd, num_tasks = get_server_command_fn(**server_config, cluster_config=cluster_config) + + # Create metadata dict + metadata = { + "num_tasks": num_tasks, + "gpus": server_config["num_gpus"], + "nodes": server_config["num_nodes"], + "log_prefix": "server", + } server_cmd = Command( command=cmd, From 5f25397a901068575d7205558bcfa07d5b8e8e7e Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Wed, 8 Oct 2025 18:09:29 -0700 Subject: [PATCH 24/29] TST harden mcp client tests Signed-off-by: George Armstrong --- tests/test_mcp_clients.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/tests/test_mcp_clients.py b/tests/test_mcp_clients.py index a2ef34155e..bf893e1e19 100644 --- a/tests/test_mcp_clients.py +++ b/tests/test_mcp_clients.py @@ -245,7 +245,8 @@ async def list_tools(self): @pytest.mark.asyncio async def test_tool_manager_list_and_execute_with_class_locator(): # Register this test module's DummyTool via module locator - tm = ToolManager(module_specs=["tests.test_mcp_clients::DummyTool"], overrides={}, context={}) + # Use __name__ to get actual module path (works in both local and CI) + tm = ToolManager(module_specs=[f"{__name__}::DummyTool"], overrides={}, context={}) tools = await tm.list_all_tools(use_cache=False) names = sorted(t["name"] for t in tools) assert names == ["echo", "execute"] @@ -256,7 +257,6 @@ async def test_tool_manager_list_and_execute_with_class_locator(): @pytest.mark.asyncio async def test_tool_manager_cache_and_duplicate_detection(): - import importlib import sys # Reset counter before test - access via sys.modules to ensure we get the right class @@ -264,25 +264,18 @@ async def test_tool_manager_cache_and_duplicate_detection(): CountingToolClass = getattr(this_module, "CountingTool") CountingToolClass.call_count = 0 - # Debug: Check what locate() will see - imported_module = importlib.import_module("tests.test_mcp_clients") - imported_class = getattr(imported_module, "CountingTool") - print(f"DEBUG: Same module? {this_module is imported_module}") - print(f"DEBUG: Same class? {CountingToolClass is imported_class}") - print(f"DEBUG: Class IDs: {id(CountingToolClass)} vs {id(imported_class)}") - - tm = ToolManager(module_specs=["tests.test_mcp_clients::CountingTool"], overrides={}, context={}) + # Use __name__ to get the actual module path (works in both local and CI environments) + module_path = __name__ + tm = ToolManager(module_specs=[f"{module_path}::CountingTool"], overrides={}, context={}) _ = await tm.list_all_tools(use_cache=True) _ = await tm.list_all_tools(use_cache=True) - print(f"DEBUG: Count after cached calls: {CountingToolClass.call_count}") - print(f"DEBUG: Imported class count: {imported_class.call_count}") assert CountingToolClass.call_count == 1, f"Expected 1 call, got {CountingToolClass.call_count}" with pytest.raises(ValueError) as excinfo: _ = await tm.list_all_tools(use_cache=False) assert "Duplicate raw tool name across providers: 'execute'" in str(excinfo.value) assert CountingToolClass.call_count == 2, f"Expected 2 calls, got {CountingToolClass.call_count}" - tm2 = ToolManager(module_specs=["tests.test_mcp_clients::DupTool"], overrides={}, context={}) + tm2 = ToolManager(module_specs=[f"{module_path}::DupTool"], overrides={}, context={}) tools2 = await tm2.list_all_tools(use_cache=False) names2 = sorted(t["name"] for t in tools2) assert names2 == ["execute"] From 5d8ddd2223027b41223ef32271393d57208cbf2a Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Thu, 9 Oct 2025 13:04:52 -0700 Subject: [PATCH 25/29] add test for dependency issue Signed-off-by: George Armstrong add test for dependency issue Signed-off-by: George Armstrong add test for dependency issue Signed-off-by: George Armstrong --- tests/test_declarative_pipeline.py | 200 +++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index b13b6f1ed1..deb209e128 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -14,10 +14,14 @@ """Tests for the declarative pipeline system.""" +import json from unittest.mock import MagicMock, patch import pytest +from nemo_skills.pipeline.cli import wrap_arguments +from nemo_skills.pipeline.generate import generate +from nemo_skills.pipeline.run_cmd import run_cmd from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline @@ -583,5 +587,201 @@ def test_commandgroup_missing_log_dir(self): pipeline.run(dry_run=True) +class TestJobDependencies: + """Test job dependencies across experiments.""" + + def test_dependencies_separated_internal_vs_external(self): + """Test that internal and external dependencies are handled differently. + + This verifies the fix for the bug where exp.add() was receiving external + experiment dependencies, causing an assertion error. + + The fix: + - Internal deps (task handles from same experiment) → passed to exp.add() + - External deps (SLURM job IDs from other experiments) → passed to executor + """ + import nemo_run as run + + # Mock get_exp_handles to return fake SLURM job IDs + with patch("nemo_skills.pipeline.utils.declarative.get_exp_handles") as mock_get_handles: + mock_get_handles.return_value = ["slurm_job_12345"] + + # Mock get_exp to avoid actually creating experiments + with patch("nemo_skills.pipeline.utils.declarative.get_exp") as mock_get_exp: + mock_exp = MagicMock(spec=run.Experiment) + mock_exp.__enter__ = MagicMock(return_value=mock_exp) + mock_exp.__exit__ = MagicMock(return_value=False) + # Return different handles for each job + mock_exp.add = MagicMock(side_effect=["task_handle_1", "task_handle_2"]) + mock_get_exp.return_value = mock_exp + + # Mock get_executor to capture what dependencies are passed to it + captured_executor_calls = [] + + def mock_get_executor(**kwargs): + captured_executor_calls.append(kwargs.get("dependencies")) + mock_executor = MagicMock() + mock_executor.packager = MagicMock() + return mock_executor + + with patch("nemo_skills.pipeline.utils.declarative.get_executor", side_effect=mock_get_executor): + # Mock run_exp to avoid actually running + with patch("nemo_skills.pipeline.utils.declarative.run_exp"): + cluster_config = { + "executor": "slurm", + "containers": {"nemo-skills": "test/container"}, + "account": "test", + "env_vars": {"HF_HOME": "/mounted/hf_home"}, + "mounts": ["/mounted/hf_home:/mounted/hf_home"], + } + + # Job 1: depends on external experiment + cmd1 = Command(command="echo job1", name="job1") + group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/tmp/logs") + + # Job 2: depends on job1 (internal) AND external experiment + cmd2 = Command(command="echo job2", name="job2") + group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/tmp/logs") + + job1_spec = { + "name": "job1", + "group": group1, + "dependencies": ["external_experiment"], # External only + } + + job2_spec = { + "name": "job2", + "group": group2, + "dependencies": [job1_spec, "another_external_experiment"], # Both internal and external + } + + pipeline = Pipeline( + name="test_pipeline", + cluster_config=cluster_config, + jobs=[job1_spec, job2_spec], + skip_hf_home_check=True, + ) + + # Run the pipeline (mocked) + pipeline.run(dry_run=False) + + # Verify executor calls + assert len(captured_executor_calls) == 2 + + # Job 1: should have external deps passed to executor + assert captured_executor_calls[0] == ["slurm_job_12345"] + + # Job 2: should have external deps passed to executor (from another_external_experiment) + assert captured_executor_calls[1] == ["slurm_job_12345"] # From another_external_experiment + + # Verify exp.add calls + assert mock_exp.add.call_count == 2 + + # Job 1: should have no internal deps + call1_kwargs = mock_exp.add.call_args_list[0][1] + assert call1_kwargs["dependencies"] is None + + # Job 2: should have internal deps (task_handle_1 from job1) + call2_kwargs = mock_exp.add.call_args_list[1][1] + assert call2_kwargs["dependencies"] == ["task_handle_1"] + + def test_run_after_dependencies_across_experiments(self, tmp_path): + """Test that run_after dependencies work when chaining multiple generate/run_cmd calls. + + This test verifies that when you call: + 1. generate() with expname="exp1" + 2. run_cmd() with expname="exp2" and run_after=["exp1"] + 3. generate() with expname="exp3" and run_after=["exp2"] + + The dependencies are correctly set up and exp3 waits for exp2, which waits for exp1. + """ + # Setup + output_dir = str(tmp_path) + input_file = f"{tmp_path}/input.jsonl" + + # Create dummy input file + with open(input_file, "w") as f: + f.write(json.dumps({"problem": "test"}) + "\n") + + # Step 1: First generation task + # Without the fix, this would work (no external deps) + exp1 = generate( + ctx=wrap_arguments("++max_samples=1"), + cluster="local", + input_file=input_file, + output_dir=f"{output_dir}/step1/", + model="nvidia/nvidia-nemotron-nano-9b-v2", + server_type="openai", + server_address="https://integrate.api.nvidia.com/v1", + expname="test_exp1", + reuse_code=False, # Disable code reuse for simpler test + dry_run=True, + ) + + # Step 2: Run command that depends on exp1 + # This tests that dependencies work across separate function calls + exp2 = run_cmd( + ctx=wrap_arguments("echo 'processing'"), + cluster="local", + log_dir=f"{output_dir}/step2-logs", + expname="test_exp2", + run_after=["test_exp1"], + reuse_code=False, # Disable code reuse for simpler test + dry_run=True, + ) + + # Step 3: Second generation that depends on exp2 + # This further tests chaining of dependencies + exp3 = generate( + ctx=wrap_arguments("++max_samples=1"), + cluster="local", + input_file=f"{output_dir}/step1/output.jsonl", + output_dir=f"{output_dir}/step3/", + model="nvidia/nvidia-nemotron-nano-9b-v2", + server_type="openai", + server_address="https://integrate.api.nvidia.com/v1", + expname="test_exp3", + run_after=["test_exp2"], + reuse_code=False, # Disable code reuse for simpler test + dry_run=True, + ) + + # Verify all experiments were created successfully + # The key test is that NO errors were raised above + # (Detailed dependency routing is verified in test_dependencies_separated_internal_vs_external) + assert exp1 is not None + assert exp2 is not None + assert exp3 is not None + + def test_run_after_with_nonexistent_experiment(self): + """Test that using run_after with a non-existent experiment gives a proper warning.""" + from nemo_skills.pipeline.utils.exp import get_exp_handles + + # This should return an empty list and log a warning + handles = get_exp_handles("nonexistent_experiment_12345", ignore_exp_not_exists=True) + assert handles == [] + + def test_run_after_with_experiment_object(self): + """Test that run_after can accept an experiment object directly.""" + import nemo_run as run + + from nemo_skills.pipeline.utils.exp import get_exp_handles + + # Create a mock experiment + mock_exp = MagicMock(spec=run.Experiment) + mock_exp.status.return_value = {"task1": {"status": "RUNNING", "handle": "slurm_job_123"}} + + # Test that we can get handles from an experiment object + with patch("nemo_skills.pipeline.utils.exp.AppState") as mock_app_state: + mock_app_state.RUNNING = "RUNNING" + mock_app_state.PENDING = "PENDING" + mock_app_state.SUBMITTED = "SUBMITTED" + mock_app_state.UNKNOWN = "UNKNOWN" + + handles = get_exp_handles(mock_exp, ignore_finished=True) + assert len(handles) == 1 + assert handles[0] == "slurm_job_123" + + if __name__ == "__main__": pytest.main([__file__, "-v"]) From 60543216f595305e1c347ae82f73cb2772e7a503 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Thu, 9 Oct 2025 13:10:10 -0700 Subject: [PATCH 26/29] FIx internal vs. external deps Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 65 +++++++++++++++-------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 64d942c70c..7d9b1bfe7c 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -344,8 +344,11 @@ def run( for job_spec in self.jobs: job_name = job_spec["name"] # Already validated in _validate() - # Resolve dependencies to task handles - run_after_deps = [] + # Separate internal and external dependencies from the start + # - Internal deps (task handles from current experiment) go to exp.add() + # - External deps (SLURM job IDs from other experiments) go to executor + internal_deps = [] + external_deps = [] # Handle dependencies from job spec job_dependencies = job_spec.get("dependencies", []) @@ -369,18 +372,18 @@ def run( ) # If no experiment found, treat as direct task handle (for _reuse_exp case) if _reuse_exp: - run_after_deps.append(dep) + external_deps.append(dep) LOG.info( f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)" ) else: - run_after_deps.extend(exp_handles) + external_deps.extend(exp_handles) LOG.info( f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)" ) elif _reuse_exp: # For non-SLURM executors with _reuse_exp, treat as task handle - run_after_deps.append(dep) + external_deps.append(dep) LOG.info(f"Job '{job_name}' depends on task handle '{dep}'") elif isinstance(dep, dict): # Dict dependency = internal job reference (by job spec object) @@ -388,7 +391,7 @@ def run( if not dep_name: raise ValueError(f"Job dependency must have a 'name' field: {dep}") if dep_name in job_name_to_handle: - run_after_deps.append(job_name_to_handle[dep_name]) + internal_deps.append(job_name_to_handle[dep_name]) LOG.info( f"Job '{job_name}' depends on internal job '{dep_name}' (handle: {job_name_to_handle[dep_name]})" ) @@ -399,12 +402,12 @@ def run( ) else: # Direct task handle object (not string or dict) - run_after_deps.append(dep) + internal_deps.append(dep) LOG.info(f"Job '{job_name}' depends on task handle (object)") - # Convert empty list to None for cleaner handling - if len(run_after_deps) == 0: - run_after_deps = None + # Convert empty lists to None for cleaner handling + internal_deps = internal_deps if internal_deps else None + external_deps = external_deps if external_deps else None # Check if this is a multi-group job or single group if "groups" in job_spec: @@ -415,7 +418,8 @@ def run( job_spec["groups"][0], self.cluster_config, default_log_dir=log_dir, - run_after=run_after_deps if run_after_deps else None, + internal_deps=internal_deps, + external_deps=external_deps, ) else: # True multi-group: combine multiple groups into one heterogeneous SLURM job @@ -424,7 +428,8 @@ def run( job_spec["groups"], self.cluster_config, default_log_dir=log_dir, - run_after=run_after_deps if run_after_deps else None, + internal_deps=internal_deps, + external_deps=external_deps, ) elif "group" in job_spec: # Single group job @@ -433,7 +438,8 @@ def run( job_spec["group"], self.cluster_config, default_log_dir=log_dir, - run_after=run_after_deps if run_after_deps else None, + internal_deps=internal_deps, + external_deps=external_deps, ) else: raise ValueError(f"Job spec must have either 'group' or 'groups': {job_spec}") @@ -490,6 +496,7 @@ def _create_executor( het_group: int, total_het_groups: int, overlap: bool, + dependencies: Optional[List] = None, ): """Create executor with optional environment update.""" env_context = ( @@ -518,6 +525,7 @@ def _create_executor( mounts=exec_config.get("mounts"), with_ray=self.with_ray, slurm_kwargs={"exclusive": hardware.exclusive} if (hardware and hardware.exclusive) else None, + dependencies=dependencies, ) def _plan_and_add_job( @@ -526,17 +534,19 @@ def _plan_and_add_job( groups: List[CommandGroup], cluster_config: Dict, default_log_dir: Optional[str] = None, - run_after: Optional[List] = None, + internal_deps: Optional[List] = None, + external_deps: Optional[List] = None, heterogeneous: bool = False, ) -> str: """Plan commands/executors for one or more groups and add to experiment. This encapsulates shared logic between single-group and multi-group jobs. Behavior differences are controlled by the 'heterogeneous' flag and the provided 'groups'. - """ - # Pass dependencies through (nemo-run handles executor-specific behavior) - dependencies = run_after if run_after else None + Args: + internal_deps: Task handles from same experiment (passed to exp.add()) + external_deps: SLURM job IDs from other experiments (passed to executor) + """ # Resolve log directory (use first group's log_dir if present) log_dir = groups[0].log_dir or default_log_dir @@ -592,6 +602,8 @@ def _plan_and_add_job( # Resolve container and create executor container_image = self._resolve_container(exec_config, command, cluster_config) + # Pass external dependencies only to the first executor (SLURM doesn't support per-component dependencies in hetjobs) + exec_dependencies = external_deps if (het_idx == 0 and comp_idx == 0) else None executor = self._create_executor( command, exec_config, @@ -603,6 +615,7 @@ def _plan_and_add_job( het_idx if heterogeneous else comp_idx, total_het_groups, (len(group.commands) > 1), + dependencies=exec_dependencies, ) # Share packager across executors for single-group jobs @@ -664,12 +677,14 @@ def _plan_and_add_job( metadata = None # Add to experiment and return task ID + # Note: Internal dependencies (task handles from same experiment) go to exp.add() + # External dependencies (SLURM job IDs from other experiments) go to executor if (not heterogeneous) and len(commands) == 1: task_id = exp.add( run.Script(inline=commands[0], metadata=metadata), executor=executors[0], name="nemo-run", - dependencies=dependencies, + dependencies=internal_deps, ) else: task_id = exp.add( @@ -679,7 +694,7 @@ def _plan_and_add_job( ], executor=executors, name="nemo-run", - dependencies=dependencies, + dependencies=internal_deps, ) return task_id @@ -690,7 +705,8 @@ def _add_single_group_job( group: CommandGroup, cluster_config: Dict, default_log_dir: Optional[str] = None, - run_after: Optional[List] = None, + internal_deps: Optional[List] = None, + external_deps: Optional[List] = None, ) -> str: """Add a single CommandGroup as one job and return its task handle.""" @@ -699,7 +715,8 @@ def _add_single_group_job( groups=[group], cluster_config=cluster_config, default_log_dir=default_log_dir, - run_after=run_after, + internal_deps=internal_deps, + external_deps=external_deps, heterogeneous=False, ) @@ -709,7 +726,8 @@ def _add_multi_group_job( groups: List[CommandGroup], cluster_config: Dict, default_log_dir: Optional[str] = None, - run_after: Optional[List] = None, + internal_deps: Optional[List] = None, + external_deps: Optional[List] = None, ) -> str: """Add multiple CommandGroups as a single heterogeneous SLURM job and return task handle.""" @@ -718,6 +736,7 @@ def _add_multi_group_job( groups=groups, cluster_config=cluster_config, default_log_dir=default_log_dir, - run_after=run_after, + internal_deps=internal_deps, + external_deps=external_deps, heterogeneous=True, ) From bdfd79f3b8ce0b85848394dc717d0957852d30af Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Thu, 9 Oct 2025 13:37:26 -0700 Subject: [PATCH 27/29] TST fix and improve tests Signed-off-by: George Armstrong --- tests/test_declarative_pipeline.py | 100 +++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 7 deletions(-) diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py index deb209e128..ffc3a13622 100644 --- a/tests/test_declarative_pipeline.py +++ b/tests/test_declarative_pipeline.py @@ -15,6 +15,7 @@ """Tests for the declarative pipeline system.""" import json +import os from unittest.mock import MagicMock, patch import pytest @@ -315,7 +316,7 @@ def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_exp): @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") @patch("nemo_skills.pipeline.utils.declarative.run_exp") def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_get_exp): - """Test pipeline execution with job dependencies.""" + """Test pipeline execution with internal job dependencies.""" # Setup mocks mock_config = { "executor": "none", @@ -327,7 +328,7 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_ mock_exp.add.side_effect = ["handle_1", "handle_2"] mock_get_exp.return_value.__enter__.return_value = mock_exp - # Create pipeline with dependencies + # Create pipeline with internal dependencies cmd1 = Command(command="echo 1", name="cmd1") group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs") @@ -335,7 +336,7 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_ group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs") job1 = {"name": "job1", "group": group1, "dependencies": []} - job2 = {"name": "job2", "group": group2, "dependencies": [job1]} + job2 = {"name": "job2", "group": group2, "dependencies": [job1]} # Internal dependency pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=[job1, job2], skip_hf_home_check=True) @@ -345,6 +346,16 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_ # Verify both jobs were added assert mock_exp.add.call_count == 2 + # Verify job1 has no dependencies + call1_kwargs = mock_exp.add.call_args_list[0][1] + assert call1_kwargs["dependencies"] is None or call1_kwargs["dependencies"] == [] + + # Verify job2 has internal dependency on job1 (handle_1) + call2_kwargs = mock_exp.add.call_args_list[1][1] + assert call2_kwargs["dependencies"] == ["handle_1"], ( + f"Job2 should depend on job1's handle, got {call2_kwargs['dependencies']}" + ) + @patch("nemo_skills.pipeline.utils.declarative.get_exp") @patch("nemo_skills.pipeline.utils.declarative.get_env_variables") @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath") @@ -590,6 +601,74 @@ def test_commandgroup_missing_log_dir(self): class TestJobDependencies: """Test job dependencies across experiments.""" + def test_multiple_internal_dependencies(self): + """Test that a job can depend on multiple internal jobs.""" + import nemo_run as run + + with patch("nemo_skills.pipeline.utils.declarative.get_exp") as mock_get_exp: + mock_exp = MagicMock(spec=run.Experiment) + mock_exp.__enter__ = MagicMock(return_value=mock_exp) + mock_exp.__exit__ = MagicMock(return_value=False) + # Return handles for 3 jobs + mock_exp.add = MagicMock(side_effect=["handle_1", "handle_2", "handle_3"]) + mock_get_exp.return_value = mock_exp + + with patch("nemo_skills.pipeline.utils.declarative.get_executor") as mock_executor: + mock_executor.return_value = MagicMock(packager=MagicMock()) + + with patch("nemo_skills.pipeline.utils.declarative.run_exp"): + cluster_config = { + "executor": "slurm", + "containers": {"nemo-skills": "test/container"}, + "account": "test", + "env_vars": {"HF_HOME": "/mounted/hf_home"}, + "mounts": ["/mounted/hf_home:/mounted/hf_home"], + } + + # Job 1 and Job 2: independent + cmd1 = Command(command="echo job1", name="job1") + group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/tmp/logs") + + cmd2 = Command(command="echo job2", name="job2") + group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/tmp/logs") + + # Job 3: depends on both job1 and job2 + cmd3 = Command(command="echo job3", name="job3") + group3 = CommandGroup(commands=[cmd3], name="group3", log_dir="/tmp/logs") + + job1_spec = {"name": "job1", "group": group1} + job2_spec = {"name": "job2", "group": group2} + job3_spec = { + "name": "job3", + "group": group3, + "dependencies": [job1_spec, job2_spec], # Multiple internal dependencies + } + + pipeline = Pipeline( + name="test_pipeline", + cluster_config=cluster_config, + jobs=[job1_spec, job2_spec, job3_spec], + skip_hf_home_check=True, + reuse_code=False, # Disable code reuse to avoid mock issues + ) + + pipeline.run(dry_run=True) + + # Verify all jobs were added + assert mock_exp.add.call_count == 3 + + # Job 1 and 2 should have no internal dependencies + call1_kwargs = mock_exp.add.call_args_list[0][1] + call2_kwargs = mock_exp.add.call_args_list[1][1] + assert call1_kwargs["dependencies"] is None + assert call2_kwargs["dependencies"] is None + + # Job 3 should depend on both job1 and job2 + call3_kwargs = mock_exp.add.call_args_list[2][1] + assert call3_kwargs["dependencies"] == ["handle_1", "handle_2"], ( + f"Job3 should depend on both handle_1 and handle_2, got {call3_kwargs['dependencies']}" + ) + def test_dependencies_separated_internal_vs_external(self): """Test that internal and external dependencies are handled differently. @@ -660,10 +739,11 @@ def mock_get_executor(**kwargs): cluster_config=cluster_config, jobs=[job1_spec, job2_spec], skip_hf_home_check=True, + reuse_code=False, # Disable code reuse to avoid mock issues ) # Run the pipeline (mocked) - pipeline.run(dry_run=False) + pipeline.run(dry_run=True) # Verify executor calls assert len(captured_executor_calls) == 2 @@ -703,11 +783,15 @@ def test_run_after_dependencies_across_experiments(self, tmp_path): with open(input_file, "w") as f: f.write(json.dumps({"problem": "test"}) + "\n") + # Use test-local cluster config for CI compatibility + test_config_dir = os.path.join(os.path.dirname(__file__), "gpu-tests") + # Step 1: First generation task # Without the fix, this would work (no external deps) exp1 = generate( ctx=wrap_arguments("++max_samples=1"), - cluster="local", + cluster="test-local", + config_dir=test_config_dir, input_file=input_file, output_dir=f"{output_dir}/step1/", model="nvidia/nvidia-nemotron-nano-9b-v2", @@ -722,7 +806,8 @@ def test_run_after_dependencies_across_experiments(self, tmp_path): # This tests that dependencies work across separate function calls exp2 = run_cmd( ctx=wrap_arguments("echo 'processing'"), - cluster="local", + cluster="test-local", + config_dir=test_config_dir, log_dir=f"{output_dir}/step2-logs", expname="test_exp2", run_after=["test_exp1"], @@ -734,7 +819,8 @@ def test_run_after_dependencies_across_experiments(self, tmp_path): # This further tests chaining of dependencies exp3 = generate( ctx=wrap_arguments("++max_samples=1"), - cluster="local", + cluster="test-local", + config_dir=test_config_dir, input_file=f"{output_dir}/step1/output.jsonl", output_dir=f"{output_dir}/step3/", model="nvidia/nvidia-nemotron-nano-9b-v2", From 2563cf38c4f62775b5b8d81f4be39acf094b88c9 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Thu, 9 Oct 2025 13:57:53 -0700 Subject: [PATCH 28/29] FIX reuse_exp Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index 7d9b1bfe7c..f9d7db3bc4 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -372,7 +372,7 @@ def run( ) # If no experiment found, treat as direct task handle (for _reuse_exp case) if _reuse_exp: - external_deps.append(dep) + internal_deps.append(dep) LOG.info( f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)" ) @@ -382,9 +382,9 @@ def run( f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)" ) elif _reuse_exp: - # For non-SLURM executors with _reuse_exp, treat as task handle - external_deps.append(dep) - LOG.info(f"Job '{job_name}' depends on task handle '{dep}'") + # For non-SLURM executors with _reuse_exp, string deps are internal task handles + internal_deps.append(dep) + LOG.info(f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)") elif isinstance(dep, dict): # Dict dependency = internal job reference (by job spec object) dep_name = dep.get("name") From d3e83163d20eca2a167c3e454e474a4f9ae6965e Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Thu, 9 Oct 2025 17:05:50 -0700 Subject: [PATCH 29/29] MAINT cascade skip_hf_home_check Signed-off-by: George Armstrong --- nemo_skills/pipeline/utils/declarative.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py index f9d7db3bc4..956dcbd08b 100644 --- a/nemo_skills/pipeline/utils/declarative.py +++ b/nemo_skills/pipeline/utils/declarative.py @@ -276,7 +276,7 @@ def __init__( jobs: List[Dict], reuse_code: bool = True, reuse_code_exp: Optional[str] = None, - skip_hf_home_check: bool = False, + skip_hf_home_check: bool | None = None, with_ray: bool = False, run_after: Optional[Union[str, List[str]]] = None, # Pipeline-level dependency on other experiments ): @@ -284,6 +284,9 @@ def __init__( self.cluster_config = cluster_config self.reuse_code = reuse_code self.reuse_code_exp = reuse_code_exp + # If not explicitly set, resolve from cluster config (matching exp.py behavior) + if skip_hf_home_check is None: + skip_hf_home_check = cluster_config.get("skip_hf_home_check", False) self.skip_hf_home_check = skip_hf_home_check self.with_ray = with_ray self.run_after = run_after