From 69c02f20a3f38f72e44c0d60f47993db4793c20b Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 29 Sep 2025 13:43:17 -0700
Subject: [PATCH 01/29] ENH refactor pipeline to be extensible

Signed-off-by: George Armstrong <georgea@nvidia.com>

WIP add more declarative syntax

Signed-off-by: George Armstrong <georgea@nvidia.com>

add crossgroup sandbox

Signed-off-by: George Armstrong <georgea@nvidia.com>

WIP make cross group built in

Signed-off-by: George Armstrong <georgea@nvidia.com>

add run_cmd

Signed-off-by: George Armstrong <georgea@nvidia.com>

small fixes

Signed-off-by: George Armstrong <georgea@nvidia.com>

WIP add variable definition and getting utilities

Signed-off-by: George Armstrong <georgea@nvidia.com>

get het group working again

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT add simple example

WIP fixing hetgroup -- better but host is broken

Signed-off-by: George Armstrong <georgea@nvidia.com>

multi hetgroup example

Signed-off-by: George Armstrong <georgea@nvidia.com>

cleanup

Signed-off-by: George Armstrong <georgea@nvidia.com>

FIX het group cross references

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT simplify declarative

Signed-off-by: George Armstrong <georgea@nvidia.com>

Maint remove stuff

MAINT more simplification and improvements

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT make dependencies work

Signed-off-by: George Armstrong <georgea@nvidia.com>

make example simplistic but functional

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT remove v2 examples

WIP working send to remote

Signed-off-by: George Armstrong <georgea@nvidia.com>

add commands

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT renaming and additional docs

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT remove examples

MAINT remove unused files

MAINT fix the dep suffix

Signed-off-by: George Armstrong <georgea@nvidia.com>

FIXED timing issue on creation

Signed-off-by: George Armstrong <georgea@nvidia.com>

resolve more differences

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT revert generate.py to refactored?

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT respect log dir

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT cleanup and FIX run_after

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT rename stage back to pipeline

MAINT rename stage back to pipeline

FIX log dir and output dir setting

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT simplify declarative

Signed-off-by: George Armstrong <georgea@nvidia.com>

MAINT remvoe extra abstraction leak

move imports to top

MAINT support using single group job if groups is only one item

Signed-off-by: George Armstrong <georgea@nvidia.com>

simplify unneeded path

Signed-off-by: George Armstrong <georgea@nvidia.com>

remove special sandbox handling

refactor pipeline

Signed-off-by: George Armstrong <georgea@nvidia.com>

fix pipeline

Signed-off-by: George Armstrong <georgea@nvidia.com>

remove extra generate

MAINT move import

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py          | 267 ++++++--
 nemo_skills/pipeline/utils/commands.py    | 144 +++++
 nemo_skills/pipeline/utils/declarative.py | 738 ++++++++++++++++++++++
 3 files changed, 1106 insertions(+), 43 deletions(-)
 create mode 100644 nemo_skills/pipeline/utils/commands.py
 create mode 100644 nemo_skills/pipeline/utils/declarative.py

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index 5543c9c6c8..626fe62fb1 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -14,7 +14,7 @@
 import importlib
 import logging
 import os
-from typing import List
+from typing import Callable, Dict, List, Optional
 
 import typer
 
@@ -22,6 +22,8 @@
 from nemo_skills.dataset.utils import import_from_path
 from nemo_skills.inference import GENERATION_MODULE_MAP, GenerationType
 from nemo_skills.pipeline.app import app, typer_unpacker
+from nemo_skills.pipeline.utils.commands import sandbox_command
+from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline
 from nemo_skills.utils import (
     compute_chunk_ids,
     get_logger_name,
@@ -35,6 +37,113 @@
 # TODO: add num_jobs here for consistency with eval?
 
 
+def _create_commandgroup_from_config(
+    generation_cmd: str,
+    server_config: Optional[Dict],
+    with_sandbox: bool,
+    sandbox_port: Optional[int],
+    cluster_config: Dict,
+    installation_command: Optional[str],
+    get_server_command_fn: Callable,
+    partition: Optional[str],
+    time_min: Optional[str],
+    exclusive: bool,
+    task_name: str,
+    log_dir: str,
+) -> CommandGroup:
+    """Create a CommandGroup from server_config, matching add_task logic.
+
+    This function replicates the component ordering from add_task:
+    1. Server (if server_config provided)
+    2. Main client command
+    3. Sandbox (if with_sandbox=True)
+    """
+
+    components = []
+
+    # 1. Add server if server_config is provided (matches add_task lines 433-463)
+    if server_config is not None and int(server_config["num_gpus"]) > 0:
+        # Extract server container (matches line 435)
+        server_type = server_config["server_type"]
+        server_container = server_config.pop("container", cluster_config["containers"][server_type])
+
+        # Use the EXISTING get_server_command function via lambda
+        # This ensures we use the same server command construction as add_task
+        server_config_copy = server_config.copy()
+
+        def make_server_cmd(cfg):
+            cmd, num_tasks = get_server_command_fn(**server_config_copy, cluster_config=cfg)
+            # Include log_prefix in metadata to match old add_task behavior
+            return (
+                cmd,
+                {
+                    "num_tasks": num_tasks,
+                    "gpus": server_config_copy["num_gpus"],
+                    "nodes": server_config_copy["num_nodes"],
+                    "log_prefix": "server",  # Explicitly set log prefix
+                },
+            )
+
+        server_cmd = Command(
+            command=make_server_cmd,
+            container=server_container,
+            gpus=server_config["num_gpus"],
+            nodes=server_config["num_nodes"],
+            name=task_name,  # Use base task_name, not with _server suffix
+        )
+        components.append(server_cmd)
+
+    # 2. Add main generation command (matches add_task lines 466-580)
+    client_cmd = Command(
+        command=generation_cmd,  # Already built with get_generation_cmd!
+        container=cluster_config["containers"]["nemo-skills"],
+        name=task_name,  # Use base task_name, not with _client suffix
+        installation_command=installation_command,
+        metadata={"log_prefix": "main"},  # Set log prefix to match old add_task
+    )
+    components.append(client_cmd)
+
+    # 3. Add sandbox if requested (matches add_task lines 527-565)
+    if with_sandbox:
+        # Use existing sandbox_command builder
+        # Wrap to ensure log_prefix is set in returned metadata
+        def make_sandbox_cmd(cfg):
+            # sandbox_command returns (callable, metadata), so we need to call the callable
+            cmd_builder, initial_metadata = sandbox_command(port=sandbox_port)
+            # Call the builder to get the actual command string
+            cmd_string, runtime_metadata = cmd_builder(cfg)
+            # Merge metadata
+            metadata = initial_metadata.copy()
+            metadata.update(runtime_metadata)
+            metadata["log_prefix"] = "sandbox"  # Explicitly set log prefix
+            return (cmd_string, metadata)
+
+        sandbox_cmd = Command(
+            command=make_sandbox_cmd,
+            container=cluster_config["containers"]["sandbox"],
+            name=task_name,  # Use base task_name, not with _sandbox suffix
+        )
+        components.append(sandbox_cmd)
+
+    # Find MAXIMUM GPUs needed by any component for the HardwareConfig
+    # This is critical for multi-component jobs - the job-level request must be the max
+    max_gpus = max((comp.gpus or 0) for comp in components)
+    max_nodes = max((comp.nodes or 1) for comp in components)
+
+    return CommandGroup(
+        commands=components,
+        hardware=HardwareConfig(
+            partition=partition,
+            time_min=time_min,
+            exclusive=exclusive,
+            num_gpus=max_gpus,
+            num_nodes=max_nodes,
+        ),
+        name=task_name,
+        log_dir=log_dir,
+    )
+
+
 @app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
 @typer_unpacker
 def generate(
@@ -257,46 +366,101 @@ def generate(
         chunk_ids=chunk_ids,
         rerun_done=rerun_done,
     )
-    has_tasks = False
-    all_tasks = []
+
     if _task_dependencies is None:
         _task_dependencies = []
-    with pipeline_utils.get_exp(expname, cluster_config, _reuse_exp) as exp:
-        for seed_idx, (seed, chunk_ids) in enumerate(remaining_jobs.items()):
-            if wandb_parameters:
-                # no need for chunks as it will run after merging
-                wandb_parameters["samples_file"] = pipeline_utils.get_chunked_rs_filename(
-                    output_dir,
-                    random_seed=seed,
-                    chunk_id=None,
-                )
-            for chunk_id in chunk_ids:
-                has_tasks = True
-                server_config, server_address, extra_arguments = pipeline_utils.configure_client(
-                    model=model,
-                    server_type=server_type,
-                    server_address=original_server_address,
-                    server_gpus=server_gpus,
-                    server_nodes=server_nodes,
-                    server_args=server_args,
-                    server_entrypoint=server_entrypoint,
-                    server_container=server_container,
-                    extra_arguments=extra_arguments_original,
-                    get_random_port=get_random_port,
+
+    # Build jobs list using declarative interface
+    jobs = []
+    all_job_names = []
+
+    for seed_idx, (seed, chunk_ids) in enumerate(remaining_jobs.items()):
+        if wandb_parameters:
+            # no need for chunks as it will run after merging
+            wandb_parameters["samples_file"] = pipeline_utils.get_chunked_rs_filename(
+                output_dir,
+                random_seed=seed,
+                chunk_id=None,
+            )
+        for chunk_id in chunk_ids:
+            # Configure client (same as before)
+            server_config, server_address, extra_arguments = pipeline_utils.configure_client(
+                model=model,
+                server_type=server_type,
+                server_address=original_server_address,
+                server_gpus=server_gpus,
+                server_nodes=server_nodes,
+                server_args=server_args,
+                server_entrypoint=server_entrypoint,
+                server_container=server_container,
+                extra_arguments=extra_arguments_original,
+                get_random_port=get_random_port,
+            )
+
+            # Build generation command (same as before)
+            cmd = pipeline_utils.get_generation_cmd(
+                input_file=input_file,
+                input_dir=input_dir,
+                random_seed=seed,
+                output_dir=output_dir,
+                extra_arguments=extra_arguments,
+                eval_args=eval_args,
+                chunk_id=chunk_id,
+                num_chunks=num_chunks,
+                preprocess_cmd=preprocess_cmd,
+                postprocess_cmd=postprocess_cmd,
+                wandb_parameters=wandb_parameters if seed_idx == 0 else None,
+                script=generation_module,
+            )
+            cmd = pipeline_utils.wrap_python_path(cmd=cmd)
+
+            # Base task name (same for all dependent jobs in chain - matches original!)
+            task_name = f"{expname}-rs{seed}" if seed is not None else expname
+            if chunk_id is not None:
+                task_name += f"-chunk{chunk_id}"
+
+            # Handle dependent_jobs chain (matching add_task behavior)
+            # Note: run_after will be handled by Stage for jobs that don't have task_dependencies
+            dependencies = _task_dependencies.copy() if _task_dependencies else []
+
+            for dep_idx in range(dependent_jobs + 1):
+                # Create CommandGroup for this task
+                cmd_group = _create_commandgroup_from_config(
+                    generation_cmd=cmd,
+                    server_config=server_config.copy() if server_config else None,
+                    with_sandbox=with_sandbox,
+                    sandbox_port=None if get_random_port else 6000,
+                    cluster_config=cluster_config,
+                    installation_command=installation_command,
+                    get_server_command_fn=generation_task.get_server_command_fn(),
+                    partition=partition,
+                    time_min=time_min,
+                    exclusive=exclusive,
+                    task_name=task_name,
+                    log_dir=log_dir,
                 )
-                cmd = pipeline_utils.get_generation_cmd(
-                    input_file=input_file,
-                    input_dir=input_dir,
-                    random_seed=seed,
-                    output_dir=output_dir,
-                    extra_arguments=extra_arguments,
-                    eval_args=eval_args,
-                    chunk_id=chunk_id,
-                    num_chunks=num_chunks,
-                    preprocess_cmd=preprocess_cmd,
-                    postprocess_cmd=postprocess_cmd,
-                    wandb_parameters=wandb_parameters if seed_idx == 0 else None,
-                    script=generation_module,
+
+                # Use unique internal job name for dependency tracking, but same task_name
+                internal_job_name = f"{task_name}-dep{dep_idx}" if dep_idx > 0 else task_name
+
+                # Build dependencies: first job gets _task_dependencies+run_after, rest get previous in chain
+                if dep_idx == 0:
+                    # First job: add run_after if no task_dependencies (matching add_task logic)
+                    job_deps = dependencies.copy() if dependencies else []
+                    if not dependencies and run_after:
+                        run_after_list = run_after if isinstance(run_after, list) else [run_after]
+                        job_deps.extend(run_after_list)
+                    job_deps = job_deps if job_deps else None
+                else:
+                    # Subsequent jobs in chain depend on previous
+                    job_deps = [f"{task_name}-dep{dep_idx - 1}"]
+
+                jobs.append(
+                    {
+                        "name": internal_job_name,
+                        "group": cmd_group,
+                        "dependencies": job_deps,
+                    }
                 )
                 prev_tasks = _task_dependencies
                 for _ in range(dependent_jobs + 1):
@@ -333,13 +497,30 @@ def generate(
         if has_tasks and not _reuse_exp:  # if we are reusing an experiment, the tasks will run from there
             pipeline_utils.run_exp(exp, cluster_config, dry_run=dry_run)
 
-    if _reuse_exp:
-        return all_tasks
-    else:
-        if has_tasks:
-            return exp
+                all_job_names.append(internal_job_name)
+
+    # If no jobs to run, return early
+    if not jobs:
         return None
 
+    # Handle _reuse_exp case
+    if _reuse_exp:
+        # For internal use: return job names for dependency tracking
+        return all_job_names
+
+    # Create and run pipeline (with performance optimizations)
+    pipeline = Pipeline(
+        name=expname,
+        cluster=cluster,
+        jobs=jobs,
+        reuse_code=reuse_code,  # PERFORMANCE: Enable code reuse
+        reuse_code_exp=reuse_code_exp,  # PERFORMANCE: Reuse from specific exp
+        skip_hf_home_check=skip_hf_home_check,  # Validation control
+    )
+
+    result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run)
+    return result
+
 
 if __name__ == "__main__":
     typer.main.get_command_name = lambda name: name
diff --git a/nemo_skills/pipeline/utils/commands.py b/nemo_skills/pipeline/utils/commands.py
new file mode 100644
index 0000000000..cfebb8753b
--- /dev/null
+++ b/nemo_skills/pipeline/utils/commands.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Minimal command builders for declarative pipeline interface.
+
+These are thin wrappers around existing command construction utilities
+in server.py and other modules, designed to work with Command objects.
+"""
+
+from typing import Callable, Dict, Optional, Tuple
+
+from nemo_skills.pipeline.utils.exp import get_sandbox_command
+from nemo_skills.pipeline.utils.server import get_free_port, get_server_command
+
+
+def vllm_server_command(
+    model: str,
+    port: Optional[int] = None,
+    server_type: str = "vllm",
+    gpus: int = 8,
+    nodes: int = 1,
+    args: str = "",
+    entrypoint: Optional[str] = None,
+    **kwargs,
+) -> Tuple[Callable, Dict]:
+    """Build vLLM server command.
+
+    Returns a lambda that will build the command when cluster_config is available.
+
+    Args:
+        model: Model path or name
+        port: Port to use (if None, will use get_free_port)
+        server_type: Type of server (vllm, sglang, trtllm, megatron)
+        gpus: Number of GPUs
+        nodes: Number of nodes
+        args: Additional server arguments
+        entrypoint: Custom entrypoint script
+
+    Returns:
+        Tuple of (lambda, metadata_dict)
+    """
+    if port is None:
+        port = get_free_port(strategy="random")
+
+    metadata = {
+        "port": port,
+        "log_prefix": "server",
+    }
+
+    # Return lambda that will call get_server_command when cluster_config is available
+    def server_cmd_builder(cfg):
+        cmd, num_tasks = get_server_command(
+            server_type=server_type,
+            num_gpus=gpus,
+            num_nodes=nodes,
+            model_path=model,
+            cluster_config=cfg,
+            server_port=port,
+            server_args=args,
+            server_entrypoint=entrypoint,
+        )
+        return (cmd, {"num_tasks": num_tasks})
+
+    return server_cmd_builder, metadata
+
+
+def sandbox_command(port: Optional[int] = None, **kwargs) -> Tuple[Callable, Dict]:
+    """Build sandbox command.
+
+    Returns a lambda that will build the command when cluster_config is available.
+
+    Args:
+        port: Port to use for sandbox
+
+    Returns:
+        Tuple of (lambda, metadata_dict)
+    """
+    if port is None:
+        port = get_free_port(strategy="random")
+
+    metadata = {
+        "port": port,
+        "log_prefix": "sandbox",
+        "mounts": [],  # Sandbox doesn't mount anything
+        "environment": {
+            "LISTEN_PORT": str(port),
+            "NGINX_PORT": str(port),
+        },
+    }
+
+    # Return lambda that will be evaluated when cluster_config is available
+    def sandbox_cmd_with_env(cfg):
+        cmd = get_sandbox_command(cfg)
+        # Build PYTHONPATH from cluster config
+        pythonpath_env = {}
+        for env_var in cfg.get("env_vars", []):
+            if "PYTHONPATH" in env_var:
+                pythonpath = env_var[11:] if env_var.startswith("PYTHONPATH=") else env_var
+                pythonpath_env["PYTHONPATH"] = pythonpath + ":/app"
+                break
+        # Return command + additional metadata
+        return (cmd, {"environment": pythonpath_env})
+
+    return sandbox_cmd_with_env, metadata
+
+
+def wrap_command(command: str, working_dir: str = "/nemo_run/code", env_vars: Optional[Dict[str, str]] = None) -> str:
+    """Wrap command with working directory and environment variable setup.
+
+    Args:
+        command: The command to wrap
+        working_dir: Working directory to cd into
+        env_vars: Environment variables to export
+
+    Returns:
+        Wrapped command string
+    """
+    parts = []
+
+    # Export environment variables
+    if env_vars:
+        for key, value in env_vars.items():
+            parts.append(f"export {key}={value}")
+
+    # Change to working directory
+    if working_dir:
+        parts.append(f"cd {working_dir}")
+
+    # Add the actual command
+    parts.append(command)
+
+    return " && ".join(parts)
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
new file mode 100644
index 0000000000..af2bdae4e5
--- /dev/null
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -0,0 +1,738 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Simplified declarative pipeline system using only Command for all task types.
+
+Basic Example (Single job with multiple commands):
+    from nemo_skills.pipeline.utils.commands import vllm_server_command, sandbox_command
+    from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline
+
+    # Commands that run together in one SLURM job
+    server = Command(command=vllm_server_command(model="Qwen/Qwen3-8B"), gpus=8, name="server")
+    sandbox = Command(command=sandbox_command(), name="sandbox")
+    client = Command(
+        command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}/health",
+        name="client"
+    )
+
+    # Group them together
+    inference_group = CommandGroup(
+        commands=[server, sandbox, client],
+        hardware=HardwareConfig(partition="batch"),
+        name="inference"
+    )
+
+    # Create and run pipeline
+    pipeline = Pipeline(
+        name="my_inference",
+        cluster="local",
+        groups=[inference_group]  # Legacy mode: single job
+    )
+    pipeline.run()
+
+Advanced Example (Multiple jobs with dependencies and heterogeneous components):
+    log_dir = "/experiments/full_pipeline/logs"
+    # Job 1: Preprocessing
+    preprocess = Command(
+        command="python preprocess.py --input data.jsonl --output processed.jsonl",
+        gpus=0,
+        name="preprocess"
+    )
+    prep_group = CommandGroup(
+        commands=[preprocess],
+        hardware=HardwareConfig(partition="cpu"),
+        name="prep",
+        log_dir=log_dir
+    )
+
+    # Job 2: Two different model servers (HETEROGENEOUS SLURM job with 2 het components)
+    server_8b = Command(command=vllm_server_command(model="Qwen/Qwen3-8B"), gpus=8, name="server_8b")
+    sandbox_8b = Command(command=sandbox_command(), name="sandbox_8b")
+    eval_8b = Command(command="python eval.py --model 8b", gpus=1, name="eval_8b")
+
+    server_32b = Command(command=vllm_server_command(model="Qwen/Qwen3-32B"), gpus=8, name="server_32b")
+    sandbox_32b = Command(command=sandbox_command(), name="sandbox_32b")
+    eval_32b = Command(command="python eval.py --model 32b", gpus=1, name="eval_32b")
+
+    group_8b = CommandGroup(commands=[server_8b, sandbox_8b, eval_8b], name="eval_8b", log_dir=log_dir)
+    group_32b = CommandGroup(commands=[server_32b, sandbox_32b, eval_32b], name="eval_32b", log_dir=log_dir)
+
+    # Job 3: Report generation (depends on both evaluations)
+    report = Command(
+        command="python generate_report.py --output report.txt",
+        gpus=0,
+        name="report"
+    )
+    report_group = CommandGroup(commands=[report], name="report", log_dir=log_dir)
+
+    # Create pipeline with dependency graph
+    pipeline = Pipeline(
+        name="full_pipeline",
+        cluster="slurm",
+        jobs=[
+            {"name": "prep", "group": prep_group},
+
+            # Multi-group heterogeneous job (both eval groups in ONE SLURM job with 2 het components)
+            {"name": "evals", "groups": [group_8b, group_32b], "dependencies": ["prep"]},
+
+            # Report depends on the multi-group eval job
+            {"name": "report", "group": report_group, "dependencies": ["evals"]},
+    pipeline.run()
+"""
+
+import inspect
+import logging
+import shlex
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import nemo_run as run
+
+from nemo_skills.pipeline.utils import (
+    get_cluster_config,
+    get_env_variables,
+    get_executor,
+    get_exp,
+    get_exp_handles,
+    get_tunnel,
+    run_exp,
+    temporary_env_update,
+)
+from nemo_skills.pipeline.utils.commands import wrap_command
+from nemo_skills.pipeline.utils.exp import REUSE_CODE_EXP, get_packaging_job_key, install_packages_wrap, tunnel_hash
+from nemo_skills.pipeline.utils.mounts import is_mounted_filepath
+from nemo_skills.pipeline.utils.packager import get_registered_external_repo
+from nemo_skills.utils import get_logger_name
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+@dataclass
+class Command:
+    """Declarative command for running tasks in containers.
+
+    The command can be either:
+    - A string: evaluated immediately
+    - A callable (lambda): evaluated lazily when the task is prepared
+    - A tuple (command, metadata): command with metadata like port
+    - A callable returning (command, metadata): lazy evaluation with metadata
+
+    Using a lambda allows references to work correctly in heterogeneous jobs:
+        Command(command=lambda: f"curl {server.hostname_ref()}:5000")
+
+    Metadata from command builders (like port) can be referenced:
+        server = Command(command=vllm_server_command(...))
+        client = Command(command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}")
+    """
+
+    command: Union[
+        str,
+        Callable[[], str],  # Lambda for cross-group refs
+        Callable[[Dict], str],  # Lambda needing cluster_config (e.g., sandbox)
+        Tuple[Optional[str], Dict],  # Command builder result
+        Callable[[], Tuple[Optional[str], Dict]],  # Lambda returning command builder result
+        None,
+    ]
+    container: str = "nemo-skills"
+    gpus: Optional[int] = None
+    nodes: int = 1
+    name: Optional[str] = None
+    working_dir: str = "/nemo_run/code"
+    env_vars: Dict[str, str] = field(default_factory=dict)
+    installation_command: Optional[str] = None
+    port: Optional[int] = None  # Can be set from metadata
+    metadata: Dict[str, any] = field(default_factory=dict)  # Stores metadata from command builders
+    het_group_index: Optional[int] = None  # Set automatically by Pipeline
+
+    def __post_init__(self):
+        # Initialize component (merged from old Component class)
+        if self.name is None:
+            self.name = "command"
+
+        # Extract metadata if command is a tuple
+        self._extract_metadata()
+
+        # Wrap plain strings with environment setup
+        if isinstance(self.command, str) and (self.env_vars or self.working_dir):
+            self.command = wrap_command(self.command, self.working_dir, self.env_vars)
+
+    def _extract_metadata(self):
+        """Extract metadata from command if it's a tuple."""
+        if not callable(self.command):
+            if isinstance(self.command, tuple):
+                cmd, self.metadata = self.command
+                self.command = cmd  # Can be None for sandbox
+
+    def hostname_ref(self) -> str:
+        """Get hostname reference for hetjob cross-component communication."""
+        if self.het_group_index is None:
+            return "127.0.0.1"  # Local fallback
+        # For heterogeneous SLURM jobs, resolve nodelist to actual hostname
+        return f"$(scontrol show hostnames $SLURM_JOB_NODELIST_HET_GROUP_{self.het_group_index} | head -n1)"
+
+    def meta_ref(self, key: str) -> str:
+        """Get metadata value (like port). Fails if key not found."""
+        if key not in self.metadata:
+            raise KeyError(
+                f"Metadata key '{key}' not found in Command '{self.name}'. "
+                f"Available keys: {list(self.metadata.keys())}"
+            )
+        return str(self.metadata[key])
+
+    def prepare_for_execution(self, cluster_config: Dict) -> Tuple[str, Dict]:
+        """Prepare command for execution.
+
+        This method:
+        1. Evaluates callables (resolves cross-group references and cluster_config-dependent commands)
+        2. Adds cross-group environment variables
+        3. Wraps with installation_command if provided
+
+        Returns:
+            Tuple of (final_command, execution_config)
+        """
+        # 1. Evaluate if callable (resolves cross-group references or cluster_config needs)
+        if callable(self.command):
+            # Check if lambda needs cluster_config (for sandbox)
+
+            sig = inspect.signature(self.command)
+            if len(sig.parameters) > 0:
+                # Lambda expects cluster_config
+                result = self.command(cluster_config)
+            else:
+                # Regular lambda (cross-group refs)
+                result = self.command()
+
+            if isinstance(result, tuple):
+                final_command, runtime_metadata = result
+                # Deep merge metadata, especially environment dict
+                for key, value in runtime_metadata.items():
+                    if key == "environment" and key in self.metadata:
+                        # Merge environment dicts instead of replacing
+                        self.metadata[key].update(value)
+                    else:
+                        self.metadata[key] = value
+            else:
+                final_command = result
+        else:
+            final_command = self.command
+
+        # 2. Wrap with installation_command if provided (matches add_task behavior)
+        if self.installation_command:
+            final_command = install_packages_wrap(final_command, self.installation_command)
+
+        # 3. Build execution config from metadata
+        execution_config = {
+            "num_tasks": self.metadata.get("num_tasks", 1),
+            "num_gpus": self.metadata.get("gpus", self.gpus or 0),
+            "num_nodes": self.metadata.get("nodes", self.nodes),
+            "environment": self.metadata.get("environment", {}),
+            "log_prefix": self.metadata.get("log_prefix", "main"),
+            "mounts": self.metadata.get("mounts"),
+            "container": self.metadata.get("container", self.container),  # Use container from metadata if available
+        }
+
+        return final_command, execution_config
+
+    def get_name(self) -> str:
+        return self.name
+
+
+@dataclass
+class HardwareConfig:
+    """Hardware configuration for a group of tasks."""
+
+    partition: Optional[str] = None
+    time_min: Optional[str] = None
+    exclusive: bool = False
+    num_gpus: Optional[int] = None
+    num_nodes: Optional[int] = None
+
+
+class CommandGroup:
+    """Command group where commands run together with shared resource requirements."""
+
+    def __init__(
+        self,
+        commands: List[Command],
+        hardware: Optional[HardwareConfig] = None,
+        name: Optional[str] = None,
+        log_dir: Optional[str] = None,
+    ):
+        self.components = commands  # Keep as self.components internally for backwards compatibility
+        self.hardware = hardware or HardwareConfig()
+        self.name = name
+        self.log_dir = log_dir
+
+
+class Pipeline:
+    """Top-level pipeline that composes command groups with dependency support.
+
+    Supports two modes:
+    1. Groups: groups=[cmdgroup1, cmdgroup2] - combines all into one job
+    2. Jobs: jobs=[{...}, {...}] - supports dependencies and multi-group jobs
+    """
+
+    def __init__(
+        self,
+        name: str,
+        cluster: Optional[str] = None,
+        groups: Optional[List[CommandGroup]] = None,  # Legacy mode
+        jobs: Optional[List[Dict]] = None,  # New mode with dependencies
+        reuse_code: bool = True,
+        reuse_code_exp: Optional[str] = None,
+        skip_hf_home_check: bool = False,
+        with_ray: bool = False,
+        run_after: Optional[Union[str, List[str]]] = None,  # Pipeline-level dependency on other experiments
+    ):
+        self.name = name
+        self.cluster = cluster
+        self.reuse_code = reuse_code
+        self.reuse_code_exp = reuse_code_exp
+        self.skip_hf_home_check = skip_hf_home_check
+        self.with_ray = with_ray
+        self.run_after = run_after
+        self._cluster_config: Optional[Dict] = None
+
+        if groups is not None and jobs is not None:
+            raise ValueError("Cannot specify both 'groups' and 'jobs'.")
+
+        if groups is not None:
+            self.jobs = [{"group": g} for g in groups]
+            self._legacy_mode = True
+        elif jobs is not None:
+            self.jobs = jobs
+            self._legacy_mode = False
+        else:
+            raise ValueError("Must specify either 'groups' or 'jobs'")
+
+        self._assign_het_group_indices()
+
+    def _assign_het_group_indices(self):
+        """Assign het_group_index to all components for cross-group references."""
+        # Collect all groups from jobs
+        all_groups = []
+        for job_spec in self.jobs:
+            if "group" in job_spec:
+                all_groups.append(job_spec["group"])
+            elif "groups" in job_spec:
+                all_groups.extend(job_spec["groups"])
+
+        # Assign indices
+        for group_idx, group in enumerate(all_groups):
+            for component in group.components:
+                component.het_group_index = group_idx
+                LOG.debug(f"Assigned het_group_index {group_idx} to {component.get_name()}")
+
+    def _get_cluster_config(self) -> Dict:
+        """Get cluster configuration, loading it if necessary."""
+        if self._cluster_config is None:
+            if self.cluster is None:
+                raise ValueError("Must specify cluster either in Pipeline() or run() method")
+
+            self._cluster_config = get_cluster_config(self.cluster)
+        return self._cluster_config
+
+    def run(
+        self,
+        cluster_config: Optional[Dict] = None,
+        cluster: Optional[str] = None,
+        dry_run: bool = False,
+        log_dir: Optional[str] = None,
+    ):
+        """Execute the pipeline by calling NeMo-Run directly.
+
+        Args:
+            cluster_config: Cluster configuration dict (optional, can use cluster name instead)
+            cluster: Cluster name to load config from (optional if cluster_config provided)
+            dry_run: If True, validate without executing
+            log_dir: Default log directory for groups that don't specify one (optional)
+        """
+        if not self.jobs:
+            LOG.info("No jobs to execute")
+            return None
+
+        # Determine cluster config to use
+        if cluster_config is not None:
+            final_cluster_config = cluster_config
+        elif cluster is not None:
+            final_cluster_config = get_cluster_config(cluster)
+        else:
+            final_cluster_config = self._get_cluster_config()
+
+        # Validate HF_HOME (matching add_task validation)
+        if final_cluster_config["executor"] != "none" and not self.skip_hf_home_check:
+            env_vars = get_env_variables(final_cluster_config)
+            if "HF_HOME" not in env_vars:
+                raise RuntimeError(
+                    "Invalid cluster_config: HF_HOME is missing from env_vars while skip_hf_home_check=False.\n"
+                    f"Current env_vars: {final_cluster_config.get('env_vars', [])}\n"
+                    "Please add a new variable: HF_HOME=/mounted/path/to/your/hf_home"
+                )
+            if not is_mounted_filepath(final_cluster_config, env_vars["HF_HOME"]):
+                raise RuntimeError(f"Invalid cluster_config: HF_HOME={env_vars['HF_HOME']} is not a mounted path.")
+
+        # Track job name -> task handle for dependency resolution
+        job_name_to_handle = {}
+
+        with get_exp(self.name, final_cluster_config) as exp:
+            # Process each job in order
+            for job_spec in self.jobs:
+                job_name = job_spec.get("name", "unnamed")
+
+                # Resolve dependencies to task handles (matching add_task lines 390-404)
+                run_after_deps = []
+
+                # Handle dependencies from job spec
+                job_dependencies = job_spec.get("dependencies", [])
+                # Handle explicit None (when dependencies key exists but value is None)
+                if job_dependencies is None:
+                    job_dependencies = []
+
+                # If no dependencies and Pipeline has run_after, apply it (matching add_task behavior)
+                if not job_dependencies and self.run_after:
+                    run_after_list = self.run_after if isinstance(self.run_after, list) else [self.run_after]
+                    job_dependencies = run_after_list
+
+                for dep in job_dependencies:
+                    if isinstance(dep, str):
+                        # String dependency - look up by name
+                        if dep in job_name_to_handle:
+                            # Internal pipeline dependency - add the handle
+                            run_after_deps.append(job_name_to_handle[dep])
+                            LOG.info(f"Job '{job_name}' depends on '{dep}' (handle: {job_name_to_handle[dep]})")
+                        else:
+                            # External experiment name - will be resolved by get_exp_handles below
+                            if final_cluster_config["executor"] == "slurm":
+                                exp_handles = get_exp_handles(dep)
+                                if len(exp_handles) == 0:
+                                    LOG.warning(
+                                        f"No pending or running tasks found for experiment {dep}, cannot set dependencies."
+                                    )
+                                run_after_deps.extend(exp_handles)
+                                LOG.info(
+                                    f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)"
+                                )
+
+                # Convert empty list to None (matching add_task line 402)
+                if len(run_after_deps) == 0:
+                    run_after_deps = None
+
+                # Check if this is a multi-group job or single group
+                if "groups" in job_spec:
+                    # If only one group in list, use single group job for efficiency
+                    if len(job_spec["groups"]) == 1:
+                        task_handle = self._add_single_group_job(
+                            exp,
+                            job_spec["groups"][0],
+                            final_cluster_config,
+                            default_log_dir=log_dir,
+                            run_after=run_after_deps if run_after_deps else None,
+                        )
+                    else:
+                        # True multi-group: combine multiple groups into one heterogeneous SLURM job
+                        task_handle = self._add_multi_group_job(
+                            exp,
+                            job_spec["groups"],
+                            final_cluster_config,
+                            default_log_dir=log_dir,
+                            run_after=run_after_deps if run_after_deps else None,
+                        )
+                elif "group" in job_spec:
+                    # Single group job
+                    task_handle = self._add_single_group_job(
+                        exp,
+                        job_spec["group"],
+                        final_cluster_config,
+                        default_log_dir=log_dir,
+                        run_after=run_after_deps if run_after_deps else None,
+                    )
+                else:
+                    raise ValueError(f"Job spec must have either 'group' or 'groups': {job_spec}")
+
+                # Track task handle for this job
+                job_name_to_handle[job_name] = task_handle
+                LOG.info(f"Added job '{job_name}' with task_handle={task_handle}")
+
+            if not dry_run:
+                run_exp(exp, final_cluster_config)
+
+                # Cache experiment for code reuse in future runs (matching add_task behavior)
+                if final_cluster_config["executor"] != "none":
+                    tunnel = get_tunnel(final_cluster_config)
+                    cur_tunnel_hash = tunnel_hash(tunnel)
+                    if cur_tunnel_hash not in REUSE_CODE_EXP:
+                        REUSE_CODE_EXP[cur_tunnel_hash] = exp
+                        LOG.info("Cached experiment for future code reuse")
+
+            return exp
+
+    def _prepare_command(self, command, cluster_config: Dict) -> Tuple[str, Dict]:
+        """Prepare command and handle mpirun wrapping."""
+        final_cmd, exec_config = command.prepare_for_execution(cluster_config)
+
+        # Handle mpirun wrapping for non-SLURM executors
+        num_tasks = exec_config["num_tasks"]
+        if cluster_config["executor"] != "slurm" and num_tasks > 1:
+            final_cmd = f"mpirun --allow-run-as-root -np {num_tasks} bash -c {shlex.quote(final_cmd)}"
+
+        return final_cmd, exec_config
+
+    def _resolve_container(self, exec_config: Dict, command, cluster_config: Dict) -> str:
+        """Resolve container name to image path."""
+        container_name = exec_config.get("container", command.container)
+        if container_name in cluster_config.get("containers", {}):
+            return cluster_config["containers"][container_name]
+        return container_name
+
+    def _create_executor(
+        self,
+        command,
+        exec_config: Dict,
+        container_image: str,
+        cluster_config: Dict,
+        log_dir: str,
+        hardware: HardwareConfig,
+        heterogeneous: bool,
+        het_group: int,
+        total_het_groups: int,
+        overlap: bool,
+    ):
+        """Create executor with optional environment update."""
+        env_context = (
+            temporary_env_update(cluster_config, exec_config["environment"])
+            if exec_config.get("environment")
+            else nullcontext()
+        )
+
+        with env_context:
+            return get_executor(
+                cluster_config=cluster_config,
+                container=container_image,
+                num_nodes=exec_config["num_nodes"],
+                tasks_per_node=exec_config["num_tasks"],
+                gpus_per_node=exec_config["num_gpus"],
+                job_name=command.name,
+                log_dir=log_dir,
+                log_prefix=exec_config["log_prefix"],
+                partition=hardware.partition if hardware else None,
+                time_min=hardware.time_min if hardware else None,
+                heterogeneous=heterogeneous,
+                het_group=het_group,
+                total_het_groups=total_het_groups,
+                overlap=overlap,
+                mounts=exec_config.get("mounts"),
+                with_ray=self.with_ray,
+                slurm_kwargs={"exclusive": hardware.exclusive} if (hardware and hardware.exclusive) else None,
+            )
+
+    def _plan_and_add_job(
+        self,
+        exp,
+        groups: List[CommandGroup],
+        cluster_config: Dict,
+        default_log_dir: Optional[str] = None,
+        run_after: Optional[List] = None,
+        heterogeneous: bool = False,
+    ) -> str:
+        """Plan commands/executors for one or more groups and add to experiment.
+
+        This encapsulates shared logic between single-group and multi-group jobs. Behavior
+        differences are controlled by the 'heterogeneous' flag and the provided 'groups'.
+        """
+
+        # Dependencies are only passed through for SLURM
+        dependencies = run_after if (run_after and cluster_config["executor"] == "slurm") else None
+
+        # Resolve log directory (use first group's log_dir if present)
+        log_dir = groups[0].log_dir or default_log_dir
+        if log_dir is None:
+            raise ValueError(f"CommandGroup '{groups[0].name}' must have log_dir set, or provide it to pipeline.run()")
+
+        commands: List[str] = []
+        executors: List = []
+        het_group_indices: List[int] = []
+
+        # In heterogeneous jobs, collect environment from all commands for cross-component refs
+        shared_env_vars: Dict[str, str] = {}
+        if heterogeneous:
+            for het_idx, group in enumerate(groups):
+                for command in group.components:
+                    _, exec_config_probe = command.prepare_for_execution(cluster_config)
+                    shared_env_vars.update(exec_config_probe.get("environment", {}))
+
+        # PERFORMANCE: Track first packager to share across executors (single-group only)
+        shared_packager = None
+
+        # Build commands and executors
+        for het_idx, group in enumerate(groups):
+            has_multiple_components = len(group.components) > 1
+            total_het_groups = (
+                len(groups) if heterogeneous else (len(group.components) if has_multiple_components else 1)
+            )
+
+            # For single-group jobs with multiple components, allow job-level GPU override for sbatch allocation
+            job_level_gpus = (
+                group.hardware.num_gpus if (not heterogeneous and has_multiple_components and group.hardware) else None
+            )
+
+            for comp_idx, command in enumerate(group.components):
+                final_cmd, exec_config = self._prepare_command(command, cluster_config)
+                commands.append(final_cmd)
+
+                # Adjust GPU allocation (first component gets job-level GPUs for sbatch) for single-group jobs
+                exec_config["num_gpus"] = exec_config["num_gpus"] or 0
+                if (not heterogeneous) and (comp_idx == 0) and (job_level_gpus is not None):
+                    exec_config["num_gpus"] = job_level_gpus
+
+                # Merge shared environment for heterogeneous jobs
+                if heterogeneous and shared_env_vars:
+                    exec_config["environment"].update(shared_env_vars)
+
+                # Resolve container and create executor
+                container_image = self._resolve_container(exec_config, command, cluster_config)
+                executor = self._create_executor(
+                    command,
+                    exec_config,
+                    container_image,
+                    cluster_config,
+                    log_dir,
+                    group.hardware,
+                    heterogeneous,
+                    het_idx if heterogeneous else comp_idx,
+                    total_het_groups,
+                    (len(group.components) > 1),
+                )
+
+                # Share packager across executors for single-group jobs
+                if not heterogeneous:
+                    if comp_idx == 0 and het_idx == 0:
+                        shared_packager = executor.packager
+                    else:
+                        executor.packager = shared_packager
+
+                executors.append(executor)
+                if heterogeneous:
+                    het_group_indices.append(het_idx)
+
+        # For heterogeneous jobs, set het_group_indices on the first executor
+        if heterogeneous and executors:
+            executors[0].het_group_indices = het_group_indices
+
+        # PERFORMANCE: Handle code reuse from previous experiments (single-group only)
+        if (not heterogeneous) and cluster_config["executor"] != "none":
+            tunnel = get_tunnel(cluster_config)
+            if self.reuse_code:
+                reuse_exp = self.reuse_code_exp or REUSE_CODE_EXP.get(tunnel_hash(tunnel))
+                if reuse_exp is not None:
+                    if isinstance(reuse_exp, str):
+                        try:
+                            reuse_exp = run.Experiment.from_id(reuse_exp)
+                        except Exception:
+                            try:
+                                reuse_exp = run.Experiment.from_title(reuse_exp)
+                            except Exception:
+                                LOG.warning(f"Failed to load experiment {reuse_exp} for code reuse")
+                                reuse_exp = None
+                    if reuse_exp is not None:
+                        LOG.info(f"Trying to reuse code from experiment {reuse_exp._title}")
+                        reuse_key = get_packaging_job_key(reuse_exp._id, "nemo-run")
+                        if reuse_key in reuse_exp.tunnels[tunnel.key].packaging_jobs:
+                            reuse_dir = reuse_exp.tunnels[tunnel.key].packaging_jobs[reuse_key].dst_path
+                            for executor in executors:
+                                executor.packager.symlink_from_remote_dir = reuse_dir
+                            LOG.info(f"Successfully reused code from {reuse_key}")
+                        else:
+                            LOG.warning(f"Relevant packaging job not found for experiment {reuse_exp._title}")
+            else:
+                # If reuse_code=False, clear cache (matching original behavior)
+                REUSE_CODE_EXP.pop(tunnel_hash(tunnel), None)
+
+        # Handle executor="none" path replacements (single-group only)
+        if (not heterogeneous) and cluster_config["executor"] == "none":
+            for idx in range(len(commands)):
+                commands[idx] = commands[idx].replace(
+                    "/nemo_run/code/nemo_skills", str(get_registered_external_repo("nemo_skills").path)
+                )
+                commands[idx] = commands[idx].replace("/nemo_run/code", "./")
+
+        # Ray metadata handling
+        if self.with_ray and cluster_config["executor"] == "slurm":
+            metadata = {"use_with_ray_cluster": True}
+        else:
+            metadata = None
+
+        # Add to experiment and return task ID
+        if (not heterogeneous) and len(commands) == 1:
+            task_id = exp.add(
+                run.Script(inline=commands[0], metadata=metadata),
+                executor=executors[0],
+                name="nemo-run",
+                dependencies=dependencies,
+            )
+        else:
+            task_id = exp.add(
+                [
+                    run.Script(inline=cmd, metadata=(metadata if idx == 0 else None))
+                    for idx, cmd in enumerate(commands)
+                ],
+                executor=executors,
+                name="nemo-run",
+                dependencies=dependencies,
+            )
+
+        return task_id
+
+    def _add_single_group_job(
+        self,
+        exp,
+        group: CommandGroup,
+        cluster_config: Dict,
+        default_log_dir: Optional[str] = None,
+        run_after: Optional[List] = None,
+    ) -> str:
+        """Add a single CommandGroup as one job and return its task handle."""
+
+        return self._plan_and_add_job(
+            exp=exp,
+            groups=[group],
+            cluster_config=cluster_config,
+            default_log_dir=default_log_dir,
+            run_after=run_after,
+            heterogeneous=False,
+        )
+
+    def _add_multi_group_job(
+        self,
+        exp,
+        groups: List[CommandGroup],
+        cluster_config: Dict,
+        default_log_dir: Optional[str] = None,
+        run_after: Optional[List] = None,
+    ) -> str:
+        """Add multiple CommandGroups as a single heterogeneous SLURM job and return task handle."""
+
+        return self._plan_and_add_job(
+            exp=exp,
+            groups=groups,
+            cluster_config=cluster_config,
+            default_log_dir=default_log_dir,
+            run_after=run_after,
+            heterogeneous=True,
+        )
+
+
+# Backwards compatibility alias
+Pipeline = Pipeline

From 47308a0429a3628e9c99f5a7b9d06ea915b1c88e Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Fri, 3 Oct 2025 12:39:30 -0700
Subject: [PATCH 02/29] MAINT fix merge

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py          | 41 ++++-------------------
 nemo_skills/pipeline/utils/commands.py    |  5 +--
 nemo_skills/pipeline/utils/declarative.py |  2 ++
 3 files changed, 11 insertions(+), 37 deletions(-)

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index 626fe62fb1..e18d7e2543 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -46,8 +46,10 @@ def _create_commandgroup_from_config(
     installation_command: Optional[str],
     get_server_command_fn: Callable,
     partition: Optional[str],
+    qos: Optional[str],
     time_min: Optional[str],
     exclusive: bool,
+    keep_mounts_for_sandbox: bool,
     task_name: str,
     log_dir: str,
 ) -> CommandGroup:
@@ -109,7 +111,7 @@ def make_server_cmd(cfg):
         # Wrap to ensure log_prefix is set in returned metadata
         def make_sandbox_cmd(cfg):
             # sandbox_command returns (callable, metadata), so we need to call the callable
-            cmd_builder, initial_metadata = sandbox_command(port=sandbox_port)
+            cmd_builder, initial_metadata = sandbox_command(port=sandbox_port, keep_mounts=keep_mounts_for_sandbox)
             # Call the builder to get the actual command string
             cmd_string, runtime_metadata = cmd_builder(cfg)
             # Merge metadata
@@ -134,6 +136,7 @@ def make_sandbox_cmd(cfg):
         commands=components,
         hardware=HardwareConfig(
             partition=partition,
+            qos=qos,
             time_min=time_min,
             exclusive=exclusive,
             num_gpus=max_gpus,
@@ -434,8 +437,10 @@ def generate(
                     installation_command=installation_command,
                     get_server_command_fn=generation_task.get_server_command_fn(),
                     partition=partition,
+                    qos=qos,
                     time_min=time_min,
                     exclusive=exclusive,
+                    keep_mounts_for_sandbox=keep_mounts_for_sandbox,
                     task_name=task_name,
                     log_dir=log_dir,
                 )
@@ -462,40 +467,6 @@ def generate(
                         "dependencies": job_deps,
                     }
                 )
-                prev_tasks = _task_dependencies
-                for _ in range(dependent_jobs + 1):
-                    task_name = f"{expname}-rs{seed}" if seed is not None else expname
-                    if chunk_id is not None:
-                        task_name += f"-chunk{chunk_id}"
-                    new_task = pipeline_utils.add_task(
-                        exp,
-                        cmd=pipeline_utils.wrap_python_path(cmd=cmd),
-                        task_name=task_name,
-                        log_dir=log_dir,
-                        container=cluster_config["containers"]["nemo-skills"],
-                        cluster_config=cluster_config,
-                        partition=partition,
-                        qos=qos,
-                        time_min=time_min,
-                        server_config=server_config,
-                        with_sandbox=with_sandbox,
-                        keep_mounts_for_sandbox=keep_mounts_for_sandbox,
-                        sandbox_port=None if get_random_port else 6000,
-                        run_after=run_after,
-                        reuse_code=reuse_code,
-                        reuse_code_exp=reuse_code_exp,
-                        task_dependencies=(
-                            prev_tasks if cluster_config["executor"] == "slurm" else all_tasks + _task_dependencies
-                        ),
-                        get_server_command=generation_task.get_server_command_fn(),
-                        slurm_kwargs={"exclusive": exclusive} if exclusive else None,
-                        installation_command=installation_command,
-                        skip_hf_home_check=skip_hf_home_check,
-                    )
-                    prev_tasks = [new_task]
-                    all_tasks.append(new_task)
-        if has_tasks and not _reuse_exp:  # if we are reusing an experiment, the tasks will run from there
-            pipeline_utils.run_exp(exp, cluster_config, dry_run=dry_run)
 
                 all_job_names.append(internal_job_name)
 
diff --git a/nemo_skills/pipeline/utils/commands.py b/nemo_skills/pipeline/utils/commands.py
index cfebb8753b..f2402c590b 100644
--- a/nemo_skills/pipeline/utils/commands.py
+++ b/nemo_skills/pipeline/utils/commands.py
@@ -76,13 +76,14 @@ def server_cmd_builder(cfg):
     return server_cmd_builder, metadata
 
 
-def sandbox_command(port: Optional[int] = None, **kwargs) -> Tuple[Callable, Dict]:
+def sandbox_command(port: Optional[int] = None, keep_mounts: bool = False, **kwargs) -> Tuple[Callable, Dict]:
     """Build sandbox command.
 
     Returns a lambda that will build the command when cluster_config is available.
 
     Args:
         port: Port to use for sandbox
+        keep_mounts: If True, keep mounts from cluster config. If False, use empty mounts for safety.
 
     Returns:
         Tuple of (lambda, metadata_dict)
@@ -93,7 +94,7 @@ def sandbox_command(port: Optional[int] = None, **kwargs) -> Tuple[Callable, Dic
     metadata = {
         "port": port,
         "log_prefix": "sandbox",
-        "mounts": [],  # Sandbox doesn't mount anything
+        "mounts": None if keep_mounts else [],  # None means use cluster config mounts, [] means no mounts
         "environment": {
             "LISTEN_PORT": str(port),
             "NGINX_PORT": str(port),
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index af2bdae4e5..2ef4f1e03f 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -255,6 +255,7 @@ class HardwareConfig:
     """Hardware configuration for a group of tasks."""
 
     partition: Optional[str] = None
+    qos: Optional[str] = None
     time_min: Optional[str] = None
     exclusive: bool = False
     num_gpus: Optional[int] = None
@@ -528,6 +529,7 @@ def _create_executor(
                 log_dir=log_dir,
                 log_prefix=exec_config["log_prefix"],
                 partition=hardware.partition if hardware else None,
+                qos=hardware.qos if hardware else None,
                 time_min=hardware.time_min if hardware else None,
                 heterogeneous=heterogeneous,
                 het_group=het_group,

From 0e3688d15d74e8016665d116d6a6a90ada9fe89c Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Fri, 3 Oct 2025 12:53:40 -0700
Subject: [PATCH 03/29] MAINT fix hetgroup indexing

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 27 ++++++++---------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 2ef4f1e03f..415f542457 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -155,7 +155,7 @@ class Command:
     installation_command: Optional[str] = None
     port: Optional[int] = None  # Can be set from metadata
     metadata: Dict[str, any] = field(default_factory=dict)  # Stores metadata from command builders
-    het_group_index: Optional[int] = None  # Set automatically by Pipeline
+    het_group_index: Optional[int] = None  # Set per-job by Pipeline (not global)
 
     def __post_init__(self):
         # Initialize component (merged from old Component class)
@@ -319,23 +319,7 @@ def __init__(
         else:
             raise ValueError("Must specify either 'groups' or 'jobs'")
 
-        self._assign_het_group_indices()
-
-    def _assign_het_group_indices(self):
-        """Assign het_group_index to all components for cross-group references."""
-        # Collect all groups from jobs
-        all_groups = []
-        for job_spec in self.jobs:
-            if "group" in job_spec:
-                all_groups.append(job_spec["group"])
-            elif "groups" in job_spec:
-                all_groups.extend(job_spec["groups"])
-
-        # Assign indices
-        for group_idx, group in enumerate(all_groups):
-            for component in group.components:
-                component.het_group_index = group_idx
-                LOG.debug(f"Assigned het_group_index {group_idx} to {component.get_name()}")
+        # Note: het_group_indices are assigned per-job in _plan_and_add_job, not globally
 
     def _get_cluster_config(self) -> Dict:
         """Get cluster configuration, loading it if necessary."""
@@ -591,6 +575,13 @@ def _plan_and_add_job(
             )
 
             for comp_idx, command in enumerate(group.components):
+                # Assign het_group_index ONLY for heterogeneous jobs (per-job, not global)
+                # Non-heterogeneous jobs use localhost, so het_group_index should remain None
+                if heterogeneous:
+                    command.het_group_index = het_idx
+                else:
+                    command.het_group_index = None
+
                 final_cmd, exec_config = self._prepare_command(command, cluster_config)
                 commands.append(final_cmd)
 

From 849253f282fb85af72c7ebd6ac763cce3237ae52 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Fri, 3 Oct 2025 12:57:04 -0700
Subject: [PATCH 04/29] MAINT cleanup

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py          | 57 +++++++++++------------
 nemo_skills/pipeline/utils/declarative.py | 28 +++++------
 2 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index e18d7e2543..959635f8bc 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -53,36 +53,33 @@ def _create_commandgroup_from_config(
     task_name: str,
     log_dir: str,
 ) -> CommandGroup:
-    """Create a CommandGroup from server_config, matching add_task logic.
+    """Create a CommandGroup from server_config.
 
-    This function replicates the component ordering from add_task:
+    Component ordering:
     1. Server (if server_config provided)
-    2. Main client command
+    2. Client command
     3. Sandbox (if with_sandbox=True)
     """
 
     components = []
 
-    # 1. Add server if server_config is provided (matches add_task lines 433-463)
+    # 1. Add server if server_config is provided
     if server_config is not None and int(server_config["num_gpus"]) > 0:
-        # Extract server container (matches line 435)
         server_type = server_config["server_type"]
         server_container = server_config.pop("container", cluster_config["containers"][server_type])
 
-        # Use the EXISTING get_server_command function via lambda
-        # This ensures we use the same server command construction as add_task
+        # Create server command builder that defers execution until cluster_config is available
         server_config_copy = server_config.copy()
 
         def make_server_cmd(cfg):
             cmd, num_tasks = get_server_command_fn(**server_config_copy, cluster_config=cfg)
-            # Include log_prefix in metadata to match old add_task behavior
             return (
                 cmd,
                 {
                     "num_tasks": num_tasks,
                     "gpus": server_config_copy["num_gpus"],
                     "nodes": server_config_copy["num_nodes"],
-                    "log_prefix": "server",  # Explicitly set log prefix
+                    "log_prefix": "server",
                 },
             )
 
@@ -91,24 +88,23 @@ def make_server_cmd(cfg):
             container=server_container,
             gpus=server_config["num_gpus"],
             nodes=server_config["num_nodes"],
-            name=task_name,  # Use base task_name, not with _server suffix
+            name=task_name,
         )
         components.append(server_cmd)
 
-    # 2. Add main generation command (matches add_task lines 466-580)
+    # 2. Add main generation command
     client_cmd = Command(
-        command=generation_cmd,  # Already built with get_generation_cmd!
+        command=generation_cmd,
         container=cluster_config["containers"]["nemo-skills"],
-        name=task_name,  # Use base task_name, not with _client suffix
+        name=task_name,
         installation_command=installation_command,
-        metadata={"log_prefix": "main"},  # Set log prefix to match old add_task
+        metadata={"log_prefix": "main"},
     )
     components.append(client_cmd)
 
-    # 3. Add sandbox if requested (matches add_task lines 527-565)
+    # 3. Add sandbox if requested
     if with_sandbox:
-        # Use existing sandbox_command builder
-        # Wrap to ensure log_prefix is set in returned metadata
+
         def make_sandbox_cmd(cfg):
             # sandbox_command returns (callable, metadata), so we need to call the callable
             cmd_builder, initial_metadata = sandbox_command(port=sandbox_port, keep_mounts=keep_mounts_for_sandbox)
@@ -117,18 +113,18 @@ def make_sandbox_cmd(cfg):
             # Merge metadata
             metadata = initial_metadata.copy()
             metadata.update(runtime_metadata)
-            metadata["log_prefix"] = "sandbox"  # Explicitly set log prefix
+            metadata["log_prefix"] = "sandbox"
             return (cmd_string, metadata)
 
         sandbox_cmd = Command(
             command=make_sandbox_cmd,
             container=cluster_config["containers"]["sandbox"],
-            name=task_name,  # Use base task_name, not with _sandbox suffix
+            name=task_name,
         )
         components.append(sandbox_cmd)
 
-    # Find MAXIMUM GPUs needed by any component for the HardwareConfig
-    # This is critical for multi-component jobs - the job-level request must be the max
+    # Find maximum GPUs/nodes needed by any component for the HardwareConfig
+    # The job-level resource request must be the maximum across all components
     max_gpus = max((comp.gpus or 0) for comp in components)
     max_nodes = max((comp.nodes or 1) for comp in components)
 
@@ -417,13 +413,12 @@ def generate(
             )
             cmd = pipeline_utils.wrap_python_path(cmd=cmd)
 
-            # Base task name (same for all dependent jobs in chain - matches original!)
+            # Base task name (shared across all dependent jobs in the chain)
             task_name = f"{expname}-rs{seed}" if seed is not None else expname
             if chunk_id is not None:
                 task_name += f"-chunk{chunk_id}"
 
-            # Handle dependent_jobs chain (matching add_task behavior)
-            # Note: run_after will be handled by Stage for jobs that don't have task_dependencies
+            # Handle dependent_jobs chain
             dependencies = _task_dependencies.copy() if _task_dependencies else []
 
             for dep_idx in range(dependent_jobs + 1):
@@ -448,16 +443,16 @@ def generate(
                 # Use unique internal job name for dependency tracking, but same task_name
                 internal_job_name = f"{task_name}-dep{dep_idx}" if dep_idx > 0 else task_name
 
-                # Build dependencies: first job gets _task_dependencies+run_after, rest get previous in chain
+                # Build dependencies: first job in chain gets external dependencies, rest chain to previous
                 if dep_idx == 0:
-                    # First job: add run_after if no task_dependencies (matching add_task logic)
+                    # First job: add run_after if no task_dependencies
                     job_deps = dependencies.copy() if dependencies else []
                     if not dependencies and run_after:
                         run_after_list = run_after if isinstance(run_after, list) else [run_after]
                         job_deps.extend(run_after_list)
                     job_deps = job_deps if job_deps else None
                 else:
-                    # Subsequent jobs in chain depend on previous
+                    # Subsequent jobs in chain depend on previous job
                     job_deps = [f"{task_name}-dep{dep_idx - 1}"]
 
                 jobs.append(
@@ -479,14 +474,14 @@ def generate(
         # For internal use: return job names for dependency tracking
         return all_job_names
 
-    # Create and run pipeline (with performance optimizations)
+    # Create and run pipeline
     pipeline = Pipeline(
         name=expname,
         cluster=cluster,
         jobs=jobs,
-        reuse_code=reuse_code,  # PERFORMANCE: Enable code reuse
-        reuse_code_exp=reuse_code_exp,  # PERFORMANCE: Reuse from specific exp
-        skip_hf_home_check=skip_hf_home_check,  # Validation control
+        reuse_code=reuse_code,
+        reuse_code_exp=reuse_code_exp,
+        skip_hf_home_check=skip_hf_home_check,
     )
 
     result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run)
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 415f542457..8c604c4655 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -38,7 +38,7 @@
     pipeline = Pipeline(
         name="my_inference",
         cluster="local",
-        groups=[inference_group]  # Legacy mode: single job
+        groups=[inference_group]
     )
     pipeline.run()
 
@@ -158,7 +158,7 @@ class Command:
     het_group_index: Optional[int] = None  # Set per-job by Pipeline (not global)
 
     def __post_init__(self):
-        # Initialize component (merged from old Component class)
+        # Initialize defaults
         if self.name is None:
             self.name = "command"
 
@@ -229,7 +229,7 @@ def prepare_for_execution(self, cluster_config: Dict) -> Tuple[str, Dict]:
         else:
             final_command = self.command
 
-        # 2. Wrap with installation_command if provided (matches add_task behavior)
+        # 2. Wrap with installation_command if provided
         if self.installation_command:
             final_command = install_packages_wrap(final_command, self.installation_command)
 
@@ -272,7 +272,7 @@ def __init__(
         name: Optional[str] = None,
         log_dir: Optional[str] = None,
     ):
-        self.components = commands  # Keep as self.components internally for backwards compatibility
+        self.components = commands
         self.hardware = hardware or HardwareConfig()
         self.name = name
         self.log_dir = log_dir
@@ -357,7 +357,7 @@ def run(
         else:
             final_cluster_config = self._get_cluster_config()
 
-        # Validate HF_HOME (matching add_task validation)
+        # Validate HF_HOME environment variable
         if final_cluster_config["executor"] != "none" and not self.skip_hf_home_check:
             env_vars = get_env_variables(final_cluster_config)
             if "HF_HOME" not in env_vars:
@@ -377,7 +377,7 @@ def run(
             for job_spec in self.jobs:
                 job_name = job_spec.get("name", "unnamed")
 
-                # Resolve dependencies to task handles (matching add_task lines 390-404)
+                # Resolve dependencies to task handles
                 run_after_deps = []
 
                 # Handle dependencies from job spec
@@ -386,7 +386,7 @@ def run(
                 if job_dependencies is None:
                     job_dependencies = []
 
-                # If no dependencies and Pipeline has run_after, apply it (matching add_task behavior)
+                # If no job-level dependencies, apply pipeline-level run_after
                 if not job_dependencies and self.run_after:
                     run_after_list = self.run_after if isinstance(self.run_after, list) else [self.run_after]
                     job_dependencies = run_after_list
@@ -411,7 +411,7 @@ def run(
                                     f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)"
                                 )
 
-                # Convert empty list to None (matching add_task line 402)
+                # Convert empty list to None for cleaner handling
                 if len(run_after_deps) == 0:
                     run_after_deps = None
 
@@ -454,7 +454,7 @@ def run(
             if not dry_run:
                 run_exp(exp, final_cluster_config)
 
-                # Cache experiment for code reuse in future runs (matching add_task behavior)
+                # Cache experiment for code reuse in future runs
                 if final_cluster_config["executor"] != "none":
                     tunnel = get_tunnel(final_cluster_config)
                     cur_tunnel_hash = tunnel_hash(tunnel)
@@ -559,7 +559,7 @@ def _plan_and_add_job(
                     _, exec_config_probe = command.prepare_for_execution(cluster_config)
                     shared_env_vars.update(exec_config_probe.get("environment", {}))
 
-        # PERFORMANCE: Track first packager to share across executors (single-group only)
+        # Share packager across executors for efficiency (single-group only)
         shared_packager = None
 
         # Build commands and executors
@@ -624,7 +624,7 @@ def _plan_and_add_job(
         if heterogeneous and executors:
             executors[0].het_group_indices = het_group_indices
 
-        # PERFORMANCE: Handle code reuse from previous experiments (single-group only)
+        # Handle code reuse from previous experiments (single-group only)
         if (not heterogeneous) and cluster_config["executor"] != "none":
             tunnel = get_tunnel(cluster_config)
             if self.reuse_code:
@@ -650,7 +650,7 @@ def _plan_and_add_job(
                         else:
                             LOG.warning(f"Relevant packaging job not found for experiment {reuse_exp._title}")
             else:
-                # If reuse_code=False, clear cache (matching original behavior)
+                # If reuse_code=False, clear cache
                 REUSE_CODE_EXP.pop(tunnel_hash(tunnel), None)
 
         # Handle executor="none" path replacements (single-group only)
@@ -725,7 +725,3 @@ def _add_multi_group_job(
             run_after=run_after,
             heterogeneous=True,
         )
-
-
-# Backwards compatibility alias
-Pipeline = Pipeline

From 934f3344f496f2e9f33c50103a20e733a3b0162c Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Fri, 3 Oct 2025 13:01:48 -0700
Subject: [PATCH 05/29] TST first round of tests

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_declarative_pipeline.py | 600 +++++++++++++++++++++++++++++
 1 file changed, 600 insertions(+)
 create mode 100644 tests/test_declarative_pipeline.py

diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
new file mode 100644
index 0000000000..6af115675f
--- /dev/null
+++ b/tests/test_declarative_pipeline.py
@@ -0,0 +1,600 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the declarative pipeline system."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline
+
+
+class TestCommand:
+    """Test Command class functionality."""
+
+    def test_command_basic_string(self):
+        """Test creating a Command with a simple string."""
+        cmd = Command(command="echo hello", name="test")
+        assert cmd.name == "test"
+        assert cmd.container == "nemo-skills"
+        assert cmd.gpus is None
+        assert cmd.nodes == 1
+
+    def test_command_with_metadata(self):
+        """Test Command with metadata tuple."""
+        cmd = Command(command=("echo hello", {"port": 8080, "log_prefix": "server"}), name="server")
+        assert cmd.metadata["port"] == 8080
+        assert cmd.metadata["log_prefix"] == "server"
+        # Command gets wrapped with working_dir by default
+        assert "echo hello" in cmd.command
+
+    def test_command_with_callable(self):
+        """Test Command with callable that returns tuple."""
+
+        def make_cmd():
+            return ("echo world", {"port": 5000})
+
+        cmd = Command(command=make_cmd, name="dynamic")
+        assert callable(cmd.command)
+        assert cmd.name == "dynamic"
+
+    def test_command_prepare_for_execution_string(self):
+        """Test prepare_for_execution with string command."""
+        cmd = Command(command="python script.py", gpus=2, name="test")
+        cluster_config = {"executor": "local", "containers": {}}
+
+        final_cmd, exec_config = cmd.prepare_for_execution(cluster_config)
+
+        assert "python script.py" in final_cmd
+        assert exec_config["num_gpus"] == 2
+        assert exec_config["num_nodes"] == 1
+        assert exec_config["num_tasks"] == 1
+
+    def test_command_prepare_for_execution_callable(self):
+        """Test prepare_for_execution with callable command."""
+
+        def make_cmd():
+            return "echo test"
+
+        cmd = Command(command=make_cmd, name="test")
+        cluster_config = {"executor": "local", "containers": {}}
+
+        final_cmd, exec_config = cmd.prepare_for_execution(cluster_config)
+
+        assert final_cmd == "echo test"
+
+    def test_command_prepare_for_execution_callable_with_metadata(self):
+        """Test prepare_for_execution with callable returning tuple."""
+
+        def make_cmd():
+            return ("echo metadata", {"num_tasks": 4, "environment": {"VAR": "value"}})
+
+        cmd = Command(command=make_cmd, name="test")
+        cluster_config = {"executor": "local", "containers": {}}
+
+        final_cmd, exec_config = cmd.prepare_for_execution(cluster_config)
+
+        assert final_cmd == "echo metadata"
+        assert exec_config["num_tasks"] == 4
+        assert exec_config["environment"]["VAR"] == "value"
+
+    def test_command_prepare_for_execution_with_cluster_config(self):
+        """Test prepare_for_execution with callable needing cluster_config."""
+
+        def make_cmd(cfg):
+            return f"echo {cfg['test_param']}"
+
+        cmd = Command(command=make_cmd, name="test")
+        cluster_config = {"executor": "local", "containers": {}, "test_param": "value123"}
+
+        final_cmd, exec_config = cmd.prepare_for_execution(cluster_config)
+
+        assert final_cmd == "echo value123"
+
+    def test_command_meta_ref(self):
+        """Test meta_ref for accessing metadata."""
+        cmd = Command(command=("echo test", {"port": 8080, "host": "localhost"}), name="server")
+
+        assert cmd.meta_ref("port") == "8080"
+        assert cmd.meta_ref("host") == "localhost"
+
+    def test_command_meta_ref_missing_key(self):
+        """Test meta_ref with missing key raises KeyError."""
+        cmd = Command(command="echo test", name="test")
+
+        with pytest.raises(KeyError, match="Metadata key 'port' not found"):
+            cmd.meta_ref("port")
+
+    def test_command_hostname_ref_none(self):
+        """Test hostname_ref returns localhost when het_group_index is None."""
+        cmd = Command(command="echo test", name="test")
+        assert cmd.het_group_index is None
+        assert cmd.hostname_ref() == "127.0.0.1"
+
+    def test_command_hostname_ref_heterogeneous(self):
+        """Test hostname_ref returns SLURM variable when het_group_index is set."""
+        cmd = Command(command="echo test", name="test")
+        cmd.het_group_index = 2
+
+        hostname = cmd.hostname_ref()
+        assert "$SLURM_JOB_NODELIST_HET_GROUP_2" in hostname
+        assert "scontrol" in hostname
+
+    def test_command_with_installation_command(self):
+        """Test Command with installation_command."""
+        cmd = Command(command="python script.py", installation_command="pip install package", name="test")
+        cluster_config = {"executor": "local", "containers": {}}
+
+        final_cmd, _ = cmd.prepare_for_execution(cluster_config)
+
+        # Installation command should be wrapped around the main command
+        assert "pip install package" in final_cmd
+        assert "python script.py" in final_cmd
+
+    def test_command_env_vars_wrapping(self):
+        """Test that env_vars and working_dir are applied to string commands."""
+        cmd = Command(
+            command="python script.py",
+            env_vars={"MY_VAR": "value"},
+            working_dir="/custom/path",
+            name="test",
+        )
+
+        # The command should be wrapped with env setup
+        assert "export MY_VAR=value" in cmd.command
+        assert "cd /custom/path" in cmd.command
+
+
+class TestCommandGroup:
+    """Test CommandGroup class functionality."""
+
+    def test_commandgroup_basic(self):
+        """Test creating a basic CommandGroup."""
+        cmd1 = Command(command="echo 1", name="cmd1")
+        cmd2 = Command(command="echo 2", name="cmd2")
+
+        group = CommandGroup(commands=[cmd1, cmd2], name="test_group")
+
+        assert group.name == "test_group"
+        assert len(group.components) == 2
+        assert group.hardware is not None
+
+    def test_commandgroup_with_hardware(self):
+        """Test CommandGroup with HardwareConfig."""
+        cmd = Command(command="echo test", name="cmd")
+        hardware = HardwareConfig(partition="batch", time_min="01:00:00", num_gpus=8)
+
+        group = CommandGroup(commands=[cmd], hardware=hardware, name="gpu_group")
+
+        assert group.hardware.partition == "batch"
+        assert group.hardware.time_min == "01:00:00"
+        assert group.hardware.num_gpus == 8
+
+    def test_commandgroup_with_log_dir(self):
+        """Test CommandGroup with log_dir."""
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], log_dir="/logs/test", name="group")
+
+        assert group.log_dir == "/logs/test"
+
+
+class TestPipeline:
+    """Test Pipeline class functionality."""
+
+    def test_pipeline_with_groups_legacy_mode(self):
+        """Test Pipeline with groups parameter (legacy mode)."""
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group")
+
+        pipeline = Pipeline(name="test_pipeline", cluster="local", groups=[group])
+
+        assert pipeline.name == "test_pipeline"
+        assert pipeline._legacy_mode is True
+        assert len(pipeline.jobs) == 1
+        assert "group" in pipeline.jobs[0]
+
+    def test_pipeline_with_jobs(self):
+        """Test Pipeline with jobs parameter."""
+        cmd1 = Command(command="echo 1", name="cmd1")
+        group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs")
+
+        cmd2 = Command(command="echo 2", name="cmd2")
+        group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs")
+
+        jobs = [
+            {"name": "job1", "group": group1},
+            {"name": "job2", "group": group2, "dependencies": ["job1"]},
+        ]
+
+        pipeline = Pipeline(name="test_pipeline", cluster="local", jobs=jobs)
+
+        assert pipeline.name == "test_pipeline"
+        assert pipeline._legacy_mode is False
+        assert len(pipeline.jobs) == 2
+
+    def test_pipeline_cannot_specify_both_groups_and_jobs(self):
+        """Test that Pipeline raises error when both groups and jobs are specified."""
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group")
+        jobs = [{"name": "job", "group": group}]
+
+        with pytest.raises(ValueError, match="Cannot specify both 'groups' and 'jobs'"):
+            Pipeline(name="test", cluster="local", groups=[group], jobs=jobs)
+
+    def test_pipeline_must_specify_either_groups_or_jobs(self):
+        """Test that Pipeline raises error when neither groups nor jobs are specified."""
+        with pytest.raises(ValueError, match="Must specify either 'groups' or 'jobs'"):
+            Pipeline(name="test", cluster="local")
+
+    def test_pipeline_with_run_after(self):
+        """Test Pipeline with run_after parameter."""
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group")
+
+        pipeline = Pipeline(name="test", cluster="local", groups=[group], run_after="other_exp")
+
+        assert pipeline.run_after == "other_exp"
+
+    def test_pipeline_with_run_after_list(self):
+        """Test Pipeline with run_after as list."""
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group")
+
+        pipeline = Pipeline(name="test", cluster="local", groups=[group], run_after=["exp1", "exp2"])
+
+        assert pipeline.run_after == ["exp1", "exp2"]
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
+    def test_pipeline_get_cluster_config(self, mock_get_config):
+        """Test _get_cluster_config method."""
+        mock_config = {"executor": "local", "containers": {}}
+        mock_get_config.return_value = mock_config
+
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group")
+        pipeline = Pipeline(name="test", cluster="local", groups=[group])
+
+        config = pipeline._get_cluster_config()
+
+        assert config == mock_config
+        mock_get_config.assert_called_once_with("local")
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
+    def test_pipeline_missing_cluster_config(self, mock_get_config):
+        """Test that Pipeline raises error when cluster is not specified."""
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group")
+        pipeline = Pipeline(name="test", groups=[group])  # No cluster specified
+
+        with pytest.raises(ValueError, match="Must specify cluster"):
+            pipeline._get_cluster_config()
+
+
+class TestPipelineExecution:
+    """Test Pipeline execution and job management."""
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_exp")
+    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    @patch("nemo_skills.pipeline.utils.declarative.run_exp")
+    def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_config, mock_get_exp):
+        """Test basic pipeline execution."""
+        # Setup mocks
+        mock_config = {
+            "executor": "none",
+            "containers": {"nemo-skills": "container:latest"},
+        }
+        mock_get_config.return_value = mock_config
+        mock_env_vars.return_value = {"HF_HOME": "/hf"}
+
+        mock_exp = MagicMock()
+        mock_exp.add.return_value = "task_handle_1"
+        mock_get_exp.return_value.__enter__.return_value = mock_exp
+
+        # Create pipeline
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
+        pipeline = Pipeline(name="test", cluster="local", groups=[group], skip_hf_home_check=True)
+
+        # Run pipeline
+        result = pipeline.run(cluster_config=mock_config, dry_run=True)
+
+        # Verify
+        assert result == mock_exp
+        mock_exp.add.assert_called_once()
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_exp")
+    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    @patch("nemo_skills.pipeline.utils.declarative.run_exp")
+    def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_get_config, mock_get_exp):
+        """Test pipeline execution with job dependencies."""
+        # Setup mocks
+        mock_config = {
+            "executor": "none",
+            "containers": {"nemo-skills": "container:latest"},
+        }
+        mock_get_config.return_value = mock_config
+        mock_env_vars.return_value = {"HF_HOME": "/hf"}
+
+        mock_exp = MagicMock()
+        mock_exp.add.side_effect = ["handle_1", "handle_2"]
+        mock_get_exp.return_value.__enter__.return_value = mock_exp
+
+        # Create pipeline with dependencies
+        cmd1 = Command(command="echo 1", name="cmd1")
+        group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs")
+
+        cmd2 = Command(command="echo 2", name="cmd2")
+        group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs")
+
+        jobs = [
+            {"name": "job1", "group": group1},
+            {"name": "job2", "group": group2, "dependencies": ["job1"]},
+        ]
+        pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True)
+
+        # Run pipeline
+        pipeline.run(cluster_config=mock_config, dry_run=True)
+
+        # Verify both jobs were added
+        assert mock_exp.add.call_count == 2
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_exp")
+    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath")
+    @patch("nemo_skills.pipeline.utils.declarative.get_executor")
+    def test_pipeline_hf_home_validation(
+        self, mock_get_executor, mock_is_mounted, mock_env_vars, mock_get_config, mock_get_exp
+    ):
+        """Test HF_HOME validation."""
+        mock_config = {
+            "executor": "slurm",
+            "containers": {"nemo-skills": "container:latest"},
+            "account": "test_account",
+        }
+        mock_get_config.return_value = mock_config
+        mock_env_vars.return_value = {"HF_HOME": "/hf"}
+        mock_is_mounted.return_value = True
+        mock_get_executor.return_value = MagicMock()
+
+        mock_exp = MagicMock()
+        mock_exp.add.return_value = "handle"
+        mock_get_exp.return_value.__enter__.return_value = mock_exp
+
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
+        pipeline = Pipeline(name="test", cluster="local", groups=[group])
+
+        # Should not raise
+        pipeline.run(cluster_config=mock_config, dry_run=True)
+
+        # Verify executor was created
+        assert mock_get_executor.called
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    def test_pipeline_hf_home_missing(self, mock_env_vars, mock_get_config):
+        """Test that missing HF_HOME raises error."""
+        mock_config = {"executor": "slurm", "containers": {}}
+        mock_get_config.return_value = mock_config
+        mock_env_vars.return_value = {}  # No HF_HOME
+
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
+        pipeline = Pipeline(name="test", cluster="local", groups=[group])
+
+        with pytest.raises(RuntimeError, match="HF_HOME is missing"):
+            pipeline.run(cluster_config=mock_config, dry_run=True)
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath")
+    def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars, mock_get_config):
+        """Test that non-mounted HF_HOME raises error."""
+        mock_config = {"executor": "slurm", "containers": {}}
+        mock_get_config.return_value = mock_config
+        mock_env_vars.return_value = {"HF_HOME": "/hf"}
+        mock_is_mounted.return_value = False
+
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
+        pipeline = Pipeline(name="test", cluster="local", groups=[group])
+
+        with pytest.raises(RuntimeError, match="is not a mounted path"):
+            pipeline.run(cluster_config=mock_config, dry_run=True)
+
+
+class TestHetGroupIndices:
+    """Test het_group_index assignment."""
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_exp")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    def test_het_group_index_non_heterogeneous(self, mock_env_vars, mock_get_exp):
+        """Test that non-heterogeneous jobs have het_group_index=None."""
+        mock_config = {
+            "executor": "none",
+            "containers": {"nemo-skills": "container:latest"},
+        }
+        mock_env_vars.return_value = {"HF_HOME": "/hf"}
+
+        mock_exp = MagicMock()
+        mock_exp.add.return_value = "handle"
+        mock_get_exp.return_value.__enter__.return_value = mock_exp
+
+        # Create single-group job with multiple components
+        cmd1 = Command(command="echo 1", name="cmd1")
+        cmd2 = Command(command="echo 2", name="cmd2")
+        group = CommandGroup(commands=[cmd1, cmd2], name="group", log_dir="/logs")
+
+        pipeline = Pipeline(name="test", cluster="local", groups=[group], skip_hf_home_check=True)
+        pipeline.run(cluster_config=mock_config, dry_run=True)
+
+        # Both commands should have None het_group_index (localhost communication)
+        assert cmd1.het_group_index is None
+        assert cmd2.het_group_index is None
+        assert cmd1.hostname_ref() == "127.0.0.1"
+        assert cmd2.hostname_ref() == "127.0.0.1"
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_exp")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    def test_het_group_index_heterogeneous(self, mock_env_vars, mock_get_exp):
+        """Test that heterogeneous jobs get per-job het_group_index."""
+        mock_config = {
+            "executor": "none",
+            "containers": {"nemo-skills": "container:latest"},
+        }
+        mock_env_vars.return_value = {"HF_HOME": "/hf"}
+
+        mock_exp = MagicMock()
+        mock_exp.add.return_value = "handle"
+        mock_get_exp.return_value.__enter__.return_value = mock_exp
+
+        # Create multi-group heterogeneous job
+        cmd1 = Command(command="echo 1", name="cmd1")
+        group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs")
+
+        cmd2 = Command(command="echo 2", name="cmd2")
+        group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs")
+
+        jobs = [{"name": "hetjob", "groups": [group1, group2]}]
+        pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True)
+        pipeline.run(cluster_config=mock_config, dry_run=True)
+
+        # Commands should have het_group_index 0 and 1
+        assert cmd1.het_group_index == 0
+        assert cmd2.het_group_index == 1
+        assert "$SLURM_JOB_NODELIST_HET_GROUP_0" in cmd1.hostname_ref()
+        assert "$SLURM_JOB_NODELIST_HET_GROUP_1" in cmd2.hostname_ref()
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_exp")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    def test_het_group_index_per_job_not_global(self, mock_env_vars, mock_get_exp):
+        """Test that het_group_index is per-job, not global across pipeline."""
+        mock_config = {
+            "executor": "none",
+            "containers": {"nemo-skills": "container:latest"},
+        }
+        mock_env_vars.return_value = {"HF_HOME": "/hf"}
+
+        mock_exp = MagicMock()
+        mock_exp.add.side_effect = ["handle1", "handle2"]
+        mock_get_exp.return_value.__enter__.return_value = mock_exp
+
+        # Create two separate heterogeneous jobs
+        cmd1 = Command(command="echo 1", name="cmd1")
+        group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs")
+
+        cmd2 = Command(command="echo 2", name="cmd2")
+        group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs")
+
+        cmd3 = Command(command="echo 3", name="cmd3")
+        group3 = CommandGroup(commands=[cmd3], name="group3", log_dir="/logs")
+
+        cmd4 = Command(command="echo 4", name="cmd4")
+        group4 = CommandGroup(commands=[cmd4], name="group4", log_dir="/logs")
+
+        jobs = [
+            {"name": "hetjob1", "groups": [group1, group2]},
+            {"name": "hetjob2", "groups": [group3, group4]},
+        ]
+        pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True)
+        pipeline.run(cluster_config=mock_config, dry_run=True)
+
+        # Both jobs should have het_group_index starting from 0
+        assert cmd1.het_group_index == 0
+        assert cmd2.het_group_index == 1
+        assert cmd3.het_group_index == 0  # Starts from 0 again!
+        assert cmd4.het_group_index == 1
+
+
+class TestDependencyResolution:
+    """Test dependency resolution in Pipeline."""
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_exp")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    def test_dependency_none_handling(self, mock_env_vars, mock_get_exp):
+        """Test that explicit None dependencies are handled correctly."""
+        mock_config = {
+            "executor": "none",
+            "containers": {"nemo-skills": "container:latest"},
+        }
+        mock_env_vars.return_value = {"HF_HOME": "/hf"}
+
+        mock_exp = MagicMock()
+        mock_exp.add.return_value = "handle"
+        mock_get_exp.return_value.__enter__.return_value = mock_exp
+
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
+
+        jobs = [{"name": "job", "group": group, "dependencies": None}]
+        pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True)
+
+        # Should not raise
+        pipeline.run(cluster_config=mock_config, dry_run=True)
+
+    @patch("nemo_skills.pipeline.utils.declarative.get_exp")
+    @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
+    def test_pipeline_run_after_applies_to_jobs(self, mock_env_vars, mock_get_exp):
+        """Test that pipeline-level run_after applies to jobs without dependencies."""
+        mock_config = {
+            "executor": "none",
+            "containers": {"nemo-skills": "container:latest"},
+        }
+        mock_env_vars.return_value = {"HF_HOME": "/hf"}
+
+        mock_exp = MagicMock()
+        mock_exp.add.return_value = "handle"
+        mock_get_exp.return_value.__enter__.return_value = mock_exp
+
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
+
+        pipeline = Pipeline(
+            name="test", cluster="local", groups=[group], run_after="other_exp", skip_hf_home_check=True
+        )
+
+        # Should not raise and should apply run_after
+        pipeline.run(cluster_config=mock_config, dry_run=True)
+
+
+class TestErrorHandling:
+    """Test error handling in Pipeline."""
+
+    def test_pipeline_job_missing_group_or_groups(self):
+        """Test that job spec without group or groups raises error."""
+        jobs = [{"name": "bad_job"}]  # Missing 'group' or 'groups'
+
+        with pytest.raises(ValueError, match="must have either 'group' or 'groups'"):
+            pipeline = Pipeline(name="test", cluster="local", jobs=jobs)
+            mock_config = {"executor": "none", "containers": {}}
+            pipeline.run(cluster_config=mock_config, dry_run=True)
+
+    def test_commandgroup_missing_log_dir(self):
+        """Test that CommandGroup without log_dir raises error during execution."""
+        cmd = Command(command="echo test", name="cmd")
+        group = CommandGroup(commands=[cmd], name="group")  # No log_dir
+
+        pipeline = Pipeline(name="test", cluster="local", groups=[group])
+        mock_config = {"executor": "none", "containers": {}}
+
+        with pytest.raises(ValueError, match="must have log_dir set"):
+            pipeline.run(cluster_config=mock_config, dry_run=True)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

From 7005d4e0e64b3083797ac2cb53a01c0ad1fbd14d Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Fri, 3 Oct 2025 14:31:51 -0700
Subject: [PATCH 06/29] FIX _resuse_exp behavior

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py          |  8 ++---
 nemo_skills/pipeline/utils/declarative.py | 43 +++++++++++++++++------
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index 959635f8bc..180fe4d22f 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -469,11 +469,6 @@ def generate(
     if not jobs:
         return None
 
-    # Handle _reuse_exp case
-    if _reuse_exp:
-        # For internal use: return job names for dependency tracking
-        return all_job_names
-
     # Create and run pipeline
     pipeline = Pipeline(
         name=expname,
@@ -484,7 +479,8 @@ def generate(
         skip_hf_home_check=skip_hf_home_check,
     )
 
-    result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run)
+    # Pass _reuse_exp to pipeline.run() to add jobs to existing experiment
+    result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run, _reuse_exp=_reuse_exp)
     return result
 
 
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 8c604c4655..7e55ac8d69 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -336,6 +336,7 @@ def run(
         cluster: Optional[str] = None,
         dry_run: bool = False,
         log_dir: Optional[str] = None,
+        _reuse_exp=None,
     ):
         """Execute the pipeline by calling NeMo-Run directly.
 
@@ -344,6 +345,7 @@ def run(
             cluster: Cluster name to load config from (optional if cluster_config provided)
             dry_run: If True, validate without executing
             log_dir: Default log directory for groups that don't specify one (optional)
+            _reuse_exp: Internal - reuse existing experiment object (for eval.py integration)
         """
         if not self.jobs:
             LOG.info("No jobs to execute")
@@ -372,7 +374,7 @@ def run(
         # Track job name -> task handle for dependency resolution
         job_name_to_handle = {}
 
-        with get_exp(self.name, final_cluster_config) as exp:
+        with get_exp(self.name, final_cluster_config, _reuse_exp) as exp:
             # Process each job in order
             for job_spec in self.jobs:
                 job_name = job_spec.get("name", "unnamed")
@@ -393,23 +395,39 @@ def run(
 
                 for dep in job_dependencies:
                     if isinstance(dep, str):
-                        # String dependency - look up by name
+                        # String dependency - could be job name, task handle, or experiment name
                         if dep in job_name_to_handle:
                             # Internal pipeline dependency - add the handle
                             run_after_deps.append(job_name_to_handle[dep])
                             LOG.info(f"Job '{job_name}' depends on '{dep}' (handle: {job_name_to_handle[dep]})")
                         else:
-                            # External experiment name - will be resolved by get_exp_handles below
+                            # Could be external experiment name OR task handle from _reuse_exp case
+                            # Try to get as experiment name first
                             if final_cluster_config["executor"] == "slurm":
                                 exp_handles = get_exp_handles(dep)
                                 if len(exp_handles) == 0:
                                     LOG.warning(
                                         f"No pending or running tasks found for experiment {dep}, cannot set dependencies."
                                     )
-                                run_after_deps.extend(exp_handles)
-                                LOG.info(
-                                    f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)"
-                                )
+                                    # If no experiment found, treat as direct task handle (for _reuse_exp case)
+                                    if _reuse_exp:
+                                        run_after_deps.append(dep)
+                                        LOG.info(
+                                            f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)"
+                                        )
+                                else:
+                                    run_after_deps.extend(exp_handles)
+                                    LOG.info(
+                                        f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)"
+                                    )
+                            elif _reuse_exp:
+                                # For non-SLURM executors with _reuse_exp, treat as task handle
+                                run_after_deps.append(dep)
+                                LOG.info(f"Job '{job_name}' depends on task handle '{dep}'")
+                    else:
+                        # Direct task handle object (not string)
+                        run_after_deps.append(dep)
+                        LOG.info(f"Job '{job_name}' depends on task handle (object)")
 
                 # Convert empty list to None for cleaner handling
                 if len(run_after_deps) == 0:
@@ -451,7 +469,8 @@ def run(
                 job_name_to_handle[job_name] = task_handle
                 LOG.info(f"Added job '{job_name}' with task_handle={task_handle}")
 
-            if not dry_run:
+            # Only run if not using existing experiment (matching generate_v0.py line 331)
+            if not dry_run and not _reuse_exp:
                 run_exp(exp, final_cluster_config)
 
                 # Cache experiment for code reuse in future runs
@@ -462,6 +481,10 @@ def run(
                         REUSE_CODE_EXP[cur_tunnel_hash] = exp
                         LOG.info("Cached experiment for future code reuse")
 
+            # When reusing experiment, return list of task handles (matching generate_v0.py line 335)
+            if _reuse_exp:
+                return list(job_name_to_handle.values())
+
             return exp
 
     def _prepare_command(self, command, cluster_config: Dict) -> Tuple[str, Dict]:
@@ -539,8 +562,8 @@ def _plan_and_add_job(
         differences are controlled by the 'heterogeneous' flag and the provided 'groups'.
         """
 
-        # Dependencies are only passed through for SLURM
-        dependencies = run_after if (run_after and cluster_config["executor"] == "slurm") else None
+        # Pass dependencies through (nemo-run handles executor-specific behavior)
+        dependencies = run_after if run_after else None
 
         # Resolve log directory (use first group's log_dir if present)
         log_dir = groups[0].log_dir or default_log_dir

From c6157786d776f1b6ad9d7984ca6d0644a5ee034a Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Fri, 3 Oct 2025 14:36:46 -0700
Subject: [PATCH 07/29] Update nemo_skills/pipeline/generate.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index 180fe4d22f..26a0953e2a 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -110,12 +110,17 @@ def make_sandbox_cmd(cfg):
             cmd_builder, initial_metadata = sandbox_command(port=sandbox_port, keep_mounts=keep_mounts_for_sandbox)
             # Call the builder to get the actual command string
             cmd_string, runtime_metadata = cmd_builder(cfg)
-            # Merge metadata
+            # Merge metadata (deep-merge environment)
             metadata = initial_metadata.copy()
-            metadata.update(runtime_metadata)
+            if "environment" in runtime_metadata:
+                env = metadata.get("environment", {}).copy()
+                env.update(runtime_metadata["environment"])
+                metadata["environment"] = env
+            for k, v in runtime_metadata.items():
+                if k != "environment":
+                    metadata[k] = v
             metadata["log_prefix"] = "sandbox"
             return (cmd_string, metadata)
-
         sandbox_cmd = Command(
             command=make_sandbox_cmd,
             container=cluster_config["containers"]["sandbox"],

From 62948b1d6441bf905c9aa3bcf9504704a9ce36ba Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Fri, 3 Oct 2025 14:41:15 -0700
Subject: [PATCH 08/29] lint

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index 26a0953e2a..bebee08c32 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -121,11 +121,13 @@ def make_sandbox_cmd(cfg):
                     metadata[k] = v
             metadata["log_prefix"] = "sandbox"
             return (cmd_string, metadata)
+
         sandbox_cmd = Command(
             command=make_sandbox_cmd,
             container=cluster_config["containers"]["sandbox"],
             name=task_name,
         )
+
         components.append(sandbox_cmd)
 
     # Find maximum GPUs/nodes needed by any component for the HardwareConfig

From b1ef852f2772d308a212b22e9c44280f935cae13 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 15:01:24 -0700
Subject: [PATCH 09/29] MAINT remove lambdas dependent on cluster_cfg

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py          | 51 ++++++----------
 nemo_skills/pipeline/utils/commands.py    | 73 ++++++++++-------------
 nemo_skills/pipeline/utils/declarative.py | 36 +++++++----
 3 files changed, 76 insertions(+), 84 deletions(-)

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index bebee08c32..3d7b0ea15c 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -68,27 +68,25 @@ def _create_commandgroup_from_config(
         server_type = server_config["server_type"]
         server_container = server_config.pop("container", cluster_config["containers"][server_type])
 
-        # Create server command builder that defers execution until cluster_config is available
-        server_config_copy = server_config.copy()
-
-        def make_server_cmd(cfg):
-            cmd, num_tasks = get_server_command_fn(**server_config_copy, cluster_config=cfg)
-            return (
-                cmd,
-                {
-                    "num_tasks": num_tasks,
-                    "gpus": server_config_copy["num_gpus"],
-                    "nodes": server_config_copy["num_nodes"],
-                    "log_prefix": "server",
-                },
-            )
+        # Call server command builder directly with cluster_config
+        cmd, metadata = get_server_command_fn(**server_config, cluster_config=cluster_config)
+
+        # Add additional metadata
+        metadata.update(
+            {
+                "gpus": server_config["num_gpus"],
+                "nodes": server_config["num_nodes"],
+                "log_prefix": "server",
+            }
+        )
 
         server_cmd = Command(
-            command=make_server_cmd,
+            command=cmd,
             container=server_container,
             gpus=server_config["num_gpus"],
             nodes=server_config["num_nodes"],
             name=task_name,
+            metadata=metadata,
         )
         components.append(server_cmd)
 
@@ -104,28 +102,15 @@ def make_server_cmd(cfg):
 
     # 3. Add sandbox if requested
     if with_sandbox:
-
-        def make_sandbox_cmd(cfg):
-            # sandbox_command returns (callable, metadata), so we need to call the callable
-            cmd_builder, initial_metadata = sandbox_command(port=sandbox_port, keep_mounts=keep_mounts_for_sandbox)
-            # Call the builder to get the actual command string
-            cmd_string, runtime_metadata = cmd_builder(cfg)
-            # Merge metadata (deep-merge environment)
-            metadata = initial_metadata.copy()
-            if "environment" in runtime_metadata:
-                env = metadata.get("environment", {}).copy()
-                env.update(runtime_metadata["environment"])
-                metadata["environment"] = env
-            for k, v in runtime_metadata.items():
-                if k != "environment":
-                    metadata[k] = v
-            metadata["log_prefix"] = "sandbox"
-            return (cmd_string, metadata)
+        # Call sandbox command builder directly with cluster_config
+        cmd, metadata = sandbox_command(cluster_config=cluster_config, port=sandbox_port)
+        metadata["log_prefix"] = "sandbox"
 
         sandbox_cmd = Command(
-            command=make_sandbox_cmd,
+            command=cmd,
             container=cluster_config["containers"]["sandbox"],
             name=task_name,
+            metadata=metadata,
         )
 
         components.append(sandbox_cmd)
diff --git a/nemo_skills/pipeline/utils/commands.py b/nemo_skills/pipeline/utils/commands.py
index f2402c590b..12413a5677 100644
--- a/nemo_skills/pipeline/utils/commands.py
+++ b/nemo_skills/pipeline/utils/commands.py
@@ -19,13 +19,14 @@
 in server.py and other modules, designed to work with Command objects.
 """
 
-from typing import Callable, Dict, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 from nemo_skills.pipeline.utils.exp import get_sandbox_command
 from nemo_skills.pipeline.utils.server import get_free_port, get_server_command
 
 
 def vllm_server_command(
+    cluster_config: Dict,
     model: str,
     port: Optional[int] = None,
     server_type: str = "vllm",
@@ -34,12 +35,11 @@ def vllm_server_command(
     args: str = "",
     entrypoint: Optional[str] = None,
     **kwargs,
-) -> Tuple[Callable, Dict]:
+) -> Tuple[str, Dict]:
     """Build vLLM server command.
 
-    Returns a lambda that will build the command when cluster_config is available.
-
     Args:
+        cluster_config: Cluster configuration dictionary
         model: Model path or name
         port: Port to use (if None, will use get_free_port)
         server_type: Type of server (vllm, sglang, trtllm, megatron)
@@ -49,72 +49,65 @@ def vllm_server_command(
         entrypoint: Custom entrypoint script
 
     Returns:
-        Tuple of (lambda, metadata_dict)
+        Tuple of (command_string, metadata_dict)
     """
     if port is None:
         port = get_free_port(strategy="random")
 
+    cmd, num_tasks = get_server_command(
+        server_type=server_type,
+        num_gpus=gpus,
+        num_nodes=nodes,
+        model_path=model,
+        cluster_config=cluster_config,
+        server_port=port,
+        server_args=args,
+        server_entrypoint=entrypoint,
+    )
+
     metadata = {
         "port": port,
         "log_prefix": "server",
+        "num_tasks": num_tasks,
     }
 
-    # Return lambda that will call get_server_command when cluster_config is available
-    def server_cmd_builder(cfg):
-        cmd, num_tasks = get_server_command(
-            server_type=server_type,
-            num_gpus=gpus,
-            num_nodes=nodes,
-            model_path=model,
-            cluster_config=cfg,
-            server_port=port,
-            server_args=args,
-            server_entrypoint=entrypoint,
-        )
-        return (cmd, {"num_tasks": num_tasks})
-
-    return server_cmd_builder, metadata
+    return cmd, metadata
 
 
-def sandbox_command(port: Optional[int] = None, keep_mounts: bool = False, **kwargs) -> Tuple[Callable, Dict]:
+def sandbox_command(cluster_config: Dict, port: Optional[int] = None, **kwargs) -> Tuple[str, Dict]:
     """Build sandbox command.
 
-    Returns a lambda that will build the command when cluster_config is available.
-
     Args:
+        cluster_config: Cluster configuration dictionary
         port: Port to use for sandbox
-        keep_mounts: If True, keep mounts from cluster config. If False, use empty mounts for safety.
 
     Returns:
-        Tuple of (lambda, metadata_dict)
+        Tuple of (command_string, metadata_dict)
     """
     if port is None:
         port = get_free_port(strategy="random")
 
+    cmd = get_sandbox_command(cluster_config)
+
+    # Build PYTHONPATH from cluster config
+    pythonpath_env = {}
+    for env_var in cluster_config.get("env_vars", []):
+        if "PYTHONPATH" in env_var:
+            pythonpath = env_var[11:] if env_var.startswith("PYTHONPATH=") else env_var
+            pythonpath_env["PYTHONPATH"] = pythonpath + ":/app"
+            break
+
     metadata = {
         "port": port,
         "log_prefix": "sandbox",
-        "mounts": None if keep_mounts else [],  # None means use cluster config mounts, [] means no mounts
         "environment": {
             "LISTEN_PORT": str(port),
             "NGINX_PORT": str(port),
+            **pythonpath_env,
         },
     }
 
-    # Return lambda that will be evaluated when cluster_config is available
-    def sandbox_cmd_with_env(cfg):
-        cmd = get_sandbox_command(cfg)
-        # Build PYTHONPATH from cluster config
-        pythonpath_env = {}
-        for env_var in cfg.get("env_vars", []):
-            if "PYTHONPATH" in env_var:
-                pythonpath = env_var[11:] if env_var.startswith("PYTHONPATH=") else env_var
-                pythonpath_env["PYTHONPATH"] = pythonpath + ":/app"
-                break
-        # Return command + additional metadata
-        return (cmd, {"environment": pythonpath_env})
-
-    return sandbox_cmd_with_env, metadata
+    return cmd, metadata
 
 
 def wrap_command(command: str, working_dir: str = "/nemo_run/code", env_vars: Optional[Dict[str, str]] = None) -> str:
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 7e55ac8d69..9560f418d3 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -20,8 +20,15 @@
     from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline
 
     # Commands that run together in one SLURM job
-    server = Command(command=vllm_server_command(model="Qwen/Qwen3-8B"), gpus=8, name="server")
-    sandbox = Command(command=sandbox_command(), name="sandbox")
+    # Note: Lambdas are needed for cross-component references (hostname_ref, meta_ref)
+    # which aren't resolved until het_group_index is assigned at pipeline execution time.
+    server = Command(
+        command=vllm_server_command(cluster_cfg, model="Qwen/Qwen3-8B"),
+        gpus=8,
+        name="server"
+    )
+    sandbox = Command(command=sandbox_command(cluster_cfg), name="sandbox")
+    # This lambda is ESSENTIAL - server.hostname_ref() and meta_ref() aren't available until runtime
     client = Command(
         command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}/health",
         name="client"
@@ -58,12 +65,20 @@
     )
 
     # Job 2: Two different model servers (HETEROGENEOUS SLURM job with 2 het components)
-    server_8b = Command(command=vllm_server_command(model="Qwen/Qwen3-8B"), gpus=8, name="server_8b")
-    sandbox_8b = Command(command=sandbox_command(), name="sandbox_8b")
+    server_8b = Command(
+        command=lambda cfg: vllm_server_command(cfg, model="Qwen/Qwen3-8B"),
+        gpus=8,
+        name="server_8b"
+    )
+    sandbox_8b = Command(command=lambda cfg: sandbox_command(cfg), name="sandbox_8b")
     eval_8b = Command(command="python eval.py --model 8b", gpus=1, name="eval_8b")
 
-    server_32b = Command(command=vllm_server_command(model="Qwen/Qwen3-32B"), gpus=8, name="server_32b")
-    sandbox_32b = Command(command=sandbox_command(), name="sandbox_32b")
+    server_32b = Command(
+        command=lambda cfg: vllm_server_command(cfg, model="Qwen/Qwen3-32B"),
+        gpus=8,
+        name="server_32b"
+    )
+    sandbox_32b = Command(command=lambda cfg: sandbox_command(cfg), name="sandbox_32b")
     eval_32b = Command(command="python eval.py --model 32b", gpus=1, name="eval_32b")
 
     group_8b = CommandGroup(commands=[server_8b, sandbox_8b, eval_8b], name="eval_8b", log_dir=log_dir)
@@ -130,11 +145,10 @@ class Command:
     - A tuple (command, metadata): command with metadata like port
     - A callable returning (command, metadata): lazy evaluation with metadata
 
-    Using a lambda allows references to work correctly in heterogeneous jobs:
-        Command(command=lambda: f"curl {server.hostname_ref()}:5000")
-
-    Metadata from command builders (like port) can be referenced:
-        server = Command(command=vllm_server_command(...))
+    Lambdas are needed for cross-component references (hostname_ref, meta_ref).
+    The het_group_index isn't assigned until pipeline execution, so these must be lazy:
+        server = Command(command=lambda cfg: vllm_server_command(cfg, ...))
+        # This lambda is ESSENTIAL - server.hostname_ref() and meta_ref() don't exist yet
         client = Command(command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}")
     """
 

From fd7e3c66c6f1167beb9458963149fab388cec716 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 15:18:35 -0700
Subject: [PATCH 10/29] MAINT simplify command typing

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 23 +++--------------------
 tests/test_declarative_pipeline.py        | 12 ++++++------
 2 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 9560f418d3..9db837bf9d 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -142,8 +142,6 @@ class Command:
     The command can be either:
     - A string: evaluated immediately
     - A callable (lambda): evaluated lazily when the task is prepared
-    - A tuple (command, metadata): command with metadata like port
-    - A callable returning (command, metadata): lazy evaluation with metadata
 
     Lambdas are needed for cross-component references (hostname_ref, meta_ref).
     The het_group_index isn't assigned until pipeline execution, so these must be lazy:
@@ -152,14 +150,9 @@ class Command:
         client = Command(command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}")
     """
 
-    command: Union[
-        str,
-        Callable[[], str],  # Lambda for cross-group refs
-        Callable[[Dict], str],  # Lambda needing cluster_config (e.g., sandbox)
-        Tuple[Optional[str], Dict],  # Command builder result
-        Callable[[], Tuple[Optional[str], Dict]],  # Lambda returning command builder result
-        None,
-    ]
+    # Command can be a string or callable (lambda).
+    # Lambdas are primarily used for cross-component references (hostname_ref, meta_ref).
+    command: Union[str, Callable]
     container: str = "nemo-skills"
     gpus: Optional[int] = None
     nodes: int = 1
@@ -176,20 +169,10 @@ def __post_init__(self):
         if self.name is None:
             self.name = "command"
 
-        # Extract metadata if command is a tuple
-        self._extract_metadata()
-
         # Wrap plain strings with environment setup
         if isinstance(self.command, str) and (self.env_vars or self.working_dir):
             self.command = wrap_command(self.command, self.working_dir, self.env_vars)
 
-    def _extract_metadata(self):
-        """Extract metadata from command if it's a tuple."""
-        if not callable(self.command):
-            if isinstance(self.command, tuple):
-                cmd, self.metadata = self.command
-                self.command = cmd  # Can be None for sandbox
-
     def hostname_ref(self) -> str:
         """Get hostname reference for hetjob cross-component communication."""
         if self.het_group_index is None:
diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index 6af115675f..cfe36c9f96 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -33,8 +33,8 @@ def test_command_basic_string(self):
         assert cmd.nodes == 1
 
     def test_command_with_metadata(self):
-        """Test Command with metadata tuple."""
-        cmd = Command(command=("echo hello", {"port": 8080, "log_prefix": "server"}), name="server")
+        """Test Command with metadata passed separately."""
+        cmd = Command(command="echo hello", name="server", metadata={"port": 8080, "log_prefix": "server"})
         assert cmd.metadata["port"] == 8080
         assert cmd.metadata["log_prefix"] == "server"
         # Command gets wrapped with working_dir by default
@@ -43,7 +43,7 @@ def test_command_with_metadata(self):
     def test_command_with_callable(self):
         """Test Command with callable that returns tuple."""
 
-        def make_cmd():
+        def make_cmd(cfg):
             return ("echo world", {"port": 5000})
 
         cmd = Command(command=make_cmd, name="dynamic")
@@ -65,7 +65,7 @@ def test_command_prepare_for_execution_string(self):
     def test_command_prepare_for_execution_callable(self):
         """Test prepare_for_execution with callable command."""
 
-        def make_cmd():
+        def make_cmd(cfg):
             return "echo test"
 
         cmd = Command(command=make_cmd, name="test")
@@ -78,7 +78,7 @@ def make_cmd():
     def test_command_prepare_for_execution_callable_with_metadata(self):
         """Test prepare_for_execution with callable returning tuple."""
 
-        def make_cmd():
+        def make_cmd(cfg):
             return ("echo metadata", {"num_tasks": 4, "environment": {"VAR": "value"}})
 
         cmd = Command(command=make_cmd, name="test")
@@ -105,7 +105,7 @@ def make_cmd(cfg):
 
     def test_command_meta_ref(self):
         """Test meta_ref for accessing metadata."""
-        cmd = Command(command=("echo test", {"port": 8080, "host": "localhost"}), name="server")
+        cmd = Command(command="echo test", name="server", metadata={"port": 8080, "host": "localhost"})
 
         assert cmd.meta_ref("port") == "8080"
         assert cmd.meta_ref("host") == "localhost"

From 3629e096569aa5d036c696be906d9db2b10ad1b7 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 15:21:04 -0700
Subject: [PATCH 11/29] set default name

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 9db837bf9d..1d3fd5a7c9 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -156,7 +156,7 @@ class Command:
     container: str = "nemo-skills"
     gpus: Optional[int] = None
     nodes: int = 1
-    name: Optional[str] = None
+    name: str = "command"
     working_dir: str = "/nemo_run/code"
     env_vars: Dict[str, str] = field(default_factory=dict)
     installation_command: Optional[str] = None
@@ -165,10 +165,6 @@ class Command:
     het_group_index: Optional[int] = None  # Set per-job by Pipeline (not global)
 
     def __post_init__(self):
-        # Initialize defaults
-        if self.name is None:
-            self.name = "command"
-
         # Wrap plain strings with environment setup
         if isinstance(self.command, str) and (self.env_vars or self.working_dir):
             self.command = wrap_command(self.command, self.working_dir, self.env_vars)

From f286644e3bd591f3691487398cc8ebc804b4bfa3 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 15:25:15 -0700
Subject: [PATCH 12/29] remove cluster config lambda resolution

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 45 ++++++++---------------
 tests/test_declarative_pipeline.py        | 19 ++--------
 2 files changed, 19 insertions(+), 45 deletions(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 1d3fd5a7c9..18b25e06c8 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -65,20 +65,18 @@
     )
 
     # Job 2: Two different model servers (HETEROGENEOUS SLURM job with 2 het components)
-    server_8b = Command(
-        command=lambda cfg: vllm_server_command(cfg, model="Qwen/Qwen3-8B"),
-        gpus=8,
-        name="server_8b"
-    )
-    sandbox_8b = Command(command=lambda cfg: sandbox_command(cfg), name="sandbox_8b")
+    # Build commands with cluster_config
+    server_8b_cmd, server_8b_meta = vllm_server_command(cluster_config, model="Qwen/Qwen3-8B")
+    sandbox_8b_cmd, sandbox_8b_meta = sandbox_command(cluster_config)
+    server_32b_cmd, server_32b_meta = vllm_server_command(cluster_config, model="Qwen/Qwen3-32B")
+    sandbox_32b_cmd, sandbox_32b_meta = sandbox_command(cluster_config)
+
+    server_8b = Command(command=server_8b_cmd, gpus=8, name="server_8b", metadata=server_8b_meta)
+    sandbox_8b = Command(command=sandbox_8b_cmd, name="sandbox_8b", metadata=sandbox_8b_meta)
     eval_8b = Command(command="python eval.py --model 8b", gpus=1, name="eval_8b")
 
-    server_32b = Command(
-        command=lambda cfg: vllm_server_command(cfg, model="Qwen/Qwen3-32B"),
-        gpus=8,
-        name="server_32b"
-    )
-    sandbox_32b = Command(command=lambda cfg: sandbox_command(cfg), name="sandbox_32b")
+    server_32b = Command(command=server_32b_cmd, gpus=8, name="server_32b", metadata=server_32b_meta)
+    sandbox_32b = Command(command=sandbox_32b_cmd, name="sandbox_32b", metadata=sandbox_32b_meta)
     eval_32b = Command(command="python eval.py --model 32b", gpus=1, name="eval_32b")
 
     group_8b = CommandGroup(commands=[server_8b, sandbox_8b, eval_8b], name="eval_8b", log_dir=log_dir)
@@ -107,7 +105,6 @@
     pipeline.run()
 """
 
-import inspect
 import logging
 import shlex
 from contextlib import nullcontext
@@ -143,10 +140,9 @@ class Command:
     - A string: evaluated immediately
     - A callable (lambda): evaluated lazily when the task is prepared
 
-    Lambdas are needed for cross-component references (hostname_ref, meta_ref).
+    Lambdas are ONLY needed for cross-component references (hostname_ref, meta_ref).
     The het_group_index isn't assigned until pipeline execution, so these must be lazy:
-        server = Command(command=lambda cfg: vllm_server_command(cfg, ...))
-        # This lambda is ESSENTIAL - server.hostname_ref() and meta_ref() don't exist yet
+        # Lambda is ESSENTIAL here - server.hostname_ref() and meta_ref() don't exist yet
         client = Command(command=lambda: f"curl {server.hostname_ref()}:{server.meta_ref('port')}")
     """
 
@@ -189,24 +185,15 @@ def prepare_for_execution(self, cluster_config: Dict) -> Tuple[str, Dict]:
         """Prepare command for execution.
 
         This method:
-        1. Evaluates callables (resolves cross-group references and cluster_config-dependent commands)
-        2. Adds cross-group environment variables
-        3. Wraps with installation_command if provided
+        1. Evaluates callables (resolves cross-component references)
+        2. Wraps with installation_command if provided
 
         Returns:
             Tuple of (final_command, execution_config)
         """
-        # 1. Evaluate if callable (resolves cross-group references or cluster_config needs)
+        # 1. Evaluate if callable (for cross-component references like hostname_ref)
         if callable(self.command):
-            # Check if lambda needs cluster_config (for sandbox)
-
-            sig = inspect.signature(self.command)
-            if len(sig.parameters) > 0:
-                # Lambda expects cluster_config
-                result = self.command(cluster_config)
-            else:
-                # Regular lambda (cross-group refs)
-                result = self.command()
+            result = self.command()
 
             if isinstance(result, tuple):
                 final_command, runtime_metadata = result
diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index cfe36c9f96..d3e862c9c0 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -43,7 +43,7 @@ def test_command_with_metadata(self):
     def test_command_with_callable(self):
         """Test Command with callable that returns tuple."""
 
-        def make_cmd(cfg):
+        def make_cmd():
             return ("echo world", {"port": 5000})
 
         cmd = Command(command=make_cmd, name="dynamic")
@@ -65,7 +65,7 @@ def test_command_prepare_for_execution_string(self):
     def test_command_prepare_for_execution_callable(self):
         """Test prepare_for_execution with callable command."""
 
-        def make_cmd(cfg):
+        def make_cmd():
             return "echo test"
 
         cmd = Command(command=make_cmd, name="test")
@@ -78,7 +78,7 @@ def make_cmd(cfg):
     def test_command_prepare_for_execution_callable_with_metadata(self):
         """Test prepare_for_execution with callable returning tuple."""
 
-        def make_cmd(cfg):
+        def make_cmd():
             return ("echo metadata", {"num_tasks": 4, "environment": {"VAR": "value"}})
 
         cmd = Command(command=make_cmd, name="test")
@@ -90,19 +90,6 @@ def make_cmd(cfg):
         assert exec_config["num_tasks"] == 4
         assert exec_config["environment"]["VAR"] == "value"
 
-    def test_command_prepare_for_execution_with_cluster_config(self):
-        """Test prepare_for_execution with callable needing cluster_config."""
-
-        def make_cmd(cfg):
-            return f"echo {cfg['test_param']}"
-
-        cmd = Command(command=make_cmd, name="test")
-        cluster_config = {"executor": "local", "containers": {}, "test_param": "value123"}
-
-        final_cmd, exec_config = cmd.prepare_for_execution(cluster_config)
-
-        assert final_cmd == "echo value123"
-
     def test_command_meta_ref(self):
         """Test meta_ref for accessing metadata."""
         cmd = Command(command="echo test", name="server", metadata={"port": 8080, "host": "localhost"})

From db30066e819977b5dfe74d13863e0cb3688984ab Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 15:42:57 -0700
Subject: [PATCH 13/29] ENH simplify cluster config prep

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py          |  4 +-
 nemo_skills/pipeline/utils/declarative.py | 55 ++++---------
 tests/test_declarative_pipeline.py        | 96 +++++++++++------------
 3 files changed, 62 insertions(+), 93 deletions(-)

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index 3d7b0ea15c..a0e07da31c 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -464,7 +464,7 @@ def generate(
     # Create and run pipeline
     pipeline = Pipeline(
         name=expname,
-        cluster=cluster,
+        cluster_config=cluster_config,
         jobs=jobs,
         reuse_code=reuse_code,
         reuse_code_exp=reuse_code_exp,
@@ -472,7 +472,7 @@ def generate(
     )
 
     # Pass _reuse_exp to pipeline.run() to add jobs to existing experiment
-    result = pipeline.run(cluster_config=cluster_config, dry_run=dry_run, _reuse_exp=_reuse_exp)
+    result = pipeline.run(dry_run=dry_run, _reuse_exp=_reuse_exp)
     return result
 
 
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 18b25e06c8..75ddff922e 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -44,7 +44,7 @@
     # Create and run pipeline
     pipeline = Pipeline(
         name="my_inference",
-        cluster="local",
+        cluster_config=cluster_config,
         groups=[inference_group]
     )
     pipeline.run()
@@ -93,7 +93,7 @@
     # Create pipeline with dependency graph
     pipeline = Pipeline(
         name="full_pipeline",
-        cluster="slurm",
+        cluster_config=cluster_config,
         jobs=[
             {"name": "prep", "group": prep_group},
 
@@ -114,7 +114,6 @@
 import nemo_run as run
 
 from nemo_skills.pipeline.utils import (
-    get_cluster_config,
     get_env_variables,
     get_executor,
     get_exp,
@@ -269,7 +268,7 @@ class Pipeline:
     def __init__(
         self,
         name: str,
-        cluster: Optional[str] = None,
+        cluster_config: Dict,
         groups: Optional[List[CommandGroup]] = None,  # Legacy mode
         jobs: Optional[List[Dict]] = None,  # New mode with dependencies
         reuse_code: bool = True,
@@ -279,13 +278,12 @@ def __init__(
         run_after: Optional[Union[str, List[str]]] = None,  # Pipeline-level dependency on other experiments
     ):
         self.name = name
-        self.cluster = cluster
+        self.cluster_config = cluster_config
         self.reuse_code = reuse_code
         self.reuse_code_exp = reuse_code_exp
         self.skip_hf_home_check = skip_hf_home_check
         self.with_ray = with_ray
         self.run_after = run_after
-        self._cluster_config: Optional[Dict] = None
 
         if groups is not None and jobs is not None:
             raise ValueError("Cannot specify both 'groups' and 'jobs'.")
@@ -301,19 +299,8 @@ def __init__(
 
         # Note: het_group_indices are assigned per-job in _plan_and_add_job, not globally
 
-    def _get_cluster_config(self) -> Dict:
-        """Get cluster configuration, loading it if necessary."""
-        if self._cluster_config is None:
-            if self.cluster is None:
-                raise ValueError("Must specify cluster either in Pipeline() or run() method")
-
-            self._cluster_config = get_cluster_config(self.cluster)
-        return self._cluster_config
-
     def run(
         self,
-        cluster_config: Optional[Dict] = None,
-        cluster: Optional[str] = None,
         dry_run: bool = False,
         log_dir: Optional[str] = None,
         _reuse_exp=None,
@@ -321,8 +308,6 @@ def run(
         """Execute the pipeline by calling NeMo-Run directly.
 
         Args:
-            cluster_config: Cluster configuration dict (optional, can use cluster name instead)
-            cluster: Cluster name to load config from (optional if cluster_config provided)
             dry_run: If True, validate without executing
             log_dir: Default log directory for groups that don't specify one (optional)
             _reuse_exp: Internal - reuse existing experiment object (for eval.py integration)
@@ -331,30 +316,22 @@ def run(
             LOG.info("No jobs to execute")
             return None
 
-        # Determine cluster config to use
-        if cluster_config is not None:
-            final_cluster_config = cluster_config
-        elif cluster is not None:
-            final_cluster_config = get_cluster_config(cluster)
-        else:
-            final_cluster_config = self._get_cluster_config()
-
         # Validate HF_HOME environment variable
-        if final_cluster_config["executor"] != "none" and not self.skip_hf_home_check:
-            env_vars = get_env_variables(final_cluster_config)
+        if self.cluster_config["executor"] != "none" and not self.skip_hf_home_check:
+            env_vars = get_env_variables(self.cluster_config)
             if "HF_HOME" not in env_vars:
                 raise RuntimeError(
                     "Invalid cluster_config: HF_HOME is missing from env_vars while skip_hf_home_check=False.\n"
-                    f"Current env_vars: {final_cluster_config.get('env_vars', [])}\n"
+                    f"Current env_vars: {self.cluster_config.get('env_vars', [])}\n"
                     "Please add a new variable: HF_HOME=/mounted/path/to/your/hf_home"
                 )
-            if not is_mounted_filepath(final_cluster_config, env_vars["HF_HOME"]):
+            if not is_mounted_filepath(self.cluster_config, env_vars["HF_HOME"]):
                 raise RuntimeError(f"Invalid cluster_config: HF_HOME={env_vars['HF_HOME']} is not a mounted path.")
 
         # Track job name -> task handle for dependency resolution
         job_name_to_handle = {}
 
-        with get_exp(self.name, final_cluster_config, _reuse_exp) as exp:
+        with get_exp(self.name, self.cluster_config, _reuse_exp) as exp:
             # Process each job in order
             for job_spec in self.jobs:
                 job_name = job_spec.get("name", "unnamed")
@@ -383,7 +360,7 @@ def run(
                         else:
                             # Could be external experiment name OR task handle from _reuse_exp case
                             # Try to get as experiment name first
-                            if final_cluster_config["executor"] == "slurm":
+                            if self.cluster_config["executor"] == "slurm":
                                 exp_handles = get_exp_handles(dep)
                                 if len(exp_handles) == 0:
                                     LOG.warning(
@@ -420,7 +397,7 @@ def run(
                         task_handle = self._add_single_group_job(
                             exp,
                             job_spec["groups"][0],
-                            final_cluster_config,
+                            self.cluster_config,
                             default_log_dir=log_dir,
                             run_after=run_after_deps if run_after_deps else None,
                         )
@@ -429,7 +406,7 @@ def run(
                         task_handle = self._add_multi_group_job(
                             exp,
                             job_spec["groups"],
-                            final_cluster_config,
+                            self.cluster_config,
                             default_log_dir=log_dir,
                             run_after=run_after_deps if run_after_deps else None,
                         )
@@ -438,7 +415,7 @@ def run(
                     task_handle = self._add_single_group_job(
                         exp,
                         job_spec["group"],
-                        final_cluster_config,
+                        self.cluster_config,
                         default_log_dir=log_dir,
                         run_after=run_after_deps if run_after_deps else None,
                     )
@@ -451,11 +428,11 @@ def run(
 
             # Only run if not using existing experiment (matching generate_v0.py line 331)
             if not dry_run and not _reuse_exp:
-                run_exp(exp, final_cluster_config)
+                run_exp(exp, self.cluster_config)
 
                 # Cache experiment for code reuse in future runs
-                if final_cluster_config["executor"] != "none":
-                    tunnel = get_tunnel(final_cluster_config)
+                if self.cluster_config["executor"] != "none":
+                    tunnel = get_tunnel(self.cluster_config)
                     cur_tunnel_hash = tunnel_hash(tunnel)
                     if cur_tunnel_hash not in REUSE_CODE_EXP:
                         REUSE_CODE_EXP[cur_tunnel_hash] = exp
diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index d3e862c9c0..093d38bf4c 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -184,8 +184,9 @@ def test_pipeline_with_groups_legacy_mode(self):
         """Test Pipeline with groups parameter (legacy mode)."""
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group")
+        cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test_pipeline", cluster="local", groups=[group])
+        pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, groups=[group])
 
         assert pipeline.name == "test_pipeline"
         assert pipeline._legacy_mode is True
@@ -204,8 +205,9 @@ def test_pipeline_with_jobs(self):
             {"name": "job1", "group": group1},
             {"name": "job2", "group": group2, "dependencies": ["job1"]},
         ]
+        cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test_pipeline", cluster="local", jobs=jobs)
+        pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=jobs)
 
         assert pipeline.name == "test_pipeline"
         assert pipeline._legacy_mode is False
@@ -216,21 +218,24 @@ def test_pipeline_cannot_specify_both_groups_and_jobs(self):
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group")
         jobs = [{"name": "job", "group": group}]
+        cluster_config = {"executor": "local", "containers": {}}
 
         with pytest.raises(ValueError, match="Cannot specify both 'groups' and 'jobs'"):
-            Pipeline(name="test", cluster="local", groups=[group], jobs=jobs)
+            Pipeline(name="test", cluster_config=cluster_config, groups=[group], jobs=jobs)
 
     def test_pipeline_must_specify_either_groups_or_jobs(self):
         """Test that Pipeline raises error when neither groups nor jobs are specified."""
+        cluster_config = {"executor": "local", "containers": {}}
         with pytest.raises(ValueError, match="Must specify either 'groups' or 'jobs'"):
-            Pipeline(name="test", cluster="local")
+            Pipeline(name="test", cluster_config=cluster_config)
 
     def test_pipeline_with_run_after(self):
         """Test Pipeline with run_after parameter."""
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group")
+        cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test", cluster="local", groups=[group], run_after="other_exp")
+        pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group], run_after="other_exp")
 
         assert pipeline.run_after == "other_exp"
 
@@ -238,35 +243,22 @@ def test_pipeline_with_run_after_list(self):
         """Test Pipeline with run_after as list."""
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group")
+        cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test", cluster="local", groups=[group], run_after=["exp1", "exp2"])
+        pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group], run_after=["exp1", "exp2"])
 
         assert pipeline.run_after == ["exp1", "exp2"]
 
-    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
-    def test_pipeline_get_cluster_config(self, mock_get_config):
-        """Test _get_cluster_config method."""
-        mock_config = {"executor": "local", "containers": {}}
-        mock_get_config.return_value = mock_config
-
+    def test_pipeline_cluster_config_passed_directly(self):
+        """Test that cluster_config is passed directly (no more string resolution)."""
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group")
-        pipeline = Pipeline(name="test", cluster="local", groups=[group])
-
-        config = pipeline._get_cluster_config()
+        cluster_config = {"executor": "local", "containers": {}}
 
-        assert config == mock_config
-        mock_get_config.assert_called_once_with("local")
+        pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group])
 
-    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
-    def test_pipeline_missing_cluster_config(self, mock_get_config):
-        """Test that Pipeline raises error when cluster is not specified."""
-        cmd = Command(command="echo test", name="cmd")
-        group = CommandGroup(commands=[cmd], name="group")
-        pipeline = Pipeline(name="test", groups=[group])  # No cluster specified
-
-        with pytest.raises(ValueError, match="Must specify cluster"):
-            pipeline._get_cluster_config()
+        # cluster_config is stored as-is
+        assert pipeline.cluster_config == cluster_config
 
 
 class TestPipelineExecution:
@@ -293,10 +285,10 @@ def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_config,
         # Create pipeline
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
-        pipeline = Pipeline(name="test", cluster="local", groups=[group], skip_hf_home_check=True)
+        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group], skip_hf_home_check=True)
 
         # Run pipeline
-        result = pipeline.run(cluster_config=mock_config, dry_run=True)
+        result = pipeline.run(dry_run=True)
 
         # Verify
         assert result == mock_exp
@@ -331,10 +323,10 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_
             {"name": "job1", "group": group1},
             {"name": "job2", "group": group2, "dependencies": ["job1"]},
         ]
-        pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True)
+        pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True)
 
         # Run pipeline
-        pipeline.run(cluster_config=mock_config, dry_run=True)
+        pipeline.run(dry_run=True)
 
         # Verify both jobs were added
         assert mock_exp.add.call_count == 2
@@ -364,10 +356,10 @@ def test_pipeline_hf_home_validation(
 
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
-        pipeline = Pipeline(name="test", cluster="local", groups=[group])
+        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group])
 
         # Should not raise
-        pipeline.run(cluster_config=mock_config, dry_run=True)
+        pipeline.run(dry_run=True)
 
         # Verify executor was created
         assert mock_get_executor.called
@@ -382,10 +374,10 @@ def test_pipeline_hf_home_missing(self, mock_env_vars, mock_get_config):
 
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
-        pipeline = Pipeline(name="test", cluster="local", groups=[group])
+        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group])
 
         with pytest.raises(RuntimeError, match="HF_HOME is missing"):
-            pipeline.run(cluster_config=mock_config, dry_run=True)
+            pipeline.run(dry_run=True)
 
     @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
@@ -399,10 +391,10 @@ def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars, mock
 
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
-        pipeline = Pipeline(name="test", cluster="local", groups=[group])
+        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group])
 
         with pytest.raises(RuntimeError, match="is not a mounted path"):
-            pipeline.run(cluster_config=mock_config, dry_run=True)
+            pipeline.run(dry_run=True)
 
 
 class TestHetGroupIndices:
@@ -427,8 +419,8 @@ def test_het_group_index_non_heterogeneous(self, mock_env_vars, mock_get_exp):
         cmd2 = Command(command="echo 2", name="cmd2")
         group = CommandGroup(commands=[cmd1, cmd2], name="group", log_dir="/logs")
 
-        pipeline = Pipeline(name="test", cluster="local", groups=[group], skip_hf_home_check=True)
-        pipeline.run(cluster_config=mock_config, dry_run=True)
+        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group], skip_hf_home_check=True)
+        pipeline.run(dry_run=True)
 
         # Both commands should have None het_group_index (localhost communication)
         assert cmd1.het_group_index is None
@@ -458,8 +450,8 @@ def test_het_group_index_heterogeneous(self, mock_env_vars, mock_get_exp):
         group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs")
 
         jobs = [{"name": "hetjob", "groups": [group1, group2]}]
-        pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True)
-        pipeline.run(cluster_config=mock_config, dry_run=True)
+        pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True)
+        pipeline.run(dry_run=True)
 
         # Commands should have het_group_index 0 and 1
         assert cmd1.het_group_index == 0
@@ -498,8 +490,8 @@ def test_het_group_index_per_job_not_global(self, mock_env_vars, mock_get_exp):
             {"name": "hetjob1", "groups": [group1, group2]},
             {"name": "hetjob2", "groups": [group3, group4]},
         ]
-        pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True)
-        pipeline.run(cluster_config=mock_config, dry_run=True)
+        pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True)
+        pipeline.run(dry_run=True)
 
         # Both jobs should have het_group_index starting from 0
         assert cmd1.het_group_index == 0
@@ -529,10 +521,10 @@ def test_dependency_none_handling(self, mock_env_vars, mock_get_exp):
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
 
         jobs = [{"name": "job", "group": group, "dependencies": None}]
-        pipeline = Pipeline(name="test", cluster="local", jobs=jobs, skip_hf_home_check=True)
+        pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True)
 
         # Should not raise
-        pipeline.run(cluster_config=mock_config, dry_run=True)
+        pipeline.run(dry_run=True)
 
     @patch("nemo_skills.pipeline.utils.declarative.get_exp")
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
@@ -552,11 +544,11 @@ def test_pipeline_run_after_applies_to_jobs(self, mock_env_vars, mock_get_exp):
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
 
         pipeline = Pipeline(
-            name="test", cluster="local", groups=[group], run_after="other_exp", skip_hf_home_check=True
+            name="test", cluster_config=mock_config, groups=[group], run_after="other_exp", skip_hf_home_check=True
         )
 
         # Should not raise and should apply run_after
-        pipeline.run(cluster_config=mock_config, dry_run=True)
+        pipeline.run(dry_run=True)
 
 
 class TestErrorHandling:
@@ -564,23 +556,23 @@ class TestErrorHandling:
 
     def test_pipeline_job_missing_group_or_groups(self):
         """Test that job spec without group or groups raises error."""
+        mock_config = {"executor": "none", "containers": {}}
         jobs = [{"name": "bad_job"}]  # Missing 'group' or 'groups'
 
         with pytest.raises(ValueError, match="must have either 'group' or 'groups'"):
-            pipeline = Pipeline(name="test", cluster="local", jobs=jobs)
-            mock_config = {"executor": "none", "containers": {}}
-            pipeline.run(cluster_config=mock_config, dry_run=True)
+            pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs)
+            pipeline.run(dry_run=True)
 
     def test_commandgroup_missing_log_dir(self):
         """Test that CommandGroup without log_dir raises error during execution."""
+        mock_config = {"executor": "none", "containers": {}}
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group")  # No log_dir
 
-        pipeline = Pipeline(name="test", cluster="local", groups=[group])
-        mock_config = {"executor": "none", "containers": {}}
+        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group])
 
         with pytest.raises(ValueError, match="must have log_dir set"):
-            pipeline.run(cluster_config=mock_config, dry_run=True)
+            pipeline.run(dry_run=True)
 
 
 if __name__ == "__main__":

From 876913f04f9d25285bb67b6cd703ce49f4d16b3f Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 16:01:31 -0700
Subject: [PATCH 14/29] TST fix tests

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_declarative_pipeline.py | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index 093d38bf4c..494a00e8c4 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -265,17 +265,15 @@ class TestPipelineExecution:
     """Test Pipeline execution and job management."""
 
     @patch("nemo_skills.pipeline.utils.declarative.get_exp")
-    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
     @patch("nemo_skills.pipeline.utils.declarative.run_exp")
-    def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_config, mock_get_exp):
+    def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_exp):
         """Test basic pipeline execution."""
         # Setup mocks
         mock_config = {
             "executor": "none",
             "containers": {"nemo-skills": "container:latest"},
         }
-        mock_get_config.return_value = mock_config
         mock_env_vars.return_value = {"HF_HOME": "/hf"}
 
         mock_exp = MagicMock()
@@ -295,17 +293,15 @@ def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_config,
         mock_exp.add.assert_called_once()
 
     @patch("nemo_skills.pipeline.utils.declarative.get_exp")
-    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
     @patch("nemo_skills.pipeline.utils.declarative.run_exp")
-    def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_get_config, mock_get_exp):
+    def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_get_exp):
         """Test pipeline execution with job dependencies."""
         # Setup mocks
         mock_config = {
             "executor": "none",
             "containers": {"nemo-skills": "container:latest"},
         }
-        mock_get_config.return_value = mock_config
         mock_env_vars.return_value = {"HF_HOME": "/hf"}
 
         mock_exp = MagicMock()
@@ -332,20 +328,16 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_
         assert mock_exp.add.call_count == 2
 
     @patch("nemo_skills.pipeline.utils.declarative.get_exp")
-    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
     @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath")
     @patch("nemo_skills.pipeline.utils.declarative.get_executor")
-    def test_pipeline_hf_home_validation(
-        self, mock_get_executor, mock_is_mounted, mock_env_vars, mock_get_config, mock_get_exp
-    ):
+    def test_pipeline_hf_home_validation(self, mock_get_executor, mock_is_mounted, mock_env_vars, mock_get_exp):
         """Test HF_HOME validation."""
         mock_config = {
             "executor": "slurm",
             "containers": {"nemo-skills": "container:latest"},
             "account": "test_account",
         }
-        mock_get_config.return_value = mock_config
         mock_env_vars.return_value = {"HF_HOME": "/hf"}
         mock_is_mounted.return_value = True
         mock_get_executor.return_value = MagicMock()
@@ -364,12 +356,10 @@ def test_pipeline_hf_home_validation(
         # Verify executor was created
         assert mock_get_executor.called
 
-    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
-    def test_pipeline_hf_home_missing(self, mock_env_vars, mock_get_config):
+    def test_pipeline_hf_home_missing(self, mock_env_vars):
         """Test that missing HF_HOME raises error."""
         mock_config = {"executor": "slurm", "containers": {}}
-        mock_get_config.return_value = mock_config
         mock_env_vars.return_value = {}  # No HF_HOME
 
         cmd = Command(command="echo test", name="cmd")
@@ -379,13 +369,11 @@ def test_pipeline_hf_home_missing(self, mock_env_vars, mock_get_config):
         with pytest.raises(RuntimeError, match="HF_HOME is missing"):
             pipeline.run(dry_run=True)
 
-    @patch("nemo_skills.pipeline.utils.declarative.get_cluster_config")
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
     @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath")
-    def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars, mock_get_config):
+    def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars):
         """Test that non-mounted HF_HOME raises error."""
         mock_config = {"executor": "slurm", "containers": {}}
-        mock_get_config.return_value = mock_config
         mock_env_vars.return_value = {"HF_HOME": "/hf"}
         mock_is_mounted.return_value = False
 

From 55d79702dc61604b97704173cb191b56c6ae8cba Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 16:23:28 -0700
Subject: [PATCH 15/29] FIX change legacy wording

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 12 +++++-------
 tests/test_declarative_pipeline.py        |  8 +++-----
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 75ddff922e..36cc7fb8a7 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -260,17 +260,17 @@ def __init__(
 class Pipeline:
     """Top-level pipeline that composes command groups with dependency support.
 
-    Supports two modes:
-    1. Groups: groups=[cmdgroup1, cmdgroup2] - combines all into one job
-    2. Jobs: jobs=[{...}, {...}] - supports dependencies and multi-group jobs
+    Supports two input formats (both converted to jobs internally):
+    1. Groups: groups=[cmdgroup1, cmdgroup2] - shorthand, each becomes a job
+    2. Jobs: jobs=[{...}, {...}] - full control with dependencies and multi-group jobs
     """
 
     def __init__(
         self,
         name: str,
         cluster_config: Dict,
-        groups: Optional[List[CommandGroup]] = None,  # Legacy mode
-        jobs: Optional[List[Dict]] = None,  # New mode with dependencies
+        groups: Optional[List[CommandGroup]] = None,
+        jobs: Optional[List[Dict]] = None,
         reuse_code: bool = True,
         reuse_code_exp: Optional[str] = None,
         skip_hf_home_check: bool = False,
@@ -290,10 +290,8 @@ def __init__(
 
         if groups is not None:
             self.jobs = [{"group": g} for g in groups]
-            self._legacy_mode = True
         elif jobs is not None:
             self.jobs = jobs
-            self._legacy_mode = False
         else:
             raise ValueError("Must specify either 'groups' or 'jobs'")
 
diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index 494a00e8c4..2cedd3293b 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -180,8 +180,8 @@ def test_commandgroup_with_log_dir(self):
 class TestPipeline:
     """Test Pipeline class functionality."""
 
-    def test_pipeline_with_groups_legacy_mode(self):
-        """Test Pipeline with groups parameter (legacy mode)."""
+    def test_pipeline_with_groups(self):
+        """Test Pipeline with groups parameter (shorthand format)."""
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group")
         cluster_config = {"executor": "local", "containers": {}}
@@ -189,12 +189,11 @@ def test_pipeline_with_groups_legacy_mode(self):
         pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, groups=[group])
 
         assert pipeline.name == "test_pipeline"
-        assert pipeline._legacy_mode is True
         assert len(pipeline.jobs) == 1
         assert "group" in pipeline.jobs[0]
 
     def test_pipeline_with_jobs(self):
-        """Test Pipeline with jobs parameter."""
+        """Test Pipeline with jobs parameter (full format with dependencies)."""
         cmd1 = Command(command="echo 1", name="cmd1")
         group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs")
 
@@ -210,7 +209,6 @@ def test_pipeline_with_jobs(self):
         pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=jobs)
 
         assert pipeline.name == "test_pipeline"
-        assert pipeline._legacy_mode is False
         assert len(pipeline.jobs) == 2
 
     def test_pipeline_cannot_specify_both_groups_and_jobs(self):

From d8d6af114372f575cb031d708a7bdb188fa036bc Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 16:27:37 -0700
Subject: [PATCH 16/29] components -> commands

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 12 ++++++------
 tests/test_declarative_pipeline.py        |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 36cc7fb8a7..c67c662cc7 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -251,7 +251,7 @@ def __init__(
         name: Optional[str] = None,
         log_dir: Optional[str] = None,
     ):
-        self.components = commands
+        self.commands = commands
         self.hardware = hardware or HardwareConfig()
         self.name = name
         self.log_dir = log_dir
@@ -533,7 +533,7 @@ def _plan_and_add_job(
         shared_env_vars: Dict[str, str] = {}
         if heterogeneous:
             for het_idx, group in enumerate(groups):
-                for command in group.components:
+                for command in group.commands:
                     _, exec_config_probe = command.prepare_for_execution(cluster_config)
                     shared_env_vars.update(exec_config_probe.get("environment", {}))
 
@@ -542,9 +542,9 @@ def _plan_and_add_job(
 
         # Build commands and executors
         for het_idx, group in enumerate(groups):
-            has_multiple_components = len(group.components) > 1
+            has_multiple_components = len(group.commands) > 1
             total_het_groups = (
-                len(groups) if heterogeneous else (len(group.components) if has_multiple_components else 1)
+                len(groups) if heterogeneous else (len(group.commands) if has_multiple_components else 1)
             )
 
             # For single-group jobs with multiple components, allow job-level GPU override for sbatch allocation
@@ -552,7 +552,7 @@ def _plan_and_add_job(
                 group.hardware.num_gpus if (not heterogeneous and has_multiple_components and group.hardware) else None
             )
 
-            for comp_idx, command in enumerate(group.components):
+            for comp_idx, command in enumerate(group.commands):
                 # Assign het_group_index ONLY for heterogeneous jobs (per-job, not global)
                 # Non-heterogeneous jobs use localhost, so het_group_index should remain None
                 if heterogeneous:
@@ -584,7 +584,7 @@ def _plan_and_add_job(
                     heterogeneous,
                     het_idx if heterogeneous else comp_idx,
                     total_het_groups,
-                    (len(group.components) > 1),
+                    (len(group.commands) > 1),
                 )
 
                 # Share packager across executors for single-group jobs
diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index 2cedd3293b..56623d3300 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -155,7 +155,7 @@ def test_commandgroup_basic(self):
         group = CommandGroup(commands=[cmd1, cmd2], name="test_group")
 
         assert group.name == "test_group"
-        assert len(group.components) == 2
+        assert len(group.commands) == 2
         assert group.hardware is not None
 
     def test_commandgroup_with_hardware(self):

From b8275c93a880698476ec104878dc6de501d7f7b0 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 17:15:50 -0700
Subject: [PATCH 17/29] ENH fix name dependencies

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py          | 21 +++---
 nemo_skills/pipeline/utils/declarative.py | 91 ++++++++++++++---------
 tests/test_declarative_pipeline.py        | 18 ++---
 3 files changed, 73 insertions(+), 57 deletions(-)

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index a0e07da31c..4d07c2f889 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -412,6 +412,7 @@ def generate(
 
             # Handle dependent_jobs chain
             dependencies = _task_dependencies.copy() if _task_dependencies else []
+            prev_job = None
 
             for dep_idx in range(dependent_jobs + 1):
                 # Create CommandGroup for this task
@@ -444,16 +445,16 @@ def generate(
                         job_deps.extend(run_after_list)
                     job_deps = job_deps if job_deps else None
                 else:
-                    # Subsequent jobs in chain depend on previous job
-                    job_deps = [f"{task_name}-dep{dep_idx - 1}"]
-
-                jobs.append(
-                    {
-                        "name": internal_job_name,
-                        "group": cmd_group,
-                        "dependencies": job_deps,
-                    }
-                )
+                    # Subsequent jobs in chain depend on previous job (use job object, not string)
+                    job_deps = [prev_job]
+
+                job_spec = {
+                    "name": internal_job_name,
+                    "group": cmd_group,
+                    "dependencies": job_deps,
+                }
+                jobs.append(job_spec)
+                prev_job = job_spec  # Track for next iteration
 
                 all_job_names.append(internal_job_name)
 
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index c67c662cc7..4108fab6fe 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -63,6 +63,7 @@
         name="prep",
         log_dir=log_dir
     )
+    prep_job = {"name": "prep", "group": prep_group}
 
     # Job 2: Two different model servers (HETEROGENEOUS SLURM job with 2 het components)
     # Build commands with cluster_config
@@ -82,6 +83,8 @@
     group_8b = CommandGroup(commands=[server_8b, sandbox_8b, eval_8b], name="eval_8b", log_dir=log_dir)
     group_32b = CommandGroup(commands=[server_32b, sandbox_32b, eval_32b], name="eval_32b", log_dir=log_dir)
 
+    evals_job = {"name": "evals", "groups": [group_8b, group_32b], "dependencies": [prep_job]}
+
     # Job 3: Report generation (depends on both evaluations)
     report = Command(
         command="python generate_report.py --output report.txt",
@@ -95,13 +98,12 @@
         name="full_pipeline",
         cluster_config=cluster_config,
         jobs=[
-            {"name": "prep", "group": prep_group},
-
-            # Multi-group heterogeneous job (both eval groups in ONE SLURM job with 2 het components)
-            {"name": "evals", "groups": [group_8b, group_32b], "dependencies": ["prep"]},
-
-            # Report depends on the multi-group eval job
-            {"name": "report", "group": report_group, "dependencies": ["evals"]},
+            prep_job,
+            evals_job,
+            # Report depends on the eval job (internal) and some external experiment (string)
+            {"name": "report", "group": report_group, "dependencies": [evals_job, "external_training_exp"]},
+        ]
+    )
     pipeline.run()
 """
 
@@ -263,6 +265,10 @@ class Pipeline:
     Supports two input formats (both converted to jobs internally):
     1. Groups: groups=[cmdgroup1, cmdgroup2] - shorthand, each becomes a job
     2. Jobs: jobs=[{...}, {...}] - full control with dependencies and multi-group jobs
+
+    Dependency types:
+    - Job dict objects: Internal dependencies on jobs in the same pipeline
+    - Strings: External dependencies on other experiments
     """
 
     def __init__(
@@ -289,7 +295,8 @@ def __init__(
             raise ValueError("Cannot specify both 'groups' and 'jobs'.")
 
         if groups is not None:
-            self.jobs = [{"group": g} for g in groups]
+            # Auto-generate job names from group names
+            self.jobs = [{"name": g.name or f"job_{idx}", "group": g} for idx, g in enumerate(groups)]
         elif jobs is not None:
             self.jobs = jobs
         else:
@@ -332,7 +339,9 @@ def run(
         with get_exp(self.name, self.cluster_config, _reuse_exp) as exp:
             # Process each job in order
             for job_spec in self.jobs:
-                job_name = job_spec.get("name", "unnamed")
+                job_name = job_spec.get("name")
+                if not job_name:
+                    raise ValueError(f"Job spec must have a 'name' field: {job_spec}")
 
                 # Resolve dependencies to task handles
                 run_after_deps = []
@@ -350,37 +359,45 @@ def run(
 
                 for dep in job_dependencies:
                     if isinstance(dep, str):
-                        # String dependency - could be job name, task handle, or experiment name
-                        if dep in job_name_to_handle:
-                            # Internal pipeline dependency - add the handle
-                            run_after_deps.append(job_name_to_handle[dep])
-                            LOG.info(f"Job '{job_name}' depends on '{dep}' (handle: {job_name_to_handle[dep]})")
-                        else:
-                            # Could be external experiment name OR task handle from _reuse_exp case
-                            # Try to get as experiment name first
-                            if self.cluster_config["executor"] == "slurm":
-                                exp_handles = get_exp_handles(dep)
-                                if len(exp_handles) == 0:
-                                    LOG.warning(
-                                        f"No pending or running tasks found for experiment {dep}, cannot set dependencies."
-                                    )
-                                    # If no experiment found, treat as direct task handle (for _reuse_exp case)
-                                    if _reuse_exp:
-                                        run_after_deps.append(dep)
-                                        LOG.info(
-                                            f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)"
-                                        )
-                                else:
-                                    run_after_deps.extend(exp_handles)
+                        # String dependency = external experiment name
+                        if self.cluster_config["executor"] == "slurm":
+                            exp_handles = get_exp_handles(dep)
+                            if len(exp_handles) == 0:
+                                LOG.warning(
+                                    f"No pending or running tasks found for experiment {dep}, cannot set dependencies."
+                                )
+                                # If no experiment found, treat as direct task handle (for _reuse_exp case)
+                                if _reuse_exp:
+                                    run_after_deps.append(dep)
                                     LOG.info(
-                                        f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)"
+                                        f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)"
                                     )
-                            elif _reuse_exp:
-                                # For non-SLURM executors with _reuse_exp, treat as task handle
-                                run_after_deps.append(dep)
-                                LOG.info(f"Job '{job_name}' depends on task handle '{dep}'")
+                            else:
+                                run_after_deps.extend(exp_handles)
+                                LOG.info(
+                                    f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)"
+                                )
+                        elif _reuse_exp:
+                            # For non-SLURM executors with _reuse_exp, treat as task handle
+                            run_after_deps.append(dep)
+                            LOG.info(f"Job '{job_name}' depends on task handle '{dep}'")
+                    elif isinstance(dep, dict):
+                        # Dict dependency = internal job reference (by job spec object)
+                        dep_name = dep.get("name")
+                        if not dep_name:
+                            raise ValueError(f"Job dependency must have a 'name' field: {dep}")
+                        if dep_name in job_name_to_handle:
+                            run_after_deps.append(job_name_to_handle[dep_name])
+                            LOG.info(
+                                f"Job '{job_name}' depends on internal job '{dep_name}' (handle: {job_name_to_handle[dep_name]})"
+                            )
+                        else:
+                            raise ValueError(
+                                f"Job '{job_name}' depends on job '{dep_name}' which hasn't been processed yet. "
+                                f"Make sure dependencies are listed before the jobs that depend on them in the jobs list."
+                            )
                     else:
-                        # Direct task handle object (not string)
+                        # Direct task handle object (not string or dict)
                         run_after_deps.append(dep)
                         LOG.info(f"Job '{job_name}' depends on task handle (object)")
 
diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index 56623d3300..9a001770a6 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -200,13 +200,12 @@ def test_pipeline_with_jobs(self):
         cmd2 = Command(command="echo 2", name="cmd2")
         group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs")
 
-        jobs = [
-            {"name": "job1", "group": group1},
-            {"name": "job2", "group": group2, "dependencies": ["job1"]},
-        ]
+        job1 = {"name": "job1", "group": group1}
+        job2 = {"name": "job2", "group": group2, "dependencies": [job1]}
+
         cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=jobs)
+        pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=[job1, job2])
 
         assert pipeline.name == "test_pipeline"
         assert len(pipeline.jobs) == 2
@@ -313,11 +312,10 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_
         cmd2 = Command(command="echo 2", name="cmd2")
         group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs")
 
-        jobs = [
-            {"name": "job1", "group": group1},
-            {"name": "job2", "group": group2, "dependencies": ["job1"]},
-        ]
-        pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=jobs, skip_hf_home_check=True)
+        job1 = {"name": "job1", "group": group1, "dependencies": []}
+        job2 = {"name": "job2", "group": group2, "dependencies": [job1]}
+
+        pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=[job1, job2], skip_hf_home_check=True)
 
         # Run pipeline
         pipeline.run(dry_run=True)

From 78e1a9ca439cd1970a05988b4117c7100ce6175a Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 6 Oct 2025 17:27:24 -0700
Subject: [PATCH 18/29] MAINT remove groups syntax

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 71 +++++++++----------
 tests/test_declarative_pipeline.py        | 84 +++++++++++++++--------
 2 files changed, 90 insertions(+), 65 deletions(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 4108fab6fe..64d942c70c 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -45,7 +45,7 @@
     pipeline = Pipeline(
         name="my_inference",
         cluster_config=cluster_config,
-        groups=[inference_group]
+        jobs=[{"name": "inference", "group": inference_group}]
     )
     pipeline.run()
 
@@ -262,9 +262,7 @@ def __init__(
 class Pipeline:
     """Top-level pipeline that composes command groups with dependency support.
 
-    Supports two input formats (both converted to jobs internally):
-    1. Groups: groups=[cmdgroup1, cmdgroup2] - shorthand, each becomes a job
-    2. Jobs: jobs=[{...}, {...}] - full control with dependencies and multi-group jobs
+    Jobs format: jobs=[{...}, {...}] - list of job dicts with dependencies and groups
 
     Dependency types:
     - Job dict objects: Internal dependencies on jobs in the same pipeline
@@ -275,8 +273,7 @@ def __init__(
         self,
         name: str,
         cluster_config: Dict,
-        groups: Optional[List[CommandGroup]] = None,
-        jobs: Optional[List[Dict]] = None,
+        jobs: List[Dict],
         reuse_code: bool = True,
         reuse_code_exp: Optional[str] = None,
         skip_hf_home_check: bool = False,
@@ -290,38 +287,31 @@ def __init__(
         self.skip_hf_home_check = skip_hf_home_check
         self.with_ray = with_ray
         self.run_after = run_after
+        self.jobs = jobs
 
-        if groups is not None and jobs is not None:
-            raise ValueError("Cannot specify both 'groups' and 'jobs'.")
-
-        if groups is not None:
-            # Auto-generate job names from group names
-            self.jobs = [{"name": g.name or f"job_{idx}", "group": g} for idx, g in enumerate(groups)]
-        elif jobs is not None:
-            self.jobs = jobs
-        else:
-            raise ValueError("Must specify either 'groups' or 'jobs'")
+        # Validate configuration early
+        self._validate()
 
         # Note: het_group_indices are assigned per-job in _plan_and_add_job, not globally
 
-    def run(
-        self,
-        dry_run: bool = False,
-        log_dir: Optional[str] = None,
-        _reuse_exp=None,
-    ):
-        """Execute the pipeline by calling NeMo-Run directly.
-
-        Args:
-            dry_run: If True, validate without executing
-            log_dir: Default log directory for groups that don't specify one (optional)
-            _reuse_exp: Internal - reuse existing experiment object (for eval.py integration)
-        """
+    def _validate(self):
+        """Validate pipeline configuration early in __init__."""
+        # Validate jobs
         if not self.jobs:
-            LOG.info("No jobs to execute")
-            return None
+            raise ValueError("Pipeline requires at least one job")
 
-        # Validate HF_HOME environment variable
+        for idx, job_spec in enumerate(self.jobs):
+            job_name = job_spec.get("name")
+            if not job_name:
+                raise ValueError(f"Job at index {idx} must have a 'name' field: {job_spec}")
+
+        # Validate cluster_config has required fields
+        if "executor" not in self.cluster_config:
+            raise ValueError("cluster_config must have 'executor' field")
+        if "containers" not in self.cluster_config:
+            raise ValueError("cluster_config must have 'containers' field")
+
+        # Validate HF_HOME if needed
         if self.cluster_config["executor"] != "none" and not self.skip_hf_home_check:
             env_vars = get_env_variables(self.cluster_config)
             if "HF_HOME" not in env_vars:
@@ -333,15 +323,26 @@ def run(
             if not is_mounted_filepath(self.cluster_config, env_vars["HF_HOME"]):
                 raise RuntimeError(f"Invalid cluster_config: HF_HOME={env_vars['HF_HOME']} is not a mounted path.")
 
+    def run(
+        self,
+        dry_run: bool = False,
+        log_dir: Optional[str] = None,
+        _reuse_exp=None,
+    ):
+        """Execute the pipeline by calling NeMo-Run directly.
+
+        Args:
+            dry_run: If True, validate without executing
+            log_dir: Default log directory for groups that don't specify one (optional)
+            _reuse_exp: Internal - reuse existing experiment object (for eval.py integration)
+        """
         # Track job name -> task handle for dependency resolution
         job_name_to_handle = {}
 
         with get_exp(self.name, self.cluster_config, _reuse_exp) as exp:
             # Process each job in order
             for job_spec in self.jobs:
-                job_name = job_spec.get("name")
-                if not job_name:
-                    raise ValueError(f"Job spec must have a 'name' field: {job_spec}")
+                job_name = job_spec["name"]  # Already validated in _validate()
 
                 # Resolve dependencies to task handles
                 run_after_deps = []
diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index 9a001770a6..b13b6f1ed1 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -180,13 +180,18 @@ def test_commandgroup_with_log_dir(self):
 class TestPipeline:
     """Test Pipeline class functionality."""
 
-    def test_pipeline_with_groups(self):
-        """Test Pipeline with groups parameter (shorthand format)."""
+    def test_pipeline_with_single_job(self):
+        """Test Pipeline with single job."""
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group")
         cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, groups=[group])
+        pipeline = Pipeline(
+            name="test_pipeline",
+            cluster_config=cluster_config,
+            jobs=[{"name": "job1", "group": group}],
+            skip_hf_home_check=True,
+        )
 
         assert pipeline.name == "test_pipeline"
         assert len(pipeline.jobs) == 1
@@ -205,25 +210,19 @@ def test_pipeline_with_jobs(self):
 
         cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test_pipeline", cluster_config=cluster_config, jobs=[job1, job2])
+        pipeline = Pipeline(
+            name="test_pipeline", cluster_config=cluster_config, jobs=[job1, job2], skip_hf_home_check=True
+        )
 
         assert pipeline.name == "test_pipeline"
         assert len(pipeline.jobs) == 2
 
-    def test_pipeline_cannot_specify_both_groups_and_jobs(self):
-        """Test that Pipeline raises error when both groups and jobs are specified."""
-        cmd = Command(command="echo test", name="cmd")
-        group = CommandGroup(commands=[cmd], name="group")
-        jobs = [{"name": "job", "group": group}]
+    def test_pipeline_requires_jobs(self):
+        """Test that Pipeline requires jobs parameter."""
         cluster_config = {"executor": "local", "containers": {}}
 
-        with pytest.raises(ValueError, match="Cannot specify both 'groups' and 'jobs'"):
-            Pipeline(name="test", cluster_config=cluster_config, groups=[group], jobs=jobs)
-
-    def test_pipeline_must_specify_either_groups_or_jobs(self):
-        """Test that Pipeline raises error when neither groups nor jobs are specified."""
-        cluster_config = {"executor": "local", "containers": {}}
-        with pytest.raises(ValueError, match="Must specify either 'groups' or 'jobs'"):
+        # Missing jobs parameter should fail
+        with pytest.raises(TypeError):
             Pipeline(name="test", cluster_config=cluster_config)
 
     def test_pipeline_with_run_after(self):
@@ -232,7 +231,13 @@ def test_pipeline_with_run_after(self):
         group = CommandGroup(commands=[cmd], name="group")
         cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group], run_after="other_exp")
+        pipeline = Pipeline(
+            name="test",
+            cluster_config=cluster_config,
+            jobs=[{"name": "job1", "group": group}],
+            run_after="other_exp",
+            skip_hf_home_check=True,
+        )
 
         assert pipeline.run_after == "other_exp"
 
@@ -242,7 +247,13 @@ def test_pipeline_with_run_after_list(self):
         group = CommandGroup(commands=[cmd], name="group")
         cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group], run_after=["exp1", "exp2"])
+        pipeline = Pipeline(
+            name="test",
+            cluster_config=cluster_config,
+            jobs=[{"name": "job1", "group": group}],
+            run_after=["exp1", "exp2"],
+            skip_hf_home_check=True,
+        )
 
         assert pipeline.run_after == ["exp1", "exp2"]
 
@@ -252,7 +263,12 @@ def test_pipeline_cluster_config_passed_directly(self):
         group = CommandGroup(commands=[cmd], name="group")
         cluster_config = {"executor": "local", "containers": {}}
 
-        pipeline = Pipeline(name="test", cluster_config=cluster_config, groups=[group])
+        pipeline = Pipeline(
+            name="test",
+            cluster_config=cluster_config,
+            jobs=[{"name": "job1", "group": group}],
+            skip_hf_home_check=True,
+        )
 
         # cluster_config is stored as-is
         assert pipeline.cluster_config == cluster_config
@@ -280,7 +296,9 @@ def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_exp):
         # Create pipeline
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
-        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group], skip_hf_home_check=True)
+        pipeline = Pipeline(
+            name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}], skip_hf_home_check=True
+        )
 
         # Run pipeline
         result = pipeline.run(dry_run=True)
@@ -344,7 +362,7 @@ def test_pipeline_hf_home_validation(self, mock_get_executor, mock_is_mounted, m
 
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
-        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group])
+        pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}])
 
         # Should not raise
         pipeline.run(dry_run=True)
@@ -354,31 +372,31 @@ def test_pipeline_hf_home_validation(self, mock_get_executor, mock_is_mounted, m
 
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
     def test_pipeline_hf_home_missing(self, mock_env_vars):
-        """Test that missing HF_HOME raises error."""
+        """Test that missing HF_HOME raises error in __init__."""
         mock_config = {"executor": "slurm", "containers": {}}
         mock_env_vars.return_value = {}  # No HF_HOME
 
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
-        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group])
 
+        # Should raise in __init__ now, not run()
         with pytest.raises(RuntimeError, match="HF_HOME is missing"):
-            pipeline.run(dry_run=True)
+            Pipeline(name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}])
 
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
     @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath")
     def test_pipeline_hf_home_not_mounted(self, mock_is_mounted, mock_env_vars):
-        """Test that non-mounted HF_HOME raises error."""
+        """Test that non-mounted HF_HOME raises error in __init__."""
         mock_config = {"executor": "slurm", "containers": {}}
         mock_env_vars.return_value = {"HF_HOME": "/hf"}
         mock_is_mounted.return_value = False
 
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
-        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group])
 
+        # Should raise in __init__ now, not run()
         with pytest.raises(RuntimeError, match="is not a mounted path"):
-            pipeline.run(dry_run=True)
+            Pipeline(name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}])
 
 
 class TestHetGroupIndices:
@@ -403,7 +421,9 @@ def test_het_group_index_non_heterogeneous(self, mock_env_vars, mock_get_exp):
         cmd2 = Command(command="echo 2", name="cmd2")
         group = CommandGroup(commands=[cmd1, cmd2], name="group", log_dir="/logs")
 
-        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group], skip_hf_home_check=True)
+        pipeline = Pipeline(
+            name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}], skip_hf_home_check=True
+        )
         pipeline.run(dry_run=True)
 
         # Both commands should have None het_group_index (localhost communication)
@@ -528,7 +548,11 @@ def test_pipeline_run_after_applies_to_jobs(self, mock_env_vars, mock_get_exp):
         group = CommandGroup(commands=[cmd], name="group", log_dir="/logs")
 
         pipeline = Pipeline(
-            name="test", cluster_config=mock_config, groups=[group], run_after="other_exp", skip_hf_home_check=True
+            name="test",
+            cluster_config=mock_config,
+            jobs=[{"name": "job1", "group": group}],
+            run_after="other_exp",
+            skip_hf_home_check=True,
         )
 
         # Should not raise and should apply run_after
@@ -553,7 +577,7 @@ def test_commandgroup_missing_log_dir(self):
         cmd = Command(command="echo test", name="cmd")
         group = CommandGroup(commands=[cmd], name="group")  # No log_dir
 
-        pipeline = Pipeline(name="test", cluster_config=mock_config, groups=[group])
+        pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=[{"name": "job1", "group": group}])
 
         with pytest.raises(ValueError, match="must have log_dir set"):
             pipeline.run(dry_run=True)

From bc59c722976d5674e52a9ed3116da5399e7d8ddf Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Wed, 8 Oct 2025 17:12:20 -0700
Subject: [PATCH 19/29] TST harden mcp client tests

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_mcp_clients.py | 46 +++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/tests/test_mcp_clients.py b/tests/test_mcp_clients.py
index 879dd60d1c..bd31bc4524 100644
--- a/tests/test_mcp_clients.py
+++ b/tests/test_mcp_clients.py
@@ -205,6 +205,32 @@ async def execute(self, tool_name: str, arguments: dict, extra_args: dict | None
         return {"unknown": tool_name, "args": arguments}
 
 
+# Helper class for test_tool_manager_cache_and_duplicate_detection
+# Defined at module level so it can be imported via locate()
+class CountingTool(DummyTool):
+    # Class variable to track calls across instances
+    _call_count = 0
+
+    async def list_tools(self):
+        CountingTool._call_count += 1
+        return await super().list_tools()
+
+    @classmethod
+    def reset_count(cls):
+        cls._call_count = 0
+
+    @classmethod
+    def get_count(cls):
+        return cls._call_count
+
+
+# Helper class for duplicate tool detection test
+class DupTool(DummyTool):
+    async def list_tools(self):
+        lst = await super().list_tools()
+        return [lst[0], lst[0]]  # duplicate names within same tool
+
+
 @pytest.mark.asyncio
 async def test_tool_manager_list_and_execute_with_class_locator():
     # Register this test module's DummyTool via module locator
@@ -219,30 +245,18 @@ async def test_tool_manager_list_and_execute_with_class_locator():
 
 @pytest.mark.asyncio
 async def test_tool_manager_cache_and_duplicate_detection():
-    calls = {"n": 0}
-
-    class CountingTool(DummyTool):
-        async def list_tools(self):
-            calls["n"] += 1
-            return await super().list_tools()
+    # Reset counter before test
+    CountingTool.reset_count()
 
-    # Expose CountingTool from this module for locate
-    globals()["CountingTool"] = CountingTool
     tm = ToolManager(module_specs=["tests.test_mcp_clients::CountingTool"], overrides={}, context={})
     _ = await tm.list_all_tools(use_cache=True)
     _ = await tm.list_all_tools(use_cache=True)
-    assert calls["n"] == 1
+    assert CountingTool.get_count() == 1
     with pytest.raises(ValueError) as excinfo:
         _ = await tm.list_all_tools(use_cache=False)
     assert "Duplicate raw tool name across providers: 'execute'" in str(excinfo.value)
-    assert calls["n"] == 2
-
-    class DupTool(DummyTool):
-        async def list_tools(self):
-            lst = await super().list_tools()
-            return [lst[0], lst[0]]  # duplicate names within same tool
+    assert CountingTool.get_count() == 2
 
-    globals()["DupTool"] = DupTool
     tm2 = ToolManager(module_specs=["tests.test_mcp_clients::DupTool"], overrides={}, context={})
     tools2 = await tm2.list_all_tools(use_cache=False)
     names2 = sorted(t["name"] for t in tools2)

From 32cc0149ce873599cb8fbef239401fe76b1bdf9e Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Wed, 8 Oct 2025 17:30:28 -0700
Subject: [PATCH 20/29] TST harden mcp client tests

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_mcp_clients.py | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/tests/test_mcp_clients.py b/tests/test_mcp_clients.py
index bd31bc4524..878a65c112 100644
--- a/tests/test_mcp_clients.py
+++ b/tests/test_mcp_clients.py
@@ -205,24 +205,18 @@ async def execute(self, tool_name: str, arguments: dict, extra_args: dict | None
         return {"unknown": tool_name, "args": arguments}
 
 
+# Shared state for CountingTool - must be a module-level mutable object
+# so that instances created via locate() will reference the same object
+_counting_tool_state = {"call_count": 0}
+
+
 # Helper class for test_tool_manager_cache_and_duplicate_detection
 # Defined at module level so it can be imported via locate()
 class CountingTool(DummyTool):
-    # Class variable to track calls across instances
-    _call_count = 0
-
     async def list_tools(self):
-        CountingTool._call_count += 1
+        _counting_tool_state["call_count"] += 1
         return await super().list_tools()
 
-    @classmethod
-    def reset_count(cls):
-        cls._call_count = 0
-
-    @classmethod
-    def get_count(cls):
-        return cls._call_count
-
 
 # Helper class for duplicate tool detection test
 class DupTool(DummyTool):
@@ -246,16 +240,16 @@ async def test_tool_manager_list_and_execute_with_class_locator():
 @pytest.mark.asyncio
 async def test_tool_manager_cache_and_duplicate_detection():
     # Reset counter before test
-    CountingTool.reset_count()
+    _counting_tool_state["call_count"] = 0
 
     tm = ToolManager(module_specs=["tests.test_mcp_clients::CountingTool"], overrides={}, context={})
     _ = await tm.list_all_tools(use_cache=True)
     _ = await tm.list_all_tools(use_cache=True)
-    assert CountingTool.get_count() == 1
+    assert _counting_tool_state["call_count"] == 1
     with pytest.raises(ValueError) as excinfo:
         _ = await tm.list_all_tools(use_cache=False)
     assert "Duplicate raw tool name across providers: 'execute'" in str(excinfo.value)
-    assert CountingTool.get_count() == 2
+    assert _counting_tool_state["call_count"] == 2
 
     tm2 = ToolManager(module_specs=["tests.test_mcp_clients::DupTool"], overrides={}, context={})
     tools2 = await tm2.list_all_tools(use_cache=False)

From 4248725836feaeb37e347e4b9f3f6986ab116c80 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Wed, 8 Oct 2025 17:46:57 -0700
Subject: [PATCH 21/29] TST harden mcp client tests WITH DEBUG

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_mcp_clients.py | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/tests/test_mcp_clients.py b/tests/test_mcp_clients.py
index 6e850eece8..a2ef34155e 100644
--- a/tests/test_mcp_clients.py
+++ b/tests/test_mcp_clients.py
@@ -219,16 +219,19 @@ async def execute(self, tool_name: str, arguments: dict, extra_args: dict | None
         return {"unknown": tool_name, "args": arguments}
 
 
-# Shared state for CountingTool - must be a module-level mutable object
-# so that instances created via locate() will reference the same object
-_counting_tool_state = {"call_count": 0}
-
-
 # Helper class for test_tool_manager_cache_and_duplicate_detection
 # Defined at module level so it can be imported via locate()
 class CountingTool(DummyTool):
+    # Use a class variable that's mutable to track calls
+    # This will be shared across all instances
+    call_count = 0
+
+    def __init__(self) -> None:
+        super().__init__()
+
     async def list_tools(self):
-        _counting_tool_state["call_count"] += 1
+        # Increment the class variable
+        CountingTool.call_count += 1
         return await super().list_tools()
 
 
@@ -253,17 +256,31 @@ async def test_tool_manager_list_and_execute_with_class_locator():
 
 @pytest.mark.asyncio
 async def test_tool_manager_cache_and_duplicate_detection():
-    # Reset counter before test
-    _counting_tool_state["call_count"] = 0
+    import importlib
+    import sys
+
+    # Reset counter before test - access via sys.modules to ensure we get the right class
+    this_module = sys.modules[__name__]
+    CountingToolClass = getattr(this_module, "CountingTool")
+    CountingToolClass.call_count = 0
+
+    # Debug: Check what locate() will see
+    imported_module = importlib.import_module("tests.test_mcp_clients")
+    imported_class = getattr(imported_module, "CountingTool")
+    print(f"DEBUG: Same module? {this_module is imported_module}")
+    print(f"DEBUG: Same class? {CountingToolClass is imported_class}")
+    print(f"DEBUG: Class IDs: {id(CountingToolClass)} vs {id(imported_class)}")
 
     tm = ToolManager(module_specs=["tests.test_mcp_clients::CountingTool"], overrides={}, context={})
     _ = await tm.list_all_tools(use_cache=True)
     _ = await tm.list_all_tools(use_cache=True)
-    assert _counting_tool_state["call_count"] == 1
+    print(f"DEBUG: Count after cached calls: {CountingToolClass.call_count}")
+    print(f"DEBUG: Imported class count: {imported_class.call_count}")
+    assert CountingToolClass.call_count == 1, f"Expected 1 call, got {CountingToolClass.call_count}"
     with pytest.raises(ValueError) as excinfo:
         _ = await tm.list_all_tools(use_cache=False)
     assert "Duplicate raw tool name across providers: 'execute'" in str(excinfo.value)
-    assert _counting_tool_state["call_count"] == 2
+    assert CountingToolClass.call_count == 2, f"Expected 2 calls, got {CountingToolClass.call_count}"
 
     tm2 = ToolManager(module_specs=["tests.test_mcp_clients::DupTool"], overrides={}, context={})
     tools2 = await tm2.list_all_tools(use_cache=False)

From c140260186643dbf9f6bfbde923772b896b8e992 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Wed, 8 Oct 2025 17:55:12 -0700
Subject: [PATCH 22/29] TST add test for generation issue

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_generation.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tests/test_generation.py b/tests/test_generation.py
index 0d0f42c586..99440ca4cd 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -16,10 +16,12 @@
 
 # running most things through subprocess since that's how it's usually used
 import subprocess
+from unittest.mock import MagicMock
 
 import pytest
 
 from nemo_skills.evaluation.metrics import ComputeMetrics
+from nemo_skills.pipeline.generate import _create_commandgroup_from_config
 
 
 def test_eval_gsm8k_api(tmp_path):
@@ -149,3 +151,41 @@ def test_generate_openai_format(tmp_path, format):
     assert len(data) == 2
     assert len(data[0]["generation"]) > 0
     assert len(data[1]["generation"]) > 0
+
+
+def test_server_metadata_from_num_tasks():
+    """Test that metadata dict is properly created from server command returning (cmd, num_tasks)."""
+    mock_server_fn = MagicMock(return_value=("python server.py", 4))
+    cluster_config = {
+        "containers": {"vllm": "nvcr.io/nvidia/nemo:vllm", "nemo-skills": "nvcr.io/nvidia/nemo:skills"},
+        "executor": "slurm",
+    }
+    server_config = {
+        "server_type": "vllm",
+        "num_gpus": 8,
+        "num_nodes": 1,
+        "model_path": "/models/test",
+        "server_port": 5000,
+    }
+
+    cmd_group = _create_commandgroup_from_config(
+        generation_cmd="python generate.py",
+        server_config=server_config,
+        with_sandbox=False,
+        sandbox_port=None,
+        cluster_config=cluster_config,
+        installation_command=None,
+        get_server_command_fn=mock_server_fn,
+        partition=None,
+        qos=None,
+        time_min=None,
+        exclusive=False,
+        keep_mounts_for_sandbox=False,
+        task_name="test-task",
+        log_dir="/tmp/logs",
+    )
+
+    server_cmd = cmd_group.commands[0]
+    assert isinstance(server_cmd.metadata, dict)
+    assert server_cmd.metadata["num_tasks"] == 4
+    assert server_cmd.metadata["gpus"] == 8

From 2116744dc9ffd816234dc7fb172ebf51a0a9dacf Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Wed, 8 Oct 2025 17:56:05 -0700
Subject: [PATCH 23/29] FIX generation pipelin metadata

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/generate.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index 4d07c2f889..436089504e 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -69,16 +69,15 @@ def _create_commandgroup_from_config(
         server_container = server_config.pop("container", cluster_config["containers"][server_type])
 
         # Call server command builder directly with cluster_config
-        cmd, metadata = get_server_command_fn(**server_config, cluster_config=cluster_config)
-
-        # Add additional metadata
-        metadata.update(
-            {
-                "gpus": server_config["num_gpus"],
-                "nodes": server_config["num_nodes"],
-                "log_prefix": "server",
-            }
-        )
+        cmd, num_tasks = get_server_command_fn(**server_config, cluster_config=cluster_config)
+
+        # Create metadata dict
+        metadata = {
+            "num_tasks": num_tasks,
+            "gpus": server_config["num_gpus"],
+            "nodes": server_config["num_nodes"],
+            "log_prefix": "server",
+        }
 
         server_cmd = Command(
             command=cmd,

From 5f25397a901068575d7205558bcfa07d5b8e8e7e Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Wed, 8 Oct 2025 18:09:29 -0700
Subject: [PATCH 24/29] TST harden mcp client tests

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_mcp_clients.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/tests/test_mcp_clients.py b/tests/test_mcp_clients.py
index a2ef34155e..bf893e1e19 100644
--- a/tests/test_mcp_clients.py
+++ b/tests/test_mcp_clients.py
@@ -245,7 +245,8 @@ async def list_tools(self):
 @pytest.mark.asyncio
 async def test_tool_manager_list_and_execute_with_class_locator():
     # Register this test module's DummyTool via module locator
-    tm = ToolManager(module_specs=["tests.test_mcp_clients::DummyTool"], overrides={}, context={})
+    # Use __name__ to get actual module path (works in both local and CI)
+    tm = ToolManager(module_specs=[f"{__name__}::DummyTool"], overrides={}, context={})
     tools = await tm.list_all_tools(use_cache=False)
     names = sorted(t["name"] for t in tools)
     assert names == ["echo", "execute"]
@@ -256,7 +257,6 @@ async def test_tool_manager_list_and_execute_with_class_locator():
 
 @pytest.mark.asyncio
 async def test_tool_manager_cache_and_duplicate_detection():
-    import importlib
     import sys
 
     # Reset counter before test - access via sys.modules to ensure we get the right class
@@ -264,25 +264,18 @@ async def test_tool_manager_cache_and_duplicate_detection():
     CountingToolClass = getattr(this_module, "CountingTool")
     CountingToolClass.call_count = 0
 
-    # Debug: Check what locate() will see
-    imported_module = importlib.import_module("tests.test_mcp_clients")
-    imported_class = getattr(imported_module, "CountingTool")
-    print(f"DEBUG: Same module? {this_module is imported_module}")
-    print(f"DEBUG: Same class? {CountingToolClass is imported_class}")
-    print(f"DEBUG: Class IDs: {id(CountingToolClass)} vs {id(imported_class)}")
-
-    tm = ToolManager(module_specs=["tests.test_mcp_clients::CountingTool"], overrides={}, context={})
+    # Use __name__ to get the actual module path (works in both local and CI environments)
+    module_path = __name__
+    tm = ToolManager(module_specs=[f"{module_path}::CountingTool"], overrides={}, context={})
     _ = await tm.list_all_tools(use_cache=True)
     _ = await tm.list_all_tools(use_cache=True)
-    print(f"DEBUG: Count after cached calls: {CountingToolClass.call_count}")
-    print(f"DEBUG: Imported class count: {imported_class.call_count}")
     assert CountingToolClass.call_count == 1, f"Expected 1 call, got {CountingToolClass.call_count}"
     with pytest.raises(ValueError) as excinfo:
         _ = await tm.list_all_tools(use_cache=False)
     assert "Duplicate raw tool name across providers: 'execute'" in str(excinfo.value)
     assert CountingToolClass.call_count == 2, f"Expected 2 calls, got {CountingToolClass.call_count}"
 
-    tm2 = ToolManager(module_specs=["tests.test_mcp_clients::DupTool"], overrides={}, context={})
+    tm2 = ToolManager(module_specs=[f"{module_path}::DupTool"], overrides={}, context={})
     tools2 = await tm2.list_all_tools(use_cache=False)
     names2 = sorted(t["name"] for t in tools2)
     assert names2 == ["execute"]

From 5d8ddd2223027b41223ef32271393d57208cbf2a Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Thu, 9 Oct 2025 13:04:52 -0700
Subject: [PATCH 25/29] add test for dependency issue

Signed-off-by: George Armstrong <georgea@nvidia.com>

add test for dependency issue

Signed-off-by: George Armstrong <georgea@nvidia.com>

add test for dependency issue

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_declarative_pipeline.py | 200 +++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)

diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index b13b6f1ed1..deb209e128 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -14,10 +14,14 @@
 
 """Tests for the declarative pipeline system."""
 
+import json
 from unittest.mock import MagicMock, patch
 
 import pytest
 
+from nemo_skills.pipeline.cli import wrap_arguments
+from nemo_skills.pipeline.generate import generate
+from nemo_skills.pipeline.run_cmd import run_cmd
 from nemo_skills.pipeline.utils.declarative import Command, CommandGroup, HardwareConfig, Pipeline
 
 
@@ -583,5 +587,201 @@ def test_commandgroup_missing_log_dir(self):
             pipeline.run(dry_run=True)
 
 
+class TestJobDependencies:
+    """Test job dependencies across experiments."""
+
+    def test_dependencies_separated_internal_vs_external(self):
+        """Test that internal and external dependencies are handled differently.
+
+        This verifies the fix for the bug where exp.add() was receiving external
+        experiment dependencies, causing an assertion error.
+
+        The fix:
+        - Internal deps (task handles from same experiment) → passed to exp.add()
+        - External deps (SLURM job IDs from other experiments) → passed to executor
+        """
+        import nemo_run as run
+
+        # Mock get_exp_handles to return fake SLURM job IDs
+        with patch("nemo_skills.pipeline.utils.declarative.get_exp_handles") as mock_get_handles:
+            mock_get_handles.return_value = ["slurm_job_12345"]
+
+            # Mock get_exp to avoid actually creating experiments
+            with patch("nemo_skills.pipeline.utils.declarative.get_exp") as mock_get_exp:
+                mock_exp = MagicMock(spec=run.Experiment)
+                mock_exp.__enter__ = MagicMock(return_value=mock_exp)
+                mock_exp.__exit__ = MagicMock(return_value=False)
+                # Return different handles for each job
+                mock_exp.add = MagicMock(side_effect=["task_handle_1", "task_handle_2"])
+                mock_get_exp.return_value = mock_exp
+
+                # Mock get_executor to capture what dependencies are passed to it
+                captured_executor_calls = []
+
+                def mock_get_executor(**kwargs):
+                    captured_executor_calls.append(kwargs.get("dependencies"))
+                    mock_executor = MagicMock()
+                    mock_executor.packager = MagicMock()
+                    return mock_executor
+
+                with patch("nemo_skills.pipeline.utils.declarative.get_executor", side_effect=mock_get_executor):
+                    # Mock run_exp to avoid actually running
+                    with patch("nemo_skills.pipeline.utils.declarative.run_exp"):
+                        cluster_config = {
+                            "executor": "slurm",
+                            "containers": {"nemo-skills": "test/container"},
+                            "account": "test",
+                            "env_vars": {"HF_HOME": "/mounted/hf_home"},
+                            "mounts": ["/mounted/hf_home:/mounted/hf_home"],
+                        }
+
+                        # Job 1: depends on external experiment
+                        cmd1 = Command(command="echo job1", name="job1")
+                        group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/tmp/logs")
+
+                        # Job 2: depends on job1 (internal) AND external experiment
+                        cmd2 = Command(command="echo job2", name="job2")
+                        group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/tmp/logs")
+
+                        job1_spec = {
+                            "name": "job1",
+                            "group": group1,
+                            "dependencies": ["external_experiment"],  # External only
+                        }
+
+                        job2_spec = {
+                            "name": "job2",
+                            "group": group2,
+                            "dependencies": [job1_spec, "another_external_experiment"],  # Both internal and external
+                        }
+
+                        pipeline = Pipeline(
+                            name="test_pipeline",
+                            cluster_config=cluster_config,
+                            jobs=[job1_spec, job2_spec],
+                            skip_hf_home_check=True,
+                        )
+
+                        # Run the pipeline (mocked)
+                        pipeline.run(dry_run=False)
+
+                        # Verify executor calls
+                        assert len(captured_executor_calls) == 2
+
+                        # Job 1: should have external deps passed to executor
+                        assert captured_executor_calls[0] == ["slurm_job_12345"]
+
+                        # Job 2: should have external deps passed to executor (from another_external_experiment)
+                        assert captured_executor_calls[1] == ["slurm_job_12345"]  # From another_external_experiment
+
+                        # Verify exp.add calls
+                        assert mock_exp.add.call_count == 2
+
+                        # Job 1: should have no internal deps
+                        call1_kwargs = mock_exp.add.call_args_list[0][1]
+                        assert call1_kwargs["dependencies"] is None
+
+                        # Job 2: should have internal deps (task_handle_1 from job1)
+                        call2_kwargs = mock_exp.add.call_args_list[1][1]
+                        assert call2_kwargs["dependencies"] == ["task_handle_1"]
+
+    def test_run_after_dependencies_across_experiments(self, tmp_path):
+        """Test that run_after dependencies work when chaining multiple generate/run_cmd calls.
+
+        This test verifies that when you call:
+        1. generate() with expname="exp1"
+        2. run_cmd() with expname="exp2" and run_after=["exp1"]
+        3. generate() with expname="exp3" and run_after=["exp2"]
+
+        The dependencies are correctly set up and exp3 waits for exp2, which waits for exp1.
+        """
+        # Setup
+        output_dir = str(tmp_path)
+        input_file = f"{tmp_path}/input.jsonl"
+
+        # Create dummy input file
+        with open(input_file, "w") as f:
+            f.write(json.dumps({"problem": "test"}) + "\n")
+
+        # Step 1: First generation task
+        # Without the fix, this would work (no external deps)
+        exp1 = generate(
+            ctx=wrap_arguments("++max_samples=1"),
+            cluster="local",
+            input_file=input_file,
+            output_dir=f"{output_dir}/step1/",
+            model="nvidia/nvidia-nemotron-nano-9b-v2",
+            server_type="openai",
+            server_address="https://integrate.api.nvidia.com/v1",
+            expname="test_exp1",
+            reuse_code=False,  # Disable code reuse for simpler test
+            dry_run=True,
+        )
+
+        # Step 2: Run command that depends on exp1
+        # This tests that dependencies work across separate function calls
+        exp2 = run_cmd(
+            ctx=wrap_arguments("echo 'processing'"),
+            cluster="local",
+            log_dir=f"{output_dir}/step2-logs",
+            expname="test_exp2",
+            run_after=["test_exp1"],
+            reuse_code=False,  # Disable code reuse for simpler test
+            dry_run=True,
+        )
+
+        # Step 3: Second generation that depends on exp2
+        # This further tests chaining of dependencies
+        exp3 = generate(
+            ctx=wrap_arguments("++max_samples=1"),
+            cluster="local",
+            input_file=f"{output_dir}/step1/output.jsonl",
+            output_dir=f"{output_dir}/step3/",
+            model="nvidia/nvidia-nemotron-nano-9b-v2",
+            server_type="openai",
+            server_address="https://integrate.api.nvidia.com/v1",
+            expname="test_exp3",
+            run_after=["test_exp2"],
+            reuse_code=False,  # Disable code reuse for simpler test
+            dry_run=True,
+        )
+
+        # Verify all experiments were created successfully
+        # The key test is that NO errors were raised above
+        # (Detailed dependency routing is verified in test_dependencies_separated_internal_vs_external)
+        assert exp1 is not None
+        assert exp2 is not None
+        assert exp3 is not None
+
+    def test_run_after_with_nonexistent_experiment(self):
+        """Test that using run_after with a non-existent experiment gives a proper warning."""
+        from nemo_skills.pipeline.utils.exp import get_exp_handles
+
+        # This should return an empty list and log a warning
+        handles = get_exp_handles("nonexistent_experiment_12345", ignore_exp_not_exists=True)
+        assert handles == []
+
+    def test_run_after_with_experiment_object(self):
+        """Test that run_after can accept an experiment object directly."""
+        import nemo_run as run
+
+        from nemo_skills.pipeline.utils.exp import get_exp_handles
+
+        # Create a mock experiment
+        mock_exp = MagicMock(spec=run.Experiment)
+        mock_exp.status.return_value = {"task1": {"status": "RUNNING", "handle": "slurm_job_123"}}
+
+        # Test that we can get handles from an experiment object
+        with patch("nemo_skills.pipeline.utils.exp.AppState") as mock_app_state:
+            mock_app_state.RUNNING = "RUNNING"
+            mock_app_state.PENDING = "PENDING"
+            mock_app_state.SUBMITTED = "SUBMITTED"
+            mock_app_state.UNKNOWN = "UNKNOWN"
+
+            handles = get_exp_handles(mock_exp, ignore_finished=True)
+            assert len(handles) == 1
+            assert handles[0] == "slurm_job_123"
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 60543216f595305e1c347ae82f73cb2772e7a503 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Thu, 9 Oct 2025 13:10:10 -0700
Subject: [PATCH 26/29] FIx internal vs. external deps

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 65 +++++++++++++++--------
 1 file changed, 42 insertions(+), 23 deletions(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 64d942c70c..7d9b1bfe7c 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -344,8 +344,11 @@ def run(
             for job_spec in self.jobs:
                 job_name = job_spec["name"]  # Already validated in _validate()
 
-                # Resolve dependencies to task handles
-                run_after_deps = []
+                # Separate internal and external dependencies from the start
+                # - Internal deps (task handles from current experiment) go to exp.add()
+                # - External deps (SLURM job IDs from other experiments) go to executor
+                internal_deps = []
+                external_deps = []
 
                 # Handle dependencies from job spec
                 job_dependencies = job_spec.get("dependencies", [])
@@ -369,18 +372,18 @@ def run(
                                 )
                                 # If no experiment found, treat as direct task handle (for _reuse_exp case)
                                 if _reuse_exp:
-                                    run_after_deps.append(dep)
+                                    external_deps.append(dep)
                                     LOG.info(
                                         f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)"
                                     )
                             else:
-                                run_after_deps.extend(exp_handles)
+                                external_deps.extend(exp_handles)
                                 LOG.info(
                                     f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)"
                                 )
                         elif _reuse_exp:
                             # For non-SLURM executors with _reuse_exp, treat as task handle
-                            run_after_deps.append(dep)
+                            external_deps.append(dep)
                             LOG.info(f"Job '{job_name}' depends on task handle '{dep}'")
                     elif isinstance(dep, dict):
                         # Dict dependency = internal job reference (by job spec object)
@@ -388,7 +391,7 @@ def run(
                         if not dep_name:
                             raise ValueError(f"Job dependency must have a 'name' field: {dep}")
                         if dep_name in job_name_to_handle:
-                            run_after_deps.append(job_name_to_handle[dep_name])
+                            internal_deps.append(job_name_to_handle[dep_name])
                             LOG.info(
                                 f"Job '{job_name}' depends on internal job '{dep_name}' (handle: {job_name_to_handle[dep_name]})"
                             )
@@ -399,12 +402,12 @@ def run(
                             )
                     else:
                         # Direct task handle object (not string or dict)
-                        run_after_deps.append(dep)
+                        internal_deps.append(dep)
                         LOG.info(f"Job '{job_name}' depends on task handle (object)")
 
-                # Convert empty list to None for cleaner handling
-                if len(run_after_deps) == 0:
-                    run_after_deps = None
+                # Convert empty lists to None for cleaner handling
+                internal_deps = internal_deps if internal_deps else None
+                external_deps = external_deps if external_deps else None
 
                 # Check if this is a multi-group job or single group
                 if "groups" in job_spec:
@@ -415,7 +418,8 @@ def run(
                             job_spec["groups"][0],
                             self.cluster_config,
                             default_log_dir=log_dir,
-                            run_after=run_after_deps if run_after_deps else None,
+                            internal_deps=internal_deps,
+                            external_deps=external_deps,
                         )
                     else:
                         # True multi-group: combine multiple groups into one heterogeneous SLURM job
@@ -424,7 +428,8 @@ def run(
                             job_spec["groups"],
                             self.cluster_config,
                             default_log_dir=log_dir,
-                            run_after=run_after_deps if run_after_deps else None,
+                            internal_deps=internal_deps,
+                            external_deps=external_deps,
                         )
                 elif "group" in job_spec:
                     # Single group job
@@ -433,7 +438,8 @@ def run(
                         job_spec["group"],
                         self.cluster_config,
                         default_log_dir=log_dir,
-                        run_after=run_after_deps if run_after_deps else None,
+                        internal_deps=internal_deps,
+                        external_deps=external_deps,
                     )
                 else:
                     raise ValueError(f"Job spec must have either 'group' or 'groups': {job_spec}")
@@ -490,6 +496,7 @@ def _create_executor(
         het_group: int,
         total_het_groups: int,
         overlap: bool,
+        dependencies: Optional[List] = None,
     ):
         """Create executor with optional environment update."""
         env_context = (
@@ -518,6 +525,7 @@ def _create_executor(
                 mounts=exec_config.get("mounts"),
                 with_ray=self.with_ray,
                 slurm_kwargs={"exclusive": hardware.exclusive} if (hardware and hardware.exclusive) else None,
+                dependencies=dependencies,
             )
 
     def _plan_and_add_job(
@@ -526,17 +534,19 @@ def _plan_and_add_job(
         groups: List[CommandGroup],
         cluster_config: Dict,
         default_log_dir: Optional[str] = None,
-        run_after: Optional[List] = None,
+        internal_deps: Optional[List] = None,
+        external_deps: Optional[List] = None,
         heterogeneous: bool = False,
     ) -> str:
         """Plan commands/executors for one or more groups and add to experiment.
 
         This encapsulates shared logic between single-group and multi-group jobs. Behavior
         differences are controlled by the 'heterogeneous' flag and the provided 'groups'.
-        """
 
-        # Pass dependencies through (nemo-run handles executor-specific behavior)
-        dependencies = run_after if run_after else None
+        Args:
+            internal_deps: Task handles from same experiment (passed to exp.add())
+            external_deps: SLURM job IDs from other experiments (passed to executor)
+        """
 
         # Resolve log directory (use first group's log_dir if present)
         log_dir = groups[0].log_dir or default_log_dir
@@ -592,6 +602,8 @@ def _plan_and_add_job(
 
                 # Resolve container and create executor
                 container_image = self._resolve_container(exec_config, command, cluster_config)
+                # Pass external dependencies only to the first executor (SLURM doesn't support per-component dependencies in hetjobs)
+                exec_dependencies = external_deps if (het_idx == 0 and comp_idx == 0) else None
                 executor = self._create_executor(
                     command,
                     exec_config,
@@ -603,6 +615,7 @@ def _plan_and_add_job(
                     het_idx if heterogeneous else comp_idx,
                     total_het_groups,
                     (len(group.commands) > 1),
+                    dependencies=exec_dependencies,
                 )
 
                 # Share packager across executors for single-group jobs
@@ -664,12 +677,14 @@ def _plan_and_add_job(
             metadata = None
 
         # Add to experiment and return task ID
+        # Note: Internal dependencies (task handles from same experiment) go to exp.add()
+        #       External dependencies (SLURM job IDs from other experiments) go to executor
         if (not heterogeneous) and len(commands) == 1:
             task_id = exp.add(
                 run.Script(inline=commands[0], metadata=metadata),
                 executor=executors[0],
                 name="nemo-run",
-                dependencies=dependencies,
+                dependencies=internal_deps,
             )
         else:
             task_id = exp.add(
@@ -679,7 +694,7 @@ def _plan_and_add_job(
                 ],
                 executor=executors,
                 name="nemo-run",
-                dependencies=dependencies,
+                dependencies=internal_deps,
             )
 
         return task_id
@@ -690,7 +705,8 @@ def _add_single_group_job(
         group: CommandGroup,
         cluster_config: Dict,
         default_log_dir: Optional[str] = None,
-        run_after: Optional[List] = None,
+        internal_deps: Optional[List] = None,
+        external_deps: Optional[List] = None,
     ) -> str:
         """Add a single CommandGroup as one job and return its task handle."""
 
@@ -699,7 +715,8 @@ def _add_single_group_job(
             groups=[group],
             cluster_config=cluster_config,
             default_log_dir=default_log_dir,
-            run_after=run_after,
+            internal_deps=internal_deps,
+            external_deps=external_deps,
             heterogeneous=False,
         )
 
@@ -709,7 +726,8 @@ def _add_multi_group_job(
         groups: List[CommandGroup],
         cluster_config: Dict,
         default_log_dir: Optional[str] = None,
-        run_after: Optional[List] = None,
+        internal_deps: Optional[List] = None,
+        external_deps: Optional[List] = None,
     ) -> str:
         """Add multiple CommandGroups as a single heterogeneous SLURM job and return task handle."""
 
@@ -718,6 +736,7 @@ def _add_multi_group_job(
             groups=groups,
             cluster_config=cluster_config,
             default_log_dir=default_log_dir,
-            run_after=run_after,
+            internal_deps=internal_deps,
+            external_deps=external_deps,
             heterogeneous=True,
         )

From bdfd79f3b8ce0b85848394dc717d0957852d30af Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Thu, 9 Oct 2025 13:37:26 -0700
Subject: [PATCH 27/29] TST fix and improve tests

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 tests/test_declarative_pipeline.py | 100 +++++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 7 deletions(-)

diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
index deb209e128..ffc3a13622 100644
--- a/tests/test_declarative_pipeline.py
+++ b/tests/test_declarative_pipeline.py
@@ -15,6 +15,7 @@
 """Tests for the declarative pipeline system."""
 
 import json
+import os
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -315,7 +316,7 @@ def test_pipeline_run_basic(self, mock_run_exp, mock_env_vars, mock_get_exp):
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
     @patch("nemo_skills.pipeline.utils.declarative.run_exp")
     def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_get_exp):
-        """Test pipeline execution with job dependencies."""
+        """Test pipeline execution with internal job dependencies."""
         # Setup mocks
         mock_config = {
             "executor": "none",
@@ -327,7 +328,7 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_
         mock_exp.add.side_effect = ["handle_1", "handle_2"]
         mock_get_exp.return_value.__enter__.return_value = mock_exp
 
-        # Create pipeline with dependencies
+        # Create pipeline with internal dependencies
         cmd1 = Command(command="echo 1", name="cmd1")
         group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/logs")
 
@@ -335,7 +336,7 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_
         group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/logs")
 
         job1 = {"name": "job1", "group": group1, "dependencies": []}
-        job2 = {"name": "job2", "group": group2, "dependencies": [job1]}
+        job2 = {"name": "job2", "group": group2, "dependencies": [job1]}  # Internal dependency
 
         pipeline = Pipeline(name="test", cluster_config=mock_config, jobs=[job1, job2], skip_hf_home_check=True)
 
@@ -345,6 +346,16 @@ def test_pipeline_run_with_dependencies(self, mock_run_exp, mock_env_vars, mock_
         # Verify both jobs were added
         assert mock_exp.add.call_count == 2
 
+        # Verify job1 has no dependencies
+        call1_kwargs = mock_exp.add.call_args_list[0][1]
+        assert call1_kwargs["dependencies"] is None or call1_kwargs["dependencies"] == []
+
+        # Verify job2 has internal dependency on job1 (handle_1)
+        call2_kwargs = mock_exp.add.call_args_list[1][1]
+        assert call2_kwargs["dependencies"] == ["handle_1"], (
+            f"Job2 should depend on job1's handle, got {call2_kwargs['dependencies']}"
+        )
+
     @patch("nemo_skills.pipeline.utils.declarative.get_exp")
     @patch("nemo_skills.pipeline.utils.declarative.get_env_variables")
     @patch("nemo_skills.pipeline.utils.declarative.is_mounted_filepath")
@@ -590,6 +601,74 @@ def test_commandgroup_missing_log_dir(self):
 class TestJobDependencies:
     """Test job dependencies across experiments."""
 
+    def test_multiple_internal_dependencies(self):
+        """Test that a job can depend on multiple internal jobs."""
+        import nemo_run as run
+
+        with patch("nemo_skills.pipeline.utils.declarative.get_exp") as mock_get_exp:
+            mock_exp = MagicMock(spec=run.Experiment)
+            mock_exp.__enter__ = MagicMock(return_value=mock_exp)
+            mock_exp.__exit__ = MagicMock(return_value=False)
+            # Return handles for 3 jobs
+            mock_exp.add = MagicMock(side_effect=["handle_1", "handle_2", "handle_3"])
+            mock_get_exp.return_value = mock_exp
+
+            with patch("nemo_skills.pipeline.utils.declarative.get_executor") as mock_executor:
+                mock_executor.return_value = MagicMock(packager=MagicMock())
+
+                with patch("nemo_skills.pipeline.utils.declarative.run_exp"):
+                    cluster_config = {
+                        "executor": "slurm",
+                        "containers": {"nemo-skills": "test/container"},
+                        "account": "test",
+                        "env_vars": {"HF_HOME": "/mounted/hf_home"},
+                        "mounts": ["/mounted/hf_home:/mounted/hf_home"],
+                    }
+
+                    # Job 1 and Job 2: independent
+                    cmd1 = Command(command="echo job1", name="job1")
+                    group1 = CommandGroup(commands=[cmd1], name="group1", log_dir="/tmp/logs")
+
+                    cmd2 = Command(command="echo job2", name="job2")
+                    group2 = CommandGroup(commands=[cmd2], name="group2", log_dir="/tmp/logs")
+
+                    # Job 3: depends on both job1 and job2
+                    cmd3 = Command(command="echo job3", name="job3")
+                    group3 = CommandGroup(commands=[cmd3], name="group3", log_dir="/tmp/logs")
+
+                    job1_spec = {"name": "job1", "group": group1}
+                    job2_spec = {"name": "job2", "group": group2}
+                    job3_spec = {
+                        "name": "job3",
+                        "group": group3,
+                        "dependencies": [job1_spec, job2_spec],  # Multiple internal dependencies
+                    }
+
+                    pipeline = Pipeline(
+                        name="test_pipeline",
+                        cluster_config=cluster_config,
+                        jobs=[job1_spec, job2_spec, job3_spec],
+                        skip_hf_home_check=True,
+                        reuse_code=False,  # Disable code reuse to avoid mock issues
+                    )
+
+                    pipeline.run(dry_run=True)
+
+                    # Verify all jobs were added
+                    assert mock_exp.add.call_count == 3
+
+                    # Job 1 and 2 should have no internal dependencies
+                    call1_kwargs = mock_exp.add.call_args_list[0][1]
+                    call2_kwargs = mock_exp.add.call_args_list[1][1]
+                    assert call1_kwargs["dependencies"] is None
+                    assert call2_kwargs["dependencies"] is None
+
+                    # Job 3 should depend on both job1 and job2
+                    call3_kwargs = mock_exp.add.call_args_list[2][1]
+                    assert call3_kwargs["dependencies"] == ["handle_1", "handle_2"], (
+                        f"Job3 should depend on both handle_1 and handle_2, got {call3_kwargs['dependencies']}"
+                    )
+
     def test_dependencies_separated_internal_vs_external(self):
         """Test that internal and external dependencies are handled differently.
 
@@ -660,10 +739,11 @@ def mock_get_executor(**kwargs):
                             cluster_config=cluster_config,
                             jobs=[job1_spec, job2_spec],
                             skip_hf_home_check=True,
+                            reuse_code=False,  # Disable code reuse to avoid mock issues
                         )
 
                         # Run the pipeline (mocked)
-                        pipeline.run(dry_run=False)
+                        pipeline.run(dry_run=True)
 
                         # Verify executor calls
                         assert len(captured_executor_calls) == 2
@@ -703,11 +783,15 @@ def test_run_after_dependencies_across_experiments(self, tmp_path):
         with open(input_file, "w") as f:
             f.write(json.dumps({"problem": "test"}) + "\n")
 
+        # Use test-local cluster config for CI compatibility
+        test_config_dir = os.path.join(os.path.dirname(__file__), "gpu-tests")
+
         # Step 1: First generation task
         # Without the fix, this would work (no external deps)
         exp1 = generate(
             ctx=wrap_arguments("++max_samples=1"),
-            cluster="local",
+            cluster="test-local",
+            config_dir=test_config_dir,
             input_file=input_file,
             output_dir=f"{output_dir}/step1/",
             model="nvidia/nvidia-nemotron-nano-9b-v2",
@@ -722,7 +806,8 @@ def test_run_after_dependencies_across_experiments(self, tmp_path):
         # This tests that dependencies work across separate function calls
         exp2 = run_cmd(
             ctx=wrap_arguments("echo 'processing'"),
-            cluster="local",
+            cluster="test-local",
+            config_dir=test_config_dir,
             log_dir=f"{output_dir}/step2-logs",
             expname="test_exp2",
             run_after=["test_exp1"],
@@ -734,7 +819,8 @@ def test_run_after_dependencies_across_experiments(self, tmp_path):
         # This further tests chaining of dependencies
         exp3 = generate(
             ctx=wrap_arguments("++max_samples=1"),
-            cluster="local",
+            cluster="test-local",
+            config_dir=test_config_dir,
             input_file=f"{output_dir}/step1/output.jsonl",
             output_dir=f"{output_dir}/step3/",
             model="nvidia/nvidia-nemotron-nano-9b-v2",

From 2563cf38c4f62775b5b8d81f4be39acf094b88c9 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Thu, 9 Oct 2025 13:57:53 -0700
Subject: [PATCH 28/29] FIX reuse_exp

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index 7d9b1bfe7c..f9d7db3bc4 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -372,7 +372,7 @@ def run(
                                 )
                                 # If no experiment found, treat as direct task handle (for _reuse_exp case)
                                 if _reuse_exp:
-                                    external_deps.append(dep)
+                                    internal_deps.append(dep)
                                     LOG.info(
                                         f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)"
                                     )
@@ -382,9 +382,9 @@ def run(
                                     f"Job '{job_name}' depends on external experiment '{dep}' ({len(exp_handles)} tasks)"
                                 )
                         elif _reuse_exp:
-                            # For non-SLURM executors with _reuse_exp, treat as task handle
-                            external_deps.append(dep)
-                            LOG.info(f"Job '{job_name}' depends on task handle '{dep}'")
+                            # For non-SLURM executors with _reuse_exp, string deps are internal task handles
+                            internal_deps.append(dep)
+                            LOG.info(f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)")
                     elif isinstance(dep, dict):
                         # Dict dependency = internal job reference (by job spec object)
                         dep_name = dep.get("name")

From d3e83163d20eca2a167c3e454e474a4f9ae6965e Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Thu, 9 Oct 2025 17:05:50 -0700
Subject: [PATCH 29/29] MAINT cascade skip_hf_home_check

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/pipeline/utils/declarative.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index f9d7db3bc4..956dcbd08b 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -276,7 +276,7 @@ def __init__(
         jobs: List[Dict],
         reuse_code: bool = True,
         reuse_code_exp: Optional[str] = None,
-        skip_hf_home_check: bool = False,
+        skip_hf_home_check: bool | None = None,
         with_ray: bool = False,
         run_after: Optional[Union[str, List[str]]] = None,  # Pipeline-level dependency on other experiments
     ):
@@ -284,6 +284,9 @@ def __init__(
         self.cluster_config = cluster_config
         self.reuse_code = reuse_code
         self.reuse_code_exp = reuse_code_exp
+        # If not explicitly set, resolve from cluster config (matching exp.py behavior)
+        if skip_hf_home_check is None:
+            skip_hf_home_check = cluster_config.get("skip_hf_home_check", False)
         self.skip_hf_home_check = skip_hf_home_check
         self.with_ray = with_ray
         self.run_after = run_after