From 409c8ee0246ad5b4748d388b567ec511beca7566 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sun, 28 Dec 2025 09:46:38 -0800
Subject: [PATCH 1/7] EOS FIX 8 chunks per node

Signed-off-by: Fejgin, Roy <rfejgin@nvidia.com>
---
 nemo_skills/pipeline/eval.py             | 13 +++++++
 nemo_skills/pipeline/utils/eval.py       | 39 ++++++++++++++++++--
 nemo_skills/pipeline/utils/exp.py        |  8 ++---
 nemo_skills/pipeline/utils/generation.py | 45 +++++++++++++++++++-----
 nemo_skills/pipeline/utils/server.py     | 33 ++++++++++++-----
 5 files changed, 114 insertions(+), 24 deletions(-)

diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py
index 1f1557a4f9..ac12716124 100644
--- a/nemo_skills/pipeline/eval.py
+++ b/nemo_skills/pipeline/eval.py
@@ -372,6 +372,12 @@ def eval(
         None,
         help="Number of chunks to split the dataset into. If None, will not chunk the dataset.",
     ),
+    gpus_per_node: int = typer.Option(
+        1,
+        help="Number of GPUs per node for multi-instance mode. "
+        "When > 1, launches multiple server instances (one per GPU) within a single job. "
+        "Requires num_chunks to be a multiple of gpus_per_node.",
+    ),
     chunk_ids: str = typer.Option(
         None,
         help="List of explicit chunk ids to run. Separate with , or .. to specify range. "
@@ -581,6 +587,7 @@ def eval(
         eval_requires_judge=eval_requires_judge,
         generation_type=generation_type,
         generation_module=generation_module,
+        gpus_per_node=gpus_per_node,
     )
 
     sbatch_kwargs = parse_kwargs(sbatch_kwargs, exclusive=exclusive, qos=qos, time_min=time_min)
@@ -605,9 +612,14 @@ def eval(
                 job_server_address,
                 job_server_command,
                 job_sandbox_env_overrides,
+                job_gpus_per_node,
             ) = job_args
             prev_tasks = _task_dependencies
 
+            # Add gpus_per_node to server config for multi-instance mode
+            if job_server_config and job_gpus_per_node > 1:
+                job_server_config["gpus_per_node"] = job_gpus_per_node
+
             for _ in range(dependent_jobs + 1):
                 has_tasks = True
                 new_task = pipeline_utils.add_task(
@@ -617,6 +629,7 @@ def eval(
                     log_dir=log_dir,
                     container=cluster_config["containers"]["nemo-skills"],
                     cluster_config=cluster_config,
+                    num_tasks=job_gpus_per_node,
                     partition=partition,
                     server_config=job_server_config,
                     with_sandbox=job_needs_sandbox or with_sandbox,
diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py
index 9750659e5b..37c81825f7 100644
--- a/nemo_skills/pipeline/utils/eval.py
+++ b/nemo_skills/pipeline/utils/eval.py
@@ -267,11 +267,20 @@ def prepare_eval_commands(
     eval_requires_judge,
     generation_type=None,
     generation_module=None,
+    gpus_per_node: int = 1,
 ):
     # TODO: there is a bit too much code duplication here and logic is quite dense, should try to refactor
 
     # TODO: should we allow setting num chunks per benchmark when not using groups? Maybe benchmark:rs_num:num_chunks?
 
+    # Validate gpus_per_node for multi-instance mode
+    if gpus_per_node > 1:
+        if num_chunks is None:
+            raise ValueError("gpus_per_node > 1 requires num_chunks to be specified")
+        if num_chunks % gpus_per_node != 0:
+            raise ValueError(f"num_chunks ({num_chunks}) must be a multiple of gpus_per_node ({gpus_per_node})")
+        LOG.info(f"Multi-instance mode: {gpus_per_node} GPUs per node, {num_chunks // gpus_per_node} jobs")
+
     if generation_type is not None:
         if generation_module is not None:
             raise ValueError("Cannot specify both generation_module and generation_type. ")
@@ -354,7 +363,12 @@ def prepare_eval_commands(
             rerun_done=rerun_done,
         )
         for seed_idx, (seed, benchmark_chunk_ids) in enumerate(benchmark_args.remaining_jobs.items()):
-            total_evals += len(benchmark_chunk_ids)
+            # Multi-instance mode: count unique base chunks (each base chunk = 1 job)
+            if gpus_per_node > 1:
+                base_chunks = set((cid // gpus_per_node) * gpus_per_node for cid in benchmark_chunk_ids)
+                total_evals += len(base_chunks)
+            else:
+                total_evals += len(benchmark_chunk_ids)
 
     if num_jobs < 0:
         # if num_jobs is -1, we run all benchmarks in parallel
@@ -376,6 +390,7 @@ def prepare_eval_commands(
         **server_parameters,
         extra_arguments=extra_arguments,
         get_random_port=get_random_port,
+        gpus_per_node=gpus_per_node,
     )
 
     cur_eval = 0
@@ -398,7 +413,18 @@ def prepare_eval_commands(
                     random_seed=seed,
                     chunk_id=None,
                 )
-            for chunk_id in benchmark_chunk_ids:
+            # Multi-instance mode: compute which base chunks need to run
+            # If ANY chunk in a batch is incomplete, we run the entire batch (base_chunk)
+            if gpus_per_node > 1:
+                base_chunks_to_run = set()
+                for cid in benchmark_chunk_ids:
+                    base_chunk = (cid // gpus_per_node) * gpus_per_node
+                    base_chunks_to_run.add(base_chunk)
+                chunks_to_process = sorted(base_chunks_to_run)
+            else:
+                chunks_to_process = benchmark_chunk_ids
+
+            for chunk_id in chunks_to_process:
                 job_benchmarks.add(benchmark)
 
                 effective_generation_module = generation_module or benchmark_args.generation_module
@@ -431,12 +457,17 @@ def prepare_eval_commands(
                     f"{job_extra_arguments} "
                 )
 
+                # Multi-instance mode: use shell expression for chunk_id
+                effective_chunk_id = chunk_id
+                if gpus_per_node > 1:
+                    effective_chunk_id = f"$(({chunk_id} + $SLURM_LOCALID))"
+
                 cmd = pipeline_utils.get_generation_cmd(
                     input_file=benchmark_args.input_file,
                     output_dir=benchmark_output_dir,
                     extra_arguments=full_extra_arguments,
                     random_seed=seed,
-                    chunk_id=chunk_id,
+                    chunk_id=effective_chunk_id,
                     num_chunks=benchmark_args.num_chunks,
                     script=generation_module or benchmark_args.generation_module,
                     requirements=requirements,
@@ -480,12 +511,14 @@ def prepare_eval_commands(
                             # a check above guarantees that this is the same for all tasks in a job
                             generation_task.get_server_command_fn(),
                             job_sandbox_env_overrides,
+                            gpus_per_node,  # client num_tasks for multi-instance mode
                         )
                     )
                     job_server_config, job_server_address, job_extra_arguments = pipeline_utils.configure_client(
                         **server_parameters,
                         extra_arguments=extra_arguments,
                         get_random_port=get_random_port,
+                        gpus_per_node=gpus_per_node,
                     )
                     for job_benchmark in job_benchmarks:
                         benchmarks_dict[job_benchmark].job_ids.append(cur_job_idx)
diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index 0f9ec7f123..e819349eaa 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -127,7 +127,7 @@ def stdout(self) -> Path:
 
     @property
     def srun_stdout(self) -> Path:
-        return Path(self.folder) / f"{self.srun_prefix}%j_srun.log"
+        return Path(self.folder) / f"{self.srun_prefix}%j_%t_srun.log"
 
     @property
     def stderr(self) -> Path:
@@ -135,7 +135,7 @@ def stderr(self) -> Path:
 
     @property
     def srun_stderr(self) -> Path:
-        return Path(self.folder) / f"{self.srun_prefix}%j_srun.log"
+        return Path(self.folder) / f"{self.srun_prefix}%j_%t_srun.log"
 
     @property
     def ls_term(self) -> str:
@@ -144,7 +144,7 @@ def ls_term(self) -> str:
         The command used to list the files is ls -1 {ls_term} 2> /dev/null
         """
         assert self.folder
-        return os.path.join(self.folder, "*%j_srun.log")
+        return os.path.join(self.folder, "*%j_*_srun.log")
 
 
 @dataclass(kw_only=True)
@@ -314,7 +314,7 @@ def get_executor(
     srun_args = [
         "--no-container-mount-home",
         "--mpi=pmix",
-        "--wait=10",
+        "--wait=240",  # wait up to 4 minutes for slower tasks to complete (important for multi-instance mode)
         # we need to be explicit about this in srun as commands might need to run in parallel
         f"--ntasks-per-node={tasks_per_node}",
         f"--nodes={num_nodes}",
diff --git a/nemo_skills/pipeline/utils/generation.py b/nemo_skills/pipeline/utils/generation.py
index 4432181cff..5d45159016 100644
--- a/nemo_skills/pipeline/utils/generation.py
+++ b/nemo_skills/pipeline/utils/generation.py
@@ -495,8 +495,15 @@ def get_generation_cmd(
         cmd += "++wait_for_sandbox=true "
 
     if chunk_id is not None:
-        cmd += f" ++num_chunks={num_chunks} ++chunk_id={chunk_id} "
-        output_file = get_chunked_rs_filename(output_dir, random_seed=random_seed, chunk_id=chunk_id)
+        # Check if chunk_id is a shell expression (e.g., "$((0 + $SLURM_LOCALID))")
+        is_shell_expr = isinstance(chunk_id, str) and "$" in str(chunk_id)
+
+        if is_shell_expr:
+            # For shell expressions, use double quotes so shell expands the expression
+            cmd += f' ++num_chunks={num_chunks} "++chunk_id={chunk_id}" '
+        else:
+            cmd += f" ++num_chunks={num_chunks} ++chunk_id={chunk_id} "
+
         donefiles = []
         # we are always waiting for all chunks in num_chunks, no matter chunk_ids in
         # the current run (as we don't want to merge partial jobs)
@@ -505,10 +512,23 @@ def get_generation_cmd(
             donefile = f"{filename}.done"
             donefiles.append(donefile)
 
-        if job_end_cmd:
-            job_end_cmd += f" && touch {donefiles[chunk_id]} "
+        if is_shell_expr:
+            # For shell expression, compute the donefile path at runtime
+            # Get the base pattern with _chunk_0 and replace with shell expression
+            base_donefile = donefiles[0]  # e.g., /path/output_chunk_0.jsonl.done
+            # Replace "_chunk_0.jsonl" with "_chunk_$((expr)).jsonl" where expr is expanded by shell
+            # Extract the expression part (e.g., "0 + $SLURM_LOCALID" from "$((0 + $SLURM_LOCALID))")
+            donefile_pattern = base_donefile.replace("_chunk_0.jsonl", f"_chunk_{chunk_id}.jsonl")
+            if job_end_cmd:
+                job_end_cmd += f' && touch "{donefile_pattern}" '
+            else:
+                job_end_cmd = f'touch "{donefile_pattern}" '
         else:
-            job_end_cmd = f"touch {donefiles[chunk_id]} "
+            output_file = get_chunked_rs_filename(output_dir, random_seed=random_seed, chunk_id=chunk_id)
+            if job_end_cmd:
+                job_end_cmd += f" && touch {donefiles[chunk_id]} "
+            else:
+                job_end_cmd = f"touch {donefiles[chunk_id]} "
 
         # getting file name as if there is no chunking since that's where we want to merge
         merged_output_file = get_chunked_rs_filename(output_dir=output_dir, random_seed=random_seed)
@@ -582,6 +602,7 @@ def configure_client(
     get_random_port: bool,
     extra_arguments: str,
     server_container: str | None = None,
+    gpus_per_node: int = 1,
 ):
     """
     Utility function to configure a client for the model inference server.
@@ -597,6 +618,7 @@ def configure_client(
         get_random_port: Whether to get a random port for the server.
         extra_arguments: Extra arguments to pass to the command.
         server_container: Container to use for the server.
+        gpus_per_node: Number of GPUs per node for multi-instance mode.
 
     Returns:
         A tuple containing:
@@ -625,9 +647,16 @@ def configure_client(
         }
         if server_container:
             server_config["container"] = server_container
-        extra_arguments = (
-            f"++server.host=127.0.0.1 ++server.port={server_port} ++server.model={model} {extra_arguments}"
-        )
+        if gpus_per_node > 1:
+            # Multi-instance mode: port is computed at runtime based on SLURM_LOCALID
+            extra_arguments = (
+                f"++server.host=127.0.0.1 "
+                f'"++server.port=$(({server_port} + $SLURM_LOCALID))" ++server.model={model} {extra_arguments}'
+            )
+        else:
+            extra_arguments = (
+                f"++server.host=127.0.0.1 ++server.port={server_port} ++server.model={model} {extra_arguments}"
+            )
     else:  # model is hosted elsewhere
         server_config = None
         extra_arguments = f"++server.base_url={server_address} ++server.model={model} {extra_arguments}"
diff --git a/nemo_skills/pipeline/utils/server.py b/nemo_skills/pipeline/utils/server.py
index 87abca4a99..adfcbb54db 100644
--- a/nemo_skills/pipeline/utils/server.py
+++ b/nemo_skills/pipeline/utils/server.py
@@ -120,6 +120,7 @@ def get_server_command(
     server_port: int,
     server_args: str = "",
     server_entrypoint: str | None = None,
+    gpus_per_node: int = 1,
 ):
     num_tasks = num_gpus
 
@@ -209,15 +210,29 @@ def get_server_command(
     elif server_type == "generic":
         if not server_entrypoint:
             raise ValueError("For 'generic' server type, 'server_entrypoint' must be specified.")
-        server_start_cmd = (
-            f"{server_entrypoint} "
-            f"    --model {model_path} "
-            f"    --num_gpus {num_gpus} "
-            f"    --num_nodes {num_nodes} "
-            f"    --port {server_port} "
-            f"    {server_args} "
-        )
-        num_tasks = 1
+        if gpus_per_node > 1:
+            # Multi-instance mode: each SLURM task gets its own GPU and port
+            server_start_cmd = (
+                f"echo 'SLURM_LOCALID='$SLURM_LOCALID' SLURM_PROCID='$SLURM_PROCID && "
+                f"export CUDA_VISIBLE_DEVICES=${{SLURM_LOCALID:-0}} && "
+                f"{server_entrypoint} "
+                f"    --model {model_path} "
+                f"    --num_gpus 1 "
+                f"    --num_nodes 1 "
+                f"    --port $(({server_port} + ${{SLURM_LOCALID:-0}})) "
+                f"    {server_args} "
+            )
+            num_tasks = gpus_per_node
+        else:
+            server_start_cmd = (
+                f"{server_entrypoint} "
+                f"    --model {model_path} "
+                f"    --num_gpus {num_gpus} "
+                f"    --num_nodes {num_nodes} "
+                f"    --port {server_port} "
+                f"    {server_args} "
+            )
+            num_tasks = 1
     else:
         raise ValueError(f"Server type '{server_type}' not supported for model inference.")
 

From 1574526a7a85e5b9a9e1b1e2c5961dfc419726fd Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sun, 28 Dec 2025 12:08:20 -0800
Subject: [PATCH 2/7] eos config example

Signed-off-by: Fejgin, Roy <rfejgin@nvidia.com>
---
 cluster_configs/eos_example.yaml | 48 ++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 cluster_configs/eos_example.yaml

diff --git a/cluster_configs/eos_example.yaml b/cluster_configs/eos_example.yaml
new file mode 100644
index 0000000000..124f85eb0d
--- /dev/null
+++ b/cluster_configs/eos_example.yaml
@@ -0,0 +1,48 @@
+executor: slurm
+
+ssh_tunnel:
+  host: login-eos.nvidia.com
+  # ------------------------------- Fill this up! -------------------------------
+  user: your_username
+  job_dir: /lustre/fsw/llmservice_nemo_speechlm/users/your_username/code/nemo-run
+  identity: ""
+  # -----------------------------------------------------------------------------
+
+# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel
+# job_dir: <some location on slurm cluster to keep job metadata, uploaded code and generated sbatch files>
+
+account: llmservice_nemo_speechlm
+partition: batch
+job_name_prefix: ""
+
+disable_gpus_per_node: True
+
+containers:
+  trtllm: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-trtllm-latest.sqsh
+  vllm: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-vllm-latest.sqsh
+  sglang: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-sglang-latest.sqsh
+  nemo-rl: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-nemo-rl-latest.sqsh
+  megatron: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-megatron-latest.sqsh
+  sandbox: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-sandbox-latest.sqsh
+  nemo-skills: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-latest.sqsh
+  verl: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-verl-latest.sqsh
+
+mounts:
+  # - /lustre/fsw/llmservice_nemo_reasoning/hf_models:/hf_models
+  # - /lustre/fsw/llmservice_nemo_reasoning/images/swe-bench:/swe-bench-images
+  - /lustre/fsw/llmservice_nemo_speechlm:/lustre/fsw/llmservice_nemo_speechlm
+
+  # you also need to mount your own workspace folder (or any other folder you need)
+  # - /lustre/fsw/llmservice_nemo_reasoning/users/igitman/:/workspace
+
+env_vars:
+  # ------------------------------- Fill this up! -------------------------------
+  - HF_HOME=/lustre/fsw/llmservice_nemo_speechlm/users/your_username/hfcache
+  # -----------------------------------------------------------------------------
+
+timeouts:
+  batch: 04:00:00
+  interactive: 02:00:00
+
+mail_type: FAIL
+mail_user: # <your email goes here>

From 436e7b361adbed7ec3f85d73a7e2bfab59284fed Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 15:39:46 -0800
Subject: [PATCH 3/7] Avoid killing multi-instance tasks via srun --wait

Stop setting srun --wait by default; allow opt-in via cluster_config.srun_wait_seconds.

Signed-off-by: Fejgin, Roy <rfejgin@nvidia.com>
---
 nemo_skills/pipeline/utils/exp.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index e819349eaa..a527e6177c 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -314,13 +314,23 @@ def get_executor(
     srun_args = [
         "--no-container-mount-home",
         "--mpi=pmix",
-        "--wait=240",  # wait up to 4 minutes for slower tasks to complete (important for multi-instance mode)
         # we need to be explicit about this in srun as commands might need to run in parallel
         f"--ntasks-per-node={tasks_per_node}",
         f"--nodes={num_nodes}",
         # NeMo-run should take care of this, but we'll put it here temporarily
         f"--container-env={','.join([k.strip() for k in env_vars.keys()])}",
     ]
+    # IMPORTANT:
+    # Slurm's `srun --wait=<sec>` terminates the job step if other tasks are still
+    # running <sec> seconds after the first task exits. For multi-instance runs
+    # (e.g., chunked evaluation), task runtimes can differ widely, and a low wait
+    # will kill long-running tasks (observed with `--wait=240`).
+    #
+    # If you need this behavior, configure it explicitly in the cluster config:
+    #   srun_wait_seconds: <int>
+    srun_wait_seconds = cluster_config.get("srun_wait_seconds")
+    if srun_wait_seconds is not None:
+        srun_args.append(f"--wait={int(srun_wait_seconds)}")
     if overlap:
         srun_args.append("--overlap")
     if not cluster_config.get("disable_gpus_per_node", False) and gpus_per_node is not None:

From 83c2375112c3580e9a0b605d9160b18c40f63767 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 15:53:19 -0800
Subject: [PATCH 4/7] Override srun wait for multi-instance jobs

Add a large srun --wait for multi-instance runs to override nemo_run's default --wait=60, preventing premature termination when some ranks finish earlier.

Signed-off-by: Fejgin, Roy <rfejgin@nvidia.com>
---
 nemo_skills/pipeline/utils/exp.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index a527e6177c..01693bddb9 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -322,13 +322,20 @@ def get_executor(
     ]
     # IMPORTANT:
     # Slurm's `srun --wait=<sec>` terminates the job step if other tasks are still
-    # running <sec> seconds after the first task exits. For multi-instance runs
-    # (e.g., chunked evaluation), task runtimes can differ widely, and a low wait
-    # will kill long-running tasks (observed with `--wait=240`).
+    # running <sec> seconds after the first task exits.
     #
-    # If you need this behavior, configure it explicitly in the cluster config:
+    # `nemo_run` adds `--wait=60` by default; for multi-instance runs (e.g., chunked
+    # evaluation) tasks can finish at very different times (some may exit quickly
+    # due to `++skip_filled=True`), which causes Slurm to kill still-running tasks.
+    #
+    # We override this with a large wait by default for multi-instance mode.
+    # You can customize via cluster config:
     #   srun_wait_seconds: <int>
     srun_wait_seconds = cluster_config.get("srun_wait_seconds")
+    if srun_wait_seconds is None and tasks_per_node > 1:
+        # Use a very large wait (1 day) so long-running ranks aren't killed just
+        # because other ranks finished earlier.
+        srun_wait_seconds = 24 * 60 * 60
     if srun_wait_seconds is not None:
         srun_args.append(f"--wait={int(srun_wait_seconds)}")
     if overlap:

From fbfdff9b781c8cfd214ed2052c26a0949ae3b173 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 16:35:08 -0800
Subject: [PATCH 5/7] Set multi-instance srun wait to 1 hour

Use a 1-hour default srun --wait for multi-instance runs to avoid premature task termination when chunk runtimes differ.

Signed-off-by: Fejgin, Roy <rfejgin@nvidia.com>
---
 nemo_skills/pipeline/utils/exp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index 01693bddb9..d22fddec6a 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -333,9 +333,9 @@ def get_executor(
     #   srun_wait_seconds: <int>
     srun_wait_seconds = cluster_config.get("srun_wait_seconds")
     if srun_wait_seconds is None and tasks_per_node > 1:
-        # Use a very large wait (1 day) so long-running ranks aren't killed just
+        # Use a reasonably large wait (1 hour) so long-running ranks aren't killed just
         # because other ranks finished earlier.
-        srun_wait_seconds = 24 * 60 * 60
+        srun_wait_seconds = 60 * 60
     if srun_wait_seconds is not None:
         srun_args.append(f"--wait={int(srun_wait_seconds)}")
     if overlap:

From db294bfc8cbd3d01641d8bd318be1b3b2662b7b6 Mon Sep 17 00:00:00 2001
From: "Fejgin, Roy" <rfejgin@nvidia.com>
Date: Wed, 18 Feb 2026 19:44:24 -0800
Subject: [PATCH 6/7] Remove internal config file

Signed-off-by: Fejgin, Roy <rfejgin@nvidia.com>
---
 cluster_configs/eos_example.yaml | 48 --------------------------------
 1 file changed, 48 deletions(-)
 delete mode 100644 cluster_configs/eos_example.yaml

diff --git a/cluster_configs/eos_example.yaml b/cluster_configs/eos_example.yaml
deleted file mode 100644
index 124f85eb0d..0000000000
--- a/cluster_configs/eos_example.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-executor: slurm
-
-ssh_tunnel:
-  host: login-eos.nvidia.com
-  # ------------------------------- Fill this up! -------------------------------
-  user: your_username
-  job_dir: /lustre/fsw/llmservice_nemo_speechlm/users/your_username/code/nemo-run
-  identity: ""
-  # -----------------------------------------------------------------------------
-
-# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel
-# job_dir: <some location on slurm cluster to keep job metadata, uploaded code and generated sbatch files>
-
-account: llmservice_nemo_speechlm
-partition: batch
-job_name_prefix: ""
-
-disable_gpus_per_node: True
-
-containers:
-  trtllm: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-trtllm-latest.sqsh
-  vllm: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-vllm-latest.sqsh
-  sglang: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-sglang-latest.sqsh
-  nemo-rl: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-nemo-rl-latest.sqsh
-  megatron: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-megatron-latest.sqsh
-  sandbox: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-sandbox-latest.sqsh
-  nemo-skills: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-latest.sqsh
-  verl: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-verl-latest.sqsh
-
-mounts:
-  # - /lustre/fsw/llmservice_nemo_reasoning/hf_models:/hf_models
-  # - /lustre/fsw/llmservice_nemo_reasoning/images/swe-bench:/swe-bench-images
-  - /lustre/fsw/llmservice_nemo_speechlm:/lustre/fsw/llmservice_nemo_speechlm
-
-  # you also need to mount your own workspace folder (or any other folder you need)
-  # - /lustre/fsw/llmservice_nemo_reasoning/users/igitman/:/workspace
-
-env_vars:
-  # ------------------------------- Fill this up! -------------------------------
-  - HF_HOME=/lustre/fsw/llmservice_nemo_speechlm/users/your_username/hfcache
-  # -----------------------------------------------------------------------------
-
-timeouts:
-  batch: 04:00:00
-  interactive: 02:00:00
-
-mail_type: FAIL
-mail_user: # <your email goes here>

From 42eb13154168a6a7b22fcbb92353aac1411b8257 Mon Sep 17 00:00:00 2001
From: "Fejgin, Roy" <rfejgin@nvidia.com>
Date: Wed, 18 Feb 2026 20:10:04 -0800
Subject: [PATCH 7/7] Detect misconfiguration of gpus_per_node

Signed-off-by: Fejgin, Roy <rfejgin@nvidia.com>
---
 nemo_skills/pipeline/utils/server.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nemo_skills/pipeline/utils/server.py b/nemo_skills/pipeline/utils/server.py
index adfcbb54db..124d3e7a6d 100644
--- a/nemo_skills/pipeline/utils/server.py
+++ b/nemo_skills/pipeline/utils/server.py
@@ -124,6 +124,13 @@ def get_server_command(
 ):
     num_tasks = num_gpus
 
+    if gpus_per_node > 1 and server_type != "generic":
+        raise ValueError(
+            f"Multi-instance mode (gpus_per_node={gpus_per_node}) is only supported for "
+            f"server_type='generic', but got server_type='{server_type}'. "
+            f"Use gpus_per_node=1 or switch to server_type='generic'."
+        )
+
     # check if the model path is mounted if not vllm, sglang, or trtllm;
     # vllm, sglang, trtllm can also pass model name as "model_path" so we need special processing
     if server_type not in ["vllm", "sglang", "trtllm", "generic"]: