NVIDIA-NeMo · Kipok · Oct 9, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
@@ -70,7 +70,7 @@ There are a few parameters specific to SWE-bench. They have to be specified with
 
 - **++agent_framework_repo:** URL of the repository to use for SWE-agent/OpenHands. Allows you to pass in a custom fork of these repositories. If you do this, you may find it helpful to check [nemo_skills/inference/eval/swebench.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/eval/swebench.py) to understand how the frameworks are used internally. This is passed directly as an argument to `git clone`. Defaults to the official repositories: [`https://github.com/SWE-agent/SWE-agent.git`](https://github.com/SWE-agent/SWE-agent) for SWE-agent, [`https://github.com/All-Hands-AI/OpenHands.git`](https://github.com/All-Hands-AI/OpenHands) for OpenHands.
 
-- **++agent_framework_commit:** The commit hash to use when cloning agent_framework_repo. Allows you to pin SWE-agent/OpenHands to a specific version. Defaults to `HEAD`, i.e. the latest commit.
+- **++agent_framework_commit:** The commit hash, branch or tag to checkout after cloning agent_framework_repo. Allows you to pin SWE-agent/OpenHands to a specific version. Defaults to `HEAD`, i.e. the latest commit.
 
 - **++agent_config:** The config file to use for SWE-agent/OpenHands.
     - For SWE-agent, this is a YAML file. See the [SWE-agent docs](https://swe-agent.com/latest/config/config/).
@@ -80,8 +80,16 @@ There are a few parameters specific to SWE-bench. They have to be specified with
 
 - **++agent_max_turns:** The maximum number of turns the agent is allowed to take before the trajectory is forcibly terminated. Defaults to 100 for both SWE-agent and OpenHands.
 
+- **++eval_harness_repo:** URL of the repository to use for the evaluation harness. This is passed directly as an argument to `git clone`. Defaults to [`https://github.com/Kipok/SWE-bench.git`](https://github.com/Kipok/SWE-bench), our fork of SWE-bench that supports local evaluation.
+
+- **++eval_harness_commit:** The commit hash, branch or tag to checkout after cloning agent_harness_repo. Defaults to `HEAD`, i.e. the latest commit.
+
 - **++swebench_tests_timeout:** The timeout for tests after applying the generated patch during evaluation, in seconds. Defaults to 1800, i.e. 30 minutes.
 
+- **++max_retries:** How many times to try running inference and evaluation until a valid output file is produced. Defaults to 3.
+
+- **++min_retry_interval, ++max_retry_interval:** The interval between retries, in seconds. Selected randomly between min and max on each retry. Defaults to 60 and 180 respectively.
+
 #### Inference parameters
 
 For this benchmark, inference parameters work a bit differently. This is because it does not use the Nemo-Skills LLM client, instead the interaction with the LLM server is handled by SWE-agent/OpenHands. Most inference parameters are not passed to the LLM by default if you don't explicitly specify them, and some parameters may be unsupported, e.g. when using OpenHands.
@@ -150,9 +158,9 @@ After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-res
     "pass@1": {
       "num_entries": 500,
       "gen_seconds": 7172,
-      "issues_resolved": 50.0,
-      "no_patch": 0.2,
-      "patch_cant_apply": 2.0
+      "issues_resolved": 45.0,
+      "no_patch": 0.4,
+      "patch_cant_apply": 0.8
     }
   }
 }

diff --git a/nemo_skills/inference/eval/swebench.py b/nemo_skills/inference/eval/swebench.py
@@ -17,6 +17,7 @@
 import json
 import logging
 import os
+import random
 import shlex
 import sys
 from dataclasses import field
@@ -101,8 +102,21 @@ class SweBenchGenerationConfig:
     agent_config: str | None = None
     agent_max_turns: int = 100  # Max iterations for the agent
 
+    # URL of the evaluation harness repo to pass to git clone. Defaults to our fork of SWE-bench with local evaluation
+    eval_harness_repo: str = "https://github.com/Kipok/SWE-bench.git"
+    eval_harness_commit: str = "HEAD"  # Which commit to use when cloning the eval harness repo
+
     swebench_tests_timeout: int = 60 * 30  # Timeout for the tests after applying the patch, in seconds
 
+    # How many times to try running inference & evaluation commands until they produce a valid output file
+    max_retries: int = 3
+
+    # Interval between retries, in seconds.
+    # Selected randomly between min_retry_interval and max_retry_interval every time an instance is retried,
+    # in order to avoid too many instances making network requests at the same time.
+    min_retry_interval: int = 60
+    max_retry_interval: int = 180
+
     inference: SweBenchInferenceConfig = field(default_factory=SweBenchInferenceConfig)  # LLM call parameters
     # Inference server configuration {server_params}
     server: dict = field(default_factory=dict)
@@ -170,9 +184,7 @@ async def apply_evaluation_hook(self, data_point):
         # currently evaluation is done directly after generation already
         return data_point
 
-    async def _execute_container_command(
-        self, data_point, command, expected_file_pattern, mode, max_retries=3, timeout=100000
-    ):
+    async def _execute_container_command(self, data_point, command, expected_file_pattern, mode, timeout=100000):
         """Execute a command in an Apptainer container with retry logic."""
         container_name = data_point["container_formatter"].format(
             instance_id=data_point["instance_id"].replace("__", "_1776_")
@@ -194,12 +206,12 @@ async def _execute_container_command(
         )
 
         # Retry apptainer command up to max_retries times
-        for attempt in range(max_retries):
+        for attempt in range(self.cfg.max_retries):
             log_file_path = logs_dir / f"{data_point['instance_id']}_{mode}_attempt{attempt + 1}.log"
             LOG.info(
                 "Starting execution of an apptainer command (attempt %d of %d). Logs are available at %s",
                 attempt + 1,
-                max_retries,
+                self.cfg.max_retries,
                 log_file_path,
             )
 
@@ -222,7 +234,7 @@ async def _execute_container_command(
                         if process.returncode is None:
                             process.kill()
                             await process.wait()
-                        attempt = max_retries  # Force exit the loop on timeout
+                        attempt = self.cfg.max_retries  # Force exit the loop on timeout
                         raise ValueError("Command timed out")
 
                 # Look for the expected file
@@ -237,15 +249,21 @@ async def _execute_container_command(
                         f"found {len(pred_files)}."
                     )
             except Exception:
-                if attempt < max_retries - 1:
+                if attempt < self.cfg.max_retries - 1:
+                    retry_interval = random.randint(self.cfg.min_retry_interval, self.cfg.max_retry_interval)
                     LOG.warning(
-                        "Attempt %d failed for instance %s. Retrying...",
+                        "Attempt %d failed for instance %s. Retrying in %d seconds...",
                         attempt + 1,
                         data_point["instance_id"],
+                        retry_interval,
                     )
+                    if retry_interval > 0:
+                        await asyncio.sleep(retry_interval)
                     continue
                 else:
-                    LOG.error("All %d attempts failed for instance %s", max_retries, data_point["instance_id"])
+                    LOG.error(
+                        "All %d attempts failed for instance %s", self.cfg.max_retries, data_point["instance_id"]
+                    )
                     LOG.error("Apptainer command failed. Check logs at: %s", log_file_path)
                     raise ValueError(
                         f"Job failed for {data_point['instance_id']}. Check logs at: {log_file_path}. "
@@ -303,7 +321,9 @@ async def _run_swe_agent(self, data_point, api_base):
         )
 
         # Execute SWE-agent command
-        search_path = os.path.join(self.output_dir / "trajectories", "**", f"{data_point['instance_id']}.pred")
+        search_path = os.path.join(
+            self.output_dir, "trajectories", "*", "*", data_point["instance_id"], f"{data_point['instance_id']}.pred"
+        )
         pred_file = await self._execute_container_command(data_point, swe_agent_cmd, search_path, mode="agent")
 
         with open(pred_file, "r") as f:
@@ -408,7 +428,7 @@ async def _run_openhands(self, data_point, api_base):
         )
 
         # Execute OpenHands command
-        search_path = os.path.join(self.output_dir / "trajectories", "**", data_point["instance_id"], "output.jsonl")
+        search_path = os.path.join(self.output_dir, "trajectories", data_point["instance_id"], "output.jsonl")
         out_file = await self._execute_container_command(data_point, openhands_cmd, search_path, mode="agent")
 
         with open(out_file, "r") as f:
@@ -480,9 +500,10 @@ async def process_single_datapoint(self, data_point, data):
                 # first installing SWE-bench repo
                 "curl -LsSf https://astral.sh/uv/install.sh | sh && "
                 "source /root/.local/bin/env && "
-                "cd /root && "
-                "git clone https://github.com/Kipok/SWE-bench.git && "
-                "cd SWE-bench && "
+                "mkdir /root/SWE-bench && "
+                "cd /root/SWE-bench && "
+                f"git clone {self.cfg.eval_harness_repo} . && "
+                f"git checkout {self.cfg.eval_harness_commit} && "
                 "uv venv --python 3.12 venv && "
                 "source venv/bin/activate && "
                 "uv pip install -e . && "
@@ -492,15 +513,12 @@ async def process_single_datapoint(self, data_point, data):
                 f"    --instance_ids {data_point['instance_id']} "
                 f"    --run_id eval-outputs "
                 f"    --timeout {self.cfg.swebench_tests_timeout} "
-                f"    --dataset_name {data_point['dataset_name']} "
-                f"    --split {data_point['split']} && "
+                f"    --dataset_name {self.cfg.input_file} && "
                 f"cp -r logs/run_evaluation/eval-outputs /trajectories_mount/"
             )
 
             # Execute SWE-bench evaluation command
-            search_path = os.path.join(
-                self.output_dir, "eval-outputs", "**", f"{data_point['instance_id']}/report.json"
-            )
+            search_path = os.path.join(self.output_dir, "eval-outputs", "*", data_point["instance_id"], "report.json")
             # TODO: should we fail on errors here? Seems that json isn't always generated
             try:
                 report_file = await self._execute_container_command(

diff --git a/tests/slurm-tests/README.md b/tests/slurm-tests/README.md
@@ -17,3 +17,5 @@ You can change CURRENT_DATE to any value there to ensure you don't
 accidentally override results of existing pipeline.
 
 See [./clone_and_run.sh](./clone_and_run.sh) for how to register tests to run on schedule with cron.
+
+**Note on SWE-bench tests ([qwen3coder_30b_swebench](qwen3coder_30b_swebench)):** by default, run_all.sh assumes your cluster config has a mount called `/swe-bench-images` where the SWE-bench Verified images are downloaded. See the [SWE-bench docs](https://nvidia.github.io/NeMo-Skills/evaluation/code/#data-preparation) for more details. To use a different path, you can modify the test's --container_formatter parameter, or you can remove it entirely to pull the images from Dockerhub every time the test is run (not recommended).
diff --git a/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py b/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent))  # for utils.py
+from utils import assert_all, get_nested_value, load_json, soft_assert  # noqa: E402
+
+METRIC_RANGES = {
+    # +/- 3 pts from scores measured on 2025-10-08 (avg of 6 runs for OpenHands, 3 for SWE-agent)
+    "openhands": {
+        ("swe-bench", "pass@1", "issues_resolved"): (41.9, 47.9),
+    },
+    "swe_agent": {
+        ("swe-bench", "pass@1", "issues_resolved"): (46.4, 52.4),
+    },
+}
+
+
+def check_results(eval_dir: str, agent_framework: str):
+    f = os.path.join(eval_dir, "eval-results", "swe-bench", "metrics.json")
+    data = load_json(f)
+    for category_tuple, expected_range in METRIC_RANGES[agent_framework].items():
+        val = float(get_nested_value(data, category_tuple))
+        lo, hi = expected_range
+        soft_assert(lo <= val <= hi, f"swe-bench ({agent_framework}) {category_tuple}={val} out of range [{lo},{hi}]")
-    for category_tuple, expected_range in METRIC_RANGES[agent_framework].items():
-        val = float(get_nested_value(data, category_tuple))
-        lo, hi = expected_range
-        soft_assert(lo <= val <= hi, f"swe-bench ({agent_framework}) {category_tuple}={val} out of range [{lo},{hi}]")
+    for category_tuple, expected_range in METRIC_RANGES[agent_framework].items():
+        raw_val = get_nested_value(data, category_tuple)
+        if raw_val is None:
+            soft_assert(False, f"swe-bench ({agent_framework}) {category_tuple} missing")
+            continue
+        try:
+            val = float(raw_val)
+        except (TypeError, ValueError):
+            soft_assert(False, f"swe-bench ({agent_framework}) {category_tuple}={raw_val} not numeric")
+            continue
+        lo, hi = expected_range
+        soft_assert(lo <= val <= hi,
+                    f"swe-bench ({agent_framework}) {category_tuple}={val} out of range [{lo},{hi}]")
-    for category_tuple, expected_range in METRIC_RANGES[agent_framework].items():
-        val = float(get_nested_value(data, category_tuple))
-        lo, hi = expected_range
-        soft_assert(lo <= val <= hi, f"swe-bench ({agent_framework}) {category_tuple}={val} out of range [{lo},{hi}]")
+    for category_tuple, expected_range in METRIC_RANGES[agent_framework].items():
+        raw_val = get_nested_value(data, category_tuple)
+        if raw_val is None:
+            soft_assert(False, f"swe-bench ({agent_framework}) {category_tuple} missing")
+            continue
+        try:
+            val = float(raw_val)
+        except (TypeError, ValueError):
+            soft_assert(False, f"swe-bench ({agent_framework}) {category_tuple}={raw_val} not numeric")
+            continue
+        lo, hi = expected_range
+        soft_assert(lo <= val <= hi,
+                    f"swe-bench ({agent_framework}) {category_tuple}={val} out of range [{lo},{hi}]")
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--workspace", required=True, help="Workspace directory containing eval results")
+    ap.add_argument("--agent_framework", required=True, help="Agent framework used for the run")
+    args = ap.parse_args()
+
+    check_results(args.workspace, args.agent_framework)
+
+    assert_all()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/slurm-tests/qwen3coder_30b_swebench/run_test.py b/tests/slurm-tests/qwen3coder_30b_swebench/run_test.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from nemo_skills.pipeline.cli import eval, prepare_data, run_cmd, wrap_arguments
+
+
+def eval_qwen3coder(workspace, cluster, expname_prefix, wandb_project, agent_framework):
+    eval(
+        ctx=wrap_arguments(
+            f"++agent_framework={agent_framework} "
+            f"++inference.temperature=0.7 "
+            f"++inference.top_p=0.8 "
+            f"++inference.top_k=20 "
+        ),
+        cluster=cluster,
+        model="Qwen/Qwen3-Coder-30B-A3B-Instruct",
+        server_type="vllm",
+        server_args="--enable-auto-tool-choice --tool-call-parser qwen3_coder",
+        server_nodes=1,
+        server_gpus=8,
+        benchmarks="swe-bench",
+        num_chunks=8,
+        dependent_jobs=2,  # automatically rerun 2 times because it's common for some instances to fail
+        reuse_code=False,  # otherwise the second run (swe-agent) tries to read the config file from the absolute cluster path and fails
+        output_dir=workspace,
+        expname=expname_prefix,
+        wandb_project=wandb_project,
+        wandb_name=expname_prefix,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--workspace", required=True, help="Workspace directory containing all experiment data")
+    parser.add_argument("--cluster", required=True, help="Cluster name, e.g. oci")
+    parser.add_argument("--expname_prefix", required=True, help="Experiment name prefix")
+    parser.add_argument("--wandb_project", default="nemo-skills-slurm-ci", help="W&B project name")
+    parser.add_argument("--container_formatter", default=None, help="Container formatter for SWE-bench")
+
+    args = parser.parse_args()
+
+    if args.container_formatter is None:
+        prepare_data_args = "swe-bench"
+    else:
+        prepare_data_args = f"swe-bench --container_formatter {args.container_formatter}"
+    prepare_data(ctx=wrap_arguments(prepare_data_args))
+
+    for agent_framework in ["openhands", "swe_agent"]:
+        workspace = f"{args.workspace}/{agent_framework}"
+        expname_prefix = f"{args.expname_prefix}_{agent_framework}"
+
+        eval_qwen3coder(
+            workspace=workspace,
+            cluster=args.cluster,
+            expname_prefix=expname_prefix,
+            wandb_project=args.wandb_project,
+            agent_framework=agent_framework,
+        )
+
+        # schedule a dependent check job on the cluster and check if the results are as expected
+        checker_cmd = (
+            f"python tests/slurm-tests/qwen3coder_30b_swebench/check_results.py "
+            f"  --workspace {workspace} "
+            f"  --agent_framework {agent_framework} "
+        )
+
+        run_cmd(
+            ctx=wrap_arguments(checker_cmd),
+            cluster=args.cluster,
+            expname=f"{expname_prefix}-check-results",
+            log_dir=f"{workspace}/check-results-logs",
+            run_after=expname_prefix,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/slurm-tests/run_all.sh b/tests/slurm-tests/run_all.sh
@@ -11,4 +11,6 @@ sleep 10
 python tests/slurm-tests/qwen3_4b_evals/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/qwen3_4b_evals --expname_prefix qwen3_4b_evals_$CURRENT_DATE &
 sleep 10
 python tests/slurm-tests/omr_simple_recipe/run_test.py --cluster $CLUSTER --backend nemo-rl --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/omr_simple_recipe/nemo-rl --expname_prefix omr_simple_recipe_nemo_rl_$CURRENT_DATE &
+sleep 10
+python tests/slurm-tests/qwen3coder_30b_swebench/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/qwen3coder_30b_swebench --expname_prefix qwen3coder_30b_swebench_$CURRENT_DATE --container_formatter '/swe-bench-images/swebench_sweb.eval.x86_64.{instance_id}.sif' &
 wait
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,3 +17,5 @@ You can change CURRENT_DATE to any value there to ensure you don't
		accidentally override results of existing pipeline.

		See [./clone_and_run.sh](./clone_and_run.sh) for how to register tests to run on schedule with cron.

		Note on SWE-bench tests ([qwen3coder_30b_swebench](qwen3coder_30b_swebench)): by default, run_all.sh assumes your cluster config has a mount called `/swe-bench-images` where the SWE-bench Verified images are downloaded. See the [SWE-bench docs](https://nvidia.github.io/NeMo-Skills/evaluation/code/#data-preparation) for more details. To use a different path, you can modify the test's --container_formatter parameter, or you can remove it entirely to pull the images from Dockerhub every time the test is run (not recommended).