Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
104 commits
Select commit Hold shift + click to select a range
537e8b3
some support
wasiahmad Nov 29, 2025
febd824
updating to support mini-swe-agent
wasiahmad Dec 21, 2025
783354b
updating to support mini-swe-agent
wasiahmad Dec 21, 2025
71167b0
updating to support mini-swe-agent
wasiahmad Dec 21, 2025
6422992
fixing a minor bug
wasiahmad Dec 21, 2025
583b9ab
fixing a minor bug
wasiahmad Dec 21, 2025
3d99c60
fixing a minor bug
wasiahmad Dec 21, 2025
3846964
fixing a minor bug
wasiahmad Dec 21, 2025
dac4bca
fixing a minor bug
wasiahmad Dec 21, 2025
9f3e234
fixing a minor bug
wasiahmad Dec 22, 2025
fa2e5b5
Add HMMT Nov 2025 dataset (#1061)
i-vainn Dec 1, 2025
4bd5f7d
Use docker build cache (#1056)
gwarmstrong Dec 2, 2025
aff6e4a
ci: Add CodeRabbit configuration file (#1063)
chtruong814 Dec 2, 2025
7c44a0d
FIX integration tests by escaping aalcr and adding judge args (#1062)
gwarmstrong Dec 2, 2025
e5bcd68
ENH add tool calling args (#1067)
gwarmstrong Dec 2, 2025
c74cd99
Fix sglang tool calling (#1070)
gwarmstrong Dec 4, 2025
e03f563
Network Blocking for Sandbox Code Execution (#1071)
gwarmstrong Dec 4, 2025
c376270
Fixes to support SWE-bench Multilingual (#1064)
ludwig-n Dec 4, 2025
1b1f66e
fix: IFBench error handling and build improvements (#1073)
gwarmstrong Dec 4, 2025
782b083
FIX math verify handle leading zeros and int literals cases (#1074)
gwarmstrong Dec 4, 2025
1545f73
build: move data preparation to beginning of gpu tests build (#1077)
gwarmstrong Dec 5, 2025
6594d4c
MAINT update langugage-data dependency (#1076)
gwarmstrong Dec 5, 2025
53f1056
MAINT: Add audio requirements to vllm image (#1081)
gwarmstrong Dec 5, 2025
7e35ddd
Add apex-shortlist dataset (#1080)
i-vainn Dec 8, 2025
0316807
Introduce regex for small differences of formatting from judge (#1082)
wprazuch Dec 9, 2025
0807259
Add LCB Prompts, fix regex bug in robust_eval, remove CR, make summar…
gnalbandyan Dec 9, 2025
b74c543
MAINT pin nemo-evaluator (#1095)
gwarmstrong Dec 10, 2025
5c15cf7
Update issue templates
gwarmstrong Dec 11, 2025
c4eb65f
Delete .github/ISSUE_TEMPLATE directory
gwarmstrong Dec 11, 2025
2d93252
enable blank issues (#1096)
gwarmstrong Dec 11, 2025
b40fff1
Fix input_file path handling when executor is "none" (#1089)
bzantium Dec 11, 2025
da79a43
TST for #1089 (#1097)
gwarmstrong Dec 11, 2025
e4aa660
Stepheng/prover cleanup (#1078)
stephencge Dec 11, 2025
7934476
add stem dependencies in main python sandbox (#1099)
jiacheng-xu Dec 11, 2025
eb5fe5a
Audiometrics unification (#1093)
Jorjeous Dec 11, 2025
56a3fa9
FEAT Add Tavily Search (#1085)
gwarmstrong Dec 11, 2025
f7e5479
updating code extraction logic (#1086)
wasiahmad Dec 11, 2025
56662d3
Sandbox add stem (#1101)
jiacheng-xu Dec 12, 2025
2007af2
Handle none output in wmtp24++ (#1091)
Froxyy-dev Dec 12, 2025
180f114
ENH enable sandbox env overrides in generate (#1107)
gwarmstrong Dec 12, 2025
637ce1f
Search Tool Parameter updates (#1112)
gwarmstrong Dec 15, 2025
3fb4e65
autoformalize cleanup (#1098)
stephencge Dec 15, 2025
c98b587
HF ASR Leaderboard Evaluation (#1104)
melllinia Dec 15, 2025
3ea7a17
Stepheng/nemotron math proofs docs (#1111)
stephencge Dec 16, 2025
e9ad754
Stepheng/prover gpt oss fix (#1114)
stephencge Dec 16, 2025
552af8c
add Nemotron-Math-V2.pdf (#1113)
wedu-nvidia Dec 16, 2025
dfc8e9a
SWE-bench: don't pass external environment variables into Apptainer c…
ludwig-n Dec 16, 2025
88ad93b
Adding clan PR with AudioBench and Librispeech PC. (#1103)
Jorjeous Dec 16, 2025
9b3c571
Schema overrides for tool-calling (#1118)
gwarmstrong Dec 16, 2025
cec7759
FIX tool call error handling and search tool errors (#1120)
gwarmstrong Dec 17, 2025
e7582a3
Use run.Script for generate pipeline (#1052)
gwarmstrong Dec 17, 2025
8e02df1
Port ICPC changes to IOI (#1046)
SeanNaren Dec 17, 2025
92a1bc9
replace raise error with LOG.warning in AA LCR dataset prepare (#1119)
anowaczynski-nvidia Dec 17, 2025
8e0c152
FIX tavily search results return type (#1123)
gwarmstrong Dec 17, 2025
9a042c1
Revert "Use run.Script for generate pipeline (#1052)" (#1125)
gwarmstrong Dec 18, 2025
667d56b
Fix: add serialized_output on bad request (#1127)
gwarmstrong Dec 18, 2025
21a4be4
update paper link (#1128)
wedu-nvidia Dec 18, 2025
25aae9e
update paper link, references to dataset, self-correction differences…
stephencge Dec 18, 2025
3754a9e
FIX ioi ignore (#1131)
gwarmstrong Dec 18, 2025
fb866ea
download AA-LCR_extracted-text.zip via hf_hub_download (#1126)
anowaczynski-nvidia Dec 18, 2025
c52c04e
Evaluation on Livecodebench-pro (#1115)
wasiahmad Dec 19, 2025
67d3493
Evaluation support for SWE-rebench (#1102)
wasiahmad Dec 24, 2025
46ecb38
Trust remote code in tokenizer (#1146)
Kipok Dec 27, 2025
b5fe5e0
Merge branch 'main' into mini_swe_agent
wasiahmad Feb 4, 2026
0a49ab3
adding mini-swe-agent in generation-task
wasiahmad Feb 4, 2026
d22eb1d
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
0a56152
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
1a815de
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
fbdaf2f
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
6ed5573
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
9338bde
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
961262e
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
4790ea9
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
08db911
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
a85041c
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
0288d9a
updating mini-swe-agent cmd
wasiahmad Feb 4, 2026
7bae25b
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
26d0e54
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
9bff948
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
241393d
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
c1fb80c
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
61c666d
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
aa08f7d
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
f3dc6df
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
9cde4e6
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
5a64c00
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
d911f9f
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
6dcba7b
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
894ca31
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
0ce1349
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
b685c0b
updating mini-swe-agent cmd
wasiahmad Feb 5, 2026
f27c50b
Fix getting patch
ludwig-n Feb 6, 2026
d567697
Save configs in separate folder
ludwig-n Feb 6, 2026
4c3e0db
Update docs
ludwig-n Feb 6, 2026
20f8580
Remove drop_params from configs
ludwig-n Feb 6, 2026
74922d7
supporting agent_max_turns
wasiahmad Feb 6, 2026
6615af3
downgrading rich to avoid issues with some instances
wasiahmad Feb 6, 2026
72f0d46
missing && added
wasiahmad Feb 7, 2026
8e0848a
Merge branch 'main' into mini_swe_agent
wasiahmad Feb 7, 2026
6251767
Merge branch 'main' into mini_swe_agent
wasiahmad Feb 7, 2026
8a095f7
adding reference
wasiahmad Feb 7, 2026
0acf4a4
Remove step_limit and set cost_limit=0 in all configs
ludwig-n Feb 10, 2026
4b05c9d
Merge branch 'main' into mini_swe_agent
wasiahmad Feb 10, 2026
276556c
Merge branch 'main' into mini_swe_agent
wasiahmad Feb 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions docs/evaluation/code.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ More details are coming soon!
- Benchmark is defined in [`nemo_skills/dataset/swe-bench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/swe-bench/__init__.py)
- Original benchmark source is [here](https://github.com/SWE-bench/SWE-bench).

Nemo-Skills can run inference (rollout) on SWE-bench-style datasets using 2 agentic frameworks: [SWE-agent](https://swe-agent.com/latest/) and [OpenHands](https://www.all-hands.dev/). It can then evaluate the generated patches on SWE-bench Verified/Lite/Full using the [official SWE-bench harness](https://www.swebench.com/SWE-bench/guides/evaluation/).
Nemo-Skills can run inference (rollout) on SWE-bench-style datasets using 3 agent frameworks: [SWE-agent](https://swe-agent.com/latest/), [mini-SWE-agent](https://mini-swe-agent.com/latest/) and [OpenHands](https://www.all-hands.dev/). It can then evaluate the generated patches on SWE-bench Verified/Lite/Full using the [official SWE-bench harness](https://www.swebench.com/SWE-bench/guides/evaluation/).

#### Data preparation

Expand Down Expand Up @@ -66,19 +66,19 @@ When this path is accessed during evaluation, `{instance_id}` will be replaced b

There are a few parameters specific to SWE-bench. They have to be specified with the `++` prefix. All of them are optional, except for ++agent_framework.

- **++agent_framework:** which agentic framework to use. Must be either `swe_agent` or `openhands`. No default, must be specified explicitly.
- **++agent_framework:** which agent framework to use. Must be one of `swe_agent`, `mini_swe_agent` or `openhands`. No default, must be specified explicitly.

- **++agent_framework_repo:** URL of the repository to use for SWE-agent/OpenHands. Allows you to pass in a custom fork of these repositories. If you do this, you may find it helpful to check [nemo_skills/inference/eval/swebench.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/inference/eval/swebench.py) to understand how the frameworks are used internally. This is passed directly as an argument to `git clone`. Defaults to the official repositories: [`https://github.com/SWE-agent/SWE-agent.git`](https://github.com/SWE-agent/SWE-agent) for SWE-agent, [`https://github.com/All-Hands-AI/OpenHands.git`](https://github.com/All-Hands-AI/OpenHands) for OpenHands.
- **++agent_framework_repo:** URL of the repository to use for SWE-agent/mini-SWE-agent/OpenHands. Allows you to pass in a custom fork of these repositories. If you do this, you may find it helpful to check [nemo_skills/inference/eval/swebench.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/inference/eval/swebench.py) to understand how the frameworks are used internally. This is passed directly as an argument to `git clone`. Defaults to the official repositories: [`https://github.com/SWE-agent/SWE-agent.git`](https://github.com/SWE-agent/SWE-agent) for SWE-agent, [`https://github.com/SWE-agent/mini-swe-agent.git`](https://github.com/SWE-agent/mini-swe-agent) for mini-SWE-agent, [`https://github.com/All-Hands-AI/OpenHands.git`](https://github.com/All-Hands-AI/OpenHands) for OpenHands.

- **++agent_framework_commit:** The commit hash, branch or tag to checkout after cloning agent_framework_repo. Allows you to pin SWE-agent/OpenHands to a specific version. Defaults to `HEAD`, i.e. the latest commit.
- **++agent_framework_commit:** The commit hash, branch or tag to checkout after cloning agent_framework_repo. Allows you to pin SWE-agent/mini-SWE-agent/OpenHands to a specific version. Defaults to `HEAD` for SWE-agent & OpenHands and `v2.0` for mini-SWE-agent.

- **++agent_config:** The config file to use for SWE-agent/OpenHands.
- For SWE-agent, this is a YAML file. See the [SWE-agent docs](https://swe-agent.com/latest/config/config/).
- **++agent_config:** The config file to use for the agent framework.
- For SWE-agent and mini-SWE-agent, this is a YAML file. See the docs: [SWE-agent](https://swe-agent.com/latest/config/config/), [mini-SWE-agent](https://mini-swe-agent.com/latest/advanced/yaml_configuration/).
- For OpenHands, this is a TOML file. Nemo-Skills runs OpenHands via their SWE-bench evaluation script, so the only settings you can set are the LLM settings under the `[llm.model]` section. For more details, see the [OpenHands evaluation README](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/README.md). Note that Nemo-Skills always uses the `[llm.model]` config section and therefore does not support multiple LLM configurations in one TOML file.
- Nemo-Skills overrides certain parameters, even if they are specified in the config file. These parameters are listed in a comment in the default config files below.
- Defaults to [eval/swe-bench/swe-agent/default](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/eval/swe-bench/swe-agent/default.yaml) for SWE-agent, [eval/swe-bench/openhands/default](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/eval/swe-bench/openhands/default.toml) for OpenHands. Note that if you store your configs in your local Nemo-Skills repo, then the path can be relative to the `nemo_skills/prompt` folder and the file extension is added automatically (same as how it works with regular [prompt configs](../basics/prompt-format.md)).
- Defaults to [eval/swe-bench/swe-agent/default](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/eval/swe-bench/swe-agent/default.yaml) for SWE-agent, [eval/swe-bench/mini-swe-agent/swebench](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/eval/swe-bench/mini-swe-agent/swebench.yaml) for mini-SWE-agent, [eval/swe-bench/openhands/default](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/eval/swe-bench/openhands/default.toml) for OpenHands. Note that if you store your configs in your local Nemo-Skills repo, then the path can be relative to the `nemo_skills/prompt` folder and the file extension is added automatically (same as how it works with regular [prompt configs](../basics/prompt-format.md)).

- **++agent_max_turns:** The maximum number of turns the agent is allowed to take before the trajectory is forcibly terminated. Defaults to 100 for both SWE-agent and OpenHands.
- **++agent_max_turns:** The maximum number of turns the agent is allowed to take before the trajectory is forcibly terminated. Defaults to 100 for all agent frameworks.

- **++eval_harness_repo:** URL of the repository to use for the evaluation harness. This is passed directly as an argument to `git clone`. Defaults to [`https://github.com/Kipok/SWE-bench.git`](https://github.com/Kipok/SWE-bench), our fork of SWE-bench that supports local evaluation.

Expand All @@ -94,24 +94,25 @@ There are a few parameters specific to SWE-bench. They have to be specified with

#### Inference parameters

For this benchmark, inference parameters work a bit differently. This is because it does not use the Nemo-Skills LLM client, instead the interaction with the LLM server is handled by SWE-agent/OpenHands.
For this benchmark, inference parameters work a bit differently. This is because it does not use the Nemo-Skills LLM client, instead the interaction with the LLM server is handled by the agent framework.

Most inference parameters are not passed to the LLM by default if you don't explicitly specify them, with the exception of temperature (defaults to 0) and top_p (defaults to 0.95). Any parameters you set explicitly will be passed. Custom parameters can be set via extra_body like this: `++inference.extra_body.chat_template_kwargs.enable_thinking=False`. However, keep in mind certain parameters may not be supported by your LLM server.

It's worth noting that when using VLLM with a HuggingFace model, any parameters that are not passed to the server will be taken from the model's config on HuggingFace by default. This may or may not be what you want. To disable this, you can add `--generation-config vllm` to the `--server_args` parameter. See [VLLM docs](https://docs.vllm.ai/en/latest/configuration/engine_args.html#-generation-config).

#### Tool calling

SWE-bench requires models to call custom tools. By default SWE-agent & OpenHands expect that the LLM server supports *native tool calling*, which means the server can parse the model's tool calls and return them in a structured format separately from the rest of the model's output. This is convenient because the agentic framework doesn't have to know what the model's preferred tool call format is. In order to set this up, you need to add these arguments to `--server_args`:
SWE-bench requires models to call custom tools. By default agent frameworks expect that the LLM server supports *native tool calling*, which means the server can parse the model's tool calls and return them in a structured format separately from the rest of the model's output. This is convenient because the agent framework doesn't have to know what the model's preferred tool call format is. In order to set this up, you need to add these arguments to `--server_args`:

- for VLLM: `--enable-auto-tool-choice --tool-call-parser <PARSER_NAME>`
- for SGLang: `--tool-call-parser <PARSER_NAME>`

For more details and the list of supported parsers, see the docs: [VLLM](https://docs.vllm.ai/en/stable/features/tool_calling.html#automatic-function-calling), [SGLang](https://docs.sglang.ai/advanced_features/function_calling.html).

In addition, both SWE-agent and OpenHands can run without native tool calling. This means the tool calls will be parsed by the agentic framework itself. To try this out, you can use the following configs with the `++agent_config` parameter:
In addition, all supported agent frameworks can run without native tool calling. This means the tool calls will be parsed by the agent framework itself. To try this out, you can use the following configs with the `++agent_config` parameter:

- for SWE-agent: [eval/swe-bench/swe-agent/swe-agent-lm-32b](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/eval/swe-bench/swe-agent/swe-agent-lm-32b.yaml). This was the config used for [SWE-agent-LM-32B](https://huggingface.co/SWE-bench/SWE-agent-LM-32B). Note that there are significant differences with the default config.
- for mini-SWE-agent: [eval/swe-bench/mini-swe-agent/swebench_xml](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/eval/swe-bench/mini-swe-agent/swebench_xml.yaml) or [eval/swe-bench/mini-swe-agent/swebench_backticks](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/eval/swe-bench/mini-swe-agent/swebench_backticks.yaml).
- for OpenHands: [eval/swe-bench/openhands/no-native-tool-calling](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/eval/swe-bench/openhands/no-native-tool-calling.toml). This simply sets `native_tool_calling` to `false`.

Keep in mind that by default the tool call format expected by these frameworks will likely be different from the one that the model was trained on.
Expand Down Expand Up @@ -156,8 +157,8 @@ After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-res
```
Keep in mind there is some variance between runs, so we recommend running evaluation multiple times and averaging out the resolve rate. To do that automatically, you can set `--benchmarks=swe-bench:N`, where N is your desired number of repeats.

To evaluate the same model with SWE-agent,
all you need to do is replace `openhands` with `swe_agent` in the command above.
To evaluate the same model with SWE-agent or mini-SWE-agent,
all you need to do is replace `openhands` with `swe_agent` or `mini_swe_agent` in the command above.

!!! note
There are some instances where the gold (ground truth) patches do not pass the evaluation tests. Therefore, it's likely that on those instances even patches that resolve the issue will be incorrectly evaluated as "unresolved". We have observed 11 such instances in SWE-bench Verified: `astropy__astropy-7606`, `astropy__astropy-8707`, `astropy__astropy-8872`, `django__django-10097`, `psf__requests-1724`, `psf__requests-1766`, `psf__requests-1921`, `psf__requests-2317`, `pylint-dev__pylint-6528`, `pylint-dev__pylint-7080`, `pylint-dev__pylint-7277`. Depending on your setup, this set of instances may be different.
Expand Down
121 changes: 121 additions & 0 deletions nemo_skills/inference/eval/swebench.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import hydra
import tomlkit
import yaml
from omegaconf import OmegaConf

from nemo_skills.inference.generate import GenerationTask
Expand All @@ -44,6 +45,7 @@
class SupportedAgentFrameworks(str, Enum):
swe_agent = "swe_agent"
openhands = "openhands"
mini_swe_agent = "mini_swe_agent"


# Like nemo_skills.inference.generate.InferenceConfig, except most parameters are not passed by default
Expand Down Expand Up @@ -254,6 +256,27 @@ def __init__(self, cfg: SweBenchGenerationConfig):
"uv pip install rich==14.2.0"
)

elif self.cfg.agent_framework == SupportedAgentFrameworks.mini_swe_agent:
if self.cfg.agent_framework_repo is None:
self.cfg.agent_framework_repo = "https://github.com/SWE-agent/mini-swe-agent.git"
if self.cfg.agent_framework_commit is None:
self.cfg.agent_framework_commit = "v2.0"
setup_commands.append(
# clone the swe-agent repo
"rm -rf /root/mini-swe-agent && "
f"git clone {self.cfg.agent_framework_repo} /root/mini-swe-agent && "
"cd /root/mini-swe-agent && "
# Bypass the interactive setup wizard by pointing to the default config
"export MSWEA_MINI_CONFIG_PATH=/root/mini-swe-agent/src/minisweagent/config/benchmarks/swebench.yaml && "
f"git checkout {self.cfg.agent_framework_commit} && "
# make venv & install mini-swe-agent dependencies
"uv venv --python 3.12 --managed-python venv && "
"source venv/bin/activate && "
"uv pip install -e . && "
# force downgrade rich - newer versions cause the swe-agent logger to hang in some instances
"uv pip install rich==14.2.0"
)

elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands:
if self.cfg.multilingual:
if self.cfg.agent_framework_repo is None:
Expand Down Expand Up @@ -532,6 +555,102 @@ async def _run_swe_agent(self, data_point, api_base):

return pred_jsonl_file

async def _run_mini_swe_agent(self, data_point, api_base):
"""
Runs mini-swe-agent on one instance.
Returns the absolute (not mounted) path to a .jsonl file in the SWE-bench evaluation format.
"""
completion_kwargs = {
openai_param: getattr(self.cfg.inference, ns_param)
for ns_param, openai_param in NS_TO_OPENAI_PARAM.items()
if getattr(self.cfg.inference, ns_param) is not None
}
completion_kwargs.update(OmegaConf.to_container(self.cfg.inference.extra_body, resolve=True))
if "top_logprobs" in completion_kwargs:
completion_kwargs["logprobs"] = True
if "reasoning_effort" in completion_kwargs:
completion_kwargs["allowed_openai_params"] = ["reasoning_effort"]

base_config_path = get_config_path(self.cfg.agent_config or "eval/swe-bench/mini-swe-agent/swebench")
with open(base_config_path, "r") as f:
full_config = yaml.safe_load(f)

if "agent" not in full_config:
full_config["agent"] = {}
full_config["agent"]["step_limit"] = self.cfg.agent_max_turns

if "model" not in full_config:
full_config["model"] = {}
if "model_kwargs" not in full_config["model"]:
full_config["model"]["model_kwargs"] = {}

full_config["model"]["model_kwargs"].update(
{
**completion_kwargs,
"api_base": api_base,
"temperature": self.cfg.inference.temperature,
"top_p": self.cfg.inference.top_p,
}
)

(self.output_dir / "configs").mkdir(parents=True, exist_ok=True)
tmp_config_filename = f"configs/config_{data_point['instance_id']}.yaml"
host_tmp_path = os.path.join(self.output_dir, tmp_config_filename)

# Inside the container, this path maps to /trajectories_mount/
container_tmp_path = os.path.join("/trajectories_mount", tmp_config_filename)

with open(host_tmp_path, "w") as f:
yaml.dump(full_config, f)

try:
mini_swe_agent_cmd = (
"cp -r /root_mount/mini-swe-agent /root && "
"cp -r /root_mount/uv /root && "
"cd /root/mini-swe-agent && "
"export MSWEA_CONFIGURED=true && "
f"export MSWEA_MINI_CONFIG_PATH={container_tmp_path} && "
f"/root/mini-swe-agent/venv/bin/python -m minisweagent.run.mini "
f"--config {container_tmp_path} "
f"--model hosted_vllm/{self.cfg.server.model} "
f"--task {shlex.quote(data_point['problem_statement'])} "
f"--output trajectories/{data_point['instance_id']}.traj.json "
f"--yolo "
f"--exit-immediately && "
"mkdir -p /trajectories_mount/trajectories && cp -r trajectories/* /trajectories_mount/trajectories/"
)

# Execute mini-swe-agent command
search_path = os.path.join(self.output_dir, "trajectories", f"{data_point['instance_id']}.traj.json")

pred_file = await self._execute_container_command(
data_point, mini_swe_agent_cmd, search_path, mode="agent"
)

with open(pred_file, "r") as f:
trajectory_dict = json.loads(f.read().strip())

pred_jsonl_file = pred_file.replace(".traj.json", ".jsonl")
with open(pred_jsonl_file, "w") as f:
trajectory_info = trajectory_dict.get("info", {})
trajectory_info["model_name_or_path"] = self.cfg.server.model
trajectory_info["instance_id"] = data_point["instance_id"]

patch = trajectory_info.pop("submission", None)
if not patch:
patch = None
elif not patch.endswith("\n"):
patch += "\n"
trajectory_info["model_patch"] = patch

f.write(json.dumps(trajectory_info))

return pred_jsonl_file

finally:
if os.path.exists(host_tmp_path):
os.remove(host_tmp_path)

async def _run_openhands(self, data_point, api_base):
"""
Runs OpenHands on one instance.
Expand Down Expand Up @@ -688,6 +807,8 @@ async def _process_single_datapoint_impl(self, data_point, data):

if self.cfg.agent_framework == SupportedAgentFrameworks.swe_agent:
pred_file = await self._run_swe_agent(data_point, api_base)
elif self.cfg.agent_framework == SupportedAgentFrameworks.mini_swe_agent:
pred_file = await self._run_mini_swe_agent(data_point, api_base)
elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands:
pred_file = await self._run_openhands(data_point, api_base)
else:
Expand Down
Loading