diff --git a/evals/open-model-gym/agent-gym-report-2026-02-03.html b/evals/open-model-gym/agent-gym-report-2026-02-03.html deleted file mode 100644 index 5d3e63d292f0..000000000000 --- a/evals/open-model-gym/agent-gym-report-2026-02-03.html +++ /dev/null @@ -1,498 +0,0 @@ -
- - - -- 27 passed / - 8 failed / - 35 total -
-Agent Configurations: goose-full, opencode
| Model | -Agent Configuration | -everyday-app-automation | file-editing | multi-turn-edit | -
|---|---|---|---|---|
- ollama/glm-4.7-flash:latest
- |
-
- goose-full
- (goose)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build |
-
- email() added compiles after turn 1 renamed to generated_email() old email() removed compiles after turn 2 |
-
- opencode
- (opencode)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build | — |
- |
- ollama/gpt-oss:120b-cloud
- |
-
- goose-full
- (goose)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build |
-
- email() added compiles after turn 1 renamed to generated_email() old email() removed compiles after turn 2 |
-
- opencode
- (opencode)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build | — |
- |
- ollama/gpt-oss:20b
- |
-
- goose-full
- (goose)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build |
-
- email() added compiles after turn 1 renamed to generated_email() old email() removed compiles after turn 2 |
-
- opencode
- (opencode)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build | — |
- |
- ollama/kimi-k2.5:cloud
- |
-
- goose-full
- (goose)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build |
-
- email() added compiles after turn 1 |
-
- opencode
- (opencode)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build | — |
- |
- ollama/nemotron-3-nano:latest
- |
-
- goose-full
- (goose)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build |
-
- email() added compiles after turn 1 |
-
- opencode
- (opencode)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build | — |
- |
- anthropic/claude-opus-4-5-20251101
- |
-
- goose-full
- (goose)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build |
-
- email() added compiles after turn 1 renamed to generated_email() old email() removed compiles after turn 2 |
-
- opencode
- (opencode)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build | — |
- |
- ollama/qwen3-coder:64k
- |
-
- goose-full
- (goose)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build |
-
- email() added compiles after turn 1 |
-
- opencode
- (opencode)
- |
-
-
- file_exists: workflow-log.md file_not_empty: workflow-log.md tool_called: slack_search_messages tool_called: slack_get_user_info tool_called: jira_create_issue tool_called: calendar_create_event |
-
- user.rs exists display_name() added has return type email() preserved first_name() preserved main.rs exists main.rs unchanged cargo build | — |
-