diff --git a/Cargo.lock b/Cargo.lock index 8ce0ef66c877..f3d6daa74423 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2201,7 +2201,7 @@ dependencies = [ [[package]] name = "goose-bench" -version = "1.0.10" +version = "1.0.12" dependencies = [ "anyhow", "async-trait", diff --git a/crates/goose-bench/src/eval_suites/core/create_file.rs b/crates/goose-bench/src/eval_suites/core/create_file.rs index 9b6b285c08c8..6fbe855ca4a3 100644 --- a/crates/goose-bench/src/eval_suites/core/create_file.rs +++ b/crates/goose-bench/src/eval_suites/core/create_file.rs @@ -1,6 +1,6 @@ // Create a new file called test.txt with the content 'Hello, World! -use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric}; +use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::register_evaluation; use crate::work_dir::WorkDir; use async_trait::async_trait; @@ -26,11 +26,11 @@ impl Evaluation for DeveloperCreateFile { ) -> anyhow::Result> { let mut metrics = Vec::new(); - // Send the prompt to list files + // Send the prompt to create and read file let messages = agent.prompt("Create a new file called test.txt in the current directory with the content 'Hello, World!'. Then read the contents of the new file to confirm.".to_string()).await?; - // println!("asdhflkahjsdflkasdfl"); - let valid_tool_call = messages.iter().any(|msg| { + // Check for write operation + let write_tool_call = messages.iter().any(|msg| { // Check if it's an assistant message msg.role == Role::Assistant && // Check if any content item is a tool request for creating a file @@ -60,9 +60,47 @@ impl Evaluation for DeveloperCreateFile { }) }); + // Check for read operation + let read_tool_call = messages.iter().any(|msg| { + // Check if it's an assistant message + msg.role == Role::Assistant && + // Check if any content item is a tool request for reading a file + msg.content.iter().any(|content| { + if let MessageContent::ToolRequest(tool_req) = content { + if let Ok(tool_call) = tool_req.tool_call.as_ref() { + // Check tool name is correct + if tool_call.name != "developer__text_editor" { + return false; + } + + // Parse the arguments as JSON + if let Ok(args) = serde_json::from_value::(tool_call.arguments.clone()) { + // Check all required parameters match exactly + args.get("command").and_then(Value::as_str) == Some("view") && + args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("test.txt")) + } else { + false + } + } else { + false + } + } else { + false + } + }) + }); + + metrics.push(( + "Create file".to_string(), + EvaluationMetric::Boolean(write_tool_call), + )); + metrics.push(( + "Read file".to_string(), + EvaluationMetric::Boolean(read_tool_call), + )); metrics.push(( - "Create files".to_string(), - EvaluationMetric::Boolean(valid_tool_call), + "Complete create and read".to_string(), + EvaluationMetric::Boolean(write_tool_call && read_tool_call), )); Ok(metrics) } @@ -71,8 +109,11 @@ impl Evaluation for DeveloperCreateFile { "developer_create_read_file" } - fn required_extensions(&self) -> Vec { - vec!["developer".to_string()] + fn required_extensions(&self) -> ExtensionRequirements { + ExtensionRequirements { + builtin: vec!["developer".to_string()], + external: Vec::new(), + } } } diff --git a/crates/goose-bench/src/eval_suites/core/example.rs b/crates/goose-bench/src/eval_suites/core/example.rs index 7661232c416a..dd676ac5569d 100644 --- a/crates/goose-bench/src/eval_suites/core/example.rs +++ b/crates/goose-bench/src/eval_suites/core/example.rs @@ -1,4 +1,4 @@ -use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric}; +use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::register_evaluation; use crate::work_dir::WorkDir; use async_trait::async_trait; @@ -36,8 +36,8 @@ impl Evaluation for ExampleEval { "example_eval" } - fn required_extensions(&self) -> Vec { - Vec::new() // Example eval doesn't require any extensions + fn required_extensions(&self) -> ExtensionRequirements { + ExtensionRequirements::default() // Example eval doesn't require any extensions } } diff --git a/crates/goose-bench/src/eval_suites/core/image.rs b/crates/goose-bench/src/eval_suites/core/image.rs index 7b361350e593..ef380b4f29dc 100644 --- a/crates/goose-bench/src/eval_suites/core/image.rs +++ b/crates/goose-bench/src/eval_suites/core/image.rs @@ -1,4 +1,4 @@ -use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric}; +use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::register_evaluation; use crate::work_dir::WorkDir; use async_trait::async_trait; @@ -88,8 +88,11 @@ impl Evaluation for DeveloperImage { "developer_image" } - fn required_extensions(&self) -> Vec { - vec!["developer".to_string()] + fn required_extensions(&self) -> ExtensionRequirements { + ExtensionRequirements { + builtin: vec!["developer".to_string()], + external: Vec::new(), + } } } diff --git a/crates/goose-bench/src/eval_suites/core/list_files.rs b/crates/goose-bench/src/eval_suites/core/list_files.rs index 6af400444755..9150fd4bf128 100644 --- a/crates/goose-bench/src/eval_suites/core/list_files.rs +++ b/crates/goose-bench/src/eval_suites/core/list_files.rs @@ -1,4 +1,4 @@ -use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric}; +use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::register_evaluation; use crate::work_dir::WorkDir; use async_trait::async_trait; @@ -72,8 +72,11 @@ impl Evaluation for DeveloperListFiles { "developer_list_files" } - fn required_extensions(&self) -> Vec { - vec!["developer".to_string()] + fn required_extensions(&self) -> ExtensionRequirements { + ExtensionRequirements { + builtin: vec!["developer".to_string()], + external: Vec::new(), + } } } diff --git a/crates/goose-bench/src/eval_suites/core/save_fact.rs b/crates/goose-bench/src/eval_suites/core/save_fact.rs index 3051bdea1488..4223bc9bf443 100644 --- a/crates/goose-bench/src/eval_suites/core/save_fact.rs +++ b/crates/goose-bench/src/eval_suites/core/save_fact.rs @@ -1,6 +1,6 @@ // Create a new file called test.txt with the content 'Hello, World! -use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric}; +use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::register_evaluation; use crate::work_dir::WorkDir; use async_trait::async_trait; @@ -71,8 +71,11 @@ impl Evaluation for MemoryRememberMemory { "memory_remember_memory" } - fn required_extensions(&self) -> Vec { - vec!["memory".to_string()] + fn required_extensions(&self) -> ExtensionRequirements { + ExtensionRequirements { + builtin: vec!["memory".to_string()], + external: Vec::new(), + } } } diff --git a/crates/goose-bench/src/eval_suites/core/script.rs b/crates/goose-bench/src/eval_suites/core/script.rs index 4a66a09c640b..a3a7ea538527 100644 --- a/crates/goose-bench/src/eval_suites/core/script.rs +++ b/crates/goose-bench/src/eval_suites/core/script.rs @@ -1,6 +1,6 @@ // Create a new file called test.txt with the content 'Hello, World! -use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric}; +use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::register_evaluation; use crate::work_dir::WorkDir; use async_trait::async_trait; @@ -69,8 +69,11 @@ impl Evaluation for ComputerControllerScript { "computercontroller_script" } - fn required_extensions(&self) -> Vec { - vec!["computercontroller".to_string()] + fn required_extensions(&self) -> ExtensionRequirements { + ExtensionRequirements { + builtin: vec!["computercontroller".to_string()], + external: Vec::new(), + } } } diff --git a/crates/goose-bench/src/eval_suites/core/search_replace.rs b/crates/goose-bench/src/eval_suites/core/search_replace.rs index 061cde024bcb..cedf23a1e689 100644 --- a/crates/goose-bench/src/eval_suites/core/search_replace.rs +++ b/crates/goose-bench/src/eval_suites/core/search_replace.rs @@ -1,4 +1,4 @@ -use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric}; +use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::register_evaluation; use crate::work_dir::WorkDir; use async_trait::async_trait; @@ -102,8 +102,11 @@ impl Evaluation for DeveloperSearchReplace { "developer_search_replace" } - fn required_extensions(&self) -> Vec { - vec!["developer".to_string()] + fn required_extensions(&self) -> ExtensionRequirements { + ExtensionRequirements { + builtin: vec!["developer".to_string()], + external: Vec::new(), + } } } diff --git a/crates/goose-bench/src/eval_suites/core/web_scrape.rs b/crates/goose-bench/src/eval_suites/core/web_scrape.rs index 7b20850cd536..99da65bc7bc3 100644 --- a/crates/goose-bench/src/eval_suites/core/web_scrape.rs +++ b/crates/goose-bench/src/eval_suites/core/web_scrape.rs @@ -1,6 +1,6 @@ // Create a new file called test.txt with the content 'Hello, World! -use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric}; +use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements}; use crate::register_evaluation; use crate::work_dir::WorkDir; use async_trait::async_trait; @@ -71,8 +71,11 @@ impl Evaluation for ComputerControllerWebScrape { "computercontroller_web_scrape" } - fn required_extensions(&self) -> Vec { - vec!["computercontroller".to_string()] + fn required_extensions(&self) -> ExtensionRequirements { + ExtensionRequirements { + builtin: vec!["computercontroller".to_string()], + external: Vec::new(), + } } } diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs index 6e84916f5529..1163d1c77e49 100644 --- a/crates/goose-bench/src/eval_suites/evaluation.rs +++ b/crates/goose-bench/src/eval_suites/evaluation.rs @@ -23,6 +23,12 @@ pub enum EvaluationMetric { Boolean(bool), } +#[derive(Debug, Default)] +pub struct ExtensionRequirements { + pub builtin: Vec, + pub external: Vec, +} + #[async_trait] pub trait BenchAgent: Send + Sync { async fn prompt(&mut self, p: String) -> Result>; @@ -41,7 +47,10 @@ pub trait Evaluation: Send + Sync { fn name(&self) -> &str; - fn required_extensions(&self) -> Vec { - Vec::new() // Default implementation returns empty vec + fn required_extensions(&self) -> ExtensionRequirements { + ExtensionRequirements { + builtin: Vec::new(), + external: Vec::new(), + } } } diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs index 823cd4ebc270..ad6ef4372a20 100644 --- a/crates/goose-cli/src/commands/bench.rs +++ b/crates/goose-cli/src/commands/bench.rs @@ -82,10 +82,11 @@ async fn run_eval( let mut result = EvaluationResult::new(evaluation.name().to_string()); if let Ok(work_dir) = work_dir.move_to(format!("./{}", &evaluation.name())) { - let required_extensions = evaluation.required_extensions(); + let requirements = evaluation.required_extensions(); // Create session with error capture - let base_session = build_session(None, false, Vec::new(), required_extensions).await; + let base_session = + build_session(None, false, requirements.external, requirements.builtin).await; let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session))); let bench_session_clone = bench_session.clone();