Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 49 additions & 8 deletions crates/goose-bench/src/eval_suites/core/create_file.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Create a new file called test.txt with the content 'Hello, World!

use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
Expand All @@ -26,11 +26,11 @@ impl Evaluation for DeveloperCreateFile {
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
let mut metrics = Vec::new();

// Send the prompt to list files
// Send the prompt to create and read file
let messages = agent.prompt("Create a new file called test.txt in the current directory with the content 'Hello, World!'. Then read the contents of the new file to confirm.".to_string()).await?;
// println!("asdhflkahjsdflkasdfl");

let valid_tool_call = messages.iter().any(|msg| {
// Check for write operation
let write_tool_call = messages.iter().any(|msg| {
// Check if it's an assistant message
msg.role == Role::Assistant &&
// Check if any content item is a tool request for creating a file
Expand Down Expand Up @@ -60,9 +60,47 @@ impl Evaluation for DeveloperCreateFile {
})
});

// Check for read operation
let read_tool_call = messages.iter().any(|msg| {
// Check if it's an assistant message
msg.role == Role::Assistant &&
// Check if any content item is a tool request for reading a file
msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
// Check tool name is correct
if tool_call.name != "developer__text_editor" {
return false;
}

// Parse the arguments as JSON
if let Ok(args) = serde_json::from_value::<Value>(tool_call.arguments.clone()) {
// Check all required parameters match exactly
args.get("command").and_then(Value::as_str) == Some("view") &&
args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("test.txt"))
} else {
false
}
} else {
false
}
} else {
false
}
})
});

metrics.push((
"Create file".to_string(),
EvaluationMetric::Boolean(write_tool_call),
));
metrics.push((
"Read file".to_string(),
EvaluationMetric::Boolean(read_tool_call),
));
metrics.push((
"Create files".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
"Complete create and read".to_string(),
EvaluationMetric::Boolean(write_tool_call && read_tool_call),
));
Ok(metrics)
}
Expand All @@ -71,8 +109,11 @@ impl Evaluation for DeveloperCreateFile {
"developer_create_read_file"
}

fn required_extensions(&self) -> Vec<String> {
vec!["developer".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

Expand Down
6 changes: 3 additions & 3 deletions crates/goose-bench/src/eval_suites/core/example.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
Expand Down Expand Up @@ -36,8 +36,8 @@ impl Evaluation for ExampleEval {
"example_eval"
}

fn required_extensions(&self) -> Vec<String> {
Vec::new() // Example eval doesn't require any extensions
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements::default() // Example eval doesn't require any extensions
}
}

Expand Down
9 changes: 6 additions & 3 deletions crates/goose-bench/src/eval_suites/core/image.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
Expand Down Expand Up @@ -88,8 +88,11 @@ impl Evaluation for DeveloperImage {
"developer_image"
}

fn required_extensions(&self) -> Vec<String> {
vec!["developer".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

Expand Down
9 changes: 6 additions & 3 deletions crates/goose-bench/src/eval_suites/core/list_files.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
Expand Down Expand Up @@ -72,8 +72,11 @@ impl Evaluation for DeveloperListFiles {
"developer_list_files"
}

fn required_extensions(&self) -> Vec<String> {
vec!["developer".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

Expand Down
9 changes: 6 additions & 3 deletions crates/goose-bench/src/eval_suites/core/save_fact.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Create a new file called test.txt with the content 'Hello, World!

use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
Expand Down Expand Up @@ -71,8 +71,11 @@ impl Evaluation for MemoryRememberMemory {
"memory_remember_memory"
}

fn required_extensions(&self) -> Vec<String> {
vec!["memory".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["memory".to_string()],
external: Vec::new(),
}
}
}

Expand Down
9 changes: 6 additions & 3 deletions crates/goose-bench/src/eval_suites/core/script.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Create a new file called test.txt with the content 'Hello, World!

use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
Expand Down Expand Up @@ -69,8 +69,11 @@ impl Evaluation for ComputerControllerScript {
"computercontroller_script"
}

fn required_extensions(&self) -> Vec<String> {
vec!["computercontroller".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["computercontroller".to_string()],
external: Vec::new(),
}
}
}

Expand Down
9 changes: 6 additions & 3 deletions crates/goose-bench/src/eval_suites/core/search_replace.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
Expand Down Expand Up @@ -102,8 +102,11 @@ impl Evaluation for DeveloperSearchReplace {
"developer_search_replace"
}

fn required_extensions(&self) -> Vec<String> {
vec!["developer".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

Expand Down
9 changes: 6 additions & 3 deletions crates/goose-bench/src/eval_suites/core/web_scrape.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Create a new file called test.txt with the content 'Hello, World!

use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use crate::work_dir::WorkDir;
use async_trait::async_trait;
Expand Down Expand Up @@ -71,8 +71,11 @@ impl Evaluation for ComputerControllerWebScrape {
"computercontroller_web_scrape"
}

fn required_extensions(&self) -> Vec<String> {
vec!["computercontroller".to_string()]
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["computercontroller".to_string()],
external: Vec::new(),
}
}
}

Expand Down
13 changes: 11 additions & 2 deletions crates/goose-bench/src/eval_suites/evaluation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ pub enum EvaluationMetric {
Boolean(bool),
}

#[derive(Debug, Default)]
pub struct ExtensionRequirements {
pub builtin: Vec<String>,
pub external: Vec<String>,
}

#[async_trait]
pub trait BenchAgent: Send + Sync {
async fn prompt(&mut self, p: String) -> Result<Vec<Message>>;
Expand All @@ -41,7 +47,10 @@ pub trait Evaluation: Send + Sync {

fn name(&self) -> &str;

fn required_extensions(&self) -> Vec<String> {
Vec::new() // Default implementation returns empty vec
fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: Vec::new(),
external: Vec::new(),
}
}
}
5 changes: 3 additions & 2 deletions crates/goose-cli/src/commands/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,11 @@ async fn run_eval(
let mut result = EvaluationResult::new(evaluation.name().to_string());

if let Ok(work_dir) = work_dir.move_to(format!("./{}", &evaluation.name())) {
let required_extensions = evaluation.required_extensions();
let requirements = evaluation.required_extensions();

// Create session with error capture
let base_session = build_session(None, false, Vec::new(), required_extensions).await;
let base_session =
build_session(None, false, requirements.external, requirements.builtin).await;

let bench_session = Arc::new(Mutex::new(BenchSession::new(base_session)));
let bench_session_clone = bench_session.clone();
Expand Down
Loading