Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
41a9daa
ollama model + structured output shim
alicehau Feb 27, 2025
92d5443
tool shim generic
alicehau Feb 28, 2025
863a133
refactoring toolshim
alicehau Feb 28, 2025
20e3810
refactor
alicehau Feb 28, 2025
f93d5c5
ollama toolshim ready
alicehau Feb 28, 2025
78a3fa8
clean up
alicehau Feb 28, 2025
da64947
lint
alicehau Mar 4, 2025
24d8f69
format
alicehau Mar 4, 2025
0387207
add toolshim to openai provider
alicehau Mar 4, 2025
10966a1
update toolshim prompt:
Mar 5, 2025
b0984c9
prompt update
alicehau Mar 5, 2025
e81af6d
refactor
Mar 7, 2025
fced390
undo change to reference agent
Mar 7, 2025
80ebac7
undo ref agent changes
Mar 10, 2025
7025113
refactor config
Mar 10, 2025
a17a4da
refactor
Mar 10, 2025
152bbe2
check env vars
Mar 10, 2025
b01091b
fix tests
Mar 10, 2025
aaae84b
clean up
Mar 10, 2025
8b21c75
add small model evals
Mar 6, 2025
d4743f8
add initial evals
Mar 6, 2025
6899fe1
updated suite assignments
Mar 6, 2025
2ca7030
rebase
Mar 7, 2025
ed701ae
updatae external extensions str
Mar 7, 2025
1cbb48f
add time and tools metrics and write to file utils
alicehau Mar 10, 2025
cff52a8
add copying of session file into workdir
alicehau Mar 10, 2025
1fee63a
add metric to check tool use
alicehau Mar 10, 2025
b4c236b
rename suite
alicehau Mar 10, 2025
595beca
update name, account for non bool metrics
alicehau Mar 10, 2025
85d9bc3
format
alicehau Mar 10, 2025
90849cd
update benchmark script to allow toolshim
alicehau Mar 10, 2025
da3b4a4
remove changes from toolshim branch
Mar 10, 2025
53de867
add back token usage mods
Mar 10, 2025
a5b42b9
instruct flappy bird to use pyenv
Mar 10, 2025
59666fc
remove toolshiim from this PR
Mar 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/goose-bench/src/assets/squirrel-data.csv

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions crates/goose-bench/src/eval_suites/evaluation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ pub trait BenchAgent: Send + Sync {

// Make get_errors async
async fn get_errors(&self) -> Vec<BenchAgentError>;

// Get token usage information
async fn get_token_usage(&self) -> Option<i32>;
}

#[async_trait]
Expand Down
105 changes: 105 additions & 0 deletions crates/goose-bench/src/eval_suites/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
use crate::eval_suites::{BenchAgent, EvaluationMetric};
use goose::message::{Message, MessageContent};
use std::collections::HashMap;
use std::time::Instant;

/// Collect baseline metrics including execution time, tool usage, and token count
pub async fn collect_baseline_metrics(
agent: &mut Box<dyn BenchAgent>,
prompt: String,
) -> (Vec<Message>, HashMap<String, EvaluationMetric>) {
// Initialize metrics map
let mut metrics = HashMap::new();

// Start timer
let start_time = Instant::now();

// Execute prompt
let messages = match agent.prompt(prompt).await {
Ok(msgs) => msgs,
Err(e) => {
metrics.insert(
"prompt_error".to_string(),
EvaluationMetric::String(format!("Error: {}", e)),
);
Vec::new()
}
};

// Calculate execution time
let execution_time = start_time.elapsed();
metrics.insert(
"prompt_execution_time_seconds".to_string(),
EvaluationMetric::Float(execution_time.as_secs_f64()),
);

// Count tool calls
let (total_tool_calls, tool_calls_by_name) = count_tool_calls(&messages);
metrics.insert(
"total_tool_calls".to_string(),
EvaluationMetric::Integer(total_tool_calls),
);

// Add tool calls by name metrics
for (tool_name, count) in tool_calls_by_name {
metrics.insert(
format!("tool_calls_{}", tool_name),
EvaluationMetric::Integer(count),
);
}

// Get token usage information if available
if let Some(token_count) = agent.get_token_usage().await {
metrics.insert(
"total_tokens".to_string(),
EvaluationMetric::Integer(token_count as i64),
);
}

(messages, metrics)
}

/// Count all tool calls in messages and categorize by tool name
fn count_tool_calls(messages: &[Message]) -> (i64, HashMap<String, i64>) {
let mut total_count = 0;
let mut counts_by_name = HashMap::new();

for message in messages {
for content in &message.content {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
total_count += 1;

// Count by name
*counts_by_name.entry(tool_call.name.clone()).or_insert(0) += 1;
}
}
}
}

(total_count, counts_by_name)
}

/// Convert HashMap of metrics to Vec
pub fn metrics_hashmap_to_vec(
metrics: HashMap<String, EvaluationMetric>,
) -> Vec<(String, EvaluationMetric)> {
metrics.into_iter().collect()
}

/// Check if a specific tool was used in any of the messages
pub fn used_tool(messages: &[Message], tool_name: &str) -> bool {
messages.iter().any(|msg| {
msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
tool_call.name.contains(tool_name)
} else {
false
}
} else {
false
}
})
})
}
5 changes: 5 additions & 0 deletions crates/goose-bench/src/eval_suites/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
mod core;
mod evaluation;
mod factory;
mod metrics;
mod utils;
mod vibes;

pub use evaluation::*;
pub use factory::{register_evaluation, EvaluationSuiteFactory};
pub use metrics::*;
pub use utils::*;
69 changes: 69 additions & 0 deletions crates/goose-bench/src/eval_suites/utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use anyhow::{Context, Result};
use goose::message::Message;
use goose::session::storage;
use std::fs::{self, File};
use std::io::Write;
use std::path::PathBuf;

/// Write the last agent message to a file
/// Returns the content of the message and an error if writing failed
pub fn write_response_to_file(
messages: &[Message],
_work_dir: &mut BenchmarkWorkDir, // Kept for API compatibility
filename: &str,
) -> Result<String> {
let last_msg = messages
.last()
.ok_or_else(|| anyhow::anyhow!("No messages to write to file"))?;

let text_content = last_msg.as_concat_text();

// Create a file in the current directory
let output_path = PathBuf::from(filename);

// Create and write to the file
let mut file = File::create(&output_path)
.with_context(|| format!("Failed to create file at {}", output_path.display()))?;

file.write_all(text_content.as_bytes())
.with_context(|| format!("Failed to write content to {}", output_path.display()))?;

Ok(text_content)
}

/// Copy the most recent session file to the current working directory
///
/// This function finds the most recent Goose session file (.jsonl) and copies it
/// to the current working directory. Session files are stored by the Goose framework
/// in a platform-specific data directory.
///
/// # Returns
/// - Ok(session_path) if successfully copied, where session_path is the path to the copied file
/// - Err if any errors occurred during the process
pub fn copy_session_to_cwd() -> Result<PathBuf> {
// Try to get the most recent session file
let src_path = storage::get_most_recent_session()
.with_context(|| "Failed to find any recent session files")?;

// Extract the filename from the path
let filename = src_path
.file_name()
.ok_or_else(|| anyhow::anyhow!("Invalid session filename"))?;

// Create the destination path in the current directory
let dest_path = PathBuf::from(".").join(filename);

// Copy the file
fs::copy(&src_path, &dest_path).with_context(|| {
format!(
"Failed to copy from '{}' to '{}'",
src_path.display(),
dest_path.display()
)
})?;

println!("Session file copied to: {}", dest_path.display());

Ok(dest_path)
}
89 changes: 89 additions & 0 deletions crates/goose-bench/src/eval_suites/vibes/blog_summary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{
collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
};
use crate::register_evaluation;
use async_trait::async_trait;

pub struct BlogSummary {}

impl BlogSummary {
pub fn new() -> Self {
BlogSummary {}
}

fn check_markdown_numbered_list(&self, text: &str) -> bool {
// Check if all numbers 1-5 exist in markdown numbered list format
(1..=5).all(|n| text.contains(&format!("{}.", n)))
}
}

#[async_trait]
impl Evaluation for BlogSummary {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("BlogSummary - run");

// Collect baseline metrics (execution time, token usage, tool calls)
let (response, perf_metrics) = collect_baseline_metrics(
&mut agent,
"What are the top 5 most counterintuitive insights from this blog post? Format your response in Markdown with 5 numbered points (1. 2. 3. 4. 5.) https://huyenchip.com/2025/01/07/agents.html".to_string()
).await;

// Write response to file and get the text content
let response_text =
match write_response_to_file(&response, work_dir, "blog_summary_output.txt") {
Ok(text) => text,
Err(e) => {
println!("Warning: Failed to write blog summary output: {}", e);
// If file write fails, still continue with the evaluation
response
.last()
.map_or_else(String::new, |msg| msg.as_concat_text())
}
};

// Convert HashMap to Vec for our metrics
let mut metrics = metrics_hashmap_to_vec(perf_metrics);

// Check if the content follows the markdown numbered list format
let has_markdown_list = self.check_markdown_numbered_list(&response_text);
metrics.push((
"valid_markdown_format".to_string(),
EvaluationMetric::Boolean(has_markdown_list),
));

// Check if the fetch tool was used
let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
metrics.push((
"used_fetch_tool".to_string(),
EvaluationMetric::Boolean(used_fetch_tool),
));

// Copy the session file to the current working directory
if let Err(e) = copy_session_to_cwd() {
println!("Warning: Failed to copy session file: {}", e);
} else {
println!("Successfully copied session file to current directory");
}

Ok(metrics)
}

fn name(&self) -> &str {
"blog_summary"
}

fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: vec!["uvx mcp-server-fetch".to_string()],
}
}
}

register_evaluation!("vibes", BlogSummary);
Loading
Loading