block · ahau-square · Mar 10, 2025 · Feb 27, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/crates/goose-bench/src/assets/squirrel-data.csv b/crates/goose-bench/src/assets/squirrel-data.csv
diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs
@@ -35,6 +35,9 @@ pub trait BenchAgent: Send + Sync {
 
     // Make get_errors async
     async fn get_errors(&self) -> Vec<BenchAgentError>;
+
+    // Get token usage information
+    async fn get_token_usage(&self) -> Option<i32>;
 }
 
 #[async_trait]

diff --git a/crates/goose-bench/src/eval_suites/metrics.rs b/crates/goose-bench/src/eval_suites/metrics.rs
@@ -0,0 +1,105 @@
+use crate::eval_suites::{BenchAgent, EvaluationMetric};
+use goose::message::{Message, MessageContent};
+use std::collections::HashMap;
+use std::time::Instant;
+
+/// Collect baseline metrics including execution time, tool usage, and token count
+pub async fn collect_baseline_metrics(
+    agent: &mut Box<dyn BenchAgent>,
+    prompt: String,
+) -> (Vec<Message>, HashMap<String, EvaluationMetric>) {
+    // Initialize metrics map
+    let mut metrics = HashMap::new();
+
+    // Start timer
+    let start_time = Instant::now();
+
+    // Execute prompt
+    let messages = match agent.prompt(prompt).await {
+        Ok(msgs) => msgs,
+        Err(e) => {
+            metrics.insert(
+                "prompt_error".to_string(),
+                EvaluationMetric::String(format!("Error: {}", e)),
+            );
+            Vec::new()
+        }
+    };
+
+    // Calculate execution time
+    let execution_time = start_time.elapsed();
+    metrics.insert(
+        "prompt_execution_time_seconds".to_string(),
+        EvaluationMetric::Float(execution_time.as_secs_f64()),
+    );
+
+    // Count tool calls
+    let (total_tool_calls, tool_calls_by_name) = count_tool_calls(&messages);
+    metrics.insert(
+        "total_tool_calls".to_string(),
+        EvaluationMetric::Integer(total_tool_calls),
+    );
+
+    // Add tool calls by name metrics
+    for (tool_name, count) in tool_calls_by_name {
+        metrics.insert(
+            format!("tool_calls_{}", tool_name),
+            EvaluationMetric::Integer(count),
+        );
+    }
+
+    // Get token usage information if available
+    if let Some(token_count) = agent.get_token_usage().await {
+        metrics.insert(
+            "total_tokens".to_string(),
+            EvaluationMetric::Integer(token_count as i64),
+        );
+    }
+
+    (messages, metrics)
+}
+
+/// Count all tool calls in messages and categorize by tool name
+fn count_tool_calls(messages: &[Message]) -> (i64, HashMap<String, i64>) {
+    let mut total_count = 0;
+    let mut counts_by_name = HashMap::new();
+
+    for message in messages {
+        for content in &message.content {
+            if let MessageContent::ToolRequest(tool_req) = content {
+                if let Ok(tool_call) = tool_req.tool_call.as_ref() {
+                    total_count += 1;
+
+                    // Count by name
+                    *counts_by_name.entry(tool_call.name.clone()).or_insert(0) += 1;
+                }
+            }
+        }
+    }
+
+    (total_count, counts_by_name)
+}
+
+/// Convert HashMap of metrics to Vec
+pub fn metrics_hashmap_to_vec(
+    metrics: HashMap<String, EvaluationMetric>,
+) -> Vec<(String, EvaluationMetric)> {
+    metrics.into_iter().collect()
+}
+
+/// Check if a specific tool was used in any of the messages
+pub fn used_tool(messages: &[Message], tool_name: &str) -> bool {
+    messages.iter().any(|msg| {
+        msg.content.iter().any(|content| {
+            if let MessageContent::ToolRequest(tool_req) = content {
+                if let Ok(tool_call) = tool_req.tool_call.as_ref() {
+                    tool_call.name.contains(tool_name)
+                } else {
+                    false
+                }
+            } else {
+                false
+            }
+        })
+    })
+}
diff --git a/crates/goose-bench/src/eval_suites/mod.rs b/crates/goose-bench/src/eval_suites/mod.rs
@@ -1,6 +1,11 @@
 mod core;
 mod evaluation;
 mod factory;
+mod metrics;
+mod utils;
+mod vibes;
 
 pub use evaluation::*;
 pub use factory::{register_evaluation, EvaluationSuiteFactory};
+pub use metrics::*;
+pub use utils::*;
diff --git a/crates/goose-bench/src/eval_suites/utils.rs b/crates/goose-bench/src/eval_suites/utils.rs
@@ -0,0 +1,69 @@
+use crate::bench_work_dir::BenchmarkWorkDir;
+use anyhow::{Context, Result};
+use goose::message::Message;
+use goose::session::storage;
+use std::fs::{self, File};
+use std::io::Write;
+use std::path::PathBuf;
+
+/// Write the last agent message to a file
+/// Returns the content of the message and an error if writing failed
+pub fn write_response_to_file(
+    messages: &[Message],
+    _work_dir: &mut BenchmarkWorkDir, // Kept for API compatibility
+    filename: &str,
+) -> Result<String> {
+    let last_msg = messages
+        .last()
+        .ok_or_else(|| anyhow::anyhow!("No messages to write to file"))?;
+
+    let text_content = last_msg.as_concat_text();
+
+    // Create a file in the current directory
+    let output_path = PathBuf::from(filename);
+
+    // Create and write to the file
+    let mut file = File::create(&output_path)
+        .with_context(|| format!("Failed to create file at {}", output_path.display()))?;
+
+    file.write_all(text_content.as_bytes())
+        .with_context(|| format!("Failed to write content to {}", output_path.display()))?;
+
+    Ok(text_content)
+}
+
+/// Copy the most recent session file to the current working directory
+///
+/// This function finds the most recent Goose session file (.jsonl) and copies it
+/// to the current working directory. Session files are stored by the Goose framework
+/// in a platform-specific data directory.
+///
+/// # Returns
+/// - Ok(session_path) if successfully copied, where session_path is the path to the copied file
+/// - Err if any errors occurred during the process
+pub fn copy_session_to_cwd() -> Result<PathBuf> {
+    // Try to get the most recent session file
+    let src_path = storage::get_most_recent_session()
+        .with_context(|| "Failed to find any recent session files")?;
+
+    // Extract the filename from the path
+    let filename = src_path
+        .file_name()
+        .ok_or_else(|| anyhow::anyhow!("Invalid session filename"))?;
+
+    // Create the destination path in the current directory
+    let dest_path = PathBuf::from(".").join(filename);
+
+    // Copy the file
+    fs::copy(&src_path, &dest_path).with_context(|| {
+        format!(
+            "Failed to copy from '{}' to '{}'",
+            src_path.display(),
+            dest_path.display()
+        )
+    })?;
+
+    println!("Session file copied to: {}", dest_path.display());
+
+    Ok(dest_path)
+}
diff --git a/crates/goose-bench/src/eval_suites/vibes/blog_summary.rs b/crates/goose-bench/src/eval_suites/vibes/blog_summary.rs
@@ -0,0 +1,89 @@
+use crate::bench_work_dir::BenchmarkWorkDir;
+use crate::eval_suites::{
+    collect_baseline_metrics, copy_session_to_cwd, metrics_hashmap_to_vec, write_response_to_file,
+    BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements,
+};
+use crate::register_evaluation;
+use async_trait::async_trait;
+
+pub struct BlogSummary {}
+
+impl BlogSummary {
+    pub fn new() -> Self {
+        BlogSummary {}
+    }
+
+    fn check_markdown_numbered_list(&self, text: &str) -> bool {
+        // Check if all numbers 1-5 exist in markdown numbered list format
+        (1..=5).all(|n| text.contains(&format!("{}.", n)))
+    }
+}
+
+#[async_trait]
+impl Evaluation for BlogSummary {
+    async fn run(
+        &self,
+        mut agent: Box<dyn BenchAgent>,
+        work_dir: &mut BenchmarkWorkDir,
+    ) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
+        println!("BlogSummary - run");
+
+        // Collect baseline metrics (execution time, token usage, tool calls)
+        let (response, perf_metrics) = collect_baseline_metrics(
+            &mut agent,
+            "What are the top 5 most counterintuitive insights from this blog post? Format your response in Markdown with 5 numbered points (1. 2. 3. 4. 5.) https://huyenchip.com/2025/01/07/agents.html".to_string()
+        ).await;
+
+        // Write response to file and get the text content
+        let response_text =
+            match write_response_to_file(&response, work_dir, "blog_summary_output.txt") {
+                Ok(text) => text,
+                Err(e) => {
+                    println!("Warning: Failed to write blog summary output: {}", e);
+                    // If file write fails, still continue with the evaluation
+                    response
+                        .last()
+                        .map_or_else(String::new, |msg| msg.as_concat_text())
+                }
+            };
+
+        // Convert HashMap to Vec for our metrics
+        let mut metrics = metrics_hashmap_to_vec(perf_metrics);
+
+        // Check if the content follows the markdown numbered list format
+        let has_markdown_list = self.check_markdown_numbered_list(&response_text);
+        metrics.push((
+            "valid_markdown_format".to_string(),
+            EvaluationMetric::Boolean(has_markdown_list),
+        ));
+
+        // Check if the fetch tool was used
+        let used_fetch_tool = crate::eval_suites::used_tool(&response, "fetch");
+        metrics.push((
+            "used_fetch_tool".to_string(),
+            EvaluationMetric::Boolean(used_fetch_tool),
+        ));
+
+        // Copy the session file to the current working directory
+        if let Err(e) = copy_session_to_cwd() {
+            println!("Warning: Failed to copy session file: {}", e);
+        } else {
+            println!("Successfully copied session file to current directory");
+        }
+
+        Ok(metrics)
+    }
+
+    fn name(&self) -> &str {
+        "blog_summary"
+    }
+
+    fn required_extensions(&self) -> ExtensionRequirements {
+        ExtensionRequirements {
+            builtin: vec!["developer".to_string()],
+            external: vec!["uvx mcp-server-fetch".to_string()],
+        }
+    }
+}
+
+register_evaluation!("vibes", BlogSummary);