Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/goose-bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ tracing-subscriber = { version = "0.3", features = ["registry"] }
tokio = { version = "1.0", features = ["full"] }
include_dir = "0.7.4"
once_cell = "1.19"
regex = "1.11.1"

[target.'cfg(target_os = "windows")'.dependencies]
winapi = { version = "0.3", features = ["wincred"] }
20 changes: 1 addition & 19 deletions crates/goose-bench/src/bench_work_dir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ pub struct BenchmarkWorkDir {
run_dir: PathBuf,
cwd: PathBuf,
run_name: String,
suite: Option<String>,
eval: Option<String>,
}

impl Default for BenchmarkWorkDir {
Expand Down Expand Up @@ -59,8 +57,6 @@ impl BenchmarkWorkDir {
run_dir,
cwd: base_path.clone(),
run_name,
suite: None,
eval: None,
}
}
fn copy_auto_included_dirs(dest: &Path) {
Expand All @@ -77,24 +73,10 @@ impl BenchmarkWorkDir {
self.cwd = path;
Ok(self)
}
pub fn set_suite(&mut self, suite: &str) {
self.eval = None;
self.suite = Some(suite.to_string());

let mut suite_dir = self.base_path.clone();
suite_dir.push(self.run_name.clone());
suite_dir.push(suite);

self.cd(suite_dir.clone()).unwrap_or_else(|_| {
panic!("Failed to execute cd into {}", suite_dir.clone().display())
});
}
pub fn set_eval(&mut self, eval: &str) {
self.eval = Some(eval.to_string());

let eval = eval.replace(":", std::path::MAIN_SEPARATOR_STR);
let mut eval_dir = self.base_path.clone();
eval_dir.push(self.run_name.clone());
eval_dir.push(self.suite.clone().unwrap());
eval_dir.push(eval);

self.cd(eval_dir.clone())
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// computer controller extension evals
mod script;
mod web_scrape;
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,4 @@ impl Evaluation for ComputerControllerScript {
}
}

register_evaluation!("computercontroller", ComputerControllerScript);
register_evaluation!(ComputerControllerScript);
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,4 @@ impl Evaluation for ComputerControllerWebScrape {
}
}

register_evaluation!("computercontroller", ComputerControllerWebScrape);
register_evaluation!(ComputerControllerWebScrape);
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,4 @@ impl Evaluation for DeveloperCreateFile {
}
}

register_evaluation!("developer", DeveloperCreateFile);
register_evaluation!(DeveloperCreateFile);
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,4 @@ impl Evaluation for DeveloperListFiles {
}
}

register_evaluation!("developer", DeveloperListFiles);
register_evaluation!(DeveloperListFiles);
3 changes: 3 additions & 0 deletions crates/goose-bench/src/eval_suites/core/developer/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// developer extension evals
mod create_file;
mod list_files;
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,4 @@ impl Evaluation for DeveloperImage {
}
}

register_evaluation!("developer_image", DeveloperImage);
register_evaluation!(DeveloperImage);
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mod image;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mod search_replace;
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,4 @@ impl Evaluation for DeveloperSearchReplace {
}
}

register_evaluation!("developer_search_replace", DeveloperSearchReplace);
register_evaluation!(DeveloperSearchReplace);
2 changes: 1 addition & 1 deletion crates/goose-bench/src/eval_suites/core/example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ impl Evaluation for ExampleEval {
}
}

register_evaluation!("core", ExampleEval);
register_evaluation!(ExampleEval);
2 changes: 2 additions & 0 deletions crates/goose-bench/src/eval_suites/core/memory/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
// memory extension evals
mod save_fact;
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,4 @@ impl Evaluation for MemoryRememberMemory {
}
}

register_evaluation!("memory", MemoryRememberMemory);
register_evaluation!(MemoryRememberMemory);
15 changes: 5 additions & 10 deletions crates/goose-bench/src/eval_suites/core/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
mod computercontroller;
mod developer;
mod developer_image;
mod developer_search_replace;
mod example;
// developer extension evals
mod create_file;
mod image;
mod list_files;
mod search_replace;
// computer controller extension evals
mod script;
mod web_scrape;
// memory extension evals
mod save_fact;
mod memory;
115 changes: 90 additions & 25 deletions crates/goose-bench/src/eval_suites/factory.rs
Original file line number Diff line number Diff line change
@@ -1,62 +1,127 @@
pub use super::Evaluation;
use regex::Regex;
use std::borrow::Cow;
use std::collections::HashMap;
use std::sync::{OnceLock, RwLock};

type EvaluationConstructor = fn() -> Box<dyn Evaluation>;
type Registry = &'static RwLock<HashMap<&'static str, EvaluationConstructor>>;

// Use std::sync::RwLock for interior mutability
static EVALUATION_REGISTRY: OnceLock<RwLock<HashMap<&'static str, Vec<EvaluationConstructor>>>> =
static EVAL_REGISTRY: OnceLock<RwLock<HashMap<&'static str, EvaluationConstructor>>> =
OnceLock::new();

/// Initialize the registry if it hasn't been initialized
fn registry() -> &'static RwLock<HashMap<&'static str, Vec<EvaluationConstructor>>> {
EVALUATION_REGISTRY.get_or_init(|| RwLock::new(HashMap::new()))
fn eval_registry() -> Registry {
EVAL_REGISTRY.get_or_init(|| RwLock::new(HashMap::new()))
}

/// Register a new evaluation version
pub fn register_evaluation(suite_name: &'static str, constructor: fn() -> Box<dyn Evaluation>) {
let registry = registry();
pub fn register_eval(selector: &'static str, constructor: fn() -> Box<dyn Evaluation>) {
let registry = eval_registry();
if let Ok(mut map) = registry.write() {
map.entry(suite_name)
.or_insert_with(Vec::new)
.push(constructor);
map.insert(selector, constructor);
}
}

pub struct EvaluationSuiteFactory;
pub struct EvaluationSuite;

impl EvaluationSuiteFactory {
pub fn create(suite_name: &str) -> Option<Vec<Box<dyn Evaluation>>> {
let registry = registry();
impl EvaluationSuite {
pub fn from(selector: &str) -> Option<Box<dyn Evaluation>> {
let registry = eval_registry();
let map = registry
.read()
.expect("Failed to read the benchmark evaluation registry.");

let constructors = map.get(suite_name)?;
let instances = constructors
.iter()
.map(|&constructor| constructor())
.collect::<Vec<_>>();
let constructor = map.get(selector)?;
let instance = constructor();

Some(instances)
Some(instance)
}

pub fn available_evaluations() -> Vec<&'static str> {
registry()
pub fn registered_evals() -> Vec<&'static str> {
let registry = eval_registry();
let map = registry
.read()
.map(|map| map.keys().copied().collect())
.unwrap_or_default()
.expect("Failed to read the benchmark evaluation registry.");

let evals: Vec<_> = map.keys().copied().collect();
evals
}
pub fn select(selectors: Vec<String>) -> HashMap<String, Vec<&'static str>> {
let eval_name_pattern = Regex::new(r":\w+$").unwrap();
let grouped_by_suite: HashMap<String, Vec<&'static str>> =
EvaluationSuite::registered_evals()
.into_iter()
.filter(|&eval| selectors.is_empty() || matches_any_selectors(eval, &selectors))
.fold(HashMap::new(), |mut suites, eval| {
let suite = match eval_name_pattern.replace(eval, "") {
Cow::Borrowed(s) => s.to_string(),
Cow::Owned(s) => s,
};
suites.entry(suite).or_default().push(eval);
suites
});

grouped_by_suite
}

pub fn available_selectors() -> HashMap<String, usize> {
let mut counts: HashMap<String, usize> = HashMap::new();
for selector in EvaluationSuite::registered_evals() {
let parts = selector.split(":").collect::<Vec<_>>();
for i in 0..parts.len() {
let sel = parts[..i + 1].join(":");
*counts.entry(sel).or_insert(0) += 1;
}
}
counts
}
}

fn matches_any_selectors(eval: &str, selectors: &Vec<String>) -> bool {
// selectors must prefix match exactly, no matching half-way in a word
// remove one level of nesting at a time and check exact match
let nesting_pattern = Regex::new(r":\w+$").unwrap();
let mut level_up = eval.to_string();
for selector in selectors {
while !level_up.is_empty() {
if level_up == *selector {
return true;
}
if !level_up.contains(":") {
break;
};
level_up = match nesting_pattern.replace(&level_up, "") {
Cow::Borrowed(s) => s.to_string(),
Cow::Owned(s) => s,
};
}
}
false
}

#[macro_export]
macro_rules! register_evaluation {
($suite_name:expr, $evaluation_type:ty) => {
($evaluation_type:ty) => {
paste::paste! {
#[ctor::ctor]
#[allow(non_snake_case)]
fn [<__register_evaluation_ $suite_name>]() {
$crate::eval_suites::factory::register_evaluation($suite_name, || {
fn [<__register_evaluation_ $evaluation_type>]() {
let mut path = std::path::PathBuf::from(file!());
path.set_extension("");
let eval_suites_dir = "eval_suites";
let eval_selector = {
let s = path.components()
.skip_while(|comp| comp.as_os_str() != eval_suites_dir)
.skip(1)
.map(|comp| comp.as_os_str().to_string_lossy().to_string())
.collect::<Vec<_>>()
.join(":");
Box::leak(s.into_boxed_str())
};

$crate::eval_suites::factory::register_eval(eval_selector, || {
Box::new(<$evaluation_type>::new())
});
}
Expand Down
2 changes: 1 addition & 1 deletion crates/goose-bench/src/eval_suites/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ mod utils;
mod vibes;

pub use evaluation::*;
pub use factory::{register_evaluation, EvaluationSuiteFactory};
pub use factory::{register_eval, EvaluationSuite};
pub use metrics::*;
pub use utils::*;
2 changes: 1 addition & 1 deletion crates/goose-bench/src/eval_suites/vibes/blog_summary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,4 @@ impl Evaluation for BlogSummary {
}
}

register_evaluation!("vibes", BlogSummary);
register_evaluation!(BlogSummary);
2 changes: 1 addition & 1 deletion crates/goose-bench/src/eval_suites/vibes/flappy_bird.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,4 @@ impl Evaluation for FlappyBird {
}
}

register_evaluation!("vibes", FlappyBird);
register_evaluation!(FlappyBird);
2 changes: 1 addition & 1 deletion crates/goose-bench/src/eval_suites/vibes/goose_wiki.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,4 @@ impl Evaluation for GooseWiki {
}
}

register_evaluation!("vibes", GooseWiki);
register_evaluation!(GooseWiki);
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,4 @@ Present the information in order of significance or quality. Focus specifically
}
}

register_evaluation!("vibes", RestaurantResearch);
register_evaluation!(RestaurantResearch);
Original file line number Diff line number Diff line change
Expand Up @@ -174,4 +174,4 @@ After writing the script, run it using python3 and show the results. Do not ask
}
}

register_evaluation!("vibes", SquirrelCensus);
register_evaluation!(SquirrelCensus);
Loading
Loading