diff --git a/crates/goose-bench/src/bench_work_dir.rs b/crates/goose-bench/src/bench_work_dir.rs index 4d6251ca2998..81d194d93b60 100644 --- a/crates/goose-bench/src/bench_work_dir.rs +++ b/crates/goose-bench/src/bench_work_dir.rs @@ -11,6 +11,7 @@ pub static BUILTIN_EVAL_ASSETS: Dir = include_dir!("$CARGO_MANIFEST_DIR/src/asse pub struct BenchmarkWorkDir { pub base_path: PathBuf, + run_dir: PathBuf, cwd: PathBuf, run_name: String, suite: Option, @@ -24,6 +25,7 @@ impl Default for BenchmarkWorkDir { } impl BenchmarkWorkDir { pub fn new(work_dir_name: String, include_dirs: Vec) -> Self { + let run_dir = std::env::current_dir().unwrap().canonicalize().unwrap(); let base_path = PathBuf::from(format!("./benchmark-{}", work_dir_name)); fs::create_dir_all(&base_path).unwrap(); @@ -54,6 +56,7 @@ impl BenchmarkWorkDir { BenchmarkWorkDir { base_path: base_path.clone(), + run_dir, cwd: base_path.clone(), run_name, suite: None, @@ -178,3 +181,9 @@ impl BenchmarkWorkDir { } } } + +impl Drop for BenchmarkWorkDir { + fn drop(&mut self) { + std::env::set_current_dir(&self.run_dir).unwrap(); + } +} diff --git a/crates/goose-bench/src/eval_suites/core/example.rs b/crates/goose-bench/src/eval_suites/core/example.rs index 68696b471d34..d05d23e469c1 100644 --- a/crates/goose-bench/src/eval_suites/core/example.rs +++ b/crates/goose-bench/src/eval_suites/core/example.rs @@ -20,15 +20,17 @@ impl Evaluation for ExampleEval { _work_dir: &mut BenchmarkWorkDir, ) -> anyhow::Result> { println!("ExampleEval - run"); - // let f = work_dir.fs_get(String::from("./arbitrary_dir/arbitrary_file.txt"))?; - // let _contents = fs::read_to_string(f)?; let mut metrics = Vec::new(); + let _ = agent.prompt("What can you do?".to_string()).await; + metrics.push(( "example_metric".to_string(), EvaluationMetric::Boolean(true), )); + metrics.push(("example_count".to_string(), EvaluationMetric::Integer(42))); + Ok(metrics) } diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs index 4c96f74d5fe2..7bd569c2ab8c 100644 --- a/crates/goose-cli/src/commands/bench.rs +++ b/crates/goose-cli/src/commands/bench.rs @@ -134,13 +134,13 @@ async fn run_eval( async fn run_suite(suite: &str, work_dir: &mut BenchmarkWorkDir) -> anyhow::Result { let mut suite_result = SuiteResult::new(suite.to_string()); - let eval_lock = Mutex::new(()); + let eval_work_dir_guard = Mutex::new(work_dir); if let Some(evals) = EvaluationSuiteFactory::create(suite) { for eval in evals { - let _unused = eval_lock.lock().await; - work_dir.set_eval(eval.name()); - let eval_result = run_eval(eval, work_dir).await?; + let mut eval_work_dir = eval_work_dir_guard.lock().await; + eval_work_dir.set_eval(eval.name()); + let eval_result = run_eval(eval, &mut eval_work_dir).await?; suite_result.add_evaluation(eval_result); } } @@ -167,13 +167,13 @@ pub async fn run_benchmark( let mut results = BenchmarkResults::new(provider_name.clone()); - let mut work_dir = BenchmarkWorkDir::new( + let suite_work_dir = Mutex::new(BenchmarkWorkDir::new( format!("{}-{}", provider_name, goose_model), include_dirs.clone(), - ); - let suite_lock = Mutex::new(()); + )); + for suite in suites { - let _unused = suite_lock.lock().await; + let mut work_dir = suite_work_dir.lock().await; work_dir.set_suite(suite); let suite_result = run_suite(suite, &mut work_dir).await?; results.add_suite(suite_result);