Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/goose-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ winapi = { version = "0.3", features = ["wincred"] }
[features]
default = ["code-mode"]
code-mode = ["goose/code-mode", "goose-acp/code-mode"]
cuda = ["goose/cuda"]
# disables the update command
disable-update = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,10 +359,19 @@ pub(super) fn generate_with_emulated_tools(
ctx: &mut GenerationContext<'_>,
code_mode_enabled: bool,
) -> Result<(), ProviderError> {
// Use oaicompat variant — its C++ wrapper catches exceptions that would
// otherwise abort the process when other native libs disturb the C++ ABI.
let prompt = ctx
.loaded
.model
.apply_chat_template(&ctx.loaded.template, ctx.chat_messages, true)
.apply_chat_template_with_tools_oaicompat(
&ctx.loaded.template,
ctx.chat_messages,
None, // no tools for emulated path
None, // no json_schema
true, // add_generation_prompt
)
.map(|r| r.prompt)
.map_err(|e| {
ProviderError::ExecutionError(format!("Failed to apply chat template: {}", e))
})?;
Expand Down
69 changes: 18 additions & 51 deletions crates/goose/tests/local_inference_integration.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,30 @@
//! Integration tests for LocalInferenceProvider.
//!
//! These tests require a downloaded GGUF model and are ignored by default.
//! Run with: cargo test -p goose --test local_inference_integration -- --ignored
//! Download a model first:
//! goose local-models download bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M
//!
//! Run with the default model:
//! cargo test -p goose --test local_inference_integration -- --ignored
//!
//! Run with a specific model:
//! TEST_MODEL="bartowski/Qwen_Qwen3-32B-GGUF:Q4_K_M" cargo test -p goose --test local_inference_integration -- --ignored

use futures::StreamExt;
use goose::conversation::message::Message;
use goose::model::ModelConfig;
use goose::providers::create;
use std::time::Instant;

const TEST_MODEL: &str = "llama-3.2-1b";
const DEFAULT_TEST_MODEL: &str = "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M";

fn test_model() -> String {
std::env::var("TEST_MODEL").unwrap_or_else(|_| DEFAULT_TEST_MODEL.to_string())
}

#[tokio::test]
#[ignore]
async fn test_local_inference_stream_produces_output() {
let model_config = ModelConfig::new(TEST_MODEL).expect("valid model config");
let model_config = ModelConfig::new(&test_model()).expect("valid model config");
let provider = create("local", model_config.clone(), Vec::new())
.await
.expect("provider creation should succeed");
Expand Down Expand Up @@ -53,55 +63,12 @@ async fn test_local_inference_stream_produces_output() {
assert!(got_usage, "stream should produce usage info");
}

#[tokio::test]
#[ignore]
async fn test_local_inference_cold_and_warm_performance() {
let model_config = ModelConfig::new(TEST_MODEL).expect("valid model config");
let provider = create("local", model_config.clone(), Vec::new())
.await
.expect("provider creation should succeed");

// Cold start (includes model loading)
let messages = vec![Message::user().with_text("what is the capital of Moldova?")];
let start = Instant::now();
let (response, _usage) = provider
.complete(&model_config, "test-session", "", &messages, &[])
.await
.expect("cold completion should succeed");
let cold_elapsed = start.elapsed();

let text = response.as_concat_text();
assert!(!text.is_empty(), "cold start should produce a response");
println!(
"Cold start: {cold_elapsed:.2?}, response length: {}",
text.len()
);

// Warm run (model already loaded)
let messages2 = vec![Message::user().with_text("what is the capital of France?")];
let start2 = Instant::now();
let (response2, _usage2) = provider
.complete(&model_config, "test-session", "", &messages2, &[])
.await
.expect("warm completion should succeed");
let warm_elapsed = start2.elapsed();

let text2 = response2.as_concat_text();
assert!(!text2.is_empty(), "warm run should produce a response");
println!(
"Warm run: {warm_elapsed:.2?}, response length: {}",
text2.len()
);
assert!(
warm_elapsed < cold_elapsed,
"warm run ({warm_elapsed:.2?}) should be faster than cold start ({cold_elapsed:.2?})"
);
}

#[tokio::test]
#[ignore]
async fn test_local_inference_large_prompt() {
let model_config = ModelConfig::new(TEST_MODEL).expect("valid model config");
let model_config = ModelConfig::new(&test_model())
.expect("valid model config")
.with_max_tokens(Some(20));
let provider = create("local", model_config.clone(), Vec::new())
.await
.expect("provider creation should succeed");
Expand All @@ -111,7 +78,7 @@ async fn test_local_inference_large_prompt() {
let prompt = format!("{padding}\nNow answer this: what is the capital of Moldova?");
let messages = vec![Message::user().with_text(&prompt)];

let start = Instant::now();
let start = std::time::Instant::now();
let (response, _usage) = provider
.complete(&model_config, "test-session", "", &messages, &[])
.await
Expand Down
66 changes: 66 additions & 0 deletions crates/goose/tests/local_inference_perf.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//! Performance benchmarks for LocalInferenceProvider.
//!
//! These tests require a downloaded GGUF model and are ignored by default.
//! Download a model first:
//! goose local-models download bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M
//!
//! Run with the default model:
//! cargo test -p goose --test local_inference_perf -- --ignored --nocapture
//!
//! Run with a specific model:
//! TEST_MODEL="bartowski/Qwen_Qwen3-32B-GGUF:Q4_K_M" cargo test -p goose --test local_inference_perf -- --ignored --nocapture

use goose::conversation::message::Message;
use goose::model::ModelConfig;
use goose::providers::create;
use std::time::Instant;

const DEFAULT_TEST_MODEL: &str = "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M";

fn test_model() -> String {
std::env::var("TEST_MODEL").unwrap_or_else(|_| DEFAULT_TEST_MODEL.to_string())
}

#[tokio::test]
#[ignore]
async fn test_local_inference_cold_vs_warm() {
let model_config = ModelConfig::new(&test_model())
.expect("valid model config")
.with_max_tokens(Some(20));
let provider = create("local", model_config.clone(), Vec::new())
.await
.expect("provider creation should succeed");

// Cold start — includes model loading from disk.
let messages = vec![Message::user().with_text("What is 2+2?")];
let start = Instant::now();
let (response, _) = provider
.complete(&model_config, "perf-session", "", &messages, &[])
.await
.expect("cold completion should succeed");
let cold_elapsed = start.elapsed();

let text = response.as_concat_text();
assert!(!text.is_empty(), "cold start should produce a response");
println!("Cold start: {cold_elapsed:.2?}, response: {}", text.len());

// Warm run — model already loaded, only inference.
let messages2 = vec![Message::user().with_text("What is 3+3?")];
let start2 = Instant::now();
let (response2, _) = provider
.complete(&model_config, "perf-session", "", &messages2, &[])
.await
.expect("warm completion should succeed");
let warm_elapsed = start2.elapsed();

let text2 = response2.as_concat_text();
assert!(!text2.is_empty(), "warm run should produce a response");
println!("Warm run: {warm_elapsed:.2?}, response: {}", text2.len());

if warm_elapsed < cold_elapsed {
let speedup = cold_elapsed.as_secs_f64() / warm_elapsed.as_secs_f64();
println!("Warm is {speedup:.1}x faster than cold");
} else {
println!("Warning: warm was not faster (model may have been pre-loaded by another test)");
}
}