aaif-goose · jh-block · Mar 4, 2026 · Mar 4, 2026
diff --git a/crates/goose-cli/Cargo.toml b/crates/goose-cli/Cargo.toml
@@ -72,6 +72,7 @@ winapi = { version = "0.3", features = ["wincred"] }
 [features]
 default = ["code-mode"]
 code-mode = ["goose/code-mode", "goose-acp/code-mode"]
+cuda = ["goose/cuda"]
 # disables the update command
 disable-update = []
 

diff --git a/crates/goose/src/providers/local_inference/inference_emulated_tools.rs b/crates/goose/src/providers/local_inference/inference_emulated_tools.rs
@@ -359,10 +359,19 @@ pub(super) fn generate_with_emulated_tools(
     ctx: &mut GenerationContext<'_>,
     code_mode_enabled: bool,
 ) -> Result<(), ProviderError> {
+    // Use oaicompat variant — its C++ wrapper catches exceptions that would
+    // otherwise abort the process when other native libs disturb the C++ ABI.
     let prompt = ctx
         .loaded
         .model
-        .apply_chat_template(&ctx.loaded.template, ctx.chat_messages, true)
+        .apply_chat_template_with_tools_oaicompat(
+            &ctx.loaded.template,
+            ctx.chat_messages,
+            None, // no tools for emulated path
+            None, // no json_schema
+            true, // add_generation_prompt
+        )
+        .map(|r| r.prompt)
         .map_err(|e| {
             ProviderError::ExecutionError(format!("Failed to apply chat template: {}", e))
         })?;

diff --git a/crates/goose/tests/local_inference_integration.rs b/crates/goose/tests/local_inference_integration.rs
@@ -1,20 +1,30 @@
 //! Integration tests for LocalInferenceProvider.
 //!
 //! These tests require a downloaded GGUF model and are ignored by default.
-//! Run with: cargo test -p goose --test local_inference_integration -- --ignored
+//! Download a model first:
+//!   goose local-models download bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M
+//!
+//! Run with the default model:
+//!   cargo test -p goose --test local_inference_integration -- --ignored
+//!
+//! Run with a specific model:
+//!   TEST_MODEL="bartowski/Qwen_Qwen3-32B-GGUF:Q4_K_M" cargo test -p goose --test local_inference_integration -- --ignored
 
 use futures::StreamExt;
 use goose::conversation::message::Message;
 use goose::model::ModelConfig;
 use goose::providers::create;
-use std::time::Instant;
 
-const TEST_MODEL: &str = "llama-3.2-1b";
+const DEFAULT_TEST_MODEL: &str = "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M";
+
+fn test_model() -> String {
+    std::env::var("TEST_MODEL").unwrap_or_else(|_| DEFAULT_TEST_MODEL.to_string())
+}
 
 #[tokio::test]
 #[ignore]
 async fn test_local_inference_stream_produces_output() {
-    let model_config = ModelConfig::new(TEST_MODEL).expect("valid model config");
+    let model_config = ModelConfig::new(&test_model()).expect("valid model config");
     let provider = create("local", model_config.clone(), Vec::new())
         .await
         .expect("provider creation should succeed");
@@ -53,55 +63,12 @@ async fn test_local_inference_stream_produces_output() {
     assert!(got_usage, "stream should produce usage info");
 }
 
-#[tokio::test]
-#[ignore]
-async fn test_local_inference_cold_and_warm_performance() {
-    let model_config = ModelConfig::new(TEST_MODEL).expect("valid model config");
-    let provider = create("local", model_config.clone(), Vec::new())
-        .await
-        .expect("provider creation should succeed");
-
-    // Cold start (includes model loading)
-    let messages = vec![Message::user().with_text("what is the capital of Moldova?")];
-    let start = Instant::now();
-    let (response, _usage) = provider
-        .complete(&model_config, "test-session", "", &messages, &[])
-        .await
-        .expect("cold completion should succeed");
-    let cold_elapsed = start.elapsed();
-
-    let text = response.as_concat_text();
-    assert!(!text.is_empty(), "cold start should produce a response");
-    println!(
-        "Cold start: {cold_elapsed:.2?}, response length: {}",
-        text.len()
-    );
-
-    // Warm run (model already loaded)
-    let messages2 = vec![Message::user().with_text("what is the capital of France?")];
-    let start2 = Instant::now();
-    let (response2, _usage2) = provider
-        .complete(&model_config, "test-session", "", &messages2, &[])
-        .await
-        .expect("warm completion should succeed");
-    let warm_elapsed = start2.elapsed();
-
-    let text2 = response2.as_concat_text();
-    assert!(!text2.is_empty(), "warm run should produce a response");
-    println!(
-        "Warm run: {warm_elapsed:.2?}, response length: {}",
-        text2.len()
-    );
-    assert!(
-        warm_elapsed < cold_elapsed,
-        "warm run ({warm_elapsed:.2?}) should be faster than cold start ({cold_elapsed:.2?})"
-    );
-}
-
 #[tokio::test]
 #[ignore]
 async fn test_local_inference_large_prompt() {
-    let model_config = ModelConfig::new(TEST_MODEL).expect("valid model config");
+    let model_config = ModelConfig::new(&test_model())
+        .expect("valid model config")
+        .with_max_tokens(Some(20));
     let provider = create("local", model_config.clone(), Vec::new())
         .await
         .expect("provider creation should succeed");
@@ -111,7 +78,7 @@ async fn test_local_inference_large_prompt() {
     let prompt = format!("{padding}\nNow answer this: what is the capital of Moldova?");
     let messages = vec![Message::user().with_text(&prompt)];
 
-    let start = Instant::now();
+    let start = std::time::Instant::now();
     let (response, _usage) = provider
         .complete(&model_config, "test-session", "", &messages, &[])
         .await

diff --git a/crates/goose/tests/local_inference_perf.rs b/crates/goose/tests/local_inference_perf.rs
@@ -0,0 +1,66 @@
+//! Performance benchmarks for LocalInferenceProvider.
+//!
+//! These tests require a downloaded GGUF model and are ignored by default.
+//! Download a model first:
+//!   goose local-models download bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M
+//!
+//! Run with the default model:
+//!   cargo test -p goose --test local_inference_perf -- --ignored --nocapture
+//!
+//! Run with a specific model:
+//!   TEST_MODEL="bartowski/Qwen_Qwen3-32B-GGUF:Q4_K_M" cargo test -p goose --test local_inference_perf -- --ignored --nocapture
+
+use goose::conversation::message::Message;
+use goose::model::ModelConfig;
+use goose::providers::create;
+use std::time::Instant;
+
+const DEFAULT_TEST_MODEL: &str = "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M";
+
+fn test_model() -> String {
+    std::env::var("TEST_MODEL").unwrap_or_else(|_| DEFAULT_TEST_MODEL.to_string())
+}
+
+#[tokio::test]
+#[ignore]
+async fn test_local_inference_cold_vs_warm() {
+    let model_config = ModelConfig::new(&test_model())
+        .expect("valid model config")
+        .with_max_tokens(Some(20));
+    let provider = create("local", model_config.clone(), Vec::new())
+        .await
+        .expect("provider creation should succeed");
+
+    // Cold start — includes model loading from disk.
+    let messages = vec![Message::user().with_text("What is 2+2?")];
+    let start = Instant::now();
+    let (response, _) = provider
+        .complete(&model_config, "perf-session", "", &messages, &[])
+        .await
+        .expect("cold completion should succeed");
+    let cold_elapsed = start.elapsed();
+
+    let text = response.as_concat_text();
+    assert!(!text.is_empty(), "cold start should produce a response");
+    println!("Cold start: {cold_elapsed:.2?}, response: {}", text.len());
+
+    // Warm run — model already loaded, only inference.
+    let messages2 = vec![Message::user().with_text("What is 3+3?")];
+    let start2 = Instant::now();
+    let (response2, _) = provider
+        .complete(&model_config, "perf-session", "", &messages2, &[])
+        .await
+        .expect("warm completion should succeed");
+    let warm_elapsed = start2.elapsed();
+
+    let text2 = response2.as_concat_text();
+    assert!(!text2.is_empty(), "warm run should produce a response");
+    println!("Warm run:   {warm_elapsed:.2?}, response: {}", text2.len());
+
+    if warm_elapsed < cold_elapsed {
+        let speedup = cold_elapsed.as_secs_f64() / warm_elapsed.as_secs_f64();
+        println!("Warm is {speedup:.1}x faster than cold");
+    } else {
+        println!("Warning: warm was not faster (model may have been pre-loaded by another test)");
+    }
+}