From 563fbe2d9c43971099190f06b68a194805e12782 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Fri, 18 Jul 2025 12:47:12 -0400
Subject: [PATCH 01/41] comiples

---
 crates/goose/src/context_mgmt/summarize.rs    | 107 +++++++++++++++++-
 crates/goose/src/prompts/summarize_oneshot.md |  18 +++
 2 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 crates/goose/src/prompts/summarize_oneshot.md
diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index 84ea104bb9bc..0fba542c9af4 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -1,14 +1,21 @@
 use super::common::{get_messages_token_counts, get_messages_token_counts_async};
 use crate::message::{Message, MessageContent};
 use crate::providers::base::Provider;
+use crate::prompt_template::render_global_file;
 use crate::token_counter::{AsyncTokenCounter, TokenCounter};
 use anyhow::Result;
 use mcp_core::Role;
+use serde::Serialize;
 use std::sync::Arc;
 
 // Constants for the summarization prompt and a follow-up user message.
 const SUMMARY_PROMPT: &str = "You are good at summarizing conversations";
 
+#[derive(Serialize)]
+struct SummarizeContext {
+    messages: String,
+}
+
 /// Summarize the combined messages from the accumulated summary and the current chunk.
 ///
 /// This method builds the summarization request, sends it to the provider, and returns the summarized response.
@@ -101,13 +108,68 @@ fn reintegrate_removed_messages(
     final_messages
 }
 
+// Summarization steps:
+//    Using a single tailored prompt, summarize the entire conversation history.
+pub async fn summarize_messages_oneshot(
+    provider: Arc<dyn Provider>,
+    messages: &[Message],
+    token_counter: &TokenCounter,
+    _context_limit: usize,
+) -> Result<(Vec<Message>, Vec<usize>), anyhow::Error> {
+    // Preprocess messages to handle tool response edge case.
+    let (preprocessed_messages, removed_messages) = preprocess_messages(messages);
+
+    if preprocessed_messages.is_empty() {
+        // If no messages to summarize, just return the removed messages
+        return Ok((
+            removed_messages.clone(),
+            get_messages_token_counts(token_counter, &removed_messages),
+        ));
+    }
+
+    // Format all messages as a single string for the summarization prompt
+    let messages_text = preprocessed_messages
+        .iter()
+        .map(|msg| format!("{:?}", msg))
+        .collect::<Vec<_>>()
+        .join("\n\n");
+
+    let context = SummarizeContext {
+        messages: messages_text,
+    };
+
+    // Render the one-shot summarization prompt
+    let system_prompt = render_global_file("summarize_oneshot.md", &context)?;
+    
+    // Create a simple user message requesting summarization
+    let user_message = Message::user().with_text("Please summarize the conversation history provided in the system prompt.");
+    let summarization_request = vec![user_message];
+
+    // Send the request to the provider and fetch the response.
+    let mut response = provider
+        .complete(&system_prompt, &summarization_request, &[])
+        .await?
+        .0;
+    
+    // Set role to user as it will be used in following conversation as user content.
+    response.role = Role::User;
+
+    // Add back removed messages.
+    let final_summary = reintegrate_removed_messages(&vec![response], &removed_messages);
+
+    Ok((
+        final_summary.clone(),
+        get_messages_token_counts(token_counter, &final_summary),
+    ))
+}
+
 // Summarization steps:
 // 1. Break down large text into smaller chunks (roughly 30% of the model’s context window).
 // 2. For each chunk:
 //    a. Combine it with the previous summary (or leave blank for the first iteration).
 //    b. Summarize the combined text, focusing on extracting only the information we need.
 // 3. Generate a final summary using a tailored prompt.
-pub async fn summarize_messages(
+pub async fn summarize_messages_chunked(
     provider: Arc<dyn Provider>,
     messages: &[Message],
     token_counter: &TokenCounter,
@@ -159,6 +221,49 @@ pub async fn summarize_messages(
     ))
 }
 
+/// Main summarization function that chooses the best algorithm based on context size.
+/// 
+/// This function will:
+/// 1. First try the one-shot summarization if there's enough context window available
+/// 2. Fall back to the chunked approach if the one-shot fails or if context is too limited
+/// 3. Choose the algorithm based on available context window space
+pub async fn summarize_messages(
+    provider: Arc<dyn Provider>,
+    messages: &[Message],
+    token_counter: &TokenCounter,
+    context_limit: usize,
+) -> Result<(Vec<Message>, Vec<usize>), anyhow::Error> {
+    // Calculate total tokens in messages
+    let total_tokens: usize = get_messages_token_counts(token_counter, messages)
+        .iter()
+        .sum();
+    
+    // Reserve space for the system prompt and response (rough estimate)
+    let system_prompt_overhead = 1000; // Conservative estimate for the summarization prompt
+    let response_overhead = 2000; // Conservative estimate for the response
+    let available_context = context_limit.saturating_sub(system_prompt_overhead + response_overhead);
+    
+    // If the total tokens fit comfortably in the available context (with some buffer),
+    // try the one-shot approach first
+    if total_tokens <= available_context * 3 / 4 { // Use 75% of available context as threshold
+        match summarize_messages_oneshot(
+            Arc::clone(&provider),
+            messages,
+            token_counter,
+            context_limit,
+        ).await {
+            Ok(result) => return Ok(result),
+            Err(e) => {
+                // Log the error but continue to fallback
+                tracing::warn!("One-shot summarization failed, falling back to chunked approach: {}", e);
+            }
+        }
+    }
+    
+    // Fall back to the chunked approach
+    summarize_messages_chunked(provider, messages, token_counter, context_limit).await
+}
+
 /// Async version using AsyncTokenCounter for better performance
 pub async fn summarize_messages_async(
     provider: Arc<dyn Provider>,
diff --git a/crates/goose/src/prompts/summarize_oneshot.md b/crates/goose/src/prompts/summarize_oneshot.md
new file mode 100644
index 000000000000..19214154264e
--- /dev/null
+++ b/crates/goose/src/prompts/summarize_oneshot.md
@@ -0,0 +1,18 @@
+You are an expert at summarizing conversation histories for AI assistants. Your task is to create a concise but comprehensive summary that preserves all the essential information needed for future conversations.
+
+## Guidelines:
+- Preserve key context, decisions, and outcomes
+- Maintain the chronological flow of important events
+- Include specific technical details, file names, code snippets, and configurations that were discussed
+- Retain user preferences and requirements that were established
+- Keep track of any ongoing tasks or unresolved issues
+- Summarize tool usage and their results when relevant
+- Maintain the conversational context and relationship dynamics
+
+## Format:
+Create a summary that reads naturally and can be used as context for continuing the conversation. The summary should be detailed enough that an AI assistant can pick up where the conversation left off without losing important context.
+
+Focus on being comprehensive rather than brief - it's better to include potentially relevant details than to lose important context.
+
+## Conversation to Summarize:
+{{ messages }}

From aa43f096392ca5dc9796358dec34f04318226fc4 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Fri, 18 Jul 2025 12:53:54 -0400
Subject: [PATCH 02/41] tests

---
 crates/goose/src/context_mgmt/summarize.rs | 248 +++++++++++++++++++++
 1 file changed, 248 insertions(+)

diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index 0fba542c9af4..c07acdb9ed7b 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -573,4 +573,252 @@ mod tests {
             "The final message list should include the summary and removed messages."
         );
     }
+
+    #[tokio::test]
+    async fn test_summarize_messages_uses_oneshot_for_small_context() {
+        let provider = create_mock_provider();
+        let token_counter = TokenCounter::new();
+        let context_limit = 100_000; // Large context limit
+        let messages = create_test_messages(); // Small message set
+
+        let result = summarize_messages(
+            Arc::clone(&provider),
+            &messages,
+            &token_counter,
+            context_limit,
+        )
+        .await;
+
+        assert!(result.is_ok(), "The function should return Ok.");
+        let (summarized_messages, _) = result.unwrap();
+
+        // Should use one-shot and return a single summarized message
+        assert_eq!(
+            summarized_messages.len(),
+            1,
+            "Should use one-shot summarization for small context."
+        );
+    }
+
+    #[tokio::test]
+    async fn test_summarize_messages_uses_chunked_for_large_context() {
+        let provider = create_mock_provider();
+        let token_counter = TokenCounter::new();
+        let context_limit = 100; // Small context limit but not too small to cause overflow
+        let messages = create_test_messages();
+
+        let result = summarize_messages(
+            Arc::clone(&provider),
+            &messages,
+            &token_counter,
+            context_limit,
+        )
+        .await;
+
+        assert!(result.is_ok(), "The function should return Ok.");
+        let (summarized_messages, _) = result.unwrap();
+
+        // Should fall back to chunked approach
+        assert_eq!(
+            summarized_messages.len(),
+            1,
+            "Should use chunked summarization for large context."
+        );
+    }
+
+    // Mock provider that fails on one-shot but succeeds on chunked
+    #[derive(Clone)]
+    struct FailingOneshotProvider {
+        model_config: ModelConfig,
+        call_count: Arc<std::sync::Mutex<usize>>,
+    }
+
+    #[async_trait::async_trait]
+    impl Provider for FailingOneshotProvider {
+        fn metadata() -> ProviderMetadata {
+            ProviderMetadata::empty()
+        }
+
+        fn get_model_config(&self) -> ModelConfig {
+            self.model_config.clone()
+        }
+
+        async fn complete(
+            &self,
+            system: &str,
+            _messages: &[Message],
+            _tools: &[Tool],
+        ) -> Result<(Message, ProviderUsage), ProviderError> {
+            let mut count = self.call_count.lock().unwrap();
+            *count += 1;
+            
+            // Fail if this looks like a one-shot request (contains the one-shot prompt content)
+            if system.contains("expert at summarizing conversation histories") {
+                return Err(ProviderError::RateLimitExceeded(
+                    "Simulated one-shot failure".to_string(),
+                ));
+            }
+            
+            // Succeed for chunked requests (uses the old SUMMARY_PROMPT)
+            Ok((
+                Message::new(
+                    Role::Assistant,
+                    Utc::now().timestamp(),
+                    vec![MessageContent::Text(TextContent {
+                        text: "Chunked summary".to_string(),
+                        annotations: None,
+                    })],
+                ),
+                ProviderUsage::new("mock".to_string(), Usage::default()),
+            ))
+        }
+    }
+
+    #[tokio::test]
+    async fn test_summarize_messages_fallback_on_oneshot_failure() {
+        let call_count = Arc::new(std::sync::Mutex::new(0));
+        let provider = Arc::new(FailingOneshotProvider {
+            model_config: ModelConfig::new("test-model".to_string()).with_context_limit(200_000.into()),
+            call_count: Arc::clone(&call_count),
+        });
+        let token_counter = TokenCounter::new();
+        let context_limit = 100_000; // Large enough to try one-shot first
+        let messages = create_test_messages();
+
+        let result = summarize_messages(
+            provider,
+            &messages,
+            &token_counter,
+            context_limit,
+        )
+        .await;
+
+        assert!(result.is_ok(), "The function should return Ok after fallback.");
+        let (summarized_messages, _) = result.unwrap();
+
+        // Should have fallen back to chunked approach
+        assert_eq!(
+            summarized_messages.len(),
+            1,
+            "Should successfully fall back to chunked approach."
+        );
+
+        // Verify the content comes from the chunked approach
+        if let MessageContent::Text(text_content) = &summarized_messages[0].content[0] {
+            assert_eq!(text_content.text, "Chunked summary");
+        } else {
+            panic!("Expected text content");
+        }
+
+        // Should have made multiple calls (one-shot attempt + chunked calls)
+        let final_count = *call_count.lock().unwrap();
+        assert!(final_count > 1, "Should have made multiple provider calls during fallback");
+    }
+
+    #[tokio::test]
+    async fn test_summarize_messages_oneshot_direct_call() {
+        let provider = create_mock_provider();
+        let token_counter = TokenCounter::new();
+        let context_limit = 100_000;
+        let messages = create_test_messages();
+
+        let result = summarize_messages_oneshot(
+            Arc::clone(&provider),
+            &messages,
+            &token_counter,
+            context_limit,
+        )
+        .await;
+
+        assert!(result.is_ok(), "One-shot summarization should work directly.");
+        let (summarized_messages, token_counts) = result.unwrap();
+
+        assert_eq!(
+            summarized_messages.len(),
+            1,
+            "One-shot should return a single summary message."
+        );
+        assert_eq!(
+            summarized_messages[0].role,
+            Role::User,
+            "Summary should be from user role for context."
+        );
+        assert_eq!(
+            token_counts.len(),
+            1,
+            "Should have token count for the summary."
+        );
+    }
+
+    #[tokio::test]
+    async fn test_summarize_messages_chunked_direct_call() {
+        let provider = create_mock_provider();
+        let token_counter = TokenCounter::new();
+        let context_limit = 30; // Small to force chunking
+        let messages = create_test_messages();
+
+        let result = summarize_messages_chunked(
+            Arc::clone(&provider),
+            &messages,
+            &token_counter,
+            context_limit,
+        )
+        .await;
+
+        assert!(result.is_ok(), "Chunked summarization should work directly.");
+        let (summarized_messages, token_counts) = result.unwrap();
+
+        assert_eq!(
+            summarized_messages.len(),
+            1,
+            "Chunked should return a single final summary."
+        );
+        assert_eq!(
+            summarized_messages[0].role,
+            Role::User,
+            "Summary should be from user role for context."
+        );
+        assert_eq!(
+            token_counts.len(),
+            1,
+            "Should have token count for the summary."
+        );
+    }
+
+    #[tokio::test]
+    async fn test_context_size_threshold_calculation() {
+        let provider = create_mock_provider();
+        let token_counter = TokenCounter::new();
+        
+        // Test with a context limit where the calculation matters
+        let context_limit = 10_000;
+        let system_prompt_overhead = 1000;
+        let response_overhead = 2000;
+        let available_context = context_limit - system_prompt_overhead - response_overhead;
+        let threshold = available_context * 3 / 4; // 75% of available context
+        
+        // Create messages that are just under the threshold
+        let mut large_messages = Vec::new();
+        let base_message = set_up_text_message("x".repeat(100).as_str(), Role::User);
+        
+        // Add enough messages to approach but not exceed the threshold
+        let message_tokens = token_counter.count_tokens(&format!("{:?}", base_message));
+        let num_messages = (threshold / message_tokens).saturating_sub(1);
+        
+        for i in 0..num_messages {
+            large_messages.push(set_up_text_message(&format!("Message {}", i), Role::User));
+        }
+
+        let result = summarize_messages(
+            Arc::clone(&provider),
+            &large_messages,
+            &token_counter,
+            context_limit,
+        )
+        .await;
+
+        assert!(result.is_ok(), "Should handle threshold calculation correctly.");
+        let (summarized_messages, _) = result.unwrap();
+        assert_eq!(summarized_messages.len(), 1, "Should produce a summary.");
+    }
 }

From 9ea702b8092af9b864b529bb82089a11eb898025 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Fri, 18 Jul 2025 13:34:18 -0400
Subject: [PATCH 03/41] context usage fix

---
 crates/goose-cli/src/session/mod.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 5b51bd701125..77615e928afe 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -742,6 +742,9 @@ impl Session {
                             )
                             .green()
                         );
+                        
+                        // Update the context display immediately after summarization
+                        self.display_context_usage().await?;
                     } else {
                         println!("{}", console::style("Summarization cancelled.").yellow());
                     }

From 6bbba2008446d658387d143ed2bc95725ffc24a9 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Fri, 18 Jul 2025 13:43:47 -0400
Subject: [PATCH 04/41] change overhead

---
 crates/goose-cli/src/session/mod.rs        |   2 +-
 crates/goose/src/context_mgmt/summarize.rs | 109 ++++++++++++---------
 2 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 77615e928afe..325629a21853 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -742,7 +742,7 @@ impl Session {
                             )
                             .green()
                         );
-                        
+
                         // Update the context display immediately after summarization
                         self.display_context_usage().await?;
                     } else {
diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index c07acdb9ed7b..6d661a115165 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -1,7 +1,7 @@
 use super::common::{get_messages_token_counts, get_messages_token_counts_async};
 use crate::message::{Message, MessageContent};
-use crate::providers::base::Provider;
 use crate::prompt_template::render_global_file;
+use crate::providers::base::Provider;
 use crate::token_counter::{AsyncTokenCounter, TokenCounter};
 use anyhow::Result;
 use mcp_core::Role;
@@ -140,9 +140,10 @@ pub async fn summarize_messages_oneshot(
 
     // Render the one-shot summarization prompt
     let system_prompt = render_global_file("summarize_oneshot.md", &context)?;
-    
+
     // Create a simple user message requesting summarization
-    let user_message = Message::user().with_text("Please summarize the conversation history provided in the system prompt.");
+    let user_message = Message::user()
+        .with_text("Please summarize the conversation history provided in the system prompt.");
     let summarization_request = vec![user_message];
 
     // Send the request to the provider and fetch the response.
@@ -150,12 +151,12 @@ pub async fn summarize_messages_oneshot(
         .complete(&system_prompt, &summarization_request, &[])
         .await?
         .0;
-    
+
     // Set role to user as it will be used in following conversation as user content.
     response.role = Role::User;
 
     // Add back removed messages.
-    let final_summary = reintegrate_removed_messages(&vec![response], &removed_messages);
+    let final_summary = reintegrate_removed_messages(&[response], &removed_messages);
 
     Ok((
         final_summary.clone(),
@@ -222,11 +223,11 @@ pub async fn summarize_messages_chunked(
 }
 
 /// Main summarization function that chooses the best algorithm based on context size.
-/// 
+///
 /// This function will:
 /// 1. First try the one-shot summarization if there's enough context window available
 /// 2. Fall back to the chunked approach if the one-shot fails or if context is too limited
-/// 3. Choose the algorithm based on available context window space
+/// 3. Choose the algorithm based on absolute token requirements rather than percentages
 pub async fn summarize_messages(
     provider: Arc<dyn Provider>,
     messages: &[Message],
@@ -237,29 +238,34 @@ pub async fn summarize_messages(
     let total_tokens: usize = get_messages_token_counts(token_counter, messages)
         .iter()
         .sum();
-    
-    // Reserve space for the system prompt and response (rough estimate)
+
+    // Calculate absolute token requirements (future-proof for large context models)
     let system_prompt_overhead = 1000; // Conservative estimate for the summarization prompt
-    let response_overhead = 2000; // Conservative estimate for the response
-    let available_context = context_limit.saturating_sub(system_prompt_overhead + response_overhead);
-    
-    // If the total tokens fit comfortably in the available context (with some buffer),
-    // try the one-shot approach first
-    if total_tokens <= available_context * 3 / 4 { // Use 75% of available context as threshold
+    let response_overhead = 4000; // Generous buffer for response generation
+    let safety_buffer = 1000; // Small safety margin for tokenization variations
+    let total_required = total_tokens + system_prompt_overhead + response_overhead + safety_buffer;
+
+    // Use one-shot if we have enough absolute space (no percentage-based limits)
+    if total_required <= context_limit {
         match summarize_messages_oneshot(
             Arc::clone(&provider),
             messages,
             token_counter,
             context_limit,
-        ).await {
+        )
+        .await
+        {
             Ok(result) => return Ok(result),
             Err(e) => {
                 // Log the error but continue to fallback
-                tracing::warn!("One-shot summarization failed, falling back to chunked approach: {}", e);
+                tracing::warn!(
+                    "One-shot summarization failed, falling back to chunked approach: {}",
+                    e
+                );
             }
         }
     }
-    
+
     // Fall back to the chunked approach
     summarize_messages_chunked(provider, messages, token_counter, context_limit).await
 }
@@ -651,14 +657,14 @@ mod tests {
         ) -> Result<(Message, ProviderUsage), ProviderError> {
             let mut count = self.call_count.lock().unwrap();
             *count += 1;
-            
+
             // Fail if this looks like a one-shot request (contains the one-shot prompt content)
             if system.contains("expert at summarizing conversation histories") {
                 return Err(ProviderError::RateLimitExceeded(
                     "Simulated one-shot failure".to_string(),
                 ));
             }
-            
+
             // Succeed for chunked requests (uses the old SUMMARY_PROMPT)
             Ok((
                 Message::new(
@@ -678,22 +684,20 @@ mod tests {
     async fn test_summarize_messages_fallback_on_oneshot_failure() {
         let call_count = Arc::new(std::sync::Mutex::new(0));
         let provider = Arc::new(FailingOneshotProvider {
-            model_config: ModelConfig::new("test-model".to_string()).with_context_limit(200_000.into()),
+            model_config: ModelConfig::new("test-model".to_string())
+                .with_context_limit(200_000.into()),
             call_count: Arc::clone(&call_count),
         });
         let token_counter = TokenCounter::new();
         let context_limit = 100_000; // Large enough to try one-shot first
         let messages = create_test_messages();
 
-        let result = summarize_messages(
-            provider,
-            &messages,
-            &token_counter,
-            context_limit,
-        )
-        .await;
+        let result = summarize_messages(provider, &messages, &token_counter, context_limit).await;
 
-        assert!(result.is_ok(), "The function should return Ok after fallback.");
+        assert!(
+            result.is_ok(),
+            "The function should return Ok after fallback."
+        );
         let (summarized_messages, _) = result.unwrap();
 
         // Should have fallen back to chunked approach
@@ -712,7 +716,10 @@ mod tests {
 
         // Should have made multiple calls (one-shot attempt + chunked calls)
         let final_count = *call_count.lock().unwrap();
-        assert!(final_count > 1, "Should have made multiple provider calls during fallback");
+        assert!(
+            final_count > 1,
+            "Should have made multiple provider calls during fallback"
+        );
     }
 
     #[tokio::test]
@@ -730,7 +737,10 @@ mod tests {
         )
         .await;
 
-        assert!(result.is_ok(), "One-shot summarization should work directly.");
+        assert!(
+            result.is_ok(),
+            "One-shot summarization should work directly."
+        );
         let (summarized_messages, token_counts) = result.unwrap();
 
         assert_eq!(
@@ -765,7 +775,10 @@ mod tests {
         )
         .await;
 
-        assert!(result.is_ok(), "Chunked summarization should work directly.");
+        assert!(
+            result.is_ok(),
+            "Chunked summarization should work directly."
+        );
         let (summarized_messages, token_counts) = result.unwrap();
 
         assert_eq!(
@@ -786,25 +799,26 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_context_size_threshold_calculation() {
+    async fn test_absolute_token_threshold_calculation() {
         let provider = create_mock_provider();
         let token_counter = TokenCounter::new();
-        
-        // Test with a context limit where the calculation matters
+
+        // Test with a context limit where absolute token calculation matters
         let context_limit = 10_000;
         let system_prompt_overhead = 1000;
-        let response_overhead = 2000;
-        let available_context = context_limit - system_prompt_overhead - response_overhead;
-        let threshold = available_context * 3 / 4; // 75% of available context
-        
-        // Create messages that are just under the threshold
+        let response_overhead = 4000;
+        let safety_buffer = 1000;
+        let max_message_tokens =
+            context_limit - system_prompt_overhead - response_overhead - safety_buffer; // 4000 tokens
+
+        // Create messages that are just under the absolute threshold
         let mut large_messages = Vec::new();
-        let base_message = set_up_text_message("x".repeat(100).as_str(), Role::User);
-        
-        // Add enough messages to approach but not exceed the threshold
+        let base_message = set_up_text_message("x".repeat(50).as_str(), Role::User);
+
+        // Add enough messages to approach but not exceed the absolute threshold
         let message_tokens = token_counter.count_tokens(&format!("{:?}", base_message));
-        let num_messages = (threshold / message_tokens).saturating_sub(1);
-        
+        let num_messages = (max_message_tokens / message_tokens).saturating_sub(1);
+
         for i in 0..num_messages {
             large_messages.push(set_up_text_message(&format!("Message {}", i), Role::User));
         }
@@ -817,7 +831,10 @@ mod tests {
         )
         .await;
 
-        assert!(result.is_ok(), "Should handle threshold calculation correctly.");
+        assert!(
+            result.is_ok(),
+            "Should handle absolute threshold calculation correctly."
+        );
         let (summarized_messages, _) = result.unwrap();
         assert_eq!(summarized_messages.len(), 1, "Should produce a summary.");
     }

From ff5c9a29507db5072aa916d469f75a72838a6093 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Fri, 18 Jul 2025 15:00:18 -0400
Subject: [PATCH 05/41] shrink works

---
 crates/goose-cli/src/session/mod.rs | 50 ++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 325629a21853..fd83e21d7863 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -742,9 +742,6 @@ impl Session {
                             )
                             .green()
                         );
-
-                        // Update the context display immediately after summarization
-                        self.display_context_usage().await?;
                     } else {
                         println!("{}", console::style("Summarization cancelled.").yellow());
                     }
@@ -1415,13 +1412,15 @@ impl Session {
             );
         }
 
-        match self.get_metadata() {
-            Ok(metadata) => {
-                let total_tokens = metadata.total_tokens.unwrap_or(0) as usize;
-
-                output::display_context_usage(total_tokens, context_limit);
+        // Calculate actual current token count from messages
+        let current_token_count = self.calculate_current_token_count().await?;
+        
+        output::display_context_usage(current_token_count, context_limit);
 
-                if show_cost {
+        if show_cost {
+            // For cost display, try to use metadata if available, otherwise fall back to current count
+            match self.get_metadata() {
+                Ok(metadata) => {
                     let input_tokens = metadata.input_tokens.unwrap_or(0) as usize;
                     let output_tokens = metadata.output_tokens.unwrap_or(0) as usize;
                     output::display_cost_usage(
@@ -1432,15 +1431,42 @@ impl Session {
                     )
                     .await;
                 }
-            }
-            Err(_) => {
-                output::display_context_usage(0, context_limit);
+                Err(_) => {
+                    // If no metadata available, we can't show cost breakdown
+                }
             }
         }
 
         Ok(())
     }
 
+    /// Calculate the current token count of the messages in memory
+    async fn calculate_current_token_count(&self) -> Result<usize> {
+        use goose::token_counter::create_async_token_counter;
+        
+        let token_counter = create_async_token_counter().await
+            .map_err(|e| anyhow::anyhow!("Failed to create token counter: {}", e))?;
+        
+        // Get tools from agent
+        let tools = self.agent.list_tools(None).await;
+        
+        // For now, use a simplified system prompt calculation
+        // In a real implementation, we'd want to access the agent's prompt manager
+        // but since it's private, we'll use an empty system prompt as approximation
+        let system_prompt = ""; // This is a simplification
+        
+        let resources = vec![]; // No direct way to get resources currently
+        
+        let token_count = token_counter.count_everything(
+            system_prompt,
+            &self.messages,
+            &tools,
+            &resources,
+        );
+        
+        Ok(token_count)
+    }
+
     /// Handle prompt command execution
     async fn handle_prompt_command(&mut self, opts: input::PromptCommandOptions) -> Result<()> {
         // name is required

From 52099ce07d57425f3eaf47f513041b4bc967b0e3 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Mon, 21 Jul 2025 12:15:42 -0400
Subject: [PATCH 06/41] new prompt

---
 crates/goose/src/prompts/summarize_oneshot.md | 77 +++++++++++++++----
 .../src/prompts/summarize_oneshot_old.md      | 18 +++++
 2 files changed, 81 insertions(+), 14 deletions(-)
 create mode 100644 crates/goose/src/prompts/summarize_oneshot_old.md

diff --git a/crates/goose/src/prompts/summarize_oneshot.md b/crates/goose/src/prompts/summarize_oneshot.md
index 19214154264e..aeedfa38a2b1 100644
--- a/crates/goose/src/prompts/summarize_oneshot.md
+++ b/crates/goose/src/prompts/summarize_oneshot.md
@@ -1,18 +1,67 @@
-You are an expert at summarizing conversation histories for AI assistants. Your task is to create a concise but comprehensive summary that preserves all the essential information needed for future conversations.
+## 📝 Summary Generation Instructions
 
-## Guidelines:
-- Preserve key context, decisions, and outcomes
-- Maintain the chronological flow of important events
-- Include specific technical details, file names, code snippets, and configurations that were discussed
-- Retain user preferences and requirements that were established
-- Keep track of any ongoing tasks or unresolved issues
-- Summarize tool usage and their results when relevant
-- Maintain the conversational context and relationship dynamics
+Your task is to generate a comprehensive summary of the conversation so far, with close attention to the user's explicit requests and your own prior actions.  
+This summary must fully capture all technical details, code structures, and architectural decisions critical to resuming development work without losing context.
 
-## Format:
-Create a summary that reads naturally and can be used as context for continuing the conversation. The summary should be detailed enough that an AI assistant can pick up where the conversation left off without losing important context.
+Before presenting the final summary, enclose your reasoning in `<analysis>` tags to organize your thought process and confirm that you've addressed all required components.  
+During your analysis, follow this approach:
 
-Focus on being comprehensive rather than brief - it's better to include potentially relevant details than to lose important context.
+### 🔍 Analysis Process
 
-## Conversation to Summarize:
-{{ messages }}
+- Review the conversation **chronologically**, section by section.  
+- For each part, clearly identify:
+  - ✅ The user’s **explicit requests** and stated **intentions**
+  - 🛠️ Your **approach** and method for addressing those requests
+  - 🧠 Major **technical decisions**, **concepts**, and **design choices**
+  - 🧩 Specific technical elements such as:
+    - `file names`
+    - `complete code snippets`
+    - `function signatures`
+    - `code modifications`
+    - `errors encountered` and how they were resolved
+
+- 🔁 Pay special attention to **direct user feedback** — especially any revisions or corrections.
+- 📋 Double-check for **technical completeness and accuracy**, ensuring that **every required element** has been thoroughly addressed.
+
+## 📄 Required Sections in Your Summary
+
+### 1. **Primary Request and Intent**  
+Capture all of the user’s **core goals** and **specific requests** throughout the conversation.
+
+### 2. **Key Technical Concepts**  
+List all major **technical concepts**, **technologies**, **tools**, or **frameworks** discussed.
+
+### 3. **Files and Code Sections**  
+Detail the specific **files and code regions** that were viewed, changed, or created.  
+Include **full code snippets** where relevant, and explain **why** each change or file mattered.
+
+### 4. **Errors and Fixes**  
+List all **errors**, bugs, or unexpected behavior you encountered — and how you resolved them.  
+Call out any **user feedback** that led you to change your solution or debugging approach.
+
+### 5. **Problem Solving**  
+Summarize all **problems solved**, including any **ongoing troubleshooting** efforts.
+
+### 6. **All User Messages**  
+Include **all user messages** (excluding tool output).  
+These are essential for tracking **user feedback** and **shifts in intent**.
+
+### 7. **Pending Tasks**  
+List any **outstanding tasks** that the user explicitly asked you to work on.
+
+### 8. **Current Work**  
+Describe **precisely** what was being worked on **immediately before** the summary request.  
+Include:
+- File names
+- Code snippets
+- Specifics from the latest conversation  
+Make sure this ties directly to the user’s **latest instructions**.
+
+### 9. **Optional Next Step**  
+Only include this if:
+- It is a **direct continuation** of your last task
+- It clearly aligns with the user’s **explicit request**
+
+> **⚠️ Do not introduce new directions** unless confirmed with the user.
+
+If appropriate, include **verbatim quotes** from the recent conversation to show **where you left off**.
diff --git a/crates/goose/src/prompts/summarize_oneshot_old.md b/crates/goose/src/prompts/summarize_oneshot_old.md
new file mode 100644
index 000000000000..19214154264e
--- /dev/null
+++ b/crates/goose/src/prompts/summarize_oneshot_old.md
@@ -0,0 +1,18 @@
+You are an expert at summarizing conversation histories for AI assistants. Your task is to create a concise but comprehensive summary that preserves all the essential information needed for future conversations.
+
+## Guidelines:
+- Preserve key context, decisions, and outcomes
+- Maintain the chronological flow of important events
+- Include specific technical details, file names, code snippets, and configurations that were discussed
+- Retain user preferences and requirements that were established
+- Keep track of any ongoing tasks or unresolved issues
+- Summarize tool usage and their results when relevant
+- Maintain the conversational context and relationship dynamics
+
+## Format:
+Create a summary that reads naturally and can be used as context for continuing the conversation. The summary should be detailed enough that an AI assistant can pick up where the conversation left off without losing important context.
+
+Focus on being comprehensive rather than brief - it's better to include potentially relevant details than to lose important context.
+
+## Conversation to Summarize:
+{{ messages }}

From f8b37d0de538504b4d51120873d6eb85a0e3bdcf Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Mon, 21 Jul 2025 18:59:25 -0400
Subject: [PATCH 07/41] move token counter

---
 crates/goose-cli/src/session/mod.rs | 28 +---------------------------
 crates/goose/src/agents/agent.rs    | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 57f0aab575a8..a0091d08ea79 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -1439,7 +1439,7 @@ impl Session {
         }
 
         // Calculate actual current token count from messages
-        let current_token_count = self.calculate_current_token_count().await?;
+        let current_token_count = self.agent.get_current_token_count(&self.messages).await?;
         
         output::display_context_usage(current_token_count, context_limit);
 
@@ -1466,32 +1466,6 @@ impl Session {
         Ok(())
     }
 
-    /// Calculate the current token count of the messages in memory
-    async fn calculate_current_token_count(&self) -> Result<usize> {
-        use goose::token_counter::create_async_token_counter;
-        
-        let token_counter = create_async_token_counter().await
-            .map_err(|e| anyhow::anyhow!("Failed to create token counter: {}", e))?;
-        
-        // Get tools from agent
-        let tools = self.agent.list_tools(None).await;
-        
-        // For now, use a simplified system prompt calculation
-        // In a real implementation, we'd want to access the agent's prompt manager
-        // but since it's private, we'll use an empty system prompt as approximation
-        let system_prompt = ""; // This is a simplification
-        
-        let resources = vec![]; // No direct way to get resources currently
-        
-        let token_count = token_counter.count_everything(
-            system_prompt,
-            &self.messages,
-            &tools,
-            &resources,
-        );
-        
-        Ok(token_count)
-    }
 
     /// Handle prompt command execution
     async fn handle_prompt_command(&mut self, opts: input::PromptCommandOptions) -> Result<()> {
diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index f74823473c5e..2ab18535b6b6 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -39,6 +39,7 @@ use crate::agents::platform_tools::{
     PLATFORM_SEARCH_AVAILABLE_EXTENSIONS_TOOL_NAME,
 };
 use crate::agents::prompt_manager::PromptManager;
+use crate::token_counter::create_async_token_counter;
 use crate::agents::router_tool_selector::{
     create_tool_selector, RouterToolSelectionStrategy, RouterToolSelector,
 };
@@ -172,6 +173,29 @@ impl Agent {
         }
     }
 
+    /// Get the current token count for the given messages using proper agent context
+    pub async fn get_current_token_count(&self, messages: &[Message]) -> Result<usize, anyhow::Error> {
+        let provider = self.provider().await?;
+        let token_counter = create_async_token_counter()
+            .await
+            .map_err(|e| anyhow::anyhow!("Failed to create token counter: {}", e))?;
+        
+        // Build the proper system prompt with agent context
+        let prompt_manager = self.prompt_manager.lock().await;
+        let system_prompt = prompt_manager.build_system_prompt(
+            vec![], // Extensions info - could be populated if needed
+            self.frontend_instructions.lock().await.clone(),
+            serde_json::Value::Null, // No extension disable prompt
+            Some(&provider.get_model_config().model_name),
+            None, // No tool selection strategy
+        );
+        
+        let tools = self.list_tools(None).await;
+        let resources = vec![]; // Resources could be populated if available
+        
+        Ok(token_counter.count_everything(&system_prompt, messages, &tools, &resources))
+    }
+
     /// Set the scheduler service for this agent
     pub async fn set_scheduler(&self, scheduler: Arc<dyn SchedulerTrait>) {
         let mut scheduler_service = self.scheduler_service.lock().await;

From e83cadca01ab4e66e8ebbe70a1f96bacfa90ce94 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Mon, 21 Jul 2025 19:17:53 -0400
Subject: [PATCH 08/41] reset changes to mod.rs and agent.rs to focus on
 summarization algorithm

---
 crates/goose-cli/src/session/mod.rs | 28 +++++++++++++++++++++++++++-
 crates/goose/src/agents/agent.rs    | 24 ------------------------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index a0091d08ea79..57f0aab575a8 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -1439,7 +1439,7 @@ impl Session {
         }
 
         // Calculate actual current token count from messages
-        let current_token_count = self.agent.get_current_token_count(&self.messages).await?;
+        let current_token_count = self.calculate_current_token_count().await?;
         
         output::display_context_usage(current_token_count, context_limit);
 
@@ -1466,6 +1466,32 @@ impl Session {
         Ok(())
     }
 
+    /// Calculate the current token count of the messages in memory
+    async fn calculate_current_token_count(&self) -> Result<usize> {
+        use goose::token_counter::create_async_token_counter;
+        
+        let token_counter = create_async_token_counter().await
+            .map_err(|e| anyhow::anyhow!("Failed to create token counter: {}", e))?;
+        
+        // Get tools from agent
+        let tools = self.agent.list_tools(None).await;
+        
+        // For now, use a simplified system prompt calculation
+        // In a real implementation, we'd want to access the agent's prompt manager
+        // but since it's private, we'll use an empty system prompt as approximation
+        let system_prompt = ""; // This is a simplification
+        
+        let resources = vec![]; // No direct way to get resources currently
+        
+        let token_count = token_counter.count_everything(
+            system_prompt,
+            &self.messages,
+            &tools,
+            &resources,
+        );
+        
+        Ok(token_count)
+    }
 
     /// Handle prompt command execution
     async fn handle_prompt_command(&mut self, opts: input::PromptCommandOptions) -> Result<()> {
diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index 2ab18535b6b6..f74823473c5e 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -39,7 +39,6 @@ use crate::agents::platform_tools::{
     PLATFORM_SEARCH_AVAILABLE_EXTENSIONS_TOOL_NAME,
 };
 use crate::agents::prompt_manager::PromptManager;
-use crate::token_counter::create_async_token_counter;
 use crate::agents::router_tool_selector::{
     create_tool_selector, RouterToolSelectionStrategy, RouterToolSelector,
 };
@@ -173,29 +172,6 @@ impl Agent {
         }
     }
 
-    /// Get the current token count for the given messages using proper agent context
-    pub async fn get_current_token_count(&self, messages: &[Message]) -> Result<usize, anyhow::Error> {
-        let provider = self.provider().await?;
-        let token_counter = create_async_token_counter()
-            .await
-            .map_err(|e| anyhow::anyhow!("Failed to create token counter: {}", e))?;
-        
-        // Build the proper system prompt with agent context
-        let prompt_manager = self.prompt_manager.lock().await;
-        let system_prompt = prompt_manager.build_system_prompt(
-            vec![], // Extensions info - could be populated if needed
-            self.frontend_instructions.lock().await.clone(),
-            serde_json::Value::Null, // No extension disable prompt
-            Some(&provider.get_model_config().model_name),
-            None, // No tool selection strategy
-        );
-        
-        let tools = self.list_tools(None).await;
-        let resources = vec![]; // Resources could be populated if available
-        
-        Ok(token_counter.count_everything(&system_prompt, messages, &tools, &resources))
-    }
-
     /// Set the scheduler service for this agent
     pub async fn set_scheduler(&self, scheduler: Arc<dyn SchedulerTrait>) {
         let mut scheduler_service = self.scheduler_service.lock().await;

From 324c51187c3cae37ffdcb99a245d68b47efe52ff Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Tue, 22 Jul 2025 15:02:14 -0400
Subject: [PATCH 09/41] rm old summarizer

---
 .../goose/src/prompts/summarize_oneshot_old.md | 18 ------------------
 1 file changed, 18 deletions(-)
 delete mode 100644 crates/goose/src/prompts/summarize_oneshot_old.md

diff --git a/crates/goose/src/prompts/summarize_oneshot_old.md b/crates/goose/src/prompts/summarize_oneshot_old.md
deleted file mode 100644
index 19214154264e..000000000000
--- a/crates/goose/src/prompts/summarize_oneshot_old.md
+++ /dev/null
@@ -1,18 +0,0 @@
-You are an expert at summarizing conversation histories for AI assistants. Your task is to create a concise but comprehensive summary that preserves all the essential information needed for future conversations.
-
-## Guidelines:
-- Preserve key context, decisions, and outcomes
-- Maintain the chronological flow of important events
-- Include specific technical details, file names, code snippets, and configurations that were discussed
-- Retain user preferences and requirements that were established
-- Keep track of any ongoing tasks or unresolved issues
-- Summarize tool usage and their results when relevant
-- Maintain the conversational context and relationship dynamics
-
-## Format:
-Create a summary that reads naturally and can be used as context for continuing the conversation. The summary should be detailed enough that an AI assistant can pick up where the conversation left off without losing important context.
-
-Focus on being comprehensive rather than brief - it's better to include potentially relevant details than to lose important context.
-
-## Conversation to Summarize:
-{{ messages }}

From 5546bef282a7d3c322f8dbc3c4f4cb31ac6c5d34 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Tue, 22 Jul 2025 15:32:51 -0400
Subject: [PATCH 10/41] shrink token counting

---
 crates/goose-cli/src/session/mod.rs        | 38 ++++++++++------------
 crates/goose/src/context_mgmt/summarize.rs | 10 +++---
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 57f0aab575a8..86964dd7078b 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -1438,17 +1438,19 @@ impl Session {
             );
         }
 
-        // Calculate actual current token count from messages
+        // Calculate actual current token count from messages - this is always up-to-date
         let current_token_count = self.calculate_current_token_count().await?;
         
         output::display_context_usage(current_token_count, context_limit);
 
         if show_cost {
-            // For cost display, try to use metadata if available, otherwise fall back to current count
+            // For cost display, also use metadata if available for accumulated totals
+            // but the current token count is always the most accurate for the current state
             match self.get_metadata() {
                 Ok(metadata) => {
-                    let input_tokens = metadata.input_tokens.unwrap_or(0) as usize;
-                    let output_tokens = metadata.output_tokens.unwrap_or(0) as usize;
+                    // Use accumulated totals from metadata if available, otherwise fall back to current count
+                    let input_tokens = metadata.accumulated_input_tokens.unwrap_or(0) as usize;
+                    let output_tokens = metadata.accumulated_output_tokens.unwrap_or(0) as usize;
                     output::display_cost_usage(
                         &provider_name,
                         &model_config.model_name,
@@ -1466,29 +1468,23 @@ impl Session {
         Ok(())
     }
 
-    /// Calculate the current token count of the messages in memory
+    /// Calculate the current token count of the session messages in memory
+    /// This only counts the actual conversation messages, not system prompts or tools
+    /// Returns 0 at the start of a session when there are no messages
     async fn calculate_current_token_count(&self) -> Result<usize> {
+        // If no messages, return 0 (clean start)
+        if self.messages.is_empty() {
+            return Ok(0);
+        }
+        
         use goose::token_counter::create_async_token_counter;
         
         let token_counter = create_async_token_counter().await
             .map_err(|e| anyhow::anyhow!("Failed to create token counter: {}", e))?;
         
-        // Get tools from agent
-        let tools = self.agent.list_tools(None).await;
-        
-        // For now, use a simplified system prompt calculation
-        // In a real implementation, we'd want to access the agent's prompt manager
-        // but since it's private, we'll use an empty system prompt as approximation
-        let system_prompt = ""; // This is a simplification
-        
-        let resources = vec![]; // No direct way to get resources currently
-        
-        let token_count = token_counter.count_everything(
-            system_prompt,
-            &self.messages,
-            &tools,
-            &resources,
-        );
+        // Only count the session messages without system prompt or tools
+        // This gives us a clean count of just the conversation content
+        let token_count = token_counter.count_chat_tokens("", &self.messages, &[]);
         
         Ok(token_count)
     }
diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index 9fcdd024e6fa..b78f7468ad38 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -676,10 +676,12 @@ mod tests {
                 Message::new(
                     Role::Assistant,
                     Utc::now().timestamp(),
-                    vec![MessageContent::Text(TextContent {
-                        text: "Chunked summary".to_string(),
-                        annotations: None,
-                    })],
+                    vec![MessageContent::Text(
+                        RawTextContent {
+                            text: "Chunked summary".to_string(),
+                        }
+                        .no_annotation(),
+                    )],
                 ),
                 ProviderUsage::new("mock".to_string(), Usage::default()),
             ))

From dc668bb1375ab7d3483f41b2d6616a769f2b759e Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Tue, 22 Jul 2025 16:02:45 -0400
Subject: [PATCH 11/41] reset token counting

---
 crates/goose-cli/src/session/mod.rs | 49 +++++++----------------------
 1 file changed, 12 insertions(+), 37 deletions(-)

diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 86964dd7078b..a260e8f16774 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -1438,19 +1438,15 @@ impl Session {
             );
         }
 
-        // Calculate actual current token count from messages - this is always up-to-date
-        let current_token_count = self.calculate_current_token_count().await?;
-        
-        output::display_context_usage(current_token_count, context_limit);
-
-        if show_cost {
-            // For cost display, also use metadata if available for accumulated totals
-            // but the current token count is always the most accurate for the current state
-            match self.get_metadata() {
-                Ok(metadata) => {
-                    // Use accumulated totals from metadata if available, otherwise fall back to current count
-                    let input_tokens = metadata.accumulated_input_tokens.unwrap_or(0) as usize;
-                    let output_tokens = metadata.accumulated_output_tokens.unwrap_or(0) as usize;
+        match self.get_metadata() {
+            Ok(metadata) => {
+                let total_tokens = metadata.total_tokens.unwrap_or(0) as usize;
+
+                output::display_context_usage(total_tokens, context_limit);
+
+                if show_cost {
+                    let input_tokens = metadata.input_tokens.unwrap_or(0) as usize;
+                    let output_tokens = metadata.output_tokens.unwrap_or(0) as usize;
                     output::display_cost_usage(
                         &provider_name,
                         &model_config.model_name,
@@ -1459,36 +1455,15 @@ impl Session {
                     )
                     .await;
                 }
-                Err(_) => {
-                    // If no metadata available, we can't show cost breakdown
-                }
+            }
+            Err(_) => {
+                output::display_context_usage(0, context_limit);
             }
         }
 
         Ok(())
     }
 
-    /// Calculate the current token count of the session messages in memory
-    /// This only counts the actual conversation messages, not system prompts or tools
-    /// Returns 0 at the start of a session when there are no messages
-    async fn calculate_current_token_count(&self) -> Result<usize> {
-        // If no messages, return 0 (clean start)
-        if self.messages.is_empty() {
-            return Ok(0);
-        }
-        
-        use goose::token_counter::create_async_token_counter;
-        
-        let token_counter = create_async_token_counter().await
-            .map_err(|e| anyhow::anyhow!("Failed to create token counter: {}", e))?;
-        
-        // Only count the session messages without system prompt or tools
-        // This gives us a clean count of just the conversation content
-        let token_count = token_counter.count_chat_tokens("", &self.messages, &[]);
-        
-        Ok(token_count)
-    }
-
     /// Handle prompt command execution
     async fn handle_prompt_command(&mut self, opts: input::PromptCommandOptions) -> Result<()> {
         // name is required

From 7b6f17edd2f4d7edf5da95847757483707bf3ec7 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Tue, 22 Jul 2025 16:11:35 -0400
Subject: [PATCH 12/41] fmt

---
 crates/goose/src/context_mgmt/summarize.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index b78f7468ad38..ba78aa521ba0 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -4,8 +4,8 @@ use crate::prompt_template::render_global_file;
 use crate::providers::base::Provider;
 use crate::token_counter::{AsyncTokenCounter, TokenCounter};
 use anyhow::Result;
-use serde::Serialize;
 use rmcp::model::Role;
+use serde::Serialize;
 use std::sync::Arc;
 
 // Constants for the summarization prompt and a follow-up user message.

From f71fe311b10b2725b206668950446042ef3ca50e Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Tue, 22 Jul 2025 17:12:44 -0400
Subject: [PATCH 13/41] fix test

---
 crates/goose/src/context_mgmt/summarize.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index ba78aa521ba0..2f24ca94542c 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -665,7 +665,7 @@ mod tests {
             *count += 1;
 
             // Fail if this looks like a one-shot request (contains the one-shot prompt content)
-            if system.contains("expert at summarizing conversation histories") {
+            if system.contains("Summary Generation Instructions") {
                 return Err(ProviderError::RateLimitExceeded(
                     "Simulated one-shot failure".to_string(),
                 ));

From 970b197bf38a478e90cfd471e861e717e7751676 Mon Sep 17 00:00:00 2001
From: Michael Neale <michael.neale@gmail.com>
Date: Fri, 25 Jul 2025 03:00:33 +1000
Subject: [PATCH 14/41] appending to pr: ads auto summarize to one shot (#3600)

Co-authored-by: Zane <75694352+zanesq@users.noreply.github.com>
Co-authored-by: dianed-square <73617011+dianed-square@users.noreply.github.com>
Co-authored-by: Rizel Scarlett <rizel@squareup.com>
Co-authored-by: Lifei Zhou <lifei@squareup.com>
Co-authored-by: David Katz <dkatz@squareup.com>
---
 crates/goose-cli/src/commands/web.rs          |  40 +++
 crates/goose-cli/src/session/mod.rs           |  36 ++
 crates/goose-server/src/routes/reply.rs       |  38 +-
 crates/goose/src/context_mgmt/auto_compact.rs | 336 ++++++++++++++++++
 crates/goose/src/context_mgmt/mod.rs          |   1 +
 crates/goose/src/context_mgmt/summarize.rs    |  58 ++-
 6 files changed, 471 insertions(+), 38 deletions(-)
 create mode 100644 crates/goose/src/context_mgmt/auto_compact.rs

diff --git a/crates/goose-cli/src/commands/web.rs b/crates/goose-cli/src/commands/web.rs
index ba5206b4ba2e..bac587e16b76 100644
--- a/crates/goose-cli/src/commands/web.rs
+++ b/crates/goose-cli/src/commands/web.rs
@@ -10,6 +10,7 @@ use axum::{
 };
 use futures::{sink::SinkExt, stream::StreamExt};
 use goose::agents::{Agent, AgentEvent};
+use goose::context_mgmt::auto_compact::check_and_compact_messages;
 use goose::message::Message as GooseMessage;
 use goose::session;
 use serde::{Deserialize, Serialize};
@@ -455,6 +456,45 @@ async fn process_message_streaming(
         session_msgs.clone()
     };
 
+    // Check and compact messages if needed before calling reply
+    let compact_result = check_and_compact_messages(agent, &messages, None).await?;
+    if compact_result.compacted {
+        messages = compact_result.messages.clone();
+
+        // Update session messages
+        {
+            let mut session_msgs = session_messages.lock().await;
+            *session_msgs = compact_result.messages;
+        }
+
+        // Notify client of compaction
+        let msg = if let (Some(before), Some(after)) =
+            (compact_result.tokens_before, compact_result.tokens_after)
+        {
+            format!(
+                "Auto-compacted context: {} → {} tokens ({:.0}% reduction)",
+                before,
+                after,
+                (1.0 - (after as f64 / before as f64)) * 100.0
+            )
+        } else {
+            "Auto-compacted context to prevent overflow".to_string()
+        };
+
+        let mut sender_lock = sender.lock().await;
+        let _ = sender_lock
+            .send(Message::Text(
+                serde_json::to_string(&WebSocketMessage::Response {
+                    content: msg,
+                    role: "system".to_string(),
+                    timestamp: chrono::Utc::now().timestamp_millis(),
+                })
+                .unwrap()
+                .into(),
+            ))
+            .await;
+    }
+
     // Persist messages to JSONL file with provider for automatic description generation
     let provider = agent.provider().await;
     if provider.is_err() {
diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 6ebc79547008..fb6f8a318290 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -16,6 +16,7 @@ pub use self::export::message_to_markdown;
 pub use builder::{build_session, SessionBuilderConfig, SessionSettings};
 use console::Color;
 use goose::agents::AgentEvent;
+use goose::context_mgmt::auto_compact::check_and_compact_messages;
 use goose::message::push_message;
 use goose::permission::permission_confirmation::PrincipalType;
 use goose::permission::Permission;
@@ -840,6 +841,41 @@ impl Session {
     }
 
     async fn process_agent_response(&mut self, interactive: bool) -> Result<()> {
+        // Check and compact messages if needed before calling reply
+        let compact_result = check_and_compact_messages(&self.agent, &self.messages, None).await?;
+        if compact_result.compacted {
+            self.messages = compact_result.messages;
+
+            // Notify user of compaction
+            let msg = if let (Some(before), Some(after)) =
+                (compact_result.tokens_before, compact_result.tokens_after)
+            {
+                format!(
+                    "Auto-compacted context: {} → {} tokens ({:.0}% reduction)",
+                    before,
+                    after,
+                    (1.0 - (after as f64 / before as f64)) * 100.0
+                )
+            } else {
+                "Auto-compacted context to prevent overflow".to_string()
+            };
+            output::render_text(&msg, Some(Color::Yellow), true);
+
+            // Persist the compacted messages
+            if let Some(session_file) = &self.session_file {
+                let working_dir = std::env::current_dir().ok();
+                let provider = self.agent.provider().await.ok();
+                session::persist_messages_with_schedule_id(
+                    session_file,
+                    &self.messages,
+                    provider,
+                    self.scheduled_job_id.clone(),
+                    working_dir,
+                )
+                .await?;
+            }
+        }
+
         let cancel_token = CancellationToken::new();
         let cancel_token_clone = cancel_token.clone();
 
diff --git a/crates/goose-server/src/routes/reply.rs b/crates/goose-server/src/routes/reply.rs
index 24a1f7eb104d..63140ac5656e 100644
--- a/crates/goose-server/src/routes/reply.rs
+++ b/crates/goose-server/src/routes/reply.rs
@@ -11,6 +11,7 @@ use bytes::Bytes;
 use futures::{stream::StreamExt, Stream};
 use goose::{
     agents::{AgentEvent, SessionConfig},
+    context_mgmt::auto_compact::check_and_compact_messages,
     message::{push_message, Message},
     permission::permission_confirmation::PrincipalType,
 };
@@ -159,8 +160,43 @@ async fn reply_handler(
             retry_config: None,
         };
 
+        // Check and compact messages if needed before calling reply
+        let mut messages_to_process = messages.clone();
+        let compact_result = check_and_compact_messages(&agent, &messages_to_process, None).await;
+        if let Ok(result) = compact_result {
+            if result.compacted {
+                messages_to_process = result.messages;
+
+                // Notify client of compaction
+                let msg = if let (Some(before), Some(after)) =
+                    (result.tokens_before, result.tokens_after)
+                {
+                    format!(
+                        "Auto-compacted context: {} → {} tokens ({:.0}% reduction)",
+                        before,
+                        after,
+                        (1.0 - (after as f64 / before as f64)) * 100.0
+                    )
+                } else {
+                    "Auto-compacted context to prevent overflow".to_string()
+                };
+
+                let _ = stream_event(
+                    MessageEvent::Message {
+                        message: Message::assistant().with_text(&msg),
+                    },
+                    &task_tx,
+                )
+                .await;
+            }
+        }
+
         let mut stream = match agent
-            .reply(&messages, Some(session_config), Some(task_cancel.clone()))
+            .reply(
+                &messages_to_process,
+                Some(session_config),
+                Some(task_cancel.clone()),
+            )
             .await
         {
             Ok(stream) => stream,
diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
new file mode 100644
index 000000000000..6c45aa9ac1c4
--- /dev/null
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -0,0 +1,336 @@
+use crate::{
+    agents::Agent,
+    config::Config,
+    context_mgmt::{estimate_target_context_limit, get_messages_token_counts_async},
+    message::Message,
+    token_counter::create_async_token_counter,
+};
+use anyhow::Result;
+use tracing::{debug, info};
+
+/// Result of auto-compaction check
+#[derive(Debug)]
+pub struct AutoCompactResult {
+    /// Whether compaction was performed
+    pub compacted: bool,
+    /// The messages after potential compaction
+    pub messages: Vec<Message>,
+    /// Token count before compaction (if compaction occurred)
+    pub tokens_before: Option<usize>,
+    /// Token count after compaction (if compaction occurred)
+    pub tokens_after: Option<usize>,
+}
+
+/// Check if messages need compaction and compact them if necessary
+///
+/// This function checks the current token usage against a configurable threshold
+/// and automatically compacts the messages using the summarization algorithm if needed.
+///
+/// # Arguments
+/// * `agent` - The agent to use for context management
+/// * `messages` - The current message history
+/// * `threshold_override` - Optional threshold override (defaults to GOOSE_AUTO_COMPACT_THRESHOLD config)
+///
+/// # Returns
+/// * `AutoCompactResult` containing the potentially compacted messages and metadata
+pub async fn check_and_compact_messages(
+    agent: &Agent,
+    messages: &[Message],
+    threshold_override: Option<f64>,
+) -> Result<AutoCompactResult> {
+    // Get threshold from config or use override
+    let config = Config::global();
+    let threshold = threshold_override.unwrap_or_else(|| {
+        config
+            .get_param::<f64>("GOOSE_AUTO_COMPACT_THRESHOLD")
+            .unwrap_or(0.3) // Default to 30%
+    });
+
+    // Check if auto-compaction is disabled
+    if threshold <= 0.0 || threshold >= 1.0 {
+        debug!("Auto-compaction disabled (threshold: {})", threshold);
+        return Ok(AutoCompactResult {
+            compacted: false,
+            messages: messages.to_vec(),
+            tokens_before: None,
+            tokens_after: None,
+        });
+    }
+
+    // Get provider and token counter
+    let provider = agent.provider().await?;
+    let token_counter = create_async_token_counter()
+        .await
+        .map_err(|e| anyhow::anyhow!("Failed to create token counter: {}", e))?;
+
+    // Calculate current token usage
+    let token_counts = get_messages_token_counts_async(&token_counter, messages);
+    let total_tokens: usize = token_counts.iter().sum();
+    let context_limit = estimate_target_context_limit(provider);
+
+    // Calculate usage ratio
+    let usage_ratio = total_tokens as f64 / context_limit as f64;
+
+    debug!(
+        "Context usage: {} / {} ({:.1}%)",
+        total_tokens,
+        context_limit,
+        usage_ratio * 100.0
+    );
+
+    // Check if compaction is needed
+    if usage_ratio <= threshold {
+        debug!(
+            "No compaction needed (usage: {:.1}% <= threshold: {:.1}%)",
+            usage_ratio * 100.0,
+            threshold * 100.0
+        );
+        return Ok(AutoCompactResult {
+            compacted: false,
+            messages: messages.to_vec(),
+            tokens_before: None,
+            tokens_after: None,
+        });
+    }
+
+    info!(
+        "Auto-compacting messages (usage: {:.1}% > threshold: {:.1}%)",
+        usage_ratio * 100.0,
+        threshold * 100.0
+    );
+
+    // Perform compaction
+    let (compacted_messages, compacted_token_counts) = agent.summarize_context(messages).await?;
+    let tokens_after: usize = compacted_token_counts.iter().sum();
+
+    info!(
+        "Compaction complete: {} tokens -> {} tokens ({:.1}% reduction)",
+        total_tokens,
+        tokens_after,
+        (1.0 - (tokens_after as f64 / total_tokens as f64)) * 100.0
+    );
+
+    Ok(AutoCompactResult {
+        compacted: true,
+        messages: compacted_messages,
+        tokens_before: Some(total_tokens),
+        tokens_after: Some(tokens_after),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{
+        agents::Agent,
+        message::{Message, MessageContent},
+        model::ModelConfig,
+        providers::base::{Provider, ProviderMetadata, ProviderUsage, Usage},
+        providers::errors::ProviderError,
+    };
+    use chrono::Utc;
+    use mcp_core::tool::Tool;
+    use rmcp::model::{AnnotateAble, RawTextContent, Role};
+    use std::sync::Arc;
+
+    #[derive(Clone)]
+    struct MockProvider {
+        model_config: ModelConfig,
+    }
+
+    #[async_trait::async_trait]
+    impl Provider for MockProvider {
+        fn metadata() -> ProviderMetadata {
+            ProviderMetadata::empty()
+        }
+
+        fn get_model_config(&self) -> ModelConfig {
+            self.model_config.clone()
+        }
+
+        async fn complete(
+            &self,
+            _system: &str,
+            _messages: &[Message],
+            _tools: &[Tool],
+        ) -> Result<(Message, ProviderUsage), ProviderError> {
+            // Return a short summary message
+            Ok((
+                Message::new(
+                    Role::Assistant,
+                    Utc::now().timestamp(),
+                    vec![MessageContent::Text(
+                        RawTextContent {
+                            text: "Summary of conversation".to_string(),
+                        }
+                        .no_annotation(),
+                    )],
+                ),
+                ProviderUsage::new("mock".to_string(), Usage::default()),
+            ))
+        }
+    }
+
+    fn create_test_message(text: &str) -> Message {
+        Message::new(
+            Role::User,
+            Utc::now().timestamp(),
+            vec![MessageContent::text(text.to_string())],
+        )
+    }
+
+    #[tokio::test]
+    async fn test_auto_compact_disabled() {
+        let mock_provider = Arc::new(MockProvider {
+            model_config: ModelConfig::new("test-model".to_string())
+                .with_context_limit(10_000.into()),
+        });
+
+        let agent = Agent::new();
+        let _ = agent.update_provider(mock_provider).await;
+
+        let messages = vec![create_test_message("Hello"), create_test_message("World")];
+
+        // Test with threshold 0 (disabled)
+        let result = check_and_compact_messages(&agent, &messages, Some(0.0))
+            .await
+            .unwrap();
+
+        assert!(!result.compacted);
+        assert_eq!(result.messages.len(), messages.len());
+        assert!(result.tokens_before.is_none());
+        assert!(result.tokens_after.is_none());
+
+        // Test with threshold 1.0 (disabled)
+        let result = check_and_compact_messages(&agent, &messages, Some(1.0))
+            .await
+            .unwrap();
+
+        assert!(!result.compacted);
+    }
+
+    #[tokio::test]
+    async fn test_auto_compact_below_threshold() {
+        let mock_provider = Arc::new(MockProvider {
+            model_config: ModelConfig::new("test-model".to_string())
+                .with_context_limit(100_000.into()), // Increased to ensure overhead doesn't dominate
+        });
+
+        let agent = Agent::new();
+        let _ = agent.update_provider(mock_provider).await;
+
+        // Create small messages that won't trigger compaction
+        let messages = vec![create_test_message("Hello"), create_test_message("World")];
+
+        let result = check_and_compact_messages(&agent, &messages, Some(0.3))
+            .await
+            .unwrap();
+
+        assert!(!result.compacted);
+        assert_eq!(result.messages.len(), messages.len());
+    }
+
+    #[tokio::test]
+    async fn test_auto_compact_above_threshold() {
+        let mock_provider = Arc::new(MockProvider {
+            model_config: ModelConfig::new("test-model".to_string())
+                .with_context_limit(50_000.into()), // Realistic context limit that won't underflow
+        });
+
+        let agent = Agent::new();
+        let _ = agent.update_provider(mock_provider).await;
+
+        // Create messages that will exceed 30% of the context limit
+        // With 50k context limit, after overhead we have ~27k usable tokens
+        // 30% of that is ~8.1k tokens, so we need messages that exceed that
+        let mut messages = Vec::new();
+
+        // Create longer messages with more content to reach the threshold
+        for i in 0..200 {
+            messages.push(create_test_message(&format!(
+                "This is message number {} with significantly more content to increase token count. \
+                 We need to ensure that our total token usage exceeds 30% of the available context \
+                 limit after accounting for system prompt and tools overhead. This message contains \
+                 multiple sentences to increase the token count substantially.",
+                i
+            )));
+        }
+
+        let result = check_and_compact_messages(&agent, &messages, Some(0.3))
+            .await
+            .unwrap();
+
+        assert!(result.compacted);
+        assert!(result.tokens_before.is_some());
+        assert!(result.tokens_after.is_some());
+
+        // Should have fewer tokens after compaction
+        if let (Some(before), Some(after)) = (result.tokens_before, result.tokens_after) {
+            assert!(
+                after < before,
+                "Token count should decrease after compaction"
+            );
+        }
+
+        // Should have fewer messages (summarized)
+        assert!(result.messages.len() <= messages.len());
+    }
+
+    #[tokio::test]
+    async fn test_auto_compact_respects_config() {
+        let mock_provider = Arc::new(MockProvider {
+            model_config: ModelConfig::new("test-model".to_string())
+                .with_context_limit(50_000.into()), // Realistic context limit that won't underflow
+        });
+
+        let agent = Agent::new();
+        let _ = agent.update_provider(mock_provider).await;
+
+        // Create enough messages to trigger compaction with low threshold
+        let mut messages = Vec::new();
+        // Need to create more messages since we have a 27k usable token limit
+        // 10% of 27k = 2.7k tokens
+        for i in 0..150 {
+            messages.push(create_test_message(&format!(
+                "Message {} with enough content to ensure we exceed 10% of the context limit. Adding more content.",
+                i
+            )));
+        }
+
+        // Set config value
+        let config = Config::global();
+        config
+            .set_param("GOOSE_AUTO_COMPACT_THRESHOLD", serde_json::Value::from(0.1))
+            .unwrap();
+
+        // Should use config value when no override provided
+        let result = check_and_compact_messages(&agent, &messages, None)
+            .await
+            .unwrap();
+
+        // Debug info if not compacted
+        if !result.compacted {
+            let provider = agent.provider().await.unwrap();
+            let token_counter = create_async_token_counter().await.unwrap();
+            let token_counts = get_messages_token_counts_async(&token_counter, &messages);
+            let total_tokens: usize = token_counts.iter().sum();
+            let context_limit = estimate_target_context_limit(provider);
+            let usage_ratio = total_tokens as f64 / context_limit as f64;
+
+            eprintln!(
+                "Config test not compacted - tokens: {} / {} ({:.1}%)",
+                total_tokens,
+                context_limit,
+                usage_ratio * 100.0
+            );
+        }
+
+        // With such a low threshold (10%), it should compact
+        assert!(result.compacted);
+
+        // Clean up config
+        config
+            .set_param("GOOSE_AUTO_COMPACT_THRESHOLD", serde_json::Value::from(0.3))
+            .unwrap();
+    }
+}
diff --git a/crates/goose/src/context_mgmt/mod.rs b/crates/goose/src/context_mgmt/mod.rs
index 838e27fece54..00d11d6b871b 100644
--- a/crates/goose/src/context_mgmt/mod.rs
+++ b/crates/goose/src/context_mgmt/mod.rs
@@ -1,3 +1,4 @@
+pub mod auto_compact;
 mod common;
 pub mod summarize;
 pub mod truncate;
diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index 2f24ca94542c..27f74fbbbd6f 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -116,19 +116,13 @@ pub async fn summarize_messages_oneshot(
     token_counter: &TokenCounter,
     _context_limit: usize,
 ) -> Result<(Vec<Message>, Vec<usize>), anyhow::Error> {
-    // Preprocess messages to handle tool response edge case.
-    let (preprocessed_messages, removed_messages) = preprocess_messages(messages);
-
-    if preprocessed_messages.is_empty() {
-        // If no messages to summarize, just return the removed messages
-        return Ok((
-            removed_messages.clone(),
-            get_messages_token_counts(token_counter, &removed_messages),
-        ));
+    if messages.is_empty() {
+        // If no messages to summarize, return empty
+        return Ok((vec![], vec![]));
     }
 
     // Format all messages as a single string for the summarization prompt
-    let messages_text = preprocessed_messages
+    let messages_text = messages
         .iter()
         .map(|msg| format!("{:?}", msg))
         .collect::<Vec<_>>()
@@ -155,8 +149,8 @@ pub async fn summarize_messages_oneshot(
     // Set role to user as it will be used in following conversation as user content.
     response.role = Role::User;
 
-    // Add back removed messages.
-    let final_summary = reintegrate_removed_messages(&[response], &removed_messages);
+    // Return just the summary without any tool response preservation
+    let final_summary = vec![response];
 
     Ok((
         final_summary.clone(),
@@ -180,17 +174,14 @@ pub async fn summarize_messages_chunked(
     let summary_prompt_tokens = token_counter.count_tokens(SUMMARY_PROMPT);
     let mut accumulated_summary = Vec::new();
 
-    // Preprocess messages to handle tool response edge case.
-    let (preprocessed_messages, removed_messages) = preprocess_messages(messages);
-
     // Get token counts for each message.
-    let token_counts = get_messages_token_counts(token_counter, &preprocessed_messages);
+    let token_counts = get_messages_token_counts(token_counter, messages);
 
     // Tokenize and break messages into chunks.
     let mut current_chunk: Vec<Message> = Vec::new();
     let mut current_chunk_tokens = 0;
 
-    for (message, message_tokens) in preprocessed_messages.iter().zip(token_counts.iter()) {
+    for (message, message_tokens) in messages.iter().zip(token_counts.iter()) {
         if current_chunk_tokens + message_tokens > chunk_size - summary_prompt_tokens {
             // Summarize the current chunk with the accumulated summary.
             accumulated_summary =
@@ -213,12 +204,10 @@ pub async fn summarize_messages_chunked(
             summarize_combined_messages(&provider, &accumulated_summary, &current_chunk).await?;
     }
 
-    // Add back removed messages.
-    let final_summary = reintegrate_removed_messages(&accumulated_summary, &removed_messages);
-
+    // Return just the summary without any tool response preservation
     Ok((
-        final_summary.clone(),
-        get_messages_token_counts(token_counter, &final_summary),
+        accumulated_summary.clone(),
+        get_messages_token_counts(token_counter, &accumulated_summary),
     ))
 }
 
@@ -281,17 +270,14 @@ pub async fn summarize_messages_async(
     let summary_prompt_tokens = token_counter.count_tokens(SUMMARY_PROMPT);
     let mut accumulated_summary = Vec::new();
 
-    // Preprocess messages to handle tool response edge case.
-    let (preprocessed_messages, removed_messages) = preprocess_messages(messages);
-
     // Get token counts for each message.
-    let token_counts = get_messages_token_counts_async(token_counter, &preprocessed_messages);
+    let token_counts = get_messages_token_counts_async(token_counter, messages);
 
     // Tokenize and break messages into chunks.
     let mut current_chunk: Vec<Message> = Vec::new();
     let mut current_chunk_tokens = 0;
 
-    for (message, message_tokens) in preprocessed_messages.iter().zip(token_counts.iter()) {
+    for (message, message_tokens) in messages.iter().zip(token_counts.iter()) {
         if current_chunk_tokens + message_tokens > chunk_size - summary_prompt_tokens {
             // Summarize the current chunk with the accumulated summary.
             accumulated_summary =
@@ -314,12 +300,10 @@ pub async fn summarize_messages_async(
             summarize_combined_messages(&provider, &accumulated_summary, &current_chunk).await?;
     }
 
-    // Add back removed messages.
-    let final_summary = reintegrate_removed_messages(&accumulated_summary, &removed_messages);
-
+    // Return just the summary without any tool response preservation
     Ok((
-        final_summary.clone(),
-        get_messages_token_counts_async(token_counter, &final_summary),
+        accumulated_summary.clone(),
+        get_messages_token_counts_async(token_counter, &accumulated_summary),
     ))
 }
 
@@ -418,7 +402,7 @@ mod tests {
     async fn test_summarize_messages_single_chunk() {
         let provider = create_mock_provider();
         let token_counter = TokenCounter::new();
-        let context_limit = 100; // Set a high enough limit to avoid chunking.
+        let context_limit = 10_000; // Higher limit to avoid underflow
         let messages = create_test_messages();
 
         let result = summarize_messages(
@@ -454,7 +438,7 @@ mod tests {
     async fn test_summarize_messages_multiple_chunks() {
         let provider = create_mock_provider();
         let token_counter = TokenCounter::new();
-        let context_limit = 30;
+        let context_limit = 10_000; // Higher limit to avoid underflow
         let messages = create_test_messages();
 
         let result = summarize_messages(
@@ -490,7 +474,7 @@ mod tests {
     async fn test_summarize_messages_empty_input() {
         let provider = create_mock_provider();
         let token_counter = TokenCounter::new();
-        let context_limit = 100;
+        let context_limit = 10_000; // Higher limit to avoid underflow
         let messages: Vec<Message> = Vec::new();
 
         let result = summarize_messages(
@@ -616,7 +600,7 @@ mod tests {
     async fn test_summarize_messages_uses_chunked_for_large_context() {
         let provider = create_mock_provider();
         let token_counter = TokenCounter::new();
-        let context_limit = 100; // Small context limit but not too small to cause overflow
+        let context_limit = 10_000; // Higher limit to avoid underflow
         let messages = create_test_messages();
 
         let result = summarize_messages(
@@ -772,7 +756,7 @@ mod tests {
     async fn test_summarize_messages_chunked_direct_call() {
         let provider = create_mock_provider();
         let token_counter = TokenCounter::new();
-        let context_limit = 30; // Small to force chunking
+        let context_limit = 10_000; // Higher limit to avoid underflow
         let messages = create_test_messages();
 
         let result = summarize_messages_chunked(

From 0f3762393ab064fe99966fc8b79a8c3b7334176a Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Thu, 24 Jul 2025 13:23:14 -0400
Subject: [PATCH 15/41] rm tool call removal fns

---
 auto-compact-bench-config.json             |  26 ++++
 crates/goose/src/context_mgmt/summarize.rs | 146 +--------------------
 full-bench-config.json                     |  91 +++++++++++++
 3 files changed, 118 insertions(+), 145 deletions(-)
 create mode 100644 auto-compact-bench-config.json
 create mode 100644 full-bench-config.json

diff --git a/auto-compact-bench-config.json b/auto-compact-bench-config.json
new file mode 100644
index 000000000000..71059b9be5ec
--- /dev/null
+++ b/auto-compact-bench-config.json
@@ -0,0 +1,26 @@
+{
+  "models": [
+    {
+      "provider": "databricks",
+      "name": "goose",
+      "parallel_safe": false,
+      "tool_shim": {
+        "use_tool_shim": false,
+        "tool_shim_model": null
+      }
+    }
+  ],
+  "evals": [
+    {
+      "selector": "core:developer:simple_repo_clone_test",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    }
+  ],
+  "include_dirs": [],
+  "repeat": 1,
+  "run_id": "auto-compact-test",
+  "eval_result_filename": "eval-results.json",
+  "run_summary_filename": "run-results-summary.json",
+  "env_file": null
+}
diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index 27f74fbbbd6f..9fbaead35c69 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -50,63 +50,7 @@ async fn summarize_combined_messages(
     Ok(vec![response])
 }
 
-/// Preprocesses the messages to handle edge cases involving tool responses.
-///
-/// This function separates messages into two groups:
-/// 1. Messages to be summarized (`preprocessed_messages`)
-/// 2. Messages to be temporarily removed (`removed_messages`), which include:
-///    - The last tool response message.
-///    - The corresponding tool request message that immediately precedes the last tool response message (if present).
-///
-/// The function only considers the last tool response message and its pair for removal.
-fn preprocess_messages(messages: &[Message]) -> (Vec<Message>, Vec<Message>) {
-    let mut preprocessed_messages = messages.to_owned();
-    let mut removed_messages = Vec::new();
-
-    if let Some((last_index, last_message)) = messages.iter().enumerate().rev().find(|(_, m)| {
-        m.content
-            .iter()
-            .any(|c| matches!(c, MessageContent::ToolResponse(_)))
-    }) {
-        // Check for the corresponding tool request message
-        if last_index > 0 {
-            if let Some(previous_message) = messages.get(last_index - 1) {
-                if previous_message
-                    .content
-                    .iter()
-                    .any(|c| matches!(c, MessageContent::ToolRequest(_)))
-                {
-                    // Add the tool request message to removed_messages
-                    removed_messages.push(previous_message.clone());
-                }
-            }
-        }
-        // Add the last tool response message to removed_messages
-        removed_messages.push(last_message.clone());
-
-        // Calculate the correct start index for removal
-        let start_index = last_index + 1 - removed_messages.len();
-
-        // Remove the tool response and its paired tool request from preprocessed_messages
-        preprocessed_messages.drain(start_index..=last_index);
-    }
-
-    (preprocessed_messages, removed_messages)
-}
 
-/// Reinserts removed messages into the summarized output.
-///
-/// This function appends messages that were temporarily removed during preprocessing
-/// back into the summarized message list. This ensures that important context,
-/// such as tool responses, is not lost.
-fn reintegrate_removed_messages(
-    summarized_messages: &[Message],
-    removed_messages: &[Message],
-) -> Vec<Message> {
-    let mut final_messages = summarized_messages.to_owned();
-    final_messages.extend_from_slice(removed_messages);
-    final_messages
-}
 
 // Summarization steps:
 //    Using a single tailored prompt, summarize the entire conversation history.
@@ -316,10 +260,8 @@ mod tests {
     use crate::providers::errors::ProviderError;
     use chrono::Utc;
     use mcp_core::tool::Tool;
-    use mcp_core::ToolCall;
     use rmcp::model::Role;
-    use rmcp::model::{AnnotateAble, Content, RawTextContent};
-    use serde_json::json;
+    use rmcp::model::{AnnotateAble, RawTextContent};
     use std::sync::Arc;
 
     #[derive(Clone)]
@@ -379,24 +321,7 @@ mod tests {
         Message::new(role, 0, vec![MessageContent::text(text.to_string())])
     }
 
-    fn set_up_tool_request_message(id: &str, tool_call: ToolCall) -> Message {
-        Message::new(
-            Role::Assistant,
-            0,
-            vec![MessageContent::tool_request(id.to_string(), Ok(tool_call))],
-        )
-    }
 
-    fn set_up_tool_response_message(id: &str, tool_response: Vec<Content>) -> Message {
-        Message::new(
-            Role::User,
-            0,
-            vec![MessageContent::tool_response(
-                id.to_string(),
-                Ok(tool_response),
-            )],
-        )
-    }
 
     #[tokio::test]
     async fn test_summarize_messages_single_chunk() {
@@ -499,76 +424,7 @@ mod tests {
         );
     }
 
-    #[tokio::test]
-    async fn test_preprocess_messages_without_tool_response() {
-        let messages = create_test_messages();
-        let (preprocessed_messages, removed_messages) = preprocess_messages(&messages);
-
-        assert_eq!(
-            preprocessed_messages.len(),
-            3,
-            "Only the user message should remain after preprocessing."
-        );
-        assert_eq!(
-            removed_messages.len(),
-            0,
-            "The tool request and tool response messages should be removed."
-        );
-    }
-
-    #[tokio::test]
-    async fn test_preprocess_messages_with_tool_response() {
-        let arguments = json!({
-            "param1": "value1"
-        });
-        let messages = vec![
-            set_up_text_message("Message 1", Role::User),
-            set_up_tool_request_message("id", ToolCall::new("tool_name", json!(arguments))),
-            set_up_tool_response_message("id", vec![Content::text("tool done")]),
-        ];
-
-        let (preprocessed_messages, removed_messages) = preprocess_messages(&messages);
 
-        assert_eq!(
-            preprocessed_messages.len(),
-            1,
-            "Only the user message should remain after preprocessing."
-        );
-        assert_eq!(
-            removed_messages.len(),
-            2,
-            "The tool request and tool response messages should be removed."
-        );
-    }
-
-    #[tokio::test]
-    async fn test_reintegrate_removed_messages() {
-        let summarized_messages = vec![Message::new(
-            Role::Assistant,
-            Utc::now().timestamp(),
-            vec![MessageContent::Text(
-                RawTextContent {
-                    text: "Summary".to_string(),
-                }
-                .no_annotation(),
-            )],
-        )];
-        let arguments = json!({
-            "param1": "value1"
-        });
-        let removed_messages = vec![
-            set_up_tool_request_message("id", ToolCall::new("tool_name", json!(arguments))),
-            set_up_tool_response_message("id", vec![Content::text("tool done")]),
-        ];
-
-        let final_messages = reintegrate_removed_messages(&summarized_messages, &removed_messages);
-
-        assert_eq!(
-            final_messages.len(),
-            3,
-            "The final message list should include the summary and removed messages."
-        );
-    }
 
     #[tokio::test]
     async fn test_summarize_messages_uses_oneshot_for_small_context() {
diff --git a/full-bench-config.json b/full-bench-config.json
new file mode 100644
index 000000000000..f46962688692
--- /dev/null
+++ b/full-bench-config.json
@@ -0,0 +1,91 @@
+{
+  "models": [
+    {
+      "provider": "databricks",
+      "name": "goose",
+      "parallel_safe": false,
+      "tool_shim": {
+        "use_tool_shim": false,
+        "tool_shim_model": null
+      }
+    }
+  ],
+  "evals": [
+    {
+      "selector": "core:developer:simple_repo_clone_test",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "core:developer:create_file",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "core:developer:list_files",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "core:computercontroller:script",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "core:computercontroller:web_scrape",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "core:developer_image:image",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "core:developer_search_replace:search_replace",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "core:example",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "core:memory:save_fact",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "vibes:blog_summary",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "vibes:flappy_bird",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "vibes:goose_wiki",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "vibes:restaurant_research",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    },
+    {
+      "selector": "vibes:squirrel_census",
+      "post_process_cmd": null,
+      "parallel_safe": false
+    }
+  ],
+  "include_dirs": [],
+  "repeat": 1,
+  "run_id": "autocompact-full-bench",
+  "eval_result_filename": "eval-results.json",
+  "run_summary_filename": "run-results-summary.json",
+  "env_file": null
+}

From ef432b5d9d16c87dee7f2dd16ead59100f0398c8 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Thu, 24 Jul 2025 13:24:51 -0400
Subject: [PATCH 16/41] one more unused symbol

---
 crates/goose/src/context_mgmt/summarize.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index 9fbaead35c69..378ab4e50d74 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -1,5 +1,5 @@
 use super::common::{get_messages_token_counts, get_messages_token_counts_async};
-use crate::message::{Message, MessageContent};
+use crate::message::{Message};
 use crate::prompt_template::render_global_file;
 use crate::providers::base::Provider;
 use crate::token_counter::{AsyncTokenCounter, TokenCounter};

From 624e3ebef489cc36958f090e4847d09be4a7297e Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Thu, 24 Jul 2025 13:27:47 -0400
Subject: [PATCH 17/41] fmt

---
 crates/goose/src/context_mgmt/summarize.rs | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index 378ab4e50d74..3484f711b63e 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -1,5 +1,5 @@
 use super::common::{get_messages_token_counts, get_messages_token_counts_async};
-use crate::message::{Message};
+use crate::message::Message;
 use crate::prompt_template::render_global_file;
 use crate::providers::base::Provider;
 use crate::token_counter::{AsyncTokenCounter, TokenCounter};
@@ -50,8 +50,6 @@ async fn summarize_combined_messages(
     Ok(vec![response])
 }
 
-
-
 // Summarization steps:
 //    Using a single tailored prompt, summarize the entire conversation history.
 pub async fn summarize_messages_oneshot(
@@ -321,8 +319,6 @@ mod tests {
         Message::new(role, 0, vec![MessageContent::text(text.to_string())])
     }
 
-
-
     #[tokio::test]
     async fn test_summarize_messages_single_chunk() {
         let provider = create_mock_provider();
@@ -424,8 +420,6 @@ mod tests {
         );
     }
 
-
-
     #[tokio::test]
     async fn test_summarize_messages_uses_oneshot_for_small_context() {
         let provider = create_mock_provider();

From caf455702134a5d728306e040cde6f73adae666c Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Thu, 24 Jul 2025 14:51:11 -0400
Subject: [PATCH 18/41] split compaction into check fn

---
 crates/goose/src/context_mgmt/auto_compact.rs | 251 +++++++++++++++---
 crates/goose/src/context_mgmt/common.rs       |  10 +-
 full-bench-config.json                        |  91 -------
 3 files changed, 218 insertions(+), 134 deletions(-)
 delete mode 100644 full-bench-config.json

diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index 6c45aa9ac1c4..0ac7b3551f37 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -21,10 +21,27 @@ pub struct AutoCompactResult {
     pub tokens_after: Option<usize>,
 }
 
-/// Check if messages need compaction and compact them if necessary
+/// Result of checking if compaction is needed
+#[derive(Debug)]
+pub struct CompactionCheckResult {
+    /// Whether compaction is needed
+    pub needs_compaction: bool,
+    /// Current token count
+    pub current_tokens: usize,
+    /// Context limit being used
+    pub context_limit: usize,
+    /// Current usage ratio (0.0 to 1.0)
+    pub usage_ratio: f64,
+    /// Remaining tokens before compaction threshold
+    pub remaining_tokens: usize,
+    /// Percentage until compaction threshold (0.0 to 100.0)
+    pub percentage_until_compaction: f64,
+}
+
+/// Check if messages need compaction without performing the compaction
 ///
-/// This function checks the current token usage against a configurable threshold
-/// and automatically compacts the messages using the summarization algorithm if needed.
+/// This function analyzes the current token usage and returns detailed information
+/// about whether compaction is needed and how close we are to the threshold.
 ///
 /// # Arguments
 /// * `agent` - The agent to use for context management
@@ -32,12 +49,12 @@ pub struct AutoCompactResult {
 /// * `threshold_override` - Optional threshold override (defaults to GOOSE_AUTO_COMPACT_THRESHOLD config)
 ///
 /// # Returns
-/// * `AutoCompactResult` containing the potentially compacted messages and metadata
-pub async fn check_and_compact_messages(
+/// * `CompactionCheckResult` containing detailed information about compaction needs
+pub async fn check_compaction_needed(
     agent: &Agent,
     messages: &[Message],
     threshold_override: Option<f64>,
-) -> Result<AutoCompactResult> {
+) -> Result<CompactionCheckResult> {
     // Get threshold from config or use override
     let config = Config::global();
     let threshold = threshold_override.unwrap_or_else(|| {
@@ -46,17 +63,6 @@ pub async fn check_and_compact_messages(
             .unwrap_or(0.3) // Default to 30%
     });
 
-    // Check if auto-compaction is disabled
-    if threshold <= 0.0 || threshold >= 1.0 {
-        debug!("Auto-compaction disabled (threshold: {})", threshold);
-        return Ok(AutoCompactResult {
-            compacted: false,
-            messages: messages.to_vec(),
-            tokens_before: None,
-            tokens_after: None,
-        });
-    }
-
     // Get provider and token counter
     let provider = agent.provider().await?;
     let token_counter = create_async_token_counter()
@@ -65,25 +71,117 @@ pub async fn check_and_compact_messages(
 
     // Calculate current token usage
     let token_counts = get_messages_token_counts_async(&token_counter, messages);
-    let total_tokens: usize = token_counts.iter().sum();
+    let current_tokens: usize = token_counts.iter().sum();
     let context_limit = estimate_target_context_limit(provider);
 
     // Calculate usage ratio
-    let usage_ratio = total_tokens as f64 / context_limit as f64;
+    let usage_ratio = current_tokens as f64 / context_limit as f64;
+
+    // Calculate threshold token count and remaining tokens
+    let threshold_tokens = (context_limit as f64 * threshold) as usize;
+    let remaining_tokens = threshold_tokens.saturating_sub(current_tokens);
+
+    // Calculate percentage until compaction (how much more we can use before hitting threshold)
+    let percentage_until_compaction = if usage_ratio < threshold {
+        (threshold - usage_ratio) * 100.0
+    } else {
+        0.0
+    };
+
+    // Check if compaction is needed (disabled if threshold is invalid)
+    let needs_compaction = if threshold <= 0.0 || threshold >= 1.0 {
+        false
+    } else {
+        usage_ratio > threshold
+    };
 
     debug!(
-        "Context usage: {} / {} ({:.1}%)",
-        total_tokens,
+        "Compaction check: {} / {} tokens ({:.1}%), threshold: {:.1}%, needs compaction: {}",
+        current_tokens,
         context_limit,
-        usage_ratio * 100.0
+        usage_ratio * 100.0,
+        threshold * 100.0,
+        needs_compaction
     );
 
-    // Check if compaction is needed
-    if usage_ratio <= threshold {
+    Ok(CompactionCheckResult {
+        needs_compaction,
+        current_tokens,
+        context_limit,
+        usage_ratio,
+        remaining_tokens,
+        percentage_until_compaction,
+    })
+}
+
+/// Perform compaction on messages
+///
+/// This function performs the actual compaction using the agent's summarization
+/// capabilities. It assumes compaction is needed and should be called after
+/// `check_compaction_needed` confirms it's necessary.
+///
+/// # Arguments
+/// * `agent` - The agent to use for context management
+/// * `messages` - The current message history to compact
+///
+/// # Returns
+/// * Tuple of (compacted_messages, tokens_before, tokens_after)
+pub async fn perform_compaction(
+    agent: &Agent,
+    messages: &[Message],
+) -> Result<(Vec<Message>, usize, usize)> {
+    // Get token counter to measure before/after
+    let token_counter = create_async_token_counter()
+        .await
+        .map_err(|e| anyhow::anyhow!("Failed to create token counter: {}", e))?;
+
+    // Calculate tokens before compaction
+    let token_counts_before = get_messages_token_counts_async(&token_counter, messages);
+    let tokens_before: usize = token_counts_before.iter().sum();
+
+    info!("Performing compaction on {} tokens", tokens_before);
+
+    // Perform compaction
+    let (compacted_messages, compacted_token_counts) = agent.summarize_context(messages).await?;
+    let tokens_after: usize = compacted_token_counts.iter().sum();
+
+    info!(
+        "Compaction complete: {} tokens -> {} tokens ({:.1}% reduction)",
+        tokens_before,
+        tokens_after,
+        (1.0 - (tokens_after as f64 / tokens_before as f64)) * 100.0
+    );
+
+    Ok((compacted_messages, tokens_before, tokens_after))
+}
+
+/// Check if messages need compaction and compact them if necessary
+///
+/// This is a convenience wrapper function that combines checking and compaction.
+/// It uses the separate `check_compaction_needed` and `perform_compaction` functions
+/// internally to provide the same interface as before while allowing for better
+/// separation of concerns.
+///
+/// # Arguments
+/// * `agent` - The agent to use for context management
+/// * `messages` - The current message history
+/// * `threshold_override` - Optional threshold override (defaults to GOOSE_AUTO_COMPACT_THRESHOLD config)
+///
+/// # Returns
+/// * `AutoCompactResult` containing the potentially compacted messages and metadata
+pub async fn check_and_compact_messages(
+    agent: &Agent,
+    messages: &[Message],
+    threshold_override: Option<f64>,
+) -> Result<AutoCompactResult> {
+    // First check if compaction is needed
+    let check_result = check_compaction_needed(agent, messages, threshold_override).await?;
+
+    // If no compaction is needed, return early
+    if !check_result.needs_compaction {
         debug!(
-            "No compaction needed (usage: {:.1}% <= threshold: {:.1}%)",
-            usage_ratio * 100.0,
-            threshold * 100.0
+            "No compaction needed (usage: {:.1}% <= threshold)",
+            check_result.usage_ratio * 100.0
         );
         return Ok(AutoCompactResult {
             compacted: false,
@@ -94,26 +192,18 @@ pub async fn check_and_compact_messages(
     }
 
     info!(
-        "Auto-compacting messages (usage: {:.1}% > threshold: {:.1}%)",
-        usage_ratio * 100.0,
-        threshold * 100.0
+        "Auto-compacting messages (usage: {:.1}%)",
+        check_result.usage_ratio * 100.0
     );
 
-    // Perform compaction
-    let (compacted_messages, compacted_token_counts) = agent.summarize_context(messages).await?;
-    let tokens_after: usize = compacted_token_counts.iter().sum();
-
-    info!(
-        "Compaction complete: {} tokens -> {} tokens ({:.1}% reduction)",
-        total_tokens,
-        tokens_after,
-        (1.0 - (tokens_after as f64 / total_tokens as f64)) * 100.0
-    );
+    // Perform the compaction
+    let (compacted_messages, tokens_before, tokens_after) =
+        perform_compaction(agent, messages).await?;
 
     Ok(AutoCompactResult {
         compacted: true,
         messages: compacted_messages,
-        tokens_before: Some(total_tokens),
+        tokens_before: Some(tokens_before),
         tokens_after: Some(tokens_after),
     })
 }
@@ -179,6 +269,85 @@ mod tests {
         )
     }
 
+    #[tokio::test]
+    async fn test_check_compaction_needed() {
+        let mock_provider = Arc::new(MockProvider {
+            model_config: ModelConfig::new("test-model".to_string())
+                .with_context_limit(100_000.into()),
+        });
+
+        let agent = Agent::new();
+        let _ = agent.update_provider(mock_provider).await;
+
+        // Create small messages that won't trigger compaction
+        let messages = vec![create_test_message("Hello"), create_test_message("World")];
+
+        let result = check_compaction_needed(&agent, &messages, Some(0.3))
+            .await
+            .unwrap();
+
+        assert!(!result.needs_compaction);
+        assert!(result.current_tokens > 0);
+        assert!(result.context_limit > 0);
+        assert!(result.usage_ratio < 0.3);
+        assert!(result.remaining_tokens > 0);
+        assert!(result.percentage_until_compaction > 0.0);
+    }
+
+    #[tokio::test]
+    async fn test_check_compaction_needed_disabled() {
+        let mock_provider = Arc::new(MockProvider {
+            model_config: ModelConfig::new("test-model".to_string())
+                .with_context_limit(100_000.into()),
+        });
+
+        let agent = Agent::new();
+        let _ = agent.update_provider(mock_provider).await;
+
+        let messages = vec![create_test_message("Hello")];
+
+        // Test with threshold 0 (disabled)
+        let result = check_compaction_needed(&agent, &messages, Some(0.0))
+            .await
+            .unwrap();
+
+        assert!(!result.needs_compaction);
+
+        // Test with threshold 1.0 (disabled)
+        let result = check_compaction_needed(&agent, &messages, Some(1.0))
+            .await
+            .unwrap();
+
+        assert!(!result.needs_compaction);
+    }
+
+    #[tokio::test]
+    async fn test_perform_compaction() {
+        let mock_provider = Arc::new(MockProvider {
+            model_config: ModelConfig::new("test-model".to_string())
+                .with_context_limit(50_000.into()),
+        });
+
+        let agent = Agent::new();
+        let _ = agent.update_provider(mock_provider).await;
+
+        // Create some messages to compact
+        let messages = vec![
+            create_test_message("First message"),
+            create_test_message("Second message"),
+            create_test_message("Third message"),
+        ];
+
+        let (compacted_messages, tokens_before, tokens_after) =
+            perform_compaction(&agent, &messages).await.unwrap();
+
+        assert!(tokens_before > 0);
+        assert!(tokens_after > 0);
+        // Note: The mock provider returns a fixed summary, which might not always be smaller
+        // In real usage, compaction should reduce tokens, but for testing we just verify it works
+        assert!(!compacted_messages.is_empty());
+    }
+
     #[tokio::test]
     async fn test_auto_compact_disabled() {
         let mock_provider = Arc::new(MockProvider {
diff --git a/crates/goose/src/context_mgmt/common.rs b/crates/goose/src/context_mgmt/common.rs
index cd12e09f96f9..9529a1ba06d3 100644
--- a/crates/goose/src/context_mgmt/common.rs
+++ b/crates/goose/src/context_mgmt/common.rs
@@ -19,8 +19,14 @@ pub fn estimate_target_context_limit(provider: Arc<dyn Provider>) -> usize {
     // Our token count is an estimate since model providers often don't provide the tokenizer (eg. Claude)
     let target_limit = (model_context_limit as f32 * ESTIMATE_FACTOR) as usize;
 
-    // subtract out overhead for system prompt and tools
-    target_limit - (SYSTEM_PROMPT_TOKEN_OVERHEAD + TOOLS_TOKEN_OVERHEAD)
+    // subtract out overhead for system prompt and tools, but ensure we don't go negative
+    let overhead = SYSTEM_PROMPT_TOKEN_OVERHEAD + TOOLS_TOKEN_OVERHEAD;
+    if target_limit > overhead {
+        target_limit - overhead
+    } else {
+        // If overhead is larger than target limit, return a minimal usable limit
+        std::cmp::max(target_limit / 2, 1000)
+    }
 }
 
 pub fn get_messages_token_counts(token_counter: &TokenCounter, messages: &[Message]) -> Vec<usize> {
diff --git a/full-bench-config.json b/full-bench-config.json
deleted file mode 100644
index f46962688692..000000000000
--- a/full-bench-config.json
+++ /dev/null
@@ -1,91 +0,0 @@
-{
-  "models": [
-    {
-      "provider": "databricks",
-      "name": "goose",
-      "parallel_safe": false,
-      "tool_shim": {
-        "use_tool_shim": false,
-        "tool_shim_model": null
-      }
-    }
-  ],
-  "evals": [
-    {
-      "selector": "core:developer:simple_repo_clone_test",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "core:developer:create_file",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "core:developer:list_files",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "core:computercontroller:script",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "core:computercontroller:web_scrape",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "core:developer_image:image",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "core:developer_search_replace:search_replace",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "core:example",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "core:memory:save_fact",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "vibes:blog_summary",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "vibes:flappy_bird",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "vibes:goose_wiki",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "vibes:restaurant_research",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    },
-    {
-      "selector": "vibes:squirrel_census",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    }
-  ],
-  "include_dirs": [],
-  "repeat": 1,
-  "run_id": "autocompact-full-bench",
-  "eval_result_filename": "eval-results.json",
-  "run_summary_filename": "run-results-summary.json",
-  "env_file": null
-}

From 1c27b65a439a5ecf028c22cf16abd572fc0587f3 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Thu, 24 Jul 2025 16:46:49 -0400
Subject: [PATCH 19/41] refactor into agent reply

---
 crates/goose-cli/src/commands/web.rs    | 49 +++---------------------
 crates/goose-cli/src/session/mod.rs     | 37 +-----------------
 crates/goose-server/src/routes/reply.rs | 33 +---------------
 crates/goose/src/agents/agent.rs        | 51 ++++++++++++++++++++++++-
 crates/goose/src/session/info.rs        |  1 -
 5 files changed, 59 insertions(+), 112 deletions(-)

diff --git a/crates/goose-cli/src/commands/web.rs b/crates/goose-cli/src/commands/web.rs
index bac587e16b76..716940e0b1bb 100644
--- a/crates/goose-cli/src/commands/web.rs
+++ b/crates/goose-cli/src/commands/web.rs
@@ -10,7 +10,6 @@ use axum::{
 };
 use futures::{sink::SinkExt, stream::StreamExt};
 use goose::agents::{Agent, AgentEvent};
-use goose::context_mgmt::auto_compact::check_and_compact_messages;
 use goose::message::Message as GooseMessage;
 use goose::session;
 use serde::{Deserialize, Serialize};
@@ -449,52 +448,13 @@ async fn process_message_streaming(
     // Create a user message
     let user_message = GooseMessage::user().with_text(content.clone());
 
-    // Get existing messages from session and add the new user message
-    let mut messages = {
+    // Messages will be auto-compacted in agent.reply() if needed
+    let messages = {
         let mut session_msgs = session_messages.lock().await;
         session_msgs.push(user_message.clone());
         session_msgs.clone()
     };
 
-    // Check and compact messages if needed before calling reply
-    let compact_result = check_and_compact_messages(agent, &messages, None).await?;
-    if compact_result.compacted {
-        messages = compact_result.messages.clone();
-
-        // Update session messages
-        {
-            let mut session_msgs = session_messages.lock().await;
-            *session_msgs = compact_result.messages;
-        }
-
-        // Notify client of compaction
-        let msg = if let (Some(before), Some(after)) =
-            (compact_result.tokens_before, compact_result.tokens_after)
-        {
-            format!(
-                "Auto-compacted context: {} → {} tokens ({:.0}% reduction)",
-                before,
-                after,
-                (1.0 - (after as f64 / before as f64)) * 100.0
-            )
-        } else {
-            "Auto-compacted context to prevent overflow".to_string()
-        };
-
-        let mut sender_lock = sender.lock().await;
-        let _ = sender_lock
-            .send(Message::Text(
-                serde_json::to_string(&WebSocketMessage::Response {
-                    content: msg,
-                    role: "system".to_string(),
-                    timestamp: chrono::Utc::now().timestamp_millis(),
-                })
-                .unwrap()
-                .into(),
-            ))
-            .await;
-    }
-
     // Persist messages to JSONL file with provider for automatic description generation
     let provider = agent.provider().await;
     if provider.is_err() {
@@ -658,7 +618,10 @@ async fn process_message_streaming(
                                     // TODO: Implement proper UI for context handling
                                     let (summarized_messages, _) =
                                         agent.summarize_context(&messages).await?;
-                                    messages = summarized_messages;
+                                    {
+                                        let mut session_msgs = session_messages.lock().await;
+                                        *session_msgs = summarized_messages;
+                                    }
                                 }
                                 _ => {
                                     // Handle other message types as needed
diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index fb6f8a318290..86113ce26d93 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -16,7 +16,6 @@ pub use self::export::message_to_markdown;
 pub use builder::{build_session, SessionBuilderConfig, SessionSettings};
 use console::Color;
 use goose::agents::AgentEvent;
-use goose::context_mgmt::auto_compact::check_and_compact_messages;
 use goose::message::push_message;
 use goose::permission::permission_confirmation::PrincipalType;
 use goose::permission::Permission;
@@ -841,41 +840,7 @@ impl Session {
     }
 
     async fn process_agent_response(&mut self, interactive: bool) -> Result<()> {
-        // Check and compact messages if needed before calling reply
-        let compact_result = check_and_compact_messages(&self.agent, &self.messages, None).await?;
-        if compact_result.compacted {
-            self.messages = compact_result.messages;
-
-            // Notify user of compaction
-            let msg = if let (Some(before), Some(after)) =
-                (compact_result.tokens_before, compact_result.tokens_after)
-            {
-                format!(
-                    "Auto-compacted context: {} → {} tokens ({:.0}% reduction)",
-                    before,
-                    after,
-                    (1.0 - (after as f64 / before as f64)) * 100.0
-                )
-            } else {
-                "Auto-compacted context to prevent overflow".to_string()
-            };
-            output::render_text(&msg, Some(Color::Yellow), true);
-
-            // Persist the compacted messages
-            if let Some(session_file) = &self.session_file {
-                let working_dir = std::env::current_dir().ok();
-                let provider = self.agent.provider().await.ok();
-                session::persist_messages_with_schedule_id(
-                    session_file,
-                    &self.messages,
-                    provider,
-                    self.scheduled_job_id.clone(),
-                    working_dir,
-                )
-                .await?;
-            }
-        }
-
+        // Messages will be auto-compacted in agent.reply() if needed
         let cancel_token = CancellationToken::new();
         let cancel_token_clone = cancel_token.clone();
 
diff --git a/crates/goose-server/src/routes/reply.rs b/crates/goose-server/src/routes/reply.rs
index 63140ac5656e..55a6c9b66acf 100644
--- a/crates/goose-server/src/routes/reply.rs
+++ b/crates/goose-server/src/routes/reply.rs
@@ -11,7 +11,6 @@ use bytes::Bytes;
 use futures::{stream::StreamExt, Stream};
 use goose::{
     agents::{AgentEvent, SessionConfig},
-    context_mgmt::auto_compact::check_and_compact_messages,
     message::{push_message, Message},
     permission::permission_confirmation::PrincipalType,
 };
@@ -160,36 +159,8 @@ async fn reply_handler(
             retry_config: None,
         };
 
-        // Check and compact messages if needed before calling reply
-        let mut messages_to_process = messages.clone();
-        let compact_result = check_and_compact_messages(&agent, &messages_to_process, None).await;
-        if let Ok(result) = compact_result {
-            if result.compacted {
-                messages_to_process = result.messages;
-
-                // Notify client of compaction
-                let msg = if let (Some(before), Some(after)) =
-                    (result.tokens_before, result.tokens_after)
-                {
-                    format!(
-                        "Auto-compacted context: {} → {} tokens ({:.0}% reduction)",
-                        before,
-                        after,
-                        (1.0 - (after as f64 / before as f64)) * 100.0
-                    )
-                } else {
-                    "Auto-compacted context to prevent overflow".to_string()
-                };
-
-                let _ = stream_event(
-                    MessageEvent::Message {
-                        message: Message::assistant().with_text(&msg),
-                    },
-                    &task_tx,
-                )
-                .await;
-            }
-        }
+        // Messages will be auto-compacted in agent.reply() if needed
+        let messages_to_process = messages.clone();
 
         let mut stream = match agent
             .reply(
diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index 0742ba33fe1d..562015b1e982 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -34,6 +34,7 @@ use crate::agents::tool_vectordb::generate_table_id;
 use crate::agents::types::SessionConfig;
 use crate::agents::types::{FrontendTool, ToolResultReceiver};
 use crate::config::{Config, ExtensionConfigManager, PermissionManager};
+use crate::context_mgmt::auto_compact;
 use crate::message::{push_message, Message};
 use crate::permission::permission_judge::check_tool_permissions;
 use crate::permission::PermissionConfirmation;
@@ -719,11 +720,59 @@ impl Agent {
         messages: &[Message],
         session: Option<SessionConfig>,
         cancel_token: Option<CancellationToken>,
+    ) -> Result<BoxStream<'_, Result<AgentEvent>>> {
+        let mut messages = messages.to_vec();
+        let _initial_messages = messages.clone();
+        let _reply_span = tracing::Span::current();
+        self.reset_retry_attempts().await;
+        let _config = Config::global();
+
+        // Handle auto-compaction before processing
+        let compact_result =
+            auto_compact::check_and_compact_messages(self, &messages, None).await?;
+        if compact_result.compacted {
+            messages = compact_result.messages;
+
+            // Create compaction notification message
+            let compaction_msg = if let (Some(before), Some(after)) =
+                (compact_result.tokens_before, compact_result.tokens_after)
+            {
+                format!(
+                    "Auto-compacted context: {} → {} tokens ({:.0}% reduction)",
+                    before,
+                    after,
+                    (1.0 - (after as f64 / before as f64)) * 100.0
+                )
+            } else {
+                "Auto-compacted context to reduce token usage".to_string()
+            };
+
+            // Yield compaction notification as first event
+            return Ok(Box::pin(async_stream::try_stream! {
+                yield AgentEvent::Message(Message::assistant().with_text(compaction_msg));
+
+                // Continue with normal reply processing using compacted messages
+                let mut reply_stream = self.reply_internal(&messages, session, cancel_token).await?;
+                while let Some(event) = reply_stream.next().await {
+                    yield event?;
+                }
+            }));
+        }
+
+        // No compaction needed, proceed with normal processing
+        self.reply_internal(&messages, session, cancel_token).await
+    }
+
+    /// Internal reply method that handles the actual agent processing
+    async fn reply_internal(
+        &self,
+        messages: &[Message],
+        session: Option<SessionConfig>,
+        cancel_token: Option<CancellationToken>,
     ) -> Result<BoxStream<'_, Result<AgentEvent>>> {
         let mut messages = messages.to_vec();
         let initial_messages = messages.clone();
         let reply_span = tracing::Span::current();
-        self.reset_retry_attempts().await;
         let config = Config::global();
 
         let (mut tools, mut toolshim_tools, mut system_prompt) =
diff --git a/crates/goose/src/session/info.rs b/crates/goose/src/session/info.rs
index 6c60d3310dba..b5fc56bca149 100644
--- a/crates/goose/src/session/info.rs
+++ b/crates/goose/src/session/info.rs
@@ -100,7 +100,6 @@ pub fn get_valid_sorted_sessions(sort_order: SortOrder) -> Result<Vec<SessionInf
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use crate::session::SessionMetadata;
     use std::fs;
     use tempfile::tempdir;

From ad7ca3dfa94b74c35985be47131f7ae8f1182c36 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Fri, 25 Jul 2025 00:01:39 -0400
Subject: [PATCH 20/41] add logging

---
 crates/goose/src/context_mgmt/auto_compact.rs | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index 0ac7b3551f37..6dc48f867daa 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -104,6 +104,27 @@ pub async fn check_compaction_needed(
         needs_compaction
     );
 
+    // Add detailed logging to help debug the issue
+    eprintln!("DEBUG COMPACTION CHECK:");
+    eprintln!("  Messages count: {}", messages.len());
+    eprintln!("  Current tokens: {}", current_tokens);
+    eprintln!("  Context limit: {}", context_limit);
+    eprintln!("  Usage ratio: {:.1}%", usage_ratio * 100.0);
+    eprintln!("  Threshold: {:.1}%", threshold * 100.0);
+    eprintln!("  Needs compaction: {}", needs_compaction);
+    eprintln!("  Token counts per message: {:?}", token_counts);
+    
+    // Log first few characters of each message for debugging
+    // for (i, msg) in messages.iter().enumerate() {
+    //     let content_preview = msg.as_concat_text();
+    //     let preview = if content_preview.len() > 100 {
+    //         format!("{}...", &content_preview[..100])
+    //     } else {
+    //         content_preview
+    //     };
+    //     eprintln!("  Message {}: {} tokens, content: {:?}", i, token_counts[i], preview);
+    // }
+
     Ok(CompactionCheckResult {
         needs_compaction,
         current_tokens,

From 94dc7a0e2e6410f6697c483a0fd05f83a5d3ad23 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Fri, 25 Jul 2025 01:20:05 -0400
Subject: [PATCH 21/41] fix token reduction

---
 crates/goose-cli/src/commands/web.rs          | 24 +++++++++++++++++++
 crates/goose-cli/src/session/mod.rs           | 19 +++++++++++++++
 crates/goose-ffi/src/lib.rs                   |  4 ++++
 crates/goose-server/src/routes/reply.rs       |  6 +++++
 crates/goose/src/agents/agent.rs              |  7 +++++-
 crates/goose/src/context_mgmt/auto_compact.rs | 14 +++++++----
 crates/goose/src/scheduler.rs                 |  4 ++++
 crates/goose/tests/agent.rs                   |  8 +++++++
 8 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/crates/goose-cli/src/commands/web.rs b/crates/goose-cli/src/commands/web.rs
index 716940e0b1bb..5e85ec7956f6 100644
--- a/crates/goose-cli/src/commands/web.rs
+++ b/crates/goose-cli/src/commands/web.rs
@@ -629,6 +629,30 @@ async fn process_message_streaming(
                             }
                         }
                     }
+                    Ok(AgentEvent::HistoryReplaced(new_messages)) => {
+                        // Replace the session's message history with the compacted messages
+                        {
+                            let mut session_msgs = session_messages.lock().await;
+                            *session_msgs = new_messages;
+                        }
+
+                        // Persist the updated messages to the JSONL file
+                        let current_messages = {
+                            let session_msgs = session_messages.lock().await;
+                            session_msgs.clone()
+                        };
+
+                        if let Err(e) = session::persist_messages(
+                            &session_file,
+                            &current_messages,
+                            None, // No provider needed for persisting
+                            working_dir.clone(),
+                        )
+                        .await
+                        {
+                            error!("Failed to persist compacted messages: {}", e);
+                        }
+                    }
                     Ok(AgentEvent::McpNotification(_notification)) => {
                         // Handle MCP notifications if needed
                         // For now, we'll just log them
diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 86113ce26d93..752ec8b508af 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -1146,6 +1146,25 @@ impl Session {
                                 }
                             }
                         }
+                        Some(Ok(AgentEvent::HistoryReplaced(new_messages))) => {
+                            // Replace the session's message history with the compacted messages
+                            self.messages = new_messages;
+                            
+                            // Persist the updated messages to the session file
+                            if let Some(session_file) = &self.session_file {
+                                let provider = self.agent.provider().await.ok();
+                                let working_dir = std::env::current_dir().ok();
+                                if let Err(e) = session::persist_messages_with_schedule_id(
+                                    session_file,
+                                    &self.messages,
+                                    provider,
+                                    self.scheduled_job_id.clone(),
+                                    working_dir,
+                                ).await {
+                                    eprintln!("Failed to persist compacted messages: {}", e);
+                                }
+                            }
+                        }
                         Some(Ok(AgentEvent::ModelChange { model, mode })) => {
                             // Log model change if in debug mode
                             if self.debug {
diff --git a/crates/goose-ffi/src/lib.rs b/crates/goose-ffi/src/lib.rs
index 6bcf3abe1695..98ddf5382146 100644
--- a/crates/goose-ffi/src/lib.rs
+++ b/crates/goose-ffi/src/lib.rs
@@ -270,6 +270,10 @@ pub unsafe extern "C" fn goose_agent_send_message(
                     // Model change events are informational, just continue
                 }
 
+                Ok(AgentEvent::HistoryReplaced(_)) => {
+                    // Handle history replacement events if needed
+                    // For FFI, we don't need to do anything special
+                }
                 Err(e) => {
                     full_response.push_str(&format!("\nError in message stream: {}", e));
                 }
diff --git a/crates/goose-server/src/routes/reply.rs b/crates/goose-server/src/routes/reply.rs
index 55a6c9b66acf..db5184a7036b 100644
--- a/crates/goose-server/src/routes/reply.rs
+++ b/crates/goose-server/src/routes/reply.rs
@@ -222,6 +222,12 @@ async fn reply_handler(
                                             break;
                                         }
                                     }
+                                    Ok(Some(Ok(AgentEvent::HistoryReplaced(new_messages)))) => {
+                                        // Replace the message history with the compacted messages
+                                        all_messages = new_messages;
+                                        // Note: We don't send this as a stream event since it's an internal operation
+                                        // The client will see the compaction notification message that was sent before this event
+                                    }
                                     Ok(Some(Ok(AgentEvent::ModelChange { model, mode }))) => {
                                         if let Err(e) = stream_event(MessageEvent::ModelChange { model, mode }, &tx).await {
                                             tracing::error!("Error sending model change through channel: {}", e);
diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index 562015b1e982..1d8d1fa584b6 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -86,6 +86,7 @@ pub enum AgentEvent {
     Message(Message),
     McpNotification((String, JsonRpcMessage)),
     ModelChange { model: String, mode: String },
+    HistoryReplaced(Vec<Message>),
 }
 
 impl Default for Agent {
@@ -747,9 +748,13 @@ impl Agent {
                 "Auto-compacted context to reduce token usage".to_string()
             };
 
-            // Yield compaction notification as first event
+            // Create a new AgentEvent type to signal message history replacement
             return Ok(Box::pin(async_stream::try_stream! {
                 yield AgentEvent::Message(Message::assistant().with_text(compaction_msg));
+                
+                // Yield a special event to indicate the session should replace its message history
+                // with the compacted messages
+                yield AgentEvent::HistoryReplaced(messages.clone());
 
                 // Continue with normal reply processing using compacted messages
                 let mut reply_stream = self.reply_internal(&messages, session, cancel_token).await?;
diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index 6dc48f867daa..9ce3a7a60c6b 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -470,7 +470,7 @@ mod tests {
     async fn test_auto_compact_respects_config() {
         let mock_provider = Arc::new(MockProvider {
             model_config: ModelConfig::new("test-model".to_string())
-                .with_context_limit(50_000.into()), // Realistic context limit that won't underflow
+                .with_context_limit(30_000.into()), // Smaller context limit to make threshold easier to hit
         });
 
         let agent = Agent::new();
@@ -478,11 +478,15 @@ mod tests {
 
         // Create enough messages to trigger compaction with low threshold
         let mut messages = Vec::new();
-        // Need to create more messages since we have a 27k usable token limit
-        // 10% of 27k = 2.7k tokens
-        for i in 0..150 {
+        // With 30k context limit, after overhead we have ~27k usable tokens
+        // 10% of 27k = 2.7k tokens, so we need messages that exceed that
+        for i in 0..200 {
             messages.push(create_test_message(&format!(
-                "Message {} with enough content to ensure we exceed 10% of the context limit. Adding more content.",
+                "Message {} with enough content to ensure we exceed 10% of the context limit. \
+                 Adding more content to increase token count substantially. This message contains \
+                 multiple sentences to increase the token count. We need to ensure that our total \
+                 token usage exceeds 10% of the available context limit after accounting for \
+                 system prompt and tools overhead.",
                 i
             )));
         }
diff --git a/crates/goose/src/scheduler.rs b/crates/goose/src/scheduler.rs
index 50e06c24bbaa..dbdb146517de 100644
--- a/crates/goose/src/scheduler.rs
+++ b/crates/goose/src/scheduler.rs
@@ -1232,6 +1232,10 @@ async fn run_scheduled_job_internal(
                             // Model change events are informational, just continue
                         }
 
+                        Ok(AgentEvent::HistoryReplaced(_)) => {
+                            // Handle history replacement events if needed
+                            // For scheduled jobs, we don't need to do anything special
+                        }
                         Err(e) => {
                             tracing::error!(
                                 "[Job {}] Error receiving message from agent: {}",
diff --git a/crates/goose/tests/agent.rs b/crates/goose/tests/agent.rs
index 497ebcaab715..6c257b364466 100644
--- a/crates/goose/tests/agent.rs
+++ b/crates/goose/tests/agent.rs
@@ -143,6 +143,10 @@ async fn run_truncate_test(
                 // Model change events are informational, just continue
             }
 
+            Ok(AgentEvent::HistoryReplaced(_)) => {
+                // Handle history replacement events if needed
+                // For tests, we don't need to do anything special
+            }
             Err(e) => {
                 println!("Error: {:?}", e);
                 return Err(e);
@@ -1037,6 +1041,10 @@ mod max_turns_tests {
                 }
                 Ok(AgentEvent::McpNotification(_)) => {}
                 Ok(AgentEvent::ModelChange { .. }) => {}
+                Ok(AgentEvent::HistoryReplaced(_)) => {
+                    // Handle history replacement events if needed
+                    // For tests, we don't need to do anything special
+                }
                 Err(e) => {
                     return Err(e);
                 }

From 0aa079a3dcdc297202628ae5ac11b3c6fd41176c Mon Sep 17 00:00:00 2001
From: Michael Neale <michael.neale@gmail.com>
Date: Fri, 25 Jul 2025 17:36:32 +1000
Subject: [PATCH 22/41] fmt tidy up

---
 crates/goose-cli/src/session/mod.rs           | 2 +-
 crates/goose/src/agents/agent.rs              | 2 +-
 crates/goose/src/context_mgmt/auto_compact.rs | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 752ec8b508af..336753d5598e 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -1149,7 +1149,7 @@ impl Session {
                         Some(Ok(AgentEvent::HistoryReplaced(new_messages))) => {
                             // Replace the session's message history with the compacted messages
                             self.messages = new_messages;
-                            
+
                             // Persist the updated messages to the session file
                             if let Some(session_file) = &self.session_file {
                                 let provider = self.agent.provider().await.ok();
diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index 44ebdb88754f..0262a11ba3b1 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -745,7 +745,7 @@ impl Agent {
             // Create a new AgentEvent type to signal message history replacement
             return Ok(Box::pin(async_stream::try_stream! {
                 yield AgentEvent::Message(Message::assistant().with_text(compaction_msg));
-                
+
                 // Yield a special event to indicate the session should replace its message history
                 // with the compacted messages
                 yield AgentEvent::HistoryReplaced(messages.clone());
diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index 9ce3a7a60c6b..0fd37f03471b 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -113,7 +113,7 @@ pub async fn check_compaction_needed(
     eprintln!("  Threshold: {:.1}%", threshold * 100.0);
     eprintln!("  Needs compaction: {}", needs_compaction);
     eprintln!("  Token counts per message: {:?}", token_counts);
-    
+
     // Log first few characters of each message for debugging
     // for (i, msg) in messages.iter().enumerate() {
     //     let content_preview = msg.as_concat_text();

From 23162e4a3058c821a0647cc84d51d7de8ef144a0 Mon Sep 17 00:00:00 2001
From: Michael Neale <michael.neale@gmail.com>
Date: Fri, 25 Jul 2025 19:22:56 +1000
Subject: [PATCH 23/41] import the right tool, don't rely on wildcard

---
 crates/goose/src/context_mgmt/auto_compact.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index 0fd37f03471b..a49e7192738c 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -240,8 +240,7 @@ mod tests {
         providers::errors::ProviderError,
     };
     use chrono::Utc;
-    use mcp_core::tool::Tool;
-    use rmcp::model::{AnnotateAble, RawTextContent, Role};
+    use rmcp::model::{AnnotateAble, RawTextContent, Role, Tool};
     use std::sync::Arc;
 
     #[derive(Clone)]

From 8342d35dbfdc8565061de995b05538f18a500bcb Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Fri, 25 Jul 2025 14:29:33 -0400
Subject: [PATCH 24/41] rm debug logs

---
 crates/goose/src/context_mgmt/auto_compact.rs | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index 9ce3a7a60c6b..1275a05a73de 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -104,27 +104,6 @@ pub async fn check_compaction_needed(
         needs_compaction
     );
 
-    // Add detailed logging to help debug the issue
-    eprintln!("DEBUG COMPACTION CHECK:");
-    eprintln!("  Messages count: {}", messages.len());
-    eprintln!("  Current tokens: {}", current_tokens);
-    eprintln!("  Context limit: {}", context_limit);
-    eprintln!("  Usage ratio: {:.1}%", usage_ratio * 100.0);
-    eprintln!("  Threshold: {:.1}%", threshold * 100.0);
-    eprintln!("  Needs compaction: {}", needs_compaction);
-    eprintln!("  Token counts per message: {:?}", token_counts);
-    
-    // Log first few characters of each message for debugging
-    // for (i, msg) in messages.iter().enumerate() {
-    //     let content_preview = msg.as_concat_text();
-    //     let preview = if content_preview.len() > 100 {
-    //         format!("{}...", &content_preview[..100])
-    //     } else {
-    //         content_preview
-    //     };
-    //     eprintln!("  Message {}: {} tokens, content: {:?}", i, token_counts[i], preview);
-    // }
-
     Ok(CompactionCheckResult {
         needs_compaction,
         current_tokens,

From 4b12c2b60f0b1d3be15d8304ac516764b47af7db Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Fri, 25 Jul 2025 14:46:44 -0400
Subject: [PATCH 25/41] rm extra file + clean comments

---
 auto-compact-bench-config.json | 26 --------------------------
 crates/goose-ffi/src/lib.rs    |  2 --
 crates/goose/src/scheduler.rs  |  2 --
 crates/goose/tests/agent.rs    |  7 +------
 4 files changed, 1 insertion(+), 36 deletions(-)
 delete mode 100644 auto-compact-bench-config.json

diff --git a/auto-compact-bench-config.json b/auto-compact-bench-config.json
deleted file mode 100644
index 71059b9be5ec..000000000000
--- a/auto-compact-bench-config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "models": [
-    {
-      "provider": "databricks",
-      "name": "goose",
-      "parallel_safe": false,
-      "tool_shim": {
-        "use_tool_shim": false,
-        "tool_shim_model": null
-      }
-    }
-  ],
-  "evals": [
-    {
-      "selector": "core:developer:simple_repo_clone_test",
-      "post_process_cmd": null,
-      "parallel_safe": false
-    }
-  ],
-  "include_dirs": [],
-  "repeat": 1,
-  "run_id": "auto-compact-test",
-  "eval_result_filename": "eval-results.json",
-  "run_summary_filename": "run-results-summary.json",
-  "env_file": null
-}
diff --git a/crates/goose-ffi/src/lib.rs b/crates/goose-ffi/src/lib.rs
index 98ddf5382146..367901c674d1 100644
--- a/crates/goose-ffi/src/lib.rs
+++ b/crates/goose-ffi/src/lib.rs
@@ -269,10 +269,8 @@ pub unsafe extern "C" fn goose_agent_send_message(
                 Ok(AgentEvent::ModelChange { .. }) => {
                     // Model change events are informational, just continue
                 }
-
                 Ok(AgentEvent::HistoryReplaced(_)) => {
                     // Handle history replacement events if needed
-                    // For FFI, we don't need to do anything special
                 }
                 Err(e) => {
                     full_response.push_str(&format!("\nError in message stream: {}", e));
diff --git a/crates/goose/src/scheduler.rs b/crates/goose/src/scheduler.rs
index 7673b5bd8b13..fe09a2fe4434 100644
--- a/crates/goose/src/scheduler.rs
+++ b/crates/goose/src/scheduler.rs
@@ -1231,10 +1231,8 @@ async fn run_scheduled_job_internal(
                         Ok(AgentEvent::ModelChange { .. }) => {
                             // Model change events are informational, just continue
                         }
-
                         Ok(AgentEvent::HistoryReplaced(_)) => {
                             // Handle history replacement events if needed
-                            // For scheduled jobs, we don't need to do anything special
                         }
                         Err(e) => {
                             tracing::error!(
diff --git a/crates/goose/tests/agent.rs b/crates/goose/tests/agent.rs
index 7e7342312810..214b84b0c654 100644
--- a/crates/goose/tests/agent.rs
+++ b/crates/goose/tests/agent.rs
@@ -142,10 +142,8 @@ async fn run_truncate_test(
             Ok(AgentEvent::ModelChange { .. }) => {
                 // Model change events are informational, just continue
             }
-
             Ok(AgentEvent::HistoryReplaced(_)) => {
                 // Handle history replacement events if needed
-                // For tests, we don't need to do anything special
             }
             Err(e) => {
                 println!("Error: {:?}", e);
@@ -1046,10 +1044,7 @@ mod max_turns_tests {
                 }
                 Ok(AgentEvent::McpNotification(_)) => {}
                 Ok(AgentEvent::ModelChange { .. }) => {}
-                Ok(AgentEvent::HistoryReplaced(_)) => {
-                    // Handle history replacement events if needed
-                    // For tests, we don't need to do anything special
-                }
+                Ok(AgentEvent::HistoryReplaced(_)) => {}
                 Err(e) => {
                     return Err(e);
                 }

From 97fa0f87e86c9dac2e99673f54c557e1ee751409 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Mon, 28 Jul 2025 13:17:17 -0400
Subject: [PATCH 26/41] fmt

---
 crates/goose/src/agents/agent.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index 5d1f390f8421..6c15e409ffe5 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -717,7 +717,7 @@ impl Agent {
         cancel_token: Option<CancellationToken>,
     ) -> Result<BoxStream<'_, Result<AgentEvent>>> {
         let (mut messages, issues) =
-        ConversationFixer::fix_conversation(Vec::from(unfixed_messages));
+            ConversationFixer::fix_conversation(Vec::from(unfixed_messages));
         if !issues.is_empty() {
             tracing::warn!(
                 "Conversation issue fixed: {}",

From b233c74625d1136d57a25a6099048e5b8a6a9ccc Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Mon, 28 Jul 2025 14:23:40 -0400
Subject: [PATCH 27/41] autocompact splice last message

---
 crates/goose/src/context_mgmt/auto_compact.rs | 38 ++++++++++++++-----
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index fd52fcc99f0f..d1725dcaf034 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -6,7 +6,7 @@ use crate::{
     token_counter::create_async_token_counter,
 };
 use anyhow::Result;
-use tracing::{debug, info};
+use tracing::{debug, warn, info};
 
 /// Result of auto-compaction check
 #[derive(Debug)]
@@ -60,7 +60,7 @@ pub async fn check_compaction_needed(
     let threshold = threshold_override.unwrap_or_else(|| {
         config
             .get_param::<f64>("GOOSE_AUTO_COMPACT_THRESHOLD")
-            .unwrap_or(0.3) // Default to 30%
+            .unwrap_or(0.00001) // Default to 30%
     });
 
     // Get provider and token counter
@@ -158,9 +158,8 @@ pub async fn perform_compaction(
 /// Check if messages need compaction and compact them if necessary
 ///
 /// This is a convenience wrapper function that combines checking and compaction.
-/// It uses the separate `check_compaction_needed` and `perform_compaction` functions
-/// internally to provide the same interface as before while allowing for better
-/// separation of concerns.
+/// If the most recent message is a user message, it will be preserved by removing it
+/// before compaction and adding it back afterwards.
 ///
 /// # Arguments
 /// * `agent` - The agent to use for context management
@@ -180,8 +179,9 @@ pub async fn check_and_compact_messages(
     // If no compaction is needed, return early
     if !check_result.needs_compaction {
         debug!(
-            "No compaction needed (usage: {:.1}% <= threshold)",
-            check_result.usage_ratio * 100.0
+            "No compaction needed (usage: {:.1}% <= {:.1}% threshold)",
+            check_result.usage_ratio * 100.0,
+            check_result.percentage_until_compaction
         );
         return Ok(AutoCompactResult {
             compacted: false,
@@ -196,9 +196,27 @@ pub async fn check_and_compact_messages(
         check_result.usage_ratio * 100.0
     );
 
-    // Perform the compaction
-    let (compacted_messages, tokens_before, tokens_after) =
-        perform_compaction(agent, messages).await?;
+    // Check if the most recent message is a user message
+    let (messages_to_compact, preserved_user_message) = 
+        if let Some(last_message) = messages.last() {
+            if matches!(last_message.role, rmcp::model::Role::User) {
+                // Remove the last user message before auto-compaction
+                (&messages[..messages.len() - 1], Some(last_message.clone()))
+            } else {
+                (messages, None)
+            }
+        } else {
+            (messages, None)
+        };
+
+    // Perform the compaction on messages excluding the preserved user message
+    let (mut compacted_messages, tokens_before, tokens_after) =
+        perform_compaction(agent, messages_to_compact).await?;
+
+    // Add back the preserved user message if it exists
+    if let Some(user_message) = preserved_user_message {
+        compacted_messages.push(user_message);
+    }
 
     Ok(AutoCompactResult {
         compacted: true,

From 1a1733e0e95d57a4c759098266cd8a5023ac6447 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Mon, 28 Jul 2025 14:24:29 -0400
Subject: [PATCH 28/41] fix threshold

---
 crates/goose/src/context_mgmt/auto_compact.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index d1725dcaf034..a3f083c9a030 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -60,7 +60,7 @@ pub async fn check_compaction_needed(
     let threshold = threshold_override.unwrap_or_else(|| {
         config
             .get_param::<f64>("GOOSE_AUTO_COMPACT_THRESHOLD")
-            .unwrap_or(0.00001) // Default to 30%
+            .unwrap_or(0.3) // Default to 30%
     });
 
     // Get provider and token counter

From 2966295211f0a1be4246c83c1368ac3f11d1bd51 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Mon, 28 Jul 2025 14:26:54 -0400
Subject: [PATCH 29/41] fmt

---
 crates/goose/src/context_mgmt/auto_compact.rs | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index a3f083c9a030..b3c193d75487 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -6,7 +6,7 @@ use crate::{
     token_counter::create_async_token_counter,
 };
 use anyhow::Result;
-use tracing::{debug, warn, info};
+use tracing::{debug, info, warn};
 
 /// Result of auto-compaction check
 #[derive(Debug)]
@@ -197,17 +197,17 @@ pub async fn check_and_compact_messages(
     );
 
     // Check if the most recent message is a user message
-    let (messages_to_compact, preserved_user_message) = 
-        if let Some(last_message) = messages.last() {
-            if matches!(last_message.role, rmcp::model::Role::User) {
-                // Remove the last user message before auto-compaction
-                (&messages[..messages.len() - 1], Some(last_message.clone()))
-            } else {
-                (messages, None)
-            }
+    let (messages_to_compact, preserved_user_message) = if let Some(last_message) = messages.last()
+    {
+        if matches!(last_message.role, rmcp::model::Role::User) {
+            // Remove the last user message before auto-compaction
+            (&messages[..messages.len() - 1], Some(last_message.clone()))
         } else {
             (messages, None)
-        };
+        }
+    } else {
+        (messages, None)
+    };
 
     // Perform the compaction on messages excluding the preserved user message
     let (mut compacted_messages, tokens_before, tokens_after) =

From 8df19c506ba2bc24b4f2896b665842624a5d8769 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Mon, 28 Jul 2025 14:27:10 -0400
Subject: [PATCH 30/41] unused

---
 crates/goose/src/context_mgmt/auto_compact.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index b3c193d75487..0b6d414857e1 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -6,7 +6,7 @@ use crate::{
     token_counter::create_async_token_counter,
 };
 use anyhow::Result;
-use tracing::{debug, info, warn};
+use tracing::{debug, info};
 
 /// Result of auto-compaction check
 #[derive(Debug)]

From d8f07ae7fee14d1f48cecc7cad34ff5d7eb9b4f8 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Mon, 28 Jul 2025 17:06:31 -0400
Subject: [PATCH 31/41] rm stray files

---
 godot-mcp-config.json    |  9 ---------
 godot-mcp-extension.json |  9 ---------
 setup-godot-mcp.sh       | 38 --------------------------------------
 3 files changed, 56 deletions(-)
 delete mode 100644 godot-mcp-config.json
 delete mode 100644 godot-mcp-extension.json
 delete mode 100755 setup-godot-mcp.sh

diff --git a/godot-mcp-config.json b/godot-mcp-config.json
deleted file mode 100644
index 5814e0e5aa02..000000000000
--- a/godot-mcp-config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "type": "stdio",
-  "name": "godot-mcp",
-  "cmd": "node",
-  "args": ["/Users/dkatz/git/goose2/Godot-MCP/server/dist/index.js"],
-  "description": "Godot MCP server for interacting with Godot Engine projects",
-  "timeout": 300,
-  "bundled": false
-}
diff --git a/godot-mcp-extension.json b/godot-mcp-extension.json
deleted file mode 100644
index 5814e0e5aa02..000000000000
--- a/godot-mcp-extension.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "type": "stdio",
-  "name": "godot-mcp",
-  "cmd": "node",
-  "args": ["/Users/dkatz/git/goose2/Godot-MCP/server/dist/index.js"],
-  "description": "Godot MCP server for interacting with Godot Engine projects",
-  "timeout": 300,
-  "bundled": false
-}
diff --git a/setup-godot-mcp.sh b/setup-godot-mcp.sh
deleted file mode 100755
index 8c3799882b68..000000000000
--- a/setup-godot-mcp.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-# Script to add Godot MCP extension to Goose configuration
-
-# Path to the Godot MCP server
-GODOT_MCP_PATH="/Users/dkatz/git/goose2/Godot-MCP/server/dist/index.js"
-
-# Check if the server file exists
-if [ ! -f "$GODOT_MCP_PATH" ]; then
-    echo "Error: Godot MCP server not found at $GODOT_MCP_PATH"
-    exit 1
-fi
-
-# Create the extension configuration
-cat > godot-mcp-extension.json << EOF
-{
-  "type": "stdio",
-  "name": "godot-mcp",
-  "cmd": "node",
-  "args": ["$GODOT_MCP_PATH"],
-  "description": "Godot MCP server for interacting with Godot Engine projects",
-  "timeout": 300,
-  "bundled": false
-}
-EOF
-
-echo "Created Godot MCP extension configuration at godot-mcp-extension.json"
-echo ""
-echo "To use this extension with Goose, you can:"
-echo "1. Use it directly in a session with: goose session --with-extension 'node $GODOT_MCP_PATH'"
-echo "2. Or add it to your Goose configuration programmatically"
-echo ""
-echo "The extension provides tools for:"
-echo "- Node management (create, modify, delete nodes)"
-echo "- Script management (read, write, analyze scripts)"
-echo "- Scene management (create, modify scenes)"
-echo "- Project management (access project settings and resources)"
-echo "- Editor control (run project, get editor state)"

From 985dfcdbd5d8375e1023cf63918d3325aeafd904 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Tue, 29 Jul 2025 15:21:51 -0400
Subject: [PATCH 32/41] rm noise

---
 cat | 12 ------------
 dog | 12 ------------
 2 files changed, 24 deletions(-)
 delete mode 100644 cat
 delete mode 100644 dog

diff --git a/cat b/cat
deleted file mode 100644
index c1305f09c6ed..000000000000
--- a/cat
+++ /dev/null
@@ -1,12 +0,0 @@
-Fish are aquatic vertebrates that breathe through gills and have fins for swimming.
-Most fish are cold-blooded, meaning their body temperature changes with their environment.
-There are over 34,000 species of fish, making them the most diverse group of vertebrates.
-Fish can be found in nearly every aquatic environment, from deep ocean trenches to mountain streams.
-Some fish, like salmon, are anadromous, meaning they migrate from saltwater to freshwater to spawn.
-The largest fish is the whale shark, which can grow up to 40 feet long.
-The smallest fish is the Paedocypris progenetica, measuring only about 7.9 millimeters.
-Fish have a lateral line system that helps them detect movement and pressure changes in water.
-Many fish species can change color for camouflage or communication purposes.
-Fish play crucial roles in aquatic ecosystems as both predators and prey.
-Some fish, like the lungfish, can breathe air and survive out of water for extended periods.
-Fish have been on Earth for over 500 million years, predating dinosaurs by millions of years.
diff --git a/dog b/dog
deleted file mode 100644
index b27c52814877..000000000000
--- a/dog
+++ /dev/null
@@ -1,12 +0,0 @@
-Fish have been on Earth for over 500 million years, predating dinosaurs by millions of years.
-Some fish, like the lungfish, can breathe air and survive out of water for extended periods.
-Fish play crucial roles in aquatic ecosystems as both predators and prey.
-Many fish species can change color for camouflage or communication purposes.
-Fish have a lateral line system that helps them detect movement and pressure changes in water.
-The smallest fish is the Paedocypris progenetica, measuring only about 7.9 millimeters.
-The largest fish is the whale shark, which can grow up to 40 feet long.
-Some fish, like salmon, are anadromous, meaning they migrate from saltwater to freshwater to spawn.
-Fish can be found in nearly every aquatic environment, from deep ocean trenches to mountain streams.
-There are over 34,000 species of fish, making them the most diverse group of vertebrates.
-Most fish are cold-blooded, meaning their body temperature changes with their environment.
-Fish are aquatic vertebrates that breathe through gills and have fins for swimming.

From a7e68b6095286409f1d83a17d977307efb900861 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Tue, 29 Jul 2025 16:38:49 -0400
Subject: [PATCH 33/41] replace with shorter summary

---
 crates/goose/src/context_mgmt/summarize.rs    |  2 +-
 .../src/prompts/summarize_oneshot_short.md    | 26 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 crates/goose/src/prompts/summarize_oneshot_short.md

diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index dcdf7afca0f1..ee42155c6fca 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -75,7 +75,7 @@ pub async fn summarize_messages_oneshot(
     };
 
     // Render the one-shot summarization prompt
-    let system_prompt = render_global_file("summarize_oneshot.md", &context)?;
+    let system_prompt = render_global_file("summarize_oneshot_short.md", &context)?;
 
     // Create a simple user message requesting summarization
     let user_message = Message::user()
diff --git a/crates/goose/src/prompts/summarize_oneshot_short.md b/crates/goose/src/prompts/summarize_oneshot_short.md
new file mode 100644
index 000000000000..38d1848acebe
--- /dev/null
+++ b/crates/goose/src/prompts/summarize_oneshot_short.md
@@ -0,0 +1,26 @@
+## Summary Task  
+Generate detailed summary of conversation to date.  
+Include user requests, your responses, and all technical content.  
+
+Wrap reasoning in `<analysis>` tags:  
+- Review conversation chronologically  
+- For each part, log:  
+  - User goals and requests  
+  - Your method and solution  
+  - Key decisions and designs  
+  - File names, code, signatures, errors, fixes  
+- Highlight user feedback and revisions  
+- Confirm completeness and accuracy  
+
+### Summary Must Include:  
+1. **User Intent** – All goals and requests  
+2. **Technical Concepts** – All discussed tools, methods  
+3. **Files + Code** – Viewed/edited files, full code, change justifications  
+4. **Errors + Fixes** – Bugs, resolutions, user-driven changes  
+5. **Problem Solving** – Issues solved or in progress  
+6. **User Messages** – All user messages, exclude tool output  
+7. **Pending Tasks** – All unresolved user requests  
+8. **Current Work** – Active work at summary request time: filenames, code, alignment to latest instruction  
+9. **Next Step** – *Include only if* directly continues user instruction  
+
+> ⚠️ No new ideas unless user confirmed

From 739eb0b3738e212063d672593a2f854a97fd18da Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Tue, 29 Jul 2025 17:43:35 -0400
Subject: [PATCH 34/41] fix test

---
 crates/goose/src/context_mgmt/summarize.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index ee42155c6fca..262f61e466ed 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -257,11 +257,9 @@ mod tests {
     use crate::providers::base::{Provider, ProviderMetadata, ProviderUsage, Usage};
     use crate::providers::errors::ProviderError;
     use chrono::Utc;
-    use mcp_core::ToolCall;
     use rmcp::model::Role;
     use rmcp::model::Tool;
-    use rmcp::model::{AnnotateAble, Content, RawTextContent};
-    use serde_json::json;
+    use rmcp::model::{AnnotateAble, RawTextContent};
     use std::sync::Arc;
 
     #[derive(Clone)]
@@ -500,8 +498,8 @@ mod tests {
             let mut count = self.call_count.lock().unwrap();
             *count += 1;
 
-            // Fail if this looks like a one-shot request (contains the one-shot prompt content)
-            if system.contains("Summary Generation Instructions") {
+            // Fail if this looks like a one-shot request
+            if system.contains("reasoning in `<analysis>` tags") {
                 return Err(ProviderError::RateLimitExceeded(
                     "Simulated one-shot failure".to_string(),
                 ));

From 168f33264957efe53d621ccea454f5f188dd5375 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Wed, 30 Jul 2025 15:11:31 -0400
Subject: [PATCH 35/41] merge fix pt 2

---
 crates/goose/src/agents/agent.rs              |  8 ++--
 crates/goose/src/context_mgmt/auto_compact.rs | 21 +++++---
 crates/goose/src/context_mgmt/summarize.rs    | 15 +++---
 crates/goose/src/model.rs                     | 48 ++++++++++++++-----
 crates/goose/src/providers/claude_code.rs     | 37 ++++++++++----
 5 files changed, 88 insertions(+), 41 deletions(-)

diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index 181119ad263d..4ddbc22743a4 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -882,7 +882,7 @@ impl Agent {
                 yield AgentEvent::HistoryReplaced(messages.clone());
                 
                 // Continue with normal reply processing using compacted messages
-                let mut reply_stream = self.reply_internal(&messages, session, cancel_token).await?;
+                let mut reply_stream = self.reply_main(&messages, session, cancel_token).await?;
                 while let Some(event) = reply_stream.next().await {
                     yield event?;
                 }
@@ -890,11 +890,11 @@ impl Agent {
         }
 
         // No compaction needed, proceed with normal processing
-        self.reply_internal(&messages, session, cancel_token).await
+        self.reply_main(&messages, session, cancel_token).await
     }
 
-    /// Internal reply method that handles the actual agent processing
-    async fn reply_internal(
+    /// Main reply method that handles the actual agent processing
+    async fn reply_main(
         &self,
         messages: &[Message],
         session: Option<SessionConfig>,
diff --git a/crates/goose/src/context_mgmt/auto_compact.rs b/crates/goose/src/context_mgmt/auto_compact.rs
index 0b6d414857e1..45268f4e8db3 100644
--- a/crates/goose/src/context_mgmt/auto_compact.rs
+++ b/crates/goose/src/context_mgmt/auto_compact.rs
@@ -289,7 +289,8 @@ mod tests {
     #[tokio::test]
     async fn test_check_compaction_needed() {
         let mock_provider = Arc::new(MockProvider {
-            model_config: ModelConfig::new("test-model".to_string())
+            model_config: ModelConfig::new("test-model")
+                .unwrap()
                 .with_context_limit(100_000.into()),
         });
 
@@ -314,7 +315,8 @@ mod tests {
     #[tokio::test]
     async fn test_check_compaction_needed_disabled() {
         let mock_provider = Arc::new(MockProvider {
-            model_config: ModelConfig::new("test-model".to_string())
+            model_config: ModelConfig::new("test-model")
+                .unwrap()
                 .with_context_limit(100_000.into()),
         });
 
@@ -341,7 +343,8 @@ mod tests {
     #[tokio::test]
     async fn test_perform_compaction() {
         let mock_provider = Arc::new(MockProvider {
-            model_config: ModelConfig::new("test-model".to_string())
+            model_config: ModelConfig::new("test-model")
+                .unwrap()
                 .with_context_limit(50_000.into()),
         });
 
@@ -368,7 +371,8 @@ mod tests {
     #[tokio::test]
     async fn test_auto_compact_disabled() {
         let mock_provider = Arc::new(MockProvider {
-            model_config: ModelConfig::new("test-model".to_string())
+            model_config: ModelConfig::new("test-model")
+                .unwrap()
                 .with_context_limit(10_000.into()),
         });
 
@@ -398,7 +402,8 @@ mod tests {
     #[tokio::test]
     async fn test_auto_compact_below_threshold() {
         let mock_provider = Arc::new(MockProvider {
-            model_config: ModelConfig::new("test-model".to_string())
+            model_config: ModelConfig::new("test-model")
+                .unwrap()
                 .with_context_limit(100_000.into()), // Increased to ensure overhead doesn't dominate
         });
 
@@ -419,7 +424,8 @@ mod tests {
     #[tokio::test]
     async fn test_auto_compact_above_threshold() {
         let mock_provider = Arc::new(MockProvider {
-            model_config: ModelConfig::new("test-model".to_string())
+            model_config: ModelConfig::new("test-model")
+                .unwrap()
                 .with_context_limit(50_000.into()), // Realistic context limit that won't underflow
         });
 
@@ -465,7 +471,8 @@ mod tests {
     #[tokio::test]
     async fn test_auto_compact_respects_config() {
         let mock_provider = Arc::new(MockProvider {
-            model_config: ModelConfig::new("test-model".to_string())
+            model_config: ModelConfig::new("test-model")
+                .unwrap()
                 .with_context_limit(30_000.into()), // Smaller context limit to make threshold easier to hit
         });
 
diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index fbba173d7448..d7830bc553c8 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -301,7 +301,7 @@ mod tests {
 
     fn create_mock_provider() -> Result<Arc<dyn Provider>> {
         let mock_model_config =
-            ModelConfig::new_or_fail("test-model").with_context_limit(200_000.into());
+            ModelConfig::new("test-model")?.with_context_limit(200_000.into());
 
         Ok(Arc::new(MockProvider {
             model_config: mock_model_config,
@@ -423,7 +423,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_summarize_messages_uses_oneshot_for_small_context() {
-        let provider = create_mock_provider();
+        let provider = create_mock_provider().expect("failed to create mock provider");
         let token_counter = TokenCounter::new();
         let context_limit = 100_000; // Large context limit
         let messages = create_test_messages(); // Small message set
@@ -449,7 +449,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_summarize_messages_uses_chunked_for_large_context() {
-        let provider = create_mock_provider();
+        let provider = create_mock_provider().expect("failed to create mock provider");
         let token_counter = TokenCounter::new();
         let context_limit = 10_000; // Higher limit to avoid underflow
         let messages = create_test_messages();
@@ -527,7 +527,8 @@ mod tests {
     async fn test_summarize_messages_fallback_on_oneshot_failure() {
         let call_count = Arc::new(std::sync::Mutex::new(0));
         let provider = Arc::new(FailingOneshotProvider {
-            model_config: ModelConfig::new("test-model".to_string())
+            model_config: ModelConfig::new("test-model")
+                .unwrap()
                 .with_context_limit(200_000.into()),
             call_count: Arc::clone(&call_count),
         });
@@ -567,7 +568,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_summarize_messages_oneshot_direct_call() {
-        let provider = create_mock_provider();
+        let provider = create_mock_provider().expect("failed to create mock provider");
         let token_counter = TokenCounter::new();
         let context_limit = 100_000;
         let messages = create_test_messages();
@@ -605,7 +606,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_summarize_messages_chunked_direct_call() {
-        let provider = create_mock_provider();
+        let provider = create_mock_provider().expect("failed to create mock provider");
         let token_counter = TokenCounter::new();
         let context_limit = 10_000; // Higher limit to avoid underflow
         let messages = create_test_messages();
@@ -643,7 +644,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_absolute_token_threshold_calculation() {
-        let provider = create_mock_provider();
+        let provider = create_mock_provider().expect("failed to create mock provider");
         let token_counter = TokenCounter::new();
 
         // Test with a context limit where absolute token calculation matters
diff --git a/crates/goose/src/model.rs b/crates/goose/src/model.rs
index dad3f74c2bda..7e6a22c8f17a 100644
--- a/crates/goose/src/model.rs
+++ b/crates/goose/src/model.rs
@@ -247,19 +247,28 @@ mod tests {
 
     #[test]
     fn test_model_config_context_limits() {
-        let config = ModelConfig::new("claude-3-opus")
-            .unwrap()
-            .with_context_limit(Some(150_000));
-        assert_eq!(config.context_limit(), 150_000);
-
-        let config = ModelConfig::new("claude-3-opus").unwrap();
-        assert_eq!(config.context_limit(), 200_000);
-
-        let config = ModelConfig::new("gpt-4-turbo").unwrap();
-        assert_eq!(config.context_limit(), 128_000);
-
-        let config = ModelConfig::new("unknown-model").unwrap();
-        assert_eq!(config.context_limit(), DEFAULT_CONTEXT_LIMIT);
+        // Clear environment variables to ensure clean test
+        with_var("GOOSE_CONTEXT_LIMIT", None::<&str>, || {
+            with_var("GOOSE_TEMPERATURE", None::<&str>, || {
+                with_var("GOOSE_TOOLSHIM", None::<&str>, || {
+                    with_var("GOOSE_TOOLSHIM_OLLAMA_MODEL", None::<&str>, || {
+                        let config = ModelConfig::new("claude-3-opus")
+                            .unwrap()
+                            .with_context_limit(Some(150_000));
+                        assert_eq!(config.context_limit(), 150_000);
+
+                        let config = ModelConfig::new("claude-3-opus").unwrap();
+                        assert_eq!(config.context_limit(), 200_000);
+
+                        let config = ModelConfig::new("gpt-4-turbo").unwrap();
+                        assert_eq!(config.context_limit(), 128_000);
+
+                        let config = ModelConfig::new("unknown-model").unwrap();
+                        assert_eq!(config.context_limit(), DEFAULT_CONTEXT_LIMIT);
+                    });
+                });
+            });
+        });
     }
 
     #[test]
@@ -329,6 +338,7 @@ mod tests {
 
     #[test]
     fn test_valid_configurations() {
+        // Test with environment variables set
         with_var("GOOSE_CONTEXT_LIMIT", Some("50000"), || {
             with_var("GOOSE_TEMPERATURE", Some("0.7"), || {
                 with_var("GOOSE_TOOLSHIM", Some("true"), || {
@@ -342,5 +352,17 @@ mod tests {
                 });
             });
         });
+        
+        // Test without environment variables (should use model-specific limits)
+        with_var("GOOSE_CONTEXT_LIMIT", None::<&str>, || {
+            with_var("GOOSE_TEMPERATURE", None::<&str>, || {
+                with_var("GOOSE_TOOLSHIM", None::<&str>, || {
+                    with_var("GOOSE_TOOLSHIM_OLLAMA_MODEL", None::<&str>, || {
+                        let config = ModelConfig::new("claude-3-opus").unwrap();
+                        assert_eq!(config.context_limit(), 200_000);
+                    });
+                });
+            });
+        });
     }
 }
diff --git a/crates/goose/src/providers/claude_code.rs b/crates/goose/src/providers/claude_code.rs
index 833fd4547aa4..298731dbb16a 100644
--- a/crates/goose/src/providers/claude_code.rs
+++ b/crates/goose/src/providers/claude_code.rs
@@ -518,6 +518,7 @@ impl Provider for ClaudeCodeProvider {
 mod tests {
     use super::ModelConfig;
     use super::*;
+    use temp_env::with_var;
 
     #[test]
     fn test_claude_code_model_config() {
@@ -544,20 +545,36 @@ mod tests {
     #[test]
     fn test_claude_code_invalid_model_no_fallback() {
         // Test that an invalid model is kept as-is (no fallback)
-        let invalid_model = ModelConfig::new_or_fail("invalid-model");
-        let provider = ClaudeCodeProvider::from_env(invalid_model).unwrap();
-        let config = provider.get_model_config();
-
-        assert_eq!(config.model_name, "invalid-model");
+        with_var("GOOSE_CONTEXT_LIMIT", None::<&str>, || {
+            with_var("GOOSE_TEMPERATURE", None::<&str>, || {
+                with_var("GOOSE_TOOLSHIM", None::<&str>, || {
+                    with_var("GOOSE_TOOLSHIM_OLLAMA_MODEL", None::<&str>, || {
+                        let invalid_model = ModelConfig::new("invalid-model").unwrap();
+                        let provider = ClaudeCodeProvider::from_env(invalid_model).unwrap();
+                        let config = provider.get_model_config();
+
+                        assert_eq!(config.model_name, "invalid-model");
+                    });
+                });
+            });
+        });
     }
 
     #[test]
     fn test_claude_code_valid_model() {
         // Test that a valid model is preserved
-        let valid_model = ModelConfig::new_or_fail("sonnet");
-        let provider = ClaudeCodeProvider::from_env(valid_model).unwrap();
-        let config = provider.get_model_config();
-
-        assert_eq!(config.model_name, "sonnet");
+        with_var("GOOSE_CONTEXT_LIMIT", None::<&str>, || {
+            with_var("GOOSE_TEMPERATURE", None::<&str>, || {
+                with_var("GOOSE_TOOLSHIM", None::<&str>, || {
+                    with_var("GOOSE_TOOLSHIM_OLLAMA_MODEL", None::<&str>, || {
+                        let valid_model = ModelConfig::new("sonnet").unwrap();
+                        let provider = ClaudeCodeProvider::from_env(valid_model).unwrap();
+                        let config = provider.get_model_config();
+
+                        assert_eq!(config.model_name, "sonnet");
+                    });
+                });
+            });
+        });
     }
 }

From 585a26064c478f40b7250d49a882e6bf766b00f1 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Wed, 30 Jul 2025 16:51:05 -0400
Subject: [PATCH 36/41] fmt

---
 crates/goose/src/agents/agent.rs           | 11 +++++------
 crates/goose/src/context_mgmt/summarize.rs |  3 +--
 crates/goose/src/model.rs                  |  2 +-
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index 4ddbc22743a4..5f6859bca94a 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -833,7 +833,7 @@ impl Agent {
         messages: &[Message],
     ) -> Result<Option<(Vec<Message>, String)>> {
         let compact_result = auto_compact::check_and_compact_messages(self, messages, None).await?;
-        
+
         if compact_result.compacted {
             let compacted_messages = compact_result.messages;
 
@@ -865,7 +865,8 @@ impl Agent {
         cancel_token: Option<CancellationToken>,
     ) -> Result<BoxStream<'_, Result<AgentEvent>>> {
         // Handle auto-compaction before processing
-        let (messages, compaction_msg) = match self.handle_auto_compaction(unfixed_messages).await? {
+        let (messages, compaction_msg) = match self.handle_auto_compaction(unfixed_messages).await?
+        {
             Some((compacted_messages, msg)) => (compacted_messages, Some(msg)),
             None => {
                 let context = self
@@ -880,7 +881,7 @@ impl Agent {
             return Ok(Box::pin(async_stream::try_stream! {
                 yield AgentEvent::Message(Message::assistant().with_text(compaction_msg));
                 yield AgentEvent::HistoryReplaced(messages.clone());
-                
+
                 // Continue with normal reply processing using compacted messages
                 let mut reply_stream = self.reply_main(&messages, session, cancel_token).await?;
                 while let Some(event) = reply_stream.next().await {
@@ -900,9 +901,7 @@ impl Agent {
         session: Option<SessionConfig>,
         cancel_token: Option<CancellationToken>,
     ) -> Result<BoxStream<'_, Result<AgentEvent>>> {
-        let context = self
-            .prepare_reply_context(messages, &session)
-            .await?;
+        let context = self.prepare_reply_context(messages, &session).await?;
         let ReplyContext {
             mut messages,
             mut tools,
diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index d7830bc553c8..5a735ddddb4f 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -300,8 +300,7 @@ mod tests {
     }
 
     fn create_mock_provider() -> Result<Arc<dyn Provider>> {
-        let mock_model_config =
-            ModelConfig::new("test-model")?.with_context_limit(200_000.into());
+        let mock_model_config = ModelConfig::new("test-model")?.with_context_limit(200_000.into());
 
         Ok(Arc::new(MockProvider {
             model_config: mock_model_config,
diff --git a/crates/goose/src/model.rs b/crates/goose/src/model.rs
index 7e6a22c8f17a..10850089551d 100644
--- a/crates/goose/src/model.rs
+++ b/crates/goose/src/model.rs
@@ -352,7 +352,7 @@ mod tests {
                 });
             });
         });
-        
+
         // Test without environment variables (should use model-specific limits)
         with_var("GOOSE_CONTEXT_LIMIT", None::<&str>, || {
             with_var("GOOSE_TEMPERATURE", None::<&str>, || {

From 2ed4fbfe8bd90d3a3f58653681b44b5bb785c1c2 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Wed, 30 Jul 2025 17:22:22 -0400
Subject: [PATCH 37/41] rename

---
 crates/goose/src/agents/agent.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index 5f6859bca94a..5d53c2e83dbe 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -883,7 +883,7 @@ impl Agent {
                 yield AgentEvent::HistoryReplaced(messages.clone());
 
                 // Continue with normal reply processing using compacted messages
-                let mut reply_stream = self.reply_main(&messages, session, cancel_token).await?;
+                let mut reply_stream = self.reply_internal(&messages, session, cancel_token).await?;
                 while let Some(event) = reply_stream.next().await {
                     yield event?;
                 }
@@ -891,11 +891,11 @@ impl Agent {
         }
 
         // No compaction needed, proceed with normal processing
-        self.reply_main(&messages, session, cancel_token).await
+        self.reply_internal(&messages, session, cancel_token).await
     }
 
     /// Main reply method that handles the actual agent processing
-    async fn reply_main(
+    async fn reply_internal(
         &self,
         messages: &[Message],
         session: Option<SessionConfig>,

From da96097460b0401af4f92324948692e598d22ac7 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Wed, 30 Jul 2025 18:04:11 -0400
Subject: [PATCH 38/41] prompt swap

---
 crates/goose/src/context_mgmt/summarize.rs    |  2 +-
 crates/goose/src/model.rs                     | 12 ---
 crates/goose/src/prompts/summarize_oneshot.md | 93 ++++++-------------
 .../src/prompts/summarize_oneshot_long.md     | 67 +++++++++++++
 .../src/prompts/summarize_oneshot_short.md    | 26 ------
 5 files changed, 94 insertions(+), 106 deletions(-)
 create mode 100644 crates/goose/src/prompts/summarize_oneshot_long.md
 delete mode 100644 crates/goose/src/prompts/summarize_oneshot_short.md

diff --git a/crates/goose/src/context_mgmt/summarize.rs b/crates/goose/src/context_mgmt/summarize.rs
index 5a735ddddb4f..c7a92fa2954a 100644
--- a/crates/goose/src/context_mgmt/summarize.rs
+++ b/crates/goose/src/context_mgmt/summarize.rs
@@ -75,7 +75,7 @@ pub async fn summarize_messages_oneshot(
     };
 
     // Render the one-shot summarization prompt
-    let system_prompt = render_global_file("summarize_oneshot_short.md", &context)?;
+    let system_prompt = render_global_file("summarize_oneshot.md", &context)?;
 
     // Create a simple user message requesting summarization
     let user_message = Message::user()
diff --git a/crates/goose/src/model.rs b/crates/goose/src/model.rs
index 10850089551d..1bf2cf6de5bf 100644
--- a/crates/goose/src/model.rs
+++ b/crates/goose/src/model.rs
@@ -352,17 +352,5 @@ mod tests {
                 });
             });
         });
-
-        // Test without environment variables (should use model-specific limits)
-        with_var("GOOSE_CONTEXT_LIMIT", None::<&str>, || {
-            with_var("GOOSE_TEMPERATURE", None::<&str>, || {
-                with_var("GOOSE_TOOLSHIM", None::<&str>, || {
-                    with_var("GOOSE_TOOLSHIM_OLLAMA_MODEL", None::<&str>, || {
-                        let config = ModelConfig::new("claude-3-opus").unwrap();
-                        assert_eq!(config.context_limit(), 200_000);
-                    });
-                });
-            });
-        });
     }
 }
diff --git a/crates/goose/src/prompts/summarize_oneshot.md b/crates/goose/src/prompts/summarize_oneshot.md
index aeedfa38a2b1..e854e9b1f3bb 100644
--- a/crates/goose/src/prompts/summarize_oneshot.md
+++ b/crates/goose/src/prompts/summarize_oneshot.md
@@ -1,67 +1,26 @@
-## 📝 Summary Generation Instructions
-
-Your task is to generate a comprehensive summary of the conversation so far, with close attention to the user's explicit requests and your own prior actions.  
-This summary must fully capture all technical details, code structures, and architectural decisions critical to resuming development work without losing context.
-
-Before presenting the final summary, enclose your reasoning in `<analysis>` tags to organize your thought process and confirm that you've addressed all required components.  
-During your analysis, follow this approach:
-
-### 🔍 Analysis Process
-
-- Review the conversation **chronologically**, section by section.  
-- For each part, clearly identify:
-  - ✅ The user’s **explicit requests** and stated **intentions**
-  - 🛠️ Your **approach** and method for addressing those requests
-  - 🧠 Major **technical decisions**, **concepts**, and **design choices**
-  - 🧩 Specific technical elements such as:
-    - `file names`
-    - `complete code snippets`
-    - `function signatures`
-    - `code modifications`
-    - `errors encountered` and how they were resolved
-
-- 🔁 Pay special attention to **direct user feedback** — especially any revisions or corrections.
-- 📋 Double-check for **technical completeness and accuracy**, ensuring that **every required element** has been thoroughly addressed.
-
-## 📄 Required Sections in Your Summary
-
-### 1. **Primary Request and Intent**  
-Capture all of the user’s **core goals** and **specific requests** throughout the conversation.
-
-### 2. **Key Technical Concepts**  
-List all major **technical concepts**, **technologies**, **tools**, or **frameworks** discussed.
-
-### 3. **Files and Code Sections**  
-Detail the specific **files and code regions** that were viewed, changed, or created.  
-Include **full code snippets** where relevant, and explain **why** each change or file mattered.
-
-### 4. **Errors and Fixes**  
-List all **errors**, bugs, or unexpected behavior you encountered — and how you resolved them.  
-Call out any **user feedback** that led you to change your solution or debugging approach.
-
-### 5. **Problem Solving**  
-Summarize all **problems solved**, including any **ongoing troubleshooting** efforts.
-
-### 6. **All User Messages**  
-Include **all user messages** (excluding tool output).  
-These are essential for tracking **user feedback** and **shifts in intent**.
-
-### 7. **Pending Tasks**  
-List any **outstanding tasks** that the user explicitly asked you to work on.
-
-### 8. **Current Work**  
-Describe **precisely** what was being worked on **immediately before** the summary request.  
-Include:
-- File names
-- Code snippets
-- Specifics from the latest conversation  
-Make sure this ties directly to the user’s **latest instructions**.
-
-### 9. **Optional Next Step**  
-Only include this if:
-- It is a **direct continuation** of your last task
-- It clearly aligns with the user’s **explicit request**
-
-> **⚠️ Do not introduce new directions** unless confirmed with the user.
-
-If appropriate, include **verbatim quotes** from the recent conversation to show **where you left off**.
+## Summary Task
+Generate detailed summary of conversation to date.  
+Include user requests, your responses, and all technical content.  
+
+Wrap reasoning in `<analysis>` tags:  
+- Review conversation chronologically  
+- For each part, log:  
+  - User goals and requests  
+  - Your method and solution  
+  - Key decisions and designs  
+  - File names, code, signatures, errors, fixes  
+- Highlight user feedback and revisions  
+- Confirm completeness and accuracy  
+
+### Summary Must Include:  
+1. **User Intent** – All goals and requests  
+2. **Technical Concepts** – All discussed tools, methods  
+3. **Files + Code** – Viewed/edited files, full code, change justifications  
+4. **Errors + Fixes** – Bugs, resolutions, user-driven changes  
+5. **Problem Solving** – Issues solved or in progress  
+6. **User Messages** – All user messages, exclude tool output  
+7. **Pending Tasks** – All unresolved user requests  
+8. **Current Work** – Active work at summary request time: filenames, code, alignment to latest instruction  
+9. **Next Step** – *Include only if* directly continues user instruction  
+
+> No new ideas unless user confirmed
diff --git a/crates/goose/src/prompts/summarize_oneshot_long.md b/crates/goose/src/prompts/summarize_oneshot_long.md
new file mode 100644
index 000000000000..aeedfa38a2b1
--- /dev/null
+++ b/crates/goose/src/prompts/summarize_oneshot_long.md
@@ -0,0 +1,67 @@
+## 📝 Summary Generation Instructions
+
+Your task is to generate a comprehensive summary of the conversation so far, with close attention to the user's explicit requests and your own prior actions.  
+This summary must fully capture all technical details, code structures, and architectural decisions critical to resuming development work without losing context.
+
+Before presenting the final summary, enclose your reasoning in `<analysis>` tags to organize your thought process and confirm that you've addressed all required components.  
+During your analysis, follow this approach:
+
+### 🔍 Analysis Process
+
+- Review the conversation **chronologically**, section by section.  
+- For each part, clearly identify:
+  - ✅ The user’s **explicit requests** and stated **intentions**
+  - 🛠️ Your **approach** and method for addressing those requests
+  - 🧠 Major **technical decisions**, **concepts**, and **design choices**
+  - 🧩 Specific technical elements such as:
+    - `file names`
+    - `complete code snippets`
+    - `function signatures`
+    - `code modifications`
+    - `errors encountered` and how they were resolved
+
+- 🔁 Pay special attention to **direct user feedback** — especially any revisions or corrections.
+- 📋 Double-check for **technical completeness and accuracy**, ensuring that **every required element** has been thoroughly addressed.
+
+## 📄 Required Sections in Your Summary
+
+### 1. **Primary Request and Intent**  
+Capture all of the user’s **core goals** and **specific requests** throughout the conversation.
+
+### 2. **Key Technical Concepts**  
+List all major **technical concepts**, **technologies**, **tools**, or **frameworks** discussed.
+
+### 3. **Files and Code Sections**  
+Detail the specific **files and code regions** that were viewed, changed, or created.  
+Include **full code snippets** where relevant, and explain **why** each change or file mattered.
+
+### 4. **Errors and Fixes**  
+List all **errors**, bugs, or unexpected behavior you encountered — and how you resolved them.  
+Call out any **user feedback** that led you to change your solution or debugging approach.
+
+### 5. **Problem Solving**  
+Summarize all **problems solved**, including any **ongoing troubleshooting** efforts.
+
+### 6. **All User Messages**  
+Include **all user messages** (excluding tool output).  
+These are essential for tracking **user feedback** and **shifts in intent**.
+
+### 7. **Pending Tasks**  
+List any **outstanding tasks** that the user explicitly asked you to work on.
+
+### 8. **Current Work**  
+Describe **precisely** what was being worked on **immediately before** the summary request.  
+Include:
+- File names
+- Code snippets
+- Specifics from the latest conversation  
+Make sure this ties directly to the user’s **latest instructions**.
+
+### 9. **Optional Next Step**  
+Only include this if:
+- It is a **direct continuation** of your last task
+- It clearly aligns with the user’s **explicit request**
+
+> **⚠️ Do not introduce new directions** unless confirmed with the user.
+
+If appropriate, include **verbatim quotes** from the recent conversation to show **where you left off**.
diff --git a/crates/goose/src/prompts/summarize_oneshot_short.md b/crates/goose/src/prompts/summarize_oneshot_short.md
deleted file mode 100644
index 38d1848acebe..000000000000
--- a/crates/goose/src/prompts/summarize_oneshot_short.md
+++ /dev/null
@@ -1,26 +0,0 @@
-## Summary Task  
-Generate detailed summary of conversation to date.  
-Include user requests, your responses, and all technical content.  
-
-Wrap reasoning in `<analysis>` tags:  
-- Review conversation chronologically  
-- For each part, log:  
-  - User goals and requests  
-  - Your method and solution  
-  - Key decisions and designs  
-  - File names, code, signatures, errors, fixes  
-- Highlight user feedback and revisions  
-- Confirm completeness and accuracy  
-
-### Summary Must Include:  
-1. **User Intent** – All goals and requests  
-2. **Technical Concepts** – All discussed tools, methods  
-3. **Files + Code** – Viewed/edited files, full code, change justifications  
-4. **Errors + Fixes** – Bugs, resolutions, user-driven changes  
-5. **Problem Solving** – Issues solved or in progress  
-6. **User Messages** – All user messages, exclude tool output  
-7. **Pending Tasks** – All unresolved user requests  
-8. **Current Work** – Active work at summary request time: filenames, code, alignment to latest instruction  
-9. **Next Step** – *Include only if* directly continues user instruction  
-
-> ⚠️ No new ideas unless user confirmed

From a18db4f609c0d9c6fa8a5a38fd2847dff552952c Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Wed, 30 Jul 2025 18:31:11 -0400
Subject: [PATCH 39/41] rm old summary prompt

---
 crates/goose/src/agents/agent.rs              |  4 +-
 crates/goose/src/prompts/summarize_oneshot.md |  2 +-
 .../src/prompts/summarize_oneshot_long.md     | 67 -------------------
 token_debug/Cargo.toml                        | 10 +++
 token_debug/src/main.rs                       | 51 ++++++++++++++
 5 files changed, 64 insertions(+), 70 deletions(-)
 delete mode 100644 crates/goose/src/prompts/summarize_oneshot_long.md
 create mode 100644 token_debug/Cargo.toml
 create mode 100644 token_debug/src/main.rs

diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs
index 5d53c2e83dbe..30aa353d8c12 100644
--- a/crates/goose/src/agents/agent.rs
+++ b/crates/goose/src/agents/agent.rs
@@ -842,13 +842,13 @@ impl Agent {
                 (compact_result.tokens_before, compact_result.tokens_after)
             {
                 format!(
-                    "Auto-compacted context: {} → {} tokens ({:.0}% reduction)",
+                    "Auto-compacted context: {} → {} tokens ({:.0}% reduction)\n\n",
                     before,
                     after,
                     (1.0 - (after as f64 / before as f64)) * 100.0
                 )
             } else {
-                "Auto-compacted context to reduce token usage".to_string()
+                "Auto-compacted context to reduce token usage\n\n".to_string()
             };
 
             return Ok(Some((compacted_messages, compaction_msg)));
diff --git a/crates/goose/src/prompts/summarize_oneshot.md b/crates/goose/src/prompts/summarize_oneshot.md
index e854e9b1f3bb..8e621f2058aa 100644
--- a/crates/goose/src/prompts/summarize_oneshot.md
+++ b/crates/goose/src/prompts/summarize_oneshot.md
@@ -12,7 +12,7 @@ Wrap reasoning in `<analysis>` tags:
 - Highlight user feedback and revisions  
 - Confirm completeness and accuracy  
 
-### Summary Must Include:  
+### Summary Must Include the Following Sections:  
 1. **User Intent** – All goals and requests  
 2. **Technical Concepts** – All discussed tools, methods  
 3. **Files + Code** – Viewed/edited files, full code, change justifications  
diff --git a/crates/goose/src/prompts/summarize_oneshot_long.md b/crates/goose/src/prompts/summarize_oneshot_long.md
deleted file mode 100644
index aeedfa38a2b1..000000000000
--- a/crates/goose/src/prompts/summarize_oneshot_long.md
+++ /dev/null
@@ -1,67 +0,0 @@
-## 📝 Summary Generation Instructions
-
-Your task is to generate a comprehensive summary of the conversation so far, with close attention to the user's explicit requests and your own prior actions.  
-This summary must fully capture all technical details, code structures, and architectural decisions critical to resuming development work without losing context.
-
-Before presenting the final summary, enclose your reasoning in `<analysis>` tags to organize your thought process and confirm that you've addressed all required components.  
-During your analysis, follow this approach:
-
-### 🔍 Analysis Process
-
-- Review the conversation **chronologically**, section by section.  
-- For each part, clearly identify:
-  - ✅ The user’s **explicit requests** and stated **intentions**
-  - 🛠️ Your **approach** and method for addressing those requests
-  - 🧠 Major **technical decisions**, **concepts**, and **design choices**
-  - 🧩 Specific technical elements such as:
-    - `file names`
-    - `complete code snippets`
-    - `function signatures`
-    - `code modifications`
-    - `errors encountered` and how they were resolved
-
-- 🔁 Pay special attention to **direct user feedback** — especially any revisions or corrections.
-- 📋 Double-check for **technical completeness and accuracy**, ensuring that **every required element** has been thoroughly addressed.
-
-## 📄 Required Sections in Your Summary
-
-### 1. **Primary Request and Intent**  
-Capture all of the user’s **core goals** and **specific requests** throughout the conversation.
-
-### 2. **Key Technical Concepts**  
-List all major **technical concepts**, **technologies**, **tools**, or **frameworks** discussed.
-
-### 3. **Files and Code Sections**  
-Detail the specific **files and code regions** that were viewed, changed, or created.  
-Include **full code snippets** where relevant, and explain **why** each change or file mattered.
-
-### 4. **Errors and Fixes**  
-List all **errors**, bugs, or unexpected behavior you encountered — and how you resolved them.  
-Call out any **user feedback** that led you to change your solution or debugging approach.
-
-### 5. **Problem Solving**  
-Summarize all **problems solved**, including any **ongoing troubleshooting** efforts.
-
-### 6. **All User Messages**  
-Include **all user messages** (excluding tool output).  
-These are essential for tracking **user feedback** and **shifts in intent**.
-
-### 7. **Pending Tasks**  
-List any **outstanding tasks** that the user explicitly asked you to work on.
-
-### 8. **Current Work**  
-Describe **precisely** what was being worked on **immediately before** the summary request.  
-Include:
-- File names
-- Code snippets
-- Specifics from the latest conversation  
-Make sure this ties directly to the user’s **latest instructions**.
-
-### 9. **Optional Next Step**  
-Only include this if:
-- It is a **direct continuation** of your last task
-- It clearly aligns with the user’s **explicit request**
-
-> **⚠️ Do not introduce new directions** unless confirmed with the user.
-
-If appropriate, include **verbatim quotes** from the recent conversation to show **where you left off**.
diff --git a/token_debug/Cargo.toml b/token_debug/Cargo.toml
new file mode 100644
index 000000000000..0ac0d0487a2b
--- /dev/null
+++ b/token_debug/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "token_debug"
+version = "0.1.0"
+edition = "2021"
+
+[workspace]
+
+[dependencies]
+goose = { path = "../crates/goose" }
+tokio = { version = "1.0", features = ["full"] }
diff --git a/token_debug/src/main.rs b/token_debug/src/main.rs
new file mode 100644
index 000000000000..992aaee03274
--- /dev/null
+++ b/token_debug/src/main.rs
@@ -0,0 +1,51 @@
+#[tokio::main]
+async fn main() {
+    use goose::token_counter::{TokenCounter, create_async_token_counter};
+    use goose::message::{Message, MessageContent};
+    // Test basic token counting
+    let sync_counter = TokenCounter::new();
+    let async_counter = create_async_token_counter().await.unwrap();
+    
+    let test_text = "Hello, how are you?";
+    println!("Text: '{}'", test_text);
+    println!("Sync tokens: {}", sync_counter.count_tokens(test_text));
+    println!("Async tokens: {}", async_counter.count_tokens(test_text));
+    
+    // Test with a longer text
+    let long_text = "This is a much longer piece of text that should have significantly more tokens than the short greeting. We want to see how the tokenizer handles different lengths of text and whether there are any discrepancies between the sync and async versions.";
+    println!("\nLong text: '{}'", long_text);
+    println!("Sync tokens: {}", sync_counter.count_tokens(long_text));
+    println!("Async tokens: {}", async_counter.count_tokens(long_text));
+    
+    // Test with message counting
+    let messages = vec![
+        Message::new(
+            Role::User,
+            0,
+            vec![MessageContent::text("What's the weather like?")],
+        ),
+        Message::new(
+            Role::Assistant,
+            1,
+            vec![MessageContent::text("I don't have access to current weather data.")],
+        ),
+    ];
+    
+    println!("\nMessage token counting:");
+    println!("Sync chat tokens: {}", sync_counter.count_chat_tokens("", &messages, &[]));
+    println!("Async chat tokens: {}", async_counter.count_chat_tokens("", &messages, &[]));
+    
+    // Test with system prompt
+    let system_prompt = "You are a helpful assistant.";
+    println!("\nWith system prompt:");
+    println!("Sync chat tokens: {}", sync_counter.count_chat_tokens(system_prompt, &messages, &[]));
+    println!("Async chat tokens: {}", async_counter.count_chat_tokens(system_prompt, &messages, &[]));
+    
+    // Test individual message token counts
+    println!("\nIndividual message tokens:");
+    for (i, msg) in messages.iter().enumerate() {
+        let sync_tokens = sync_counter.count_chat_tokens("", std::slice::from_ref(msg), &[]);
+        let async_tokens = async_counter.count_chat_tokens("", std::slice::from_ref(msg), &[]);
+        println!("Message {}: sync={}, async={}", i, sync_tokens, async_tokens);
+    }
+}

From fb7f2c31ba671ef2504928dc4e98e65b6f23bfba Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Thu, 31 Jul 2025 12:58:29 -0400
Subject: [PATCH 40/41] rm stray file

---
 token_debug/Cargo.toml  | 10 --------
 token_debug/src/main.rs | 51 -----------------------------------------
 2 files changed, 61 deletions(-)
 delete mode 100644 token_debug/Cargo.toml
 delete mode 100644 token_debug/src/main.rs

diff --git a/token_debug/Cargo.toml b/token_debug/Cargo.toml
deleted file mode 100644
index 0ac0d0487a2b..000000000000
--- a/token_debug/Cargo.toml
+++ /dev/null
@@ -1,10 +0,0 @@
-[package]
-name = "token_debug"
-version = "0.1.0"
-edition = "2021"
-
-[workspace]
-
-[dependencies]
-goose = { path = "../crates/goose" }
-tokio = { version = "1.0", features = ["full"] }
diff --git a/token_debug/src/main.rs b/token_debug/src/main.rs
deleted file mode 100644
index 992aaee03274..000000000000
--- a/token_debug/src/main.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-#[tokio::main]
-async fn main() {
-    use goose::token_counter::{TokenCounter, create_async_token_counter};
-    use goose::message::{Message, MessageContent};
-    // Test basic token counting
-    let sync_counter = TokenCounter::new();
-    let async_counter = create_async_token_counter().await.unwrap();
-    
-    let test_text = "Hello, how are you?";
-    println!("Text: '{}'", test_text);
-    println!("Sync tokens: {}", sync_counter.count_tokens(test_text));
-    println!("Async tokens: {}", async_counter.count_tokens(test_text));
-    
-    // Test with a longer text
-    let long_text = "This is a much longer piece of text that should have significantly more tokens than the short greeting. We want to see how the tokenizer handles different lengths of text and whether there are any discrepancies between the sync and async versions.";
-    println!("\nLong text: '{}'", long_text);
-    println!("Sync tokens: {}", sync_counter.count_tokens(long_text));
-    println!("Async tokens: {}", async_counter.count_tokens(long_text));
-    
-    // Test with message counting
-    let messages = vec![
-        Message::new(
-            Role::User,
-            0,
-            vec![MessageContent::text("What's the weather like?")],
-        ),
-        Message::new(
-            Role::Assistant,
-            1,
-            vec![MessageContent::text("I don't have access to current weather data.")],
-        ),
-    ];
-    
-    println!("\nMessage token counting:");
-    println!("Sync chat tokens: {}", sync_counter.count_chat_tokens("", &messages, &[]));
-    println!("Async chat tokens: {}", async_counter.count_chat_tokens("", &messages, &[]));
-    
-    // Test with system prompt
-    let system_prompt = "You are a helpful assistant.";
-    println!("\nWith system prompt:");
-    println!("Sync chat tokens: {}", sync_counter.count_chat_tokens(system_prompt, &messages, &[]));
-    println!("Async chat tokens: {}", async_counter.count_chat_tokens(system_prompt, &messages, &[]));
-    
-    // Test individual message token counts
-    println!("\nIndividual message tokens:");
-    for (i, msg) in messages.iter().enumerate() {
-        let sync_tokens = sync_counter.count_chat_tokens("", std::slice::from_ref(msg), &[]);
-        let async_tokens = async_counter.count_chat_tokens("", std::slice::from_ref(msg), &[]);
-        println!("Message {}: sync={}, async={}", i, sync_tokens, async_tokens);
-    }
-}

From f1140241fe08951c56bd53bd08129997207394d8 Mon Sep 17 00:00:00 2001
From: David Katz <dkatz@squareup.com>
Date: Thu, 31 Jul 2025 13:03:32 -0400
Subject: [PATCH 41/41] more stray test changes

---
 crates/goose/src/model.rs                 | 35 ++++++++--------------
 crates/goose/src/providers/claude_code.rs | 36 +++++++----------------
 2 files changed, 23 insertions(+), 48 deletions(-)

diff --git a/crates/goose/src/model.rs b/crates/goose/src/model.rs
index 1bf2cf6de5bf..6799c01ad9c0 100644
--- a/crates/goose/src/model.rs
+++ b/crates/goose/src/model.rs
@@ -247,28 +247,19 @@ mod tests {
 
     #[test]
     fn test_model_config_context_limits() {
-        // Clear environment variables to ensure clean test
-        with_var("GOOSE_CONTEXT_LIMIT", None::<&str>, || {
-            with_var("GOOSE_TEMPERATURE", None::<&str>, || {
-                with_var("GOOSE_TOOLSHIM", None::<&str>, || {
-                    with_var("GOOSE_TOOLSHIM_OLLAMA_MODEL", None::<&str>, || {
-                        let config = ModelConfig::new("claude-3-opus")
-                            .unwrap()
-                            .with_context_limit(Some(150_000));
-                        assert_eq!(config.context_limit(), 150_000);
-
-                        let config = ModelConfig::new("claude-3-opus").unwrap();
-                        assert_eq!(config.context_limit(), 200_000);
-
-                        let config = ModelConfig::new("gpt-4-turbo").unwrap();
-                        assert_eq!(config.context_limit(), 128_000);
-
-                        let config = ModelConfig::new("unknown-model").unwrap();
-                        assert_eq!(config.context_limit(), DEFAULT_CONTEXT_LIMIT);
-                    });
-                });
-            });
-        });
+        let config = ModelConfig::new("claude-3-opus")
+            .unwrap()
+            .with_context_limit(Some(150_000));
+        assert_eq!(config.context_limit(), 150_000);
+
+        let config = ModelConfig::new("claude-3-opus").unwrap();
+        assert_eq!(config.context_limit(), 200_000);
+
+        let config = ModelConfig::new("gpt-4-turbo").unwrap();
+        assert_eq!(config.context_limit(), 128_000);
+
+        let config = ModelConfig::new("unknown-model").unwrap();
+        assert_eq!(config.context_limit(), DEFAULT_CONTEXT_LIMIT);
     }
 
     #[test]
diff --git a/crates/goose/src/providers/claude_code.rs b/crates/goose/src/providers/claude_code.rs
index 298731dbb16a..93a419beeeac 100644
--- a/crates/goose/src/providers/claude_code.rs
+++ b/crates/goose/src/providers/claude_code.rs
@@ -545,36 +545,20 @@ mod tests {
     #[test]
     fn test_claude_code_invalid_model_no_fallback() {
         // Test that an invalid model is kept as-is (no fallback)
-        with_var("GOOSE_CONTEXT_LIMIT", None::<&str>, || {
-            with_var("GOOSE_TEMPERATURE", None::<&str>, || {
-                with_var("GOOSE_TOOLSHIM", None::<&str>, || {
-                    with_var("GOOSE_TOOLSHIM_OLLAMA_MODEL", None::<&str>, || {
-                        let invalid_model = ModelConfig::new("invalid-model").unwrap();
-                        let provider = ClaudeCodeProvider::from_env(invalid_model).unwrap();
-                        let config = provider.get_model_config();
-
-                        assert_eq!(config.model_name, "invalid-model");
-                    });
-                });
-            });
-        });
+        let invalid_model = ModelConfig::new_or_fail("invalid-model");
+        let provider = ClaudeCodeProvider::from_env(invalid_model).unwrap();
+        let config = provider.get_model_config();
+
+        assert_eq!(config.model_name, "invalid-model");
     }
 
     #[test]
     fn test_claude_code_valid_model() {
         // Test that a valid model is preserved
-        with_var("GOOSE_CONTEXT_LIMIT", None::<&str>, || {
-            with_var("GOOSE_TEMPERATURE", None::<&str>, || {
-                with_var("GOOSE_TOOLSHIM", None::<&str>, || {
-                    with_var("GOOSE_TOOLSHIM_OLLAMA_MODEL", None::<&str>, || {
-                        let valid_model = ModelConfig::new("sonnet").unwrap();
-                        let provider = ClaudeCodeProvider::from_env(valid_model).unwrap();
-                        let config = provider.get_model_config();
-
-                        assert_eq!(config.model_name, "sonnet");
-                    });
-                });
-            });
-        });
+        let valid_model = ModelConfig::new_or_fail("sonnet");
+        let provider = ClaudeCodeProvider::from_env(valid_model).unwrap();
+        let config = provider.get_model_config();
+
+        assert_eq!(config.model_name, "sonnet");
     }
 }