diff --git a/Cargo.lock b/Cargo.lock index 03eb317111b4..53747e885b9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2741,6 +2741,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", + "serial_test", "shlex", "tar", "temp-env", diff --git a/crates/goose-cli/Cargo.toml b/crates/goose-cli/Cargo.toml index fb16ced65e80..f68622b4b8c4 100644 --- a/crates/goose-cli/Cargo.toml +++ b/crates/goose-cli/Cargo.toml @@ -70,3 +70,4 @@ tempfile = "3" temp-env = { version = "0.3.6", features = ["async_closure"] } test-case = "3.3" tokio = { version = "1.43", features = ["rt", "macros"] } +serial_test = "3.2.0" diff --git a/crates/goose-cli/src/scenario_tests/scenarios.rs b/crates/goose-cli/src/scenario_tests/scenarios.rs index 6b6c0ef99922..16b1ff8126fc 100644 --- a/crates/goose-cli/src/scenario_tests/scenarios.rs +++ b/crates/goose-cli/src/scenario_tests/scenarios.rs @@ -8,10 +8,13 @@ mod tests { use crate::scenario_tests::scenario_runner::run_scenario; use anyhow::Result; use goose::conversation::message::Message; + use serial_test::serial; #[tokio::test] + #[serial] async fn test_what_is_your_name() -> Result<()> { - run_scenario( + std::env::set_var("GOOSE_MOIM_ENABLED", "false"); + let result = run_scenario( "what_is_your_name", text("what is your name"), None, @@ -25,13 +28,17 @@ mod tests { Ok(()) }, ) - .await + .await; + std::env::remove_var("GOOSE_MOIM_ENABLED"); + result } #[tokio::test] + #[serial] async fn test_weather_tool() -> Result<()> { + std::env::set_var("GOOSE_MOIM_ENABLED", "false"); // Google tells me it only knows about the weather in the US, so we skip it. - run_scenario( + let result = run_scenario( "weather_tool", text("tell me what the weather is in Berlin, Germany"), Some(&["Google"]), @@ -55,14 +62,18 @@ mod tests { Ok(()) }, ) - .await + .await; + std::env::remove_var("GOOSE_MOIM_ENABLED"); + result } #[tokio::test] + #[serial] async fn test_image_analysis() -> Result<()> { + std::env::set_var("GOOSE_MOIM_ENABLED", "false"); // Google says it doesn't know about images, the other providers complain about // the image format, so we only run this for OpenAI and Anthropic. - run_scenario( + let result = run_scenario( "image_analysis", image("What do you see in this image?", "test_image"), Some(&["Google", "azure_openai", "groq"]), @@ -73,12 +84,16 @@ mod tests { Ok(()) }, ) - .await + .await; + std::env::remove_var("GOOSE_MOIM_ENABLED"); + result } #[tokio::test] + #[serial] async fn test_context_length_exceeded_error() -> Result<()> { - run_scenario( + std::env::set_var("GOOSE_MOIM_ENABLED", "false"); + let result = run_scenario( "context_length_exceeded", Box::new(|provider| { let model_config = provider.get_model_config(); @@ -100,6 +115,8 @@ mod tests { Ok(()) }, ) - .await + .await; + std::env::remove_var("GOOSE_MOIM_ENABLED"); + result } } diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs index 2ad6e54fa3d1..145b3d4e58dc 100644 --- a/crates/goose/src/agents/agent.rs +++ b/crates/goose/src/agents/agent.rs @@ -60,9 +60,7 @@ use super::model_selector::autopilot::AutoPilot; use super::platform_tools; use super::tool_execution::{ToolCallResult, CHAT_MODE_TOOL_SKIPPED_RESPONSE, DECLINED_RESPONSE}; use crate::agents::subagent_task_config::TaskConfig; -use crate::agents::todo_tools::{ - todo_read_tool, todo_write_tool, TODO_READ_TOOL_NAME, TODO_WRITE_TOOL_NAME, -}; +use crate::agents::todo_tools::{todo_write_tool, TODO_WRITE_TOOL_NAME}; use crate::conversation::message::{Message, ToolRequest}; use crate::session::extension_data::ExtensionState; use crate::session::{extension_data, SessionManager}; @@ -504,22 +502,6 @@ impl Agent { "Frontend tool execution required".to_string(), None, ))) - } else if tool_call.name == TODO_READ_TOOL_NAME { - // Handle task planner read tool - let todo_content = if let Some(session_config) = session { - SessionManager::get_session(&session_config.id, false) - .await - .ok() - .and_then(|metadata| { - extension_data::TodoState::from_extension_data(&metadata.extension_data) - .map(|state| state.content) - }) - .unwrap_or_default() - } else { - String::new() - }; - - ToolCallResult::from(Ok(vec![Content::text(todo_content)])) } else if tool_call.name == TODO_WRITE_TOOL_NAME { // Handle task planner write tool let content = tool_call @@ -816,8 +798,8 @@ impl Agent { platform_tools::manage_schedule_tool(), ]); - // Add task planner tools - prefixed_tools.extend([todo_read_tool(), todo_write_tool()]); + // Add task planner tool (write only, read happens via MOIM) + prefixed_tools.push(todo_write_tool()); // Dynamic task tool prefixed_tools.push(create_dynamic_task_tool()); @@ -1105,6 +1087,7 @@ impl Agent { conversation.messages(), &tools, &toolshim_tools, + &session, ).await?; let mut no_tools_called = true; @@ -1741,13 +1724,11 @@ mod tests { async fn test_todo_tools_integration() -> Result<()> { let agent = Agent::new(); - // Test that task planner tools are listed + // Test that task planner tool is listed (write only, read happens via MOIM) let tools = agent.list_tools(None).await; - let todo_read = tools.iter().find(|tool| tool.name == TODO_READ_TOOL_NAME); let todo_write = tools.iter().find(|tool| tool.name == TODO_WRITE_TOOL_NAME); - assert!(todo_read.is_some(), "TODO read tool should be present"); assert!(todo_write.is_some(), "TODO write tool should be present"); Ok(()) } diff --git a/crates/goose/src/agents/mod.rs b/crates/goose/src/agents/mod.rs index 7782d22d98a1..770ee30bd6a3 100644 --- a/crates/goose/src/agents/mod.rs +++ b/crates/goose/src/agents/mod.rs @@ -6,6 +6,7 @@ pub mod extension_manager; pub mod final_output_tool; mod large_response_handler; pub mod model_selector; +pub mod moim; pub mod platform_tools; pub mod prompt_manager; pub mod recipe_tools; diff --git a/crates/goose/src/agents/moim.rs b/crates/goose/src/agents/moim.rs new file mode 100644 index 000000000000..c4a09c768375 --- /dev/null +++ b/crates/goose/src/agents/moim.rs @@ -0,0 +1,94 @@ +use crate::agents::types::SessionConfig; +use crate::conversation::message::Message; +use crate::conversation::Conversation; +use crate::session::extension_data::{ExtensionState, TodoState}; +use crate::session::SessionManager; +use chrono::Local; + +async fn build_moim_content(session: &Option) -> Option { + let timestamp = Local::now().format("%Y-%m-%d %H:%M:%S").to_string(); + let mut content = format!("Current date and time: {}\n", timestamp); + + if let Some(todo_content) = get_todo_context(session).await { + content.push_str("\nCurrent tasks and notes:\n"); + content.push_str(&todo_content); + content.push('\n'); + } + + Some(content) +} + +/// Find a safe insertion point for MOIM. +/// +/// We want to insert as close to the end as possible, but we must avoid +/// breaking tool call/response pairs. We check if inserting at a position +/// would separate a tool call from its response. +pub fn find_safe_insertion_point(messages: &[Message]) -> usize { + if messages.is_empty() { + return 0; + } + + // Default to inserting before the last message + let last_pos = messages.len() - 1; + + // Check if inserting at last_pos would break a tool pair + if last_pos > 0 { + let prev_msg = &messages[last_pos - 1]; + let curr_msg = &messages[last_pos]; + + // If previous message has tool calls and current has matching responses, + // we can't insert between them + if prev_msg.is_tool_call() && curr_msg.is_tool_response() { + // Find the next best position (before the tool call) + return last_pos.saturating_sub(1); + } + } + + last_pos +} + +pub async fn inject_moim_if_enabled( + messages_for_provider: Conversation, + session: &Option, +) -> Conversation { + // Check if MOIM is enabled + let moim_enabled = crate::config::Config::global() + .get_param::("goose_moim_enabled") + .unwrap_or(true); + + if !moim_enabled { + return messages_for_provider; + } + + if let Some(moim_content) = build_moim_content(session).await { + let mut msgs = messages_for_provider.messages().to_vec(); + let moim_msg = Message::user().with_text(moim_content); + + if msgs.is_empty() { + // If conversation is empty, just add the MOIM + msgs.push(moim_msg); + } else { + // Find a safe position that won't break tool call/response pairs + let insert_pos = find_safe_insertion_point(&msgs); + msgs.insert(insert_pos, moim_msg); + } + + Conversation::new_unvalidated(msgs) + } else { + messages_for_provider + } +} + +async fn get_todo_context(session: &Option) -> Option { + let session_config = session.as_ref()?; + + match SessionManager::get_session(&session_config.id, false).await { + Ok(session_data) => TodoState::from_extension_data(&session_data.extension_data) + .map(|state| state.content) + .filter(|content| !content.trim().is_empty()), + Err(e) => { + tracing::debug!("Could not read session for MOIM: {}", e); + None + } + } +} diff --git a/crates/goose/src/agents/prompt_manager.rs b/crates/goose/src/agents/prompt_manager.rs index 87c3bbb93809..11381b7128f9 100644 --- a/crates/goose/src/agents/prompt_manager.rs +++ b/crates/goose/src/agents/prompt_manager.rs @@ -1,4 +1,3 @@ -use chrono::Utc; use serde_json::Value; use std::collections::HashMap; @@ -10,7 +9,6 @@ use crate::{config::Config, prompt_template, utils::sanitize_unicode_tags}; pub struct PromptManager { system_prompt_override: Option, system_prompt_extras: Vec, - current_date_timestamp: String, } impl Default for PromptManager { @@ -24,8 +22,6 @@ impl PromptManager { PromptManager { system_prompt_override: None, system_prompt_extras: Vec::new(), - // Use the fixed current date time so that prompt cache can be used. - current_date_timestamp: Utc::now().format("%Y-%m-%d %H:%M:%S").to_string(), } } @@ -102,11 +98,6 @@ impl PromptManager { ); } - context.insert( - "current_date_time", - Value::String(self.current_date_timestamp.clone()), - ); - // Add the suggestion about disabling extensions if flag is true context.insert( "suggest_disable", diff --git a/crates/goose/src/agents/reply_parts.rs b/crates/goose/src/agents/reply_parts.rs index 641e65a0341c..0aa5ff1f888e 100644 --- a/crates/goose/src/agents/reply_parts.rs +++ b/crates/goose/src/agents/reply_parts.rs @@ -134,6 +134,7 @@ impl Agent { messages: &[Message], tools: &[Tool], toolshim_tools: &[Tool], + session: &Option, ) -> Result { let config = provider.get_model_config(); @@ -144,6 +145,10 @@ impl Agent { Conversation::new_unvalidated(messages.to_vec()) }; + // Inject MOIM (timestamp + TODO content) if enabled + let messages_for_provider = + super::moim::inject_moim_if_enabled(messages_for_provider, session).await; + // Clone owned data to move into the async stream let system_prompt = system_prompt.to_owned(); let tools = tools.to_owned(); diff --git a/crates/goose/src/agents/todo_tools.rs b/crates/goose/src/agents/todo_tools.rs index 3f43369bce8d..75782c9ed9dd 100644 --- a/crates/goose/src/agents/todo_tools.rs +++ b/crates/goose/src/agents/todo_tools.rs @@ -2,46 +2,9 @@ use indoc::indoc; use rmcp::model::{Tool, ToolAnnotations}; use rmcp::object; -/// Tool name constant for reading task planner content -pub const TODO_READ_TOOL_NAME: &str = "todo__read"; - /// Tool name constant for writing task planner content pub const TODO_WRITE_TOOL_NAME: &str = "todo__write"; -/// Creates a tool for reading task planner content. -/// -/// This tool reads the entire task planner file content as a string. -/// It is marked as read-only and safe to use repeatedly. -/// -/// # Returns -/// A configured `Tool` instance for reading task planner content -pub fn todo_read_tool() -> Tool { - Tool::new( - TODO_READ_TOOL_NAME.to_string(), - indoc! {r#" - Read the entire TODO file content. - - This tool reads the complete TODO file and returns its content as a string. - Use this to view current tasks, notes, and any other information stored in the TODO file. - - The tool will return an error if the TODO file doesn't exist or cannot be read. - "#} - .to_string(), - object!({ - "type": "object", - "required": [], - "properties": {} - }), - ) - .annotate(ToolAnnotations { - title: Some("Read TODO content".to_string()), - read_only_hint: Some(true), - destructive_hint: Some(false), - idempotent_hint: Some(true), - open_world_hint: Some(false), - }) -} - /// Creates a tool for writing task planner content. /// /// This tool writes or overwrites the entire task planner file with new content. @@ -53,16 +16,9 @@ pub fn todo_write_tool() -> Tool { Tool::new( TODO_WRITE_TOOL_NAME.to_string(), indoc! {r#" - Write or overwrite the entire TODO file content. - - This tool replaces the complete TODO file content with the provided string. - Use this to update tasks, add new items, or reorganize the TODO file. - - WARNING: This operation completely replaces the file content. Make sure to include - all content you want to keep, not just the changes. - - The tool will create the TODO file if it doesn't exist, or overwrite it if it does. - Returns an error if the file cannot be written due to permissions or other I/O issues. + Update your task list, notes, or any information you want to track. + This content will be automatically available in your context for future turns. + This tool overwrites the entire existing todo list. "#} .to_string(), object!({ @@ -89,32 +45,6 @@ pub fn todo_write_tool() -> Tool { mod unit_tests { use super::*; - #[test] - fn test_todo_read_tool_creation() { - let tool = todo_read_tool(); - - // Verify tool name - assert_eq!(tool.name, TODO_READ_TOOL_NAME); - - // Verify description exists and is not empty - assert!(tool.description.is_some()); - let description = tool.description.as_ref().unwrap(); - assert!(!description.is_empty()); - - // Verify input schema - let schema = &tool.input_schema; - assert_eq!(schema["type"], "object"); - assert_eq!(schema["required"].as_array().unwrap().len(), 0); - - // Verify annotations - let annotations = tool.annotations.as_ref().unwrap(); - assert_eq!(annotations.title, Some("Read TODO content".to_string())); - assert_eq!(annotations.read_only_hint, Some(true)); - assert_eq!(annotations.destructive_hint, Some(false)); - assert_eq!(annotations.idempotent_hint, Some(true)); - assert_eq!(annotations.open_world_hint, Some(false)); - } - #[test] fn test_todo_write_tool_creation() { let tool = todo_write_tool(); @@ -151,10 +81,8 @@ mod unit_tests { #[test] fn test_tool_name_constants() { - // Verify the constants follow the naming pattern - assert!(TODO_READ_TOOL_NAME.starts_with("todo__")); + // Verify the constant follows the naming pattern assert!(TODO_WRITE_TOOL_NAME.starts_with("todo__")); - assert_eq!(TODO_READ_TOOL_NAME, "todo__read"); assert_eq!(TODO_WRITE_TOOL_NAME, "todo__write"); } } diff --git a/crates/goose/src/prompts/subagent_system.md b/crates/goose/src/prompts/subagent_system.md index 7bb9b8ef1580..35ab938e1be4 100644 --- a/crates/goose/src/prompts/subagent_system.md +++ b/crates/goose/src/prompts/subagent_system.md @@ -1,4 +1,4 @@ -You are a specialized subagent within the goose AI framework, created by Block. You were spawned by the main goose agent to handle a specific task efficiently. The current date is {{current_date_time}}. +You are a specialized subagent within the goose AI framework, created by Block. You were spawned by the main goose agent to handle a specific task efficiently. # Your Role You are an autonomous subagent with these characteristics: diff --git a/crates/goose/src/prompts/system.md b/crates/goose/src/prompts/system.md index 475ee1ada1ac..e27deaca76a9 100644 --- a/crates/goose/src/prompts/system.md +++ b/crates/goose/src/prompts/system.md @@ -1,7 +1,5 @@ You are a general-purpose AI agent called goose, created by Block, the parent company of Square, CashApp, and Tidal. goose is being developed as an open-source software project. -The current date is {{current_date_time}}. - goose uses LLM providers with tool calling capability. You can be used with different language models (gpt-4o, claude-sonnet-4, o1, llama-3.2, deepseek-r1, etc). These models have varying knowledge cut-off dates depending on when they were trained, but typically it's between 5-10 months prior to the current date. @@ -40,11 +38,12 @@ No extensions are defined. You should let the user know that they should add ext # Task Management -- Use `todo__read` and `todo__write` for tasks with 2+ steps, multiple files/components, or uncertain scope -- Workflow — Start: read → write checklist | During: read → update progress | End: verify all complete -- Warning — `todo__write` overwrites entirely; always `todo__read` first (skipping is an error) +- Use `todo__write` for tasks with 2+ steps, multiple files/components, or uncertain scope +- TODO content you've written is automatically available in your context +- Workflow — Start: write checklist | During: update progress after each step | End: verify all complete +- Warning — `todo__write` overwrites entirely; include all content you want to keep - Keep items short, specific, action-oriented -- Not using the todo tools for complex tasks is an error +- Not using the todo tool for complex tasks is an error Template: ```markdown diff --git a/crates/goose/src/prompts/system_gpt_4.1.md b/crates/goose/src/prompts/system_gpt_4.1.md index 1578efaf4dc7..df7edab33027 100644 --- a/crates/goose/src/prompts/system_gpt_4.1.md +++ b/crates/goose/src/prompts/system_gpt_4.1.md @@ -1,6 +1,6 @@ You are a general-purpose AI agent called goose, created by Block, the parent company of Square, CashApp, and Tidal. goose is being developed as an open-source software project. -IMPORTANT INSTRUCTIONS: +IMPORTANT INSTRUCTIONS: Please keep going until the user's query is completely resolved, before ending your turn and yielding back to the user. Only terminate your turn when you are sure that the problem is solved. @@ -11,16 +11,12 @@ CRITICAL: The str_replace command in the text_editor tool (when available) shoul The user may direct or imply that you are to take actions, in this case, it is important to note the following guidelines: * If you are directed to complete a task, you should see it through. -* Your thinking should be thorough and so it's fine if it's very long. You can think step by step before and after each action you decide to take. +* Your thinking should be thorough and so it's fine if it's very long. You can think step by step before and after each action you decide to take. * Only terminate your turn when you are sure that the problem is solved. Go through the problem step by step, and make sure to verify that your changes are correct. NEVER end your turn without having solved the problem, and when you say you are going to make a tool call, make sure you ACTUALLY make the tool call, instead of ending your turn. * You MUST plan extensively before each function call, and reflect extensively on the outcomes of the previous function calls. DO NOT do this entire process by making function calls only, as this can impair your ability to solve the problem and think insightfully. * Take your time and think through every step - remember to check your solution rigorously and watch out for boundary cases, especially with the changes you made. Your solution must be perfect. If not, continue working on it. When you are validating solutions with tools, it is important to iterate until you get success * Do not stop and ask the user for confirmation for actions you should be taking to achieve the outcomes directed and with tools available. - - -The current date is {{current_date_time}}. - goose uses LLM providers with tool calling capability. Your model may have varying knowledge cut-off dates depending on when they were trained, but typically it's between 5-10 months prior to the current date. diff --git a/crates/goose/tests/mcp_integration_test.rs b/crates/goose/tests/mcp_integration_test.rs index 7434f87c6ba2..821233346e8b 100644 --- a/crates/goose/tests/mcp_integration_test.rs +++ b/crates/goose/tests/mcp_integration_test.rs @@ -11,6 +11,7 @@ use goose::agents::extension::{Envs, ExtensionConfig}; use goose::agents::extension_manager::ExtensionManager; use mcp_core::ToolCall; +use serial_test::serial; use test_case::test_case; enum TestMode { @@ -78,11 +79,14 @@ enum TestMode { vec![] )] #[tokio::test] +#[serial] async fn test_replayed_session( command: Vec<&str>, tool_calls: Vec, required_envs: Vec<&str>, ) { + // Disable MOIM for replayed sessions to ensure consistency with recorded data + std::env::set_var("GOOSE_MOIM_ENABLED", "false"); let replay_file_name = command .iter() .map(|s| s.replace("/", "_")) @@ -200,4 +204,5 @@ async fn test_replayed_session( } panic!("Test failed: {:?}", err); } + std::env::remove_var("GOOSE_MOIM_ENABLED"); } diff --git a/crates/goose/tests/moim_tests.rs b/crates/goose/tests/moim_tests.rs new file mode 100644 index 000000000000..555d95d30e89 --- /dev/null +++ b/crates/goose/tests/moim_tests.rs @@ -0,0 +1,99 @@ +use goose::agents::moim; +use goose::conversation::message::Message; +use goose::conversation::Conversation; +use mcp_core::tool::ToolCall; +use rmcp::model::Content; +use serial_test::serial; + +// All async tests that call inject_moim_if_enabled need #[serial] because +// that function accesses Config::global() to check if MOIM is enabled +#[tokio::test] +#[serial] +async fn test_moim_basic_injection() { + let messages = vec![Message::user().with_text("Only message")]; + let conversation = Conversation::new_unvalidated(messages.clone()); + + let result = moim::inject_moim_if_enabled(conversation, &None).await; + + assert_eq!(result.len(), 2); + assert!(result.messages()[0] + .as_concat_text() + .contains("Current date and time:")); +} + +#[tokio::test] +#[serial] +async fn test_moim_disabled_no_injection() { + std::env::set_var("GOOSE_MOIM_ENABLED", "false"); + + let messages = vec![ + Message::user().with_text("Test message"), + Message::assistant().with_text("Test response"), + ]; + let conversation = Conversation::new_unvalidated(messages.clone()); + + let result = moim::inject_moim_if_enabled(conversation, &None).await; + + assert_eq!(result.len(), messages.len()); + assert_eq!(result.messages()[0].as_concat_text(), "Test message"); + assert_eq!(result.messages()[1].as_concat_text(), "Test response"); + + std::env::remove_var("GOOSE_MOIM_ENABLED"); +} + +#[tokio::test] +#[serial] +async fn test_moim_respects_tool_pairs() { + // Critical test: ensure MOIM doesn't break tool call/response pairs + let tool_call = Ok(ToolCall::new( + "test_tool", + serde_json::json!({"param": "value"}), + )); + let tool_result = Ok(vec![Content::text("Tool executed successfully")]); + + let messages = vec![ + Message::user().with_text("Please use the tool"), + Message::assistant() + .with_text("I'll use the tool for you") + .with_tool_request("tool1", tool_call), + Message::user().with_tool_response("tool1", tool_result), + Message::assistant().with_text("The tool has been executed"), + Message::user().with_text("Thank you"), + ]; + + let conversation = Conversation::new_unvalidated(messages.clone()); + + let result = moim::inject_moim_if_enabled(conversation, &None).await; + + assert_eq!(result.len(), messages.len() + 1); + + // Verify tool call and response are still adjacent + let msgs = result.messages(); + for i in 0..msgs.len() - 1 { + if msgs[i].is_tool_call() { + assert!( + msgs[i + 1].is_tool_response() + || !msgs[i + 1] + .as_concat_text() + .contains("Current date and time:"), + "MOIM should not be inserted between tool call and response" + ); + } + } +} + +#[test] +fn test_find_safe_insertion_point_ending_with_tool_response() { + // Critical test: when conversation ends with tool response, don't break the pair + let tool_call = Ok(ToolCall::new("test_tool", serde_json::json!({}))); + let tool_result = Ok(vec![Content::text("Result")]); + + let messages = vec![ + Message::user().with_text("Do something"), + Message::assistant().with_tool_request("tool1", tool_call), + Message::user().with_tool_response("tool1", tool_result), + ]; + + // Should insert before the tool call instead (index 1) + assert_eq!(goose::agents::moim::find_safe_insertion_point(&messages), 1); +}