diff --git a/.gitignore b/.gitignore index 9254becb26e2..6b1ee2ce31fe 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ run_cli.sh tokenizer_files/ .DS_Store .idea +.vscode *.log tmp/ diff --git a/Cargo.lock b/Cargo.lock index 6bfd8fc04d3b..45a09f4d5aba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,7 +111,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", - "anstyle-parse", + "anstyle-parse 0.2.7", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse 1.0.0", "anstyle-query", "anstyle-wincon", "colorchoice", @@ -121,9 +136,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" @@ -134,6 +149,15 @@ dependencies = [ "utf8parse", ] +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + [[package]] name = "anstyle-query" version = "1.1.5" @@ -940,7 +964,7 @@ dependencies = [ "bincode", "bytesize", "clircle", - "console 0.16.2", + "console 0.16.3", "content_inspector", "encoding_rs", "flate2", @@ -964,7 +988,7 @@ dependencies = [ "syntect", "terminal-colorsaurus", "thiserror 2.0.18", - "toml", + "toml 0.9.12+spec-1.1.0", "unicode-segmentation", "unicode-width 0.2.2", "walkdir", @@ -1838,9 +1862,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", "clap_derive", @@ -1848,11 +1872,11 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ - "anstream", + "anstream 1.0.0", "anstyle", "clap_lex", "strsim", @@ -1860,18 +1884,18 @@ dependencies = [ [[package]] name = "clap_complete" -version = "4.5.66" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c757a3b7e39161a4e56f9365141ada2a6c915a8622c408ab6bb4b5d047371031" +checksum = "19c9f1dde76b736e3681f28cec9d5a61299cbaae0fce80a68e43724ad56031eb" dependencies = [ "clap", ] [[package]] name = "clap_derive" -version = "4.5.55" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" dependencies = [ "heck", "proc-macro2", @@ -1881,15 +1905,15 @@ dependencies = [ [[package]] name = "clap_lex" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "clap_mangen" -version = "0.2.31" +version = "0.2.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ea63a92086df93893164221ad4f24142086d535b3a0957b9b9bea2dc86301" +checksum = "7e30ffc187e2e3aeafcd1c6e2aa416e29739454c0ccaa419226d5ecd181f2d78" dependencies = [ "clap", "roff", @@ -1945,9 +1969,9 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "combine" @@ -2029,9 +2053,9 @@ dependencies = [ [[package]] name = "config" -version = "0.15.19" +version = "0.15.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b30fa8254caad766fc03cb0ccae691e14bf3bd72bfff27f72802ce729551b3d6" +checksum = "4fe5feec195269515c4722937cd7ffcfe7b4205d18d2e6577b7223ecb159ab00" dependencies = [ "async-trait", "convert_case 0.6.0", @@ -2042,7 +2066,7 @@ dependencies = [ "serde-untagged", "serde_core", "serde_json", - "toml", + "toml 1.0.6+spec-1.1.0", "winnow", "yaml-rust2", ] @@ -2062,13 +2086,12 @@ dependencies = [ [[package]] name = "console" -version = "0.16.2" +version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03e45a4a8926227e4197636ba97a9fc9b00477e9f4bd711395687c5f0734bec4" +checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87" dependencies = [ "encode_unicode", "libc", - "once_cell", "unicode-width 0.2.2", "windows-sys 0.61.2", ] @@ -4373,7 +4396,7 @@ dependencies = [ name = "goose-cli" version = "1.27.0" dependencies = [ - "anstream", + "anstream 0.6.21", "anyhow", "async-trait", "base64 0.22.1", @@ -4385,7 +4408,7 @@ dependencies = [ "clap_mangen", "cliclack", "comfy-table", - "console 0.16.2", + "console 0.16.3", "dotenvy", "etcetera 0.11.0", "futures", @@ -5224,7 +5247,7 @@ version = "0.18.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" dependencies = [ - "console 0.16.2", + "console 0.16.3", "portable-atomic", "unicode-width 0.2.2", "unit-prefix", @@ -7922,9 +7945,9 @@ dependencies = [ [[package]] name = "roff" -version = "0.2.2" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88f8660c1ff60292143c98d08fc6e2f654d722db50410e3f3797d40baaf9d8f3" +checksum = "dbf2048e0e979efb2ca7b91c4f1a8d77c91853e9b987c94c555668a8994915ad" [[package]] name = "ron" @@ -10349,12 +10372,25 @@ dependencies = [ "indexmap 2.13.0", "serde_core", "serde_spanned", - "toml_datetime", + "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", "toml_writer", "winnow", ] +[[package]] +name = "toml" +version = "1.0.6+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "399b1124a3c9e16766831c6bba21e50192572cdd98706ea114f9502509686ffc" +dependencies = [ + "serde_core", + "serde_spanned", + "toml_datetime 1.0.0+spec-1.1.0", + "toml_parser", + "winnow", +] + [[package]] name = "toml_datetime" version = "0.7.5+spec-1.1.0" @@ -10364,6 +10400,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "toml_datetime" +version = "1.0.0+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e" +dependencies = [ + "serde_core", +] + [[package]] name = "toml_parser" version = "1.0.9+spec-1.1.0" @@ -10580,9 +10625,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.22" +version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" dependencies = [ "matchers", "nu-ansi-term", diff --git a/crates/goose-acp/tests/test_data/openai_builtin_execute.txt b/crates/goose-acp/tests/test_data/openai_builtin_execute.txt index bf6d35bd7935..9b2edd9485ce 100644 --- a/crates/goose-acp/tests/test_data/openai_builtin_execute.txt +++ b/crates/goose-acp/tests/test_data/openai_builtin_execute.txt @@ -1,4 +1,4 @@ -data: {"id":"chatcmpl-D64NZp69RkEyXdUoDaCBj7fSYll8J","object":"chat.completion.chunk","created":1770339173,"model":"gpt-5-nano-2025-08-07","service_tier":"default","system_fingerprint":null,"choices":[{"index":0,"delta":{"role":"assistant","content":null,"tool_calls":[{"index":0,"id":"call_HCUq7OYIqj233H77wpqAtSGP","type":"function","function":{"name":"code_execution__execute","arguments":""}}],"refusal":null},"finish_reason":null}],"usage":null,"obfuscation":"XbIx"} +data: {"id":"chatcmpl-D64NZp69RkEyXdUoDaCBj7fSYll8J","object":"chat.completion.chunk","created":1770339173,"model":"gpt-5-nano-2025-08-07","service_tier":"default","system_fingerprint":null,"choices":[{"index":0,"delta":{"role":"assistant","content":null,"tool_calls":[{"index":0,"id":"call_HCUq7OYIqj233H77wpqAtSGP","type":"function","function":{"name":"code_execution__execute_typescript","arguments":""}}],"refusal":null},"finish_reason":null}],"usage":null,"obfuscation":"XbIx"} data: {"id":"chatcmpl-D64NZp69RkEyXdUoDaCBj7fSYll8J","object":"chat.completion.chunk","created":1770339173,"model":"gpt-5-nano-2025-08-07","service_tier":"default","system_fingerprint":null,"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{\""}}]},"finish_reason":null}],"usage":null,"obfuscation":"0WAOew1EJA"} diff --git a/crates/goose-cli/src/session/output.rs b/crates/goose-cli/src/session/output.rs index 8472cc8a6af2..d7985df82184 100644 --- a/crates/goose-cli/src/session/output.rs +++ b/crates/goose-cli/src/session/output.rs @@ -480,7 +480,7 @@ fn render_tool_request(req: &ToolRequest, theme: Theme, debug: bool) { Ok(call) => match call.name.to_string().as_str() { name if is_shell_tool_name(name) => render_shell_request(call, debug), name if is_file_tool_name(name) => render_text_editor_request(call, debug), - "execute" | "execute_code" => render_execute_code_request(call, debug), + "execute_typescript" | "execute_code" => render_execute_code_request(call, debug), "delegate" => render_delegate_request(call, debug), "subagent" => render_delegate_request(call, debug), "todo__write" => render_todo_request(call, debug), @@ -822,7 +822,7 @@ pub fn render_subagent_tool_call( arguments: Option<&JsonObject>, debug: bool, ) { - if tool_name == "code_execution__execute_code" { + if tool_name == "code_execution__execute_typescript" { let tool_graph = arguments .and_then(|args| args.get("tool_graph")) .and_then(Value::as_array) @@ -851,7 +851,7 @@ fn render_subagent_tool_graph(subagent_id: &str, tool_graph: &[Value]) { " {} {} {} {} tool call{}", style("▸").dim(), style(format!("[subagent:{}]", short_id)).dim(), - style("execute_code").dim(), + style("execute_typescript").dim(), style(count).dim(), plural, ); diff --git a/crates/goose/src/agents/platform_extensions/code_execution.rs b/crates/goose/src/agents/platform_extensions/code_execution.rs index 22e0f43ae76c..26012035678b 100644 --- a/crates/goose/src/agents/platform_extensions/code_execution.rs +++ b/crates/goose/src/agents/platform_extensions/code_execution.rs @@ -4,11 +4,13 @@ use crate::agents::mcp_client::{Error, McpClientTrait}; use crate::agents::tool_execution::ToolCallContext; use anyhow::Result; use async_trait::async_trait; -use indoc::indoc; -use pctx_code_mode::config::ToolDisclosure; -use pctx_code_mode::model::{CallbackConfig, ExecuteInput, GetFunctionDetailsInput}; -use pctx_code_mode::registry::{CallbackFn, PctxRegistry}; -use pctx_code_mode::CodeMode; +use pctx_code_mode::{ + config::ToolDisclosure, + descriptions::{tools as tool_descriptions, workflow::get_workflow_description}, + model::{CallbackConfig, ExecuteBashInput, ExecuteInput, GetFunctionDetailsInput}, + registry::{CallbackFn, PctxRegistry}, + CodeMode, +}; use rmcp::model::{ CallToolRequestParams, CallToolResult, Content, Implementation, InitializeResult, JsonObject, ListToolsResult, RawContent, Role, ServerCapabilities, Tool as McpTool, ToolAnnotations, @@ -29,6 +31,7 @@ pub static EXTENSION_NAME: &str = "code_execution"; pub struct CodeExecutionClient { info: InitializeResult, context: PlatformExtensionContext, + disclosure: ToolDisclosure, state: RwLock>, } @@ -54,32 +57,18 @@ pub struct ExecuteWithToolGraph { } impl CodeExecutionClient { - pub fn new(context: PlatformExtensionContext) -> Result { + pub fn new(context: PlatformExtensionContext, disclosure: ToolDisclosure) -> Result { let info = InitializeResult::new(ServerCapabilities::builder().enable_tools().build()) .with_server_info( Implementation::new(EXTENSION_NAME.to_string(), "1.0.0".to_string()) .with_title("Code Mode"), ) - .with_instructions(indoc! {r#" - BATCH MULTIPLE TOOL CALLS INTO ONE execute CALL. - - This extension exists to reduce round-trips. When a task requires multiple tool calls: - - WRONG: Multiple execute calls, each with one tool - - RIGHT: One execute call with a script that calls all needed tools - - IMPORTANT: All tool calls are ASYNC. Use await for each call. - - Workflow: - 1. Use the list_functions and get_function_details tools to discover tools and signatures - 2. Write ONE script that calls ALL tools needed for the task, no need to import anything, - all the namespaces returned by list_functions and get_function_details will be available - 3. Chain results: use output from one tool as input to the next - 4. Only return and console.log data you need, tools could have very large responses. - "#}.to_string()); + .with_instructions(get_workflow_description(disclosure)); Ok(Self { info, context, + disclosure, state: RwLock::new(None), }) } @@ -95,19 +84,20 @@ impl CodeExecutionClient { .get_prefixed_tools_excluding(session_id, EXTENSION_NAME) .await .ok()?; + let mut cfgs = vec![]; for tool in tools { - let full_name = tool.name.to_string(); - let (namespace, name) = if let Some((server, tool_name)) = full_name.split_once("__") { - (server.to_string(), tool_name.to_string()) + let (name, namespace) = if let Some((prefix, tool_name)) = tool.name.split_once("__") { + (tool_name.to_string(), Some(prefix.to_string())) } else if let Some(owner) = get_tool_owner(&tool) { - (owner, full_name) + (tool.name.to_string(), Some(owner)) } else { - continue; + (tool.name.to_string(), None) }; + cfgs.push(CallbackConfig { name, - namespace: Some(namespace), + namespace, description: tool.description.as_ref().map(|d| d.to_string()), input_schema: Some(json!(tool.input_schema)), output_schema: tool.output_schema.as_ref().map(|s| json!(s)), @@ -146,10 +136,11 @@ impl CodeExecutionClient { let state = CodeModeState::new(cfgs)?; let code_mode = state.code_mode.clone(); *guard = Some(state); + Ok(code_mode) } - /// Build a CallbackRegistry with all tool callbacks registered + /// Build a PctxRegistry with all tool callbacks registered fn build_callback_registry( &self, session_id: &str, @@ -164,7 +155,14 @@ impl CodeExecutionClient { let registry = PctxRegistry::default(); for cfg in code_mode.callbacks() { - let full_name = format!("{}__{}", cfg.namespace.as_deref().unwrap_or(""), &cfg.name); + let full_name = format!( + "{}{}", + cfg.namespace + .clone() + .map(|n| format!("{n}__")) + .unwrap_or_default(), + &cfg.name + ); let callback = create_tool_callback(session_id.to_string(), full_name, manager.clone()); registry .add_callback(&cfg.id(), callback) @@ -200,8 +198,43 @@ impl CodeExecutionClient { Ok(vec![Content::text(output.code)]) } - /// Handle the execute tool call - async fn handle_execute( + /// Handle the execute bash tool call + async fn handle_execute_bash( + &self, + session_id: &str, + arguments: Option, + ) -> Result, String> { + let input: ExecuteBashInput = arguments + .map(|args| serde_json::from_value(Value::Object(args))) + .transpose() + .map_err(|e| format!("Failed to parse arguments: {e}"))? + .ok_or("Missing arguments for execute_bash")?; + let command = input.command; + let code_mode = self.get_code_mode(session_id).await?; + + // Deno runtime is not Send, so we need to run it in a blocking task + // with its own tokio runtime + let output = tokio::task::spawn_blocking(move || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|e| format!("Failed to create runtime: {e}"))?; + + rt.block_on(async move { + code_mode + .execute_bash(&command) + .await + .map_err(|e| format!("Typescript execution error: {e}")) + }) + }) + .await + .map_err(|e| format!("Typescript execution task failed: {e}"))??; + + Ok(vec![Content::text(output.markdown())]) + } + + /// Handle the execute typescript tool call + async fn handle_execute_typescript( &self, session_id: &str, arguments: Option, @@ -210,11 +243,12 @@ impl CodeExecutionClient { .map(|args| serde_json::from_value(Value::Object(args))) .transpose() .map_err(|e| format!("Failed to parse arguments: {e}"))? - .ok_or("Missing arguments for execute")?; + .ok_or("Missing arguments for execute_typescript")?; let code_mode = self.get_code_mode(session_id).await?; let registry = self.build_callback_registry(session_id, &code_mode)?; let code = args.input.code.clone(); + let disclosure = self.disclosure; // Deno runtime is not Send, so we need to run it in a blocking task // with its own tokio runtime @@ -226,13 +260,13 @@ impl CodeExecutionClient { rt.block_on(async move { code_mode - .execute_typescript(&code, ToolDisclosure::default(), Some(registry)) + .execute_typescript(&code, disclosure, Some(registry)) .await - .map_err(|e| format!("Execution error: {e}")) + .map_err(|e| format!("Typescript execution error: {e}")) }) }) .await - .map_err(|e| format!("Execution task failed: {e}"))??; + .map_err(|e| format!("Typescript execution task failed: {e}"))??; Ok(vec![Content::text(output.markdown())]) } @@ -316,97 +350,79 @@ impl McpClientTrait for CodeExecutionClient { })) .expect("valid schema"); - Ok(ListToolsResult { - tools: vec![ - McpTool::new( - "list_functions".to_string(), - indoc! {r#" - List all available functions across all namespaces. - - This will not return function input and output types. - After determining which functions are needed use - get_function_details to get input and output type - information about specific functions. - "#} - .to_string(), - empty_schema, - ) - .annotate(ToolAnnotations::from_raw( - Some("List functions".to_string()), - Some(true), - Some(false), - Some(true), - Some(false), - )), - McpTool::new( - "get_function_details".to_string(), - indoc! {r#" - Get detailed type information for specific functions. - - Provide a list of function identifiers in the format "Namespace.functionName" - (e.g., "Developer.shell", "Github.createIssue"). - - Returns full TypeScript interface definitions with parameter types, - return types, and descriptions for the requested functions. - "#} - .to_string(), - schema::(), - ) - .annotate(ToolAnnotations::from_raw( - Some("Get function details".to_string()), - Some(true), - Some(false), - Some(true), - Some(false), - )), - McpTool::new( - "execute".to_string(), - indoc! {r#" - Execute TypeScript code that calls available functions. - - SYNTAX - TypeScript with async run() function: - ```typescript - async function run() { - // Access functions via Namespace.functionName({ params }) — always camelCase - const files = await Developer.shell({ command: "ls -la" }); - const readme = await Developer.shell({ command: "cat ./README.md" }); - return { files, readme }; - } - ``` - - TOOL_GRAPH: Always provide tool_graph to describe the execution flow for the UI. - Each node has: tool (Namespace.functionName), description (what it does), depends_on (indices of dependencies). - Example for chained operations: - [ - {"tool": "Developer.shell", "description": "list files", "depends_on": []}, - {"tool": "Developer.shell", "description": "read README.md", "depends_on": []}, - {"tool": "Developer.write", "description": "write output.txt", "depends_on": [0, 1]} - ] - - KEY RULES: - - Code MUST define an async function named `run()` - - All function calls are async - use `await` - - Function names are always camelCase (e.g., Developer.shell, Github.listIssues, Github.createIssue) - - Return value from `run()` is the result, all `console.log()` output will be returned as well. - - Only functions from `list_functions()` and `console` methods are available — no `fetch()`, `fs`, or other Node/Deno APIs - - Variables don't persist between `execute()` calls - return or log anything you need later - - Code runs in an isolated sandbox with restricted network access - - HANDLING RETURN VALUES: - - If a function returns `any`, do NOT assume its shape - log it first: `console.log(JSON.stringify(result))` - - Many functions return wrapper objects, not raw arrays - check the response structure before calling .filter(), .map(), etc. - - Always inspect unfamiliar return values with console.log() before processing them - - TOKEN USAGE WARNING: This tool could return LARGE responses if your code returns big objects. - To minimize tokens: - - Filter/map/reduce data IN YOUR CODE before returning - - Only return specific fields you need (e.g., return {id: result.id, count: items.length}) - - Use console.log() for intermediate results instead of returning everything - - Avoid returning full API responses - extract just what you need - - BEFORE CALLING: Use list_functions or get_function_details to check available functions and their parameters. - "#} - .to_string(), + let tools = match self.disclosure { + ToolDisclosure::Catalog => { + vec![ + McpTool::new( + "list_functions".to_string(), + tool_descriptions::LIST_FUNCTIONS.to_string(), + empty_schema, + ) + .annotate(ToolAnnotations::from_raw( + Some("List functions".to_string()), + Some(true), + Some(false), + Some(true), + Some(false), + )), + McpTool::new( + "get_function_details".to_string(), + tool_descriptions::GET_FUNCTION_DETAILS.to_string(), + schema::(), + ) + .annotate(ToolAnnotations::from_raw( + Some("Get function details".to_string()), + Some(true), + Some(false), + Some(true), + Some(false), + )), + McpTool::new( + "execute_typescript".to_string(), + tool_descriptions::EXECUTE_TYPESCRIPT_CATALOG.to_string(), + schema::(), + ) + .annotate(ToolAnnotations::from_raw( + Some("Execute TypeScript".to_string()), + Some(false), + Some(true), + Some(false), + Some(true), + )), + ] + } + ToolDisclosure::Filesystem => { + vec![ + McpTool::new( + "execute_bash".to_string(), + tool_descriptions::EXECUTE_BASH.to_string(), + schema::(), + ) + .annotate(ToolAnnotations::from_raw( + Some("Get function details".to_string()), + Some(true), + Some(false), + Some(true), + Some(false), + )), + McpTool::new( + "execute_typescript".to_string(), + tool_descriptions::EXECUTE_TYPESCRIPT_FILESYSTEM.to_string(), + schema::(), + ) + .annotate(ToolAnnotations::from_raw( + Some("Execute TypeScript".to_string()), + Some(false), + Some(true), + Some(false), + Some(true), + )), + ] + } + ToolDisclosure::Sidecar => { + vec![McpTool::new( + "execute_typescript".to_string(), + tool_descriptions::EXECUTE_TYPESCRIPT_SIDECAR.to_string(), schema::(), ) .annotate(ToolAnnotations::from_raw( @@ -415,10 +431,14 @@ impl McpClientTrait for CodeExecutionClient { Some(true), Some(false), Some(true), - )), - ], - next_cursor: None, + ))] + } + }; + + Ok(ListToolsResult { meta: None, + next_cursor: None, + tools, }) } @@ -436,7 +456,8 @@ impl McpClientTrait for CodeExecutionClient { self.handle_get_function_details(session_id, arguments) .await } - "execute" => self.handle_execute(session_id, arguments).await, + "execute_bash" => self.handle_execute_bash(session_id, arguments).await, + "execute_typescript" => self.handle_execute_typescript(session_id, arguments).await, _ => Err(format!("Unknown tool: {name}")), }; @@ -454,28 +475,48 @@ impl McpClientTrait for CodeExecutionClient { async fn get_moim(&self, session_id: &str) -> Option { let code_mode = self.get_code_mode(session_id).await.ok()?; - let available: Vec<_> = code_mode - .list_functions() - .functions - .iter() - .map(|f| format!("{}.{}", &f.namespace, &f.name)) - .collect(); + + let disclosure_style_moim = match self.disclosure { + ToolDisclosure::Catalog => { + let available_fns: Vec<_> = code_mode + .list_functions() + .functions + .iter() + .map(|f| format!("{}.{}", &f.namespace, &f.name)) + .collect(); + format!("Available functions: {} + + Use the list_functions & get_function_details tools to see tool signatures and input/output types before calling execute_typescript.", available_fns.join(", ")) + } + ToolDisclosure::Filesystem => { + let available_filepaths: Vec<_> = code_mode + .virtual_fs().keys().map(String::from).collect(); + format!("Use execute_bash to search and read the tool signatures and input/output types before calling execute_typescript. The available files are: {}", available_filepaths.join(", ")) + }, + ToolDisclosure::Sidecar => "Prioritize calling tools with the execute_typescript tool, especially when multiple tools can be called in one script.".into(), + }; Some(format!( indoc::indoc! {r#" - ALWAYS batch multiple tool operations into ONE execute call. - - WRONG: Separate execute calls for read file, then write file - - RIGHT: One execute with an async run() function that reads AND writes - - Available namespaces: {} + ALWAYS batch multiple tool operations into ONE execute_typescript call. + - WRONG: Separate execute_typescript calls for read file, then write file + - RIGHT: One execute_typescript with an async run() function that reads AND writes AND logs/returns as little information as needed for the next step. - Use the list_functions & get_function_details tools to see tool signatures and input/output types before calling unfamiliar tools. + {} "#}, - available.join(", ") + disclosure_style_moim )) } } +pub fn get_tool_disclosure() -> ToolDisclosure { + let config = crate::config::Config::global(); + let tool_disclosure_str: String = config + .get_param("CODE_MODE_TOOL_DISCLOSURE") + .unwrap_or_else(|_| "catalog".to_string()); + serde_json::from_value(serde_json::json!(tool_disclosure_str)).unwrap_or_default() +} + struct CodeModeState { code_mode: CodeMode, hash: u64, diff --git a/crates/goose/src/agents/platform_extensions/mod.rs b/crates/goose/src/agents/platform_extensions/mod.rs index cd577d71e8d2..04eecb2cf25b 100644 --- a/crates/goose/src/agents/platform_extensions/mod.rs +++ b/crates/goose/src/agents/platform_extensions/mod.rs @@ -128,7 +128,13 @@ pub static PLATFORM_EXTENSIONS: Lazy default_enabled: false, unprefixed_tools: true, client_factory: |ctx| { - Box::new(code_execution::CodeExecutionClient::new(ctx).unwrap()) + Box::new( + code_execution::CodeExecutionClient::new( + ctx, + code_execution::get_tool_disclosure(), + ) + .unwrap(), + ) }, }, ); diff --git a/crates/goose/src/agents/reply_parts.rs b/crates/goose/src/agents/reply_parts.rs index 0fc3cf6c86d9..8f60e3111473 100644 --- a/crates/goose/src/agents/reply_parts.rs +++ b/crates/goose/src/agents/reply_parts.rs @@ -155,13 +155,50 @@ impl Agent { #[cfg(not(feature = "code-mode"))] let code_execution_active = false; if code_execution_active { - tools.retain(|tool| { - if let Some(owner) = crate::agents::extension_manager::get_tool_owner(tool) { - crate::agents::extension_manager::is_first_class_extension(&owner) - } else { - false - } - }); + let disclosure_style = + crate::agents::platform_extensions::code_execution::get_tool_disclosure(); + + tools = tools + .into_iter() + .filter_map(|mut t| match disclosure_style { + pctx_code_mode::config::ToolDisclosure::Catalog + | pctx_code_mode::config::ToolDisclosure::Filesystem => { + // in catalog & filesystem styles, progressive search is handled + // by pctx, so we want to omit all non-first-class extensions + // from the standard tool list + if crate::agents::extension_manager::get_tool_owner(&t).is_some_and(|o| { + crate::agents::extension_manager::is_first_class_extension(&o) + }) { + Some(t) + } else { + None + } + } + pctx_code_mode::config::ToolDisclosure::Sidecar => { + // in sidecar style there is no progressive search, just a way to chain tools + // together with typescript + // add output schema to description since many model providers drop the + // output schema when presenting tools to the model + let output_schema = t + .output_schema + .as_ref() + .map(|s| serde_json::json!(s).to_string()) + .unwrap_or("unknown".to_string()); + let description_extension = format!( + "The successful return schema of this tool is:\n{output_schema}" + ); + + t.description = Some( + t.description + .map(|t| format!("{t}\n{description_extension}")) + .unwrap_or(description_extension) + .into(), + ); + + Some(t) + } + }) + .collect(); } // Stable tool ordering is important for multi session prompt caching. diff --git a/crates/goose/src/agents/snapshots/goose__agents__prompt_manager__tests__all_platform_extensions.snap b/crates/goose/src/agents/snapshots/goose__agents__prompt_manager__tests__all_platform_extensions.snap index 1b00f5374ed4..124cc17058c0 100644 --- a/crates/goose/src/agents/snapshots/goose__agents__prompt_manager__tests__all_platform_extensions.snap +++ b/crates/goose/src/agents/snapshots/goose__agents__prompt_manager__tests__all_platform_extensions.snap @@ -64,21 +64,16 @@ Two modes: ## code_execution ### Instructions -BATCH MULTIPLE TOOL CALLS INTO ONE execute CALL. - -This extension exists to reduce round-trips. When a task requires multiple tool calls: -- WRONG: Multiple execute calls, each with one tool -- RIGHT: One execute call with a script that calls all needed tools - -IMPORTANT: All tool calls are ASYNC. Use await for each call. - -Workflow: - 1. Use the list_functions and get_function_details tools to discover tools and signatures - 2. Write ONE script that calls ALL tools needed for the task, no need to import anything, - all the namespaces returned by list_functions and get_function_details will be available - 3. Chain results: use output from one tool as input to the next - 4. Only return and console.log data you need, tools could have very large responses. - +General: + - BATCH MULTIPLE TOOL CALLS INTO ONE `execute_typescript` CALL. + - These tools exists to reduce round-trips. When a task requires multiple tool calls: + - WRONG: Multiple `execute_typescript` calls, each with one tool + - RIGHT: One `execute_typescript` call with a script that calls all needed tools + - Only `return` and `console.log` data you need, tools could have very large responses. + - IMPORTANT: All tool calls are ASYNC. Use await for each call. +WORKFLOW: + 1. Use the `list_functions` and `get_function_details` tools to discover tools signatures and input/output types. + 2. Write ONE script that calls ALL tools needed for the task and execute that script with `execute_typescript`, no need to import anything, all the namespaces returned by `list_functions` and `get_function_details` will be available globally. ## developer ### Instructions diff --git a/crates/goose/src/providers/local_inference.rs b/crates/goose/src/providers/local_inference.rs index 2020699a85b4..04bac00f1f0c 100644 --- a/crates/goose/src/providers/local_inference.rs +++ b/crates/goose/src/providers/local_inference.rs @@ -39,7 +39,7 @@ use tokio::sync::Mutex; use uuid::Uuid; const SHELL_TOOL: &str = "developer__shell"; -const CODE_EXECUTION_TOOL: &str = "code_execution__execute"; +const CODE_EXECUTION_TOOL: &str = "code_execution__execute_typescript"; type ModelSlot = Arc>>; @@ -210,10 +210,10 @@ fn build_openai_messages_json(system: &str, messages: &[Message]) -> String { /// the model: /// /// - `ToolRequest` with a `"command"` argument → `$ command` -/// - `ToolRequest` with a `"code"` argument → `` ```execute\n…\n``` `` +/// - `ToolRequest` with a `"code"` argument → `` ```execute_typescript\n…\n``` `` /// - `ToolResponse` → `Command output:\n…` /// -/// Only `developer__shell` and `code_execution__execute` style tool calls are +/// Only `developer__shell` and `code_execution__execute_typescript` style tool calls are /// recognized (by argument shape, not tool name). Tool calls from other extensions /// (e.g. custom MCP tools made by a native-tool-calling model earlier in the /// conversation) are silently dropped, since the emulator path has no syntax to @@ -241,7 +241,7 @@ fn extract_text_content(msg: &Message) -> String { .and_then(|a| a.get("code")) .and_then(|v| v.as_str()) { - parts.push(format!("```execute\n{}\n```", code)); + parts.push(format!("```execute_typescript\n{}\n```", code)); } } } diff --git a/crates/goose/src/providers/local_inference/inference_emulated_tools.rs b/crates/goose/src/providers/local_inference/inference_emulated_tools.rs index 902bdec27345..9e04852a4aa6 100644 --- a/crates/goose/src/providers/local_inference/inference_emulated_tools.rs +++ b/crates/goose/src/providers/local_inference/inference_emulated_tools.rs @@ -34,7 +34,7 @@ use super::inference_engine::{ }; use super::{finalize_usage, StreamSender, CODE_EXECUTION_TOOL, SHELL_TOOL}; -const HOLD_BACK_CODE_MODE: usize = " ```execute\n".len(); +const HOLD_BACK_CODE_MODE: usize = " ```execute_typescript\n".len(); const HOLD_BACK_SHELL_ONLY: usize = "\n$".len(); pub(super) fn load_tiny_model_prompt() -> String { @@ -79,7 +79,7 @@ pub(super) fn build_emulator_tool_description(tools: &[Tool], code_mode_enabled: The code runs immediately — do not explain it, just run it.\n\n", ); tool_desc.push_str("Example — counting files in /tmp:\n\n"); - tool_desc.push_str("```execute\nasync function run() {\n"); + tool_desc.push_str("```execute_typescript\nasync function run() {\n"); tool_desc.push_str( " const result = await Developer.shell({ command: \"ls -1 /tmp | wc -l\" });\n", ); @@ -206,7 +206,9 @@ impl StreamingEmulatorParser { ParserState::Normal => { // Check for ```execute block (code mode) if self.code_mode_enabled { - if let Some((before, after)) = self.buffer.split_once("```execute\n") { + if let Some((before, after)) = + self.buffer.split_once("```execute_typescript\n") + { if !before.trim().is_empty() { results.push(EmulatorAction::Text(before.to_string())); } @@ -215,8 +217,8 @@ impl StreamingEmulatorParser { continue; } // Also handle without newline after tag (accumulating) - if self.buffer.ends_with("```execute") { - let before = self.buffer.trim_end_matches("```execute"); + if self.buffer.ends_with("```execute_typescript") { + let before = self.buffer.trim_end_matches("```execute_typescript"); if !before.trim().is_empty() { results.push(EmulatorAction::Text(before.to_string())); } @@ -561,7 +563,7 @@ mod tests { #[test] fn execute_block() { - let input = "Here's the code:\n```execute\nconsole.log('hi');\n```\n"; + let input = "Here's the code:\n```execute_typescript\nconsole.log('hi');\n```\n"; let actions = parse_all(input, true); assert!(actions.len() >= 2); assert_text(&actions[0], "Here's the code:"); @@ -570,7 +572,7 @@ mod tests { #[test] fn execute_block_not_detected_without_code_mode() { - let input = "```execute\nconsole.log('hi');\n```\n"; + let input = "```execute_typescript\nconsole.log('hi');\n```\n"; let actions = parse_all(input, false); // Should be treated as plain text for action in &actions { @@ -592,7 +594,10 @@ mod tests { #[test] fn execute_fence_split_across_chunks() { - let actions = parse_chunks(&["Here:\n```ex", "ecute\nlet x = 1;\n", "```\n"], true); + let actions = parse_chunks( + &["Here:\n```ex", "ecute_typescript\nlet x = 1;\n", "```\n"], + true, + ); let executes: Vec<_> = actions .iter() .filter(|a| matches!(a, EmulatorAction::ExecuteCode(_))) @@ -655,7 +660,7 @@ mod tests { #[test] fn execute_block_with_multiline_code() { - let input = "```execute\nasync function run() {\n const r = await Developer.shell({ command: \"ls\" });\n return r;\n}\n```\n"; + let input = "```execute_typescript\nasync function run() {\n const r = await Developer.shell({ command: \"ls\" });\n return r;\n}\n```\n"; let actions = parse_all(input, true); let executes: Vec<_> = actions .iter() @@ -674,7 +679,7 @@ mod tests { #[test] fn unclosed_execute_block_flushed() { // Model stops generating mid-block - let input = "```execute\nlet x = 1;"; + let input = "```execute_typescript\nlet x = 1;"; let actions = parse_all(input, true); let executes: Vec<_> = actions .iter() diff --git a/crates/goose/src/providers/local_inference/tool_parsing.rs b/crates/goose/src/providers/local_inference/tool_parsing.rs index b03a6792622c..d2f57688b280 100644 --- a/crates/goose/src/providers/local_inference/tool_parsing.rs +++ b/crates/goose/src/providers/local_inference/tool_parsing.rs @@ -436,13 +436,13 @@ mod tests { #[test] fn test_parse_glm_style_tool_call_multiple_args() { - let text = "Let me check.\nexecutecodeasync function run() { return 1; }tool_graph[{\"tool\": \"shell\"}]"; + let text = "Let me check.\nexecute_typescriptcodeasync function run() { return 1; }tool_graph[{\"tool\": \"shell\"}]"; let result = split_content_and_xml_tool_calls(text); assert!(result.is_some()); let (content, calls) = result.unwrap(); assert_eq!(content, "Let me check."); assert_eq!(calls.len(), 1); - assert_eq!(calls[0].0, "execute"); + assert_eq!(calls[0].0, "execute_typescript"); assert_eq!( calls[0].1.get("code").unwrap(), "async function run() { return 1; }" diff --git a/documentation/docs/guides/managing-tools/code-mode.md b/documentation/docs/guides/managing-tools/code-mode.md index 49883b3ff291..8d8a373aac6c 100644 --- a/documentation/docs/guides/managing-tools/code-mode.md +++ b/documentation/docs/guides/managing-tools/code-mode.md @@ -15,13 +15,15 @@ This functionality requires the built-in [Code Mode extension](/docs/mcp/code-mo ::: Code Mode controls how tools are discovered and called: + - Tools from enabled extensions are discovered on-demand and loaded into context as needed - Multiple tool calls are batched in one execution - Intermediate results are chained (output from one tool as input to the next) ## How Code Mode Works -The [Code Mode extension](/docs/mcp/code-mode-mcp) is an MCP server that uses the MCP protocol to expose three foundational meta-tools. When Code Mode is enabled, goose switches to Code Mode. For every request, the LLM writes JavaScript code that goose executes using [pctx (Port of Context)](https://github.com/AdrianCole/pctx), a custom Deno-based runtime, to: +The [Code Mode extension](/docs/mcp/code-mode-mcp) is an MCP server that uses the MCP protocol to expose three foundational meta-tools. When Code Mode is enabled, goose switches to Code Mode. For every request, the LLM writes JavaScript code that goose executes using [pctx (Port of Context)](https://portofcontext.com/) ([GitHub](https://github.com/portofcontext/pctx)), a custom Deno-based runtime, to: + - Discover available tools from your enabled extensions (if needed) - Learn how to work with the tools it needs for the current task - Call those tools programmatically to complete the task @@ -32,7 +34,7 @@ Traditional MCP tool calling and Code Mode are two different approaches to the s | Aspect | Traditional | Code Mode | |--------|------------------|-----------| -| **Tool Discovery** | All tools from enabled extensions, for example:
• `developer.shell`
• `developer.text_editor`
• `github.list_issues`
• `github.get_pull_request`
• `slack.send_message`
• ... *potentially many more* | Code Mode extension's meta-tools:
• `list_functions`
• `get_function_details`
• `execute`

The LLM uses these tools to discover tools from other enabled extensions as needed | +| **Tool Discovery** | All tools from enabled extensions, for example:
• `developer.shell`
• `developer.text_editor`
• `github.list_issues`
• `github.get_pull_request`
• `slack.send_message`
• ... *potentially many more* | Code Mode extension's meta-tools:
• `list_functions`
• `get_function_details`
• `execute_typescript`

The LLM uses these tools to discover tools from other enabled extensions as needed | | **Tool Calling** | • Sequential tool calls
• Each result sent to the LLM before the next call | • May require tool discovery calls
• Multiple tool calls batched in one execution
• Intermediate results are chained and processed locally | | **Context Window** | Every LLM call includes all tool definitions from enabled extensions | Every LLM call includes the 3 meta-tool definitions, plus any tool definitions previously discovered in the session | | **Best For** | • 1-3 enabled extensions
• Simple tasks using 1-2 tools | • 5+ extensions
• Well-defined multi-step workflows | @@ -68,4 +70,4 @@ import notMcpReplacement from '@site/blog/2025-12-21-code-mode-doesnt-replace-mc duration: '8 min read' } ]} -/> +/> \ No newline at end of file diff --git a/scripts/test_providers_code_exec.sh b/scripts/test_providers_code_exec.sh index 19243d6622fc..c9d720d202a0 100755 --- a/scripts/test_providers_code_exec.sh +++ b/scripts/test_providers_code_exec.sh @@ -27,10 +27,11 @@ run_test() { cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1 ) > "$output_file" 2>&1 - # Matches: "execute | code_execution", "get_function_details | code_execution", + # Matches: "execute_typescript | code_execution", "get_function_details | code_execution", # "tool call | execute", "tool calls | execute" (old format) # "▸ execute N tool call" (new format with tool_graph) - if grep -qE "(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)|(▸.*execute.*tool call)" "$output_file"; then + # "▸ execute_typescript" (plain tool name in output) + if grep -qE "(execute_typescript \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)|(▸.*execute.*tool call)|(▸ execute_typescript)" "$output_file"; then echo "success|code_execution tool called" > "$result_file" else echo "failure|no code_execution tool calls found" > "$result_file" diff --git a/ui/desktop/src/components/ToolCallWithResponse.tsx b/ui/desktop/src/components/ToolCallWithResponse.tsx index cbca5a4b7d16..f3c3df5d3c06 100644 --- a/ui/desktop/src/components/ToolCallWithResponse.tsx +++ b/ui/desktop/src/components/ToolCallWithResponse.tsx @@ -354,9 +354,9 @@ const formatSubagentToolCall = (data: SubagentToolRequestData): string => { const extensionName = parts.slice(1).reverse().join('__') || ''; const toolGraph = toolCall.arguments?.tool_graph; - if (toolName === 'execute_code' && toolGraph && toolGraph.length > 0) { + if (toolName === 'execute_typescript' && toolGraph && toolGraph.length > 0) { const plural = toolGraph.length === 1 ? '' : 's'; - const header = `[subagent:${shortId}] ${toolGraph.length} tool call${plural} | execute_code`; + const header = `[subagent:${shortId}] ${toolGraph.length} tool call${plural} | execute_typescript`; const lines = toolGraph.map((node, idx) => { const deps = node.depends_on && node.depends_on.length > 0 @@ -638,7 +638,7 @@ function ToolCallView({ case 'computer_control': return `poking around...`; - case 'execute': { + case 'execute_typescript': { const toolGraph = args.tool_graph as unknown as ToolGraphNode[] | undefined; if (toolGraph && Array.isArray(toolGraph) && toolGraph.length > 0) { if (toolGraph.length === 1) { @@ -736,7 +736,7 @@ function ToolCallView({ const toolGraph = toolCall.arguments?.tool_graph as unknown as ToolGraphNode[] | undefined; if ( - toolCall.name === 'code_execution__execute' && + toolCall.name === 'code_execution__execute_typescript' && (typeof code === 'string' || Array.isArray(toolGraph)) ) { return (