diff --git a/common/console.cpp b/common/console.cpp index 857e9484519..d03aa5ff90f 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -83,6 +83,10 @@ namespace console { static bool bracket_paste_mode = false; // true when inside ESC[200~ ... ESC[201~ + // Thread-safety mutex for console operations + // Protects: current_display, out (FILE*), and all console output operations + static std::mutex g_console_mutex; + // // Init and cleanup // @@ -179,7 +183,8 @@ namespace console { // // Keep track of current display and only emit ANSI code if it changes - void set_display(display_type display) { + // Internal version without lock (for use when lock is already held) + static void set_display_unlocked(display_type display) { if (advanced_display && current_display != display) { common_log_flush(common_log_main()); switch(display) { @@ -210,6 +215,11 @@ namespace console { } } + void set_display(display_type display) { + std::lock_guard lock(g_console_mutex); + set_display_unlocked(display); + } + static char32_t getchar32() { #if defined(_WIN32) HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); @@ -1198,6 +1208,7 @@ namespace console { } void log(const char * fmt, ...) { + std::lock_guard lock(g_console_mutex); va_list args; va_start(args, fmt); vfprintf(out, fmt, args); @@ -1205,16 +1216,45 @@ namespace console { } void error(const char * fmt, ...) { + std::lock_guard lock(g_console_mutex); va_list args; va_start(args, fmt); display_type cur = current_display; - set_display(DISPLAY_TYPE_ERROR); + set_display_unlocked(DISPLAY_TYPE_ERROR); vfprintf(out, fmt, args); - set_display(cur); // restore previous color + set_display_unlocked(cur); // restore previous color va_end(args); } void flush() { + std::lock_guard lock(g_console_mutex); + fflush(out); + } + + // + // output_guard implementation + // + + output_guard::output_guard() { + g_console_mutex.lock(); + } + + output_guard::~output_guard() { + g_console_mutex.unlock(); + } + + void output_guard::write(const char * fmt, ...) { + va_list args; + va_start(args, fmt); + vfprintf(out, fmt, args); + va_end(args); + } + + void output_guard::set_display(display_type type) { + set_display_unlocked(type); + } + + void output_guard::flush() { fflush(out); } } diff --git a/common/console.h b/common/console.h index 6d0db6420cc..24f90bed7ac 100644 --- a/common/console.h +++ b/common/console.h @@ -39,4 +39,27 @@ namespace console { void error(const char * fmt, ...); void flush(); + + // RAII guard for atomic multi-part console output + // Holds the console mutex for the lifetime of the object + // Use this when you need to output multiple lines/parts atomically + class output_guard { + public: + output_guard(); + ~output_guard(); + + // Non-copyable, non-movable + output_guard(const output_guard &) = delete; + output_guard & operator=(const output_guard &) = delete; + + // Write to console (mutex already held) + LLAMA_COMMON_ATTRIBUTE_FORMAT(2, 3) + void write(const char * fmt, ...); + + // Change display type (mutex already held) + void set_display(display_type type); + + // Flush output (mutex already held) + void flush(); + }; } diff --git a/tools/agent/CMakeLists.txt b/tools/agent/CMakeLists.txt index 6d8c36369fd..673c20f0c2c 100644 --- a/tools/agent/CMakeLists.txt +++ b/tools/agent/CMakeLists.txt @@ -10,6 +10,7 @@ set(AGENT_SOURCES subagent/subagent-types.cpp subagent/subagent-display.cpp subagent/subagent-runner.cpp + subagent/subagent-output.cpp tools/tool-bash.cpp tools/tool-read.cpp tools/tool-write.cpp diff --git a/tools/agent/README.md b/tools/agent/README.md index c136fccd2bd..be649915762 100644 --- a/tools/agent/README.md +++ b/tools/agent/README.md @@ -54,6 +54,104 @@ llama-agent -hf unsloth/Nemotron-3-Nano-30B-A3B-GGUF:Q5_K_M | `write` | Create or overwrite files | | `edit` | Search and replace in files | | `glob` | Find files matching a pattern | +| `task` | Spawn a subagent for complex tasks | + +## Subagents + +Subagents are specialized child agents that handle complex tasks independently, keeping the main conversation context clean and efficient. + +### Why Subagents? + +Without subagents, every file read and search pollutes your main context: + +``` +Main context after exploring codebase: +├── glob **/*.cpp → 50 files (800 tokens) +├── read src/main.cpp → full file (1,500 tokens) +├── read src/utils.cpp → full file (2,200 tokens) +├── grep "TODO" → 100 matches (1,200 tokens) +└── Total: ~5,700 tokens consumed for ONE exploration +``` + +With subagents, only the summary enters main context: + +``` +Main context: +└── task(explore) → "Found 3 TODO items in src/main.cpp:42,87,156" (50 tokens) + +Subagent context (discarded after): +├── All the detailed exploration (~5,700 tokens) +└── Summarized to parent +``` + +### Subagent Types + +| Type | Purpose | Tools Available | +|------|---------|-----------------| +| `explore` | Search and understand code (read-only) | `glob`, `read`, `bash` (read-only commands only) | +| `bash` | Execute shell commands | `bash` | +| `plan` | Design implementation approaches | `glob`, `read`, `bash` | +| `general` | General-purpose tasks | All tools | + +### How It Works + +``` +┌─────────────────┐ +│ Main Agent │ "Find where errors are handled" +└────────┬────────┘ + │ task(explore, "find error handling") + ▼ +┌─────────────────┐ +│ Subagent │ Does detailed exploration: +│ (explore) │ - glob **/*.cpp +│ │ - read 5 files +│ │ - grep patterns +└────────┬────────┘ + │ Returns summary only + ▼ +┌─────────────────┐ +│ Main Agent │ Receives: "Errors handled in src/error.cpp:45 +│ │ via ErrorHandler class..." +└─────────────────┘ +``` + +### Memory Efficiency + +Subagents share the model - no additional VRAM is used: + +| Resource | Main Agent | Subagent | Total | +|----------|------------|----------|-------| +| Model weights | ✓ | Shared | 1x | +| KV cache | ✓ | Shared via slots | 1x | +| Context window | Own | Own (discarded after) | Efficient | + +### Parallel Execution + +Multiple subagents can run in the background simultaneously: + +``` +> Run tests and check for lint errors at the same time + +[task-a1b2] ┌── ⚡ run-tests (bash) +[task-c3d4] ┌── ⚡ check-lint (bash) +[task-a1b2] │ ├─› bash npm test (2.1s) +[task-c3d4] │ ├─› bash npm run lint (1.8s) +[task-c3d4] │ └── done (1.8s) +[task-a1b2] │ └── done (2.1s) +``` + +### KV Cache Prefix Sharing + +Subagent prompts share a common prefix with the main agent, enabling automatic KV cache reuse: + +``` +Main agent prompt: "You are llama-agent... [base] + [main agent instructions]" +Subagent prompt: "You are llama-agent... [base] + # Subagent Mode: explore..." + ↑─────── shared prefix ──────↑ + Cached tokens reused, not re-processed +``` + +This reduces subagent startup latency and saves compute. ## Usage Examples diff --git a/tools/agent/agent-loop.cpp b/tools/agent/agent-loop.cpp index 04fc2d7008f..b03192441d4 100644 --- a/tools/agent/agent-loop.cpp +++ b/tools/agent/agent-loop.cpp @@ -70,12 +70,34 @@ agent_loop::agent_loop(server_context & server_ctx, tool_ctx_.server_ctx_ptr = &server_ctx_; tool_ctx_.agent_config_ptr = const_cast(&config_); tool_ctx_.common_params_ptr = const_cast(¶ms); + tool_ctx_.session_stats_ptr = &stats_; tool_ctx_.subagent_depth = 0; // Set up permission manager permission_mgr_.set_project_root(tool_ctx_.working_dir); permission_mgr_.set_yolo_mode(config.yolo_mode); + // Base prompt shared with subagents for KV cache prefix sharing + // Subagent prompts start with this exact text to maximize cache hits + static const char * BASE_PROMPT_PREFIX = R"(You are llama-agent, a powerful local AI coding assistant running on llama.cpp. + +You help users with software engineering tasks by reading files, writing code, running commands, and navigating codebases. You run entirely on the user's machine - no data leaves their system. + +# Tools + +You have access to the following tools: + +- **bash**: Execute shell commands. Use for git, build commands, running tests, etc. +- **read**: Read file contents with line numbers. Always read files before editing them. +- **write**: Create new files or overwrite existing ones. +- **edit**: Make targeted edits using search/replace. The old_string must match exactly. Use replace_all=true to replace all occurrences of a word or phrase. +- **glob**: Find files matching a pattern. Use to explore project structure. + +)"; + + // Store base prompt for subagents to inherit (enables KV cache prefix sharing) + tool_ctx_.base_system_prompt = BASE_PROMPT_PREFIX; + // Add system prompt for tool usage std::string system_prompt = R"(You are llama-agent, a powerful local AI coding assistant running on llama.cpp. @@ -265,6 +287,7 @@ agent_loop::agent_loop(server_context & server_ctx, tool_ctx_.server_ctx_ptr = &server_ctx_; tool_ctx_.agent_config_ptr = const_cast(&config_); tool_ctx_.common_params_ptr = const_cast(¶ms); + tool_ctx_.session_stats_ptr = &stats_; tool_ctx_.subagent_depth = subagent_depth; // Set up permission manager @@ -286,6 +309,9 @@ void agent_loop::clear() { messages_.push_back(system_msg); } permission_mgr_.clear_session(); + + // Reset stats when conversation is cleared + stats_ = session_stats{}; } common_chat_msg agent_loop::generate_completion(result_timings & out_timings) { diff --git a/tools/agent/agent-loop.h b/tools/agent/agent-loop.h index efb2990cf9c..01abe15df1a 100644 --- a/tools/agent/agent-loop.h +++ b/tools/agent/agent-loop.h @@ -61,6 +61,12 @@ struct session_stats { int32_t total_cached = 0; // Total tokens served from KV cache double total_prompt_ms = 0; // Total prompt evaluation time double total_predicted_ms = 0; // Total generation time + + // Subagent-specific stats (subset of totals above) + int32_t subagent_input = 0; // Prompt tokens from subagents + int32_t subagent_output = 0; // Output tokens from subagents + int32_t subagent_cached = 0; // Cached tokens from subagents + int32_t subagent_count = 0; // Number of subagent runs }; // The main agent loop class diff --git a/tools/agent/agent.cpp b/tools/agent/agent.cpp index dcc147157d4..e4dd6138a7b 100644 --- a/tools/agent/agent.cpp +++ b/tools/agent/agent.cpp @@ -452,6 +452,27 @@ int main(int argc, char ** argv) { console::log(" Cached tokens: %d\n", stats.total_cached); } console::log(" Total tokens: %d\n", stats.total_input + stats.total_output); + + // Show subagent breakdown if any subagents were used + if (stats.subagent_count > 0) { + console::log("\n Subagent breakdown (%d run%s):\n", + stats.subagent_count, stats.subagent_count == 1 ? "" : "s"); + console::log(" Prompt tokens: %d\n", stats.subagent_input); + console::log(" Output tokens: %d\n", stats.subagent_output); + if (stats.subagent_cached > 0) { + console::log(" Cached tokens: %d\n", stats.subagent_cached); + } + console::log(" Total tokens: %d\n", stats.subagent_input + stats.subagent_output); + + // Show main agent stats (total minus subagent) + int32_t main_input = stats.total_input - stats.subagent_input; + int32_t main_output = stats.total_output - stats.subagent_output; + console::log("\n Main agent:\n"); + console::log(" Prompt tokens: %d\n", main_input); + console::log(" Output tokens: %d\n", main_output); + console::log(" Total tokens: %d\n", main_input + main_output); + } + if (stats.total_prompt_ms > 0) { console::log(" Prompt time: %.2fs\n", stats.total_prompt_ms / 1000.0); } diff --git a/tools/agent/subagent/subagent-display.cpp b/tools/agent/subagent/subagent-display.cpp index e447e16422e..e9e7021d5c0 100644 --- a/tools/agent/subagent/subagent-display.cpp +++ b/tools/agent/subagent/subagent-display.cpp @@ -1,4 +1,5 @@ #include "subagent-display.h" +#include "subagent-output.h" #include "console.h" #include @@ -31,89 +32,162 @@ subagent_display & subagent_display::instance() { void subagent_display::print_header(int depth, const std::string & icon, const std::string & name, const std::string & type_name, - const std::string & description) { + const std::string & description, + subagent_output_buffer * buffer) { std::string prefix = subagent_indent_prefix(depth); - // Print: ┌── ⚡ name (type) - console::log("\n%s%s%s%s ", prefix.c_str(), TREE_CORNER_TOP, TREE_HORIZONTAL, TREE_HORIZONTAL); - console::log("%s ", icon.c_str()); - console::set_display(DISPLAY_TYPE_SUBAGENT); - console::log("%s", name.c_str()); - console::set_display(DISPLAY_TYPE_RESET); - console::set_display(DISPLAY_TYPE_REASONING); - console::log(" (%s)\n", type_name.c_str()); - console::set_display(DISPLAY_TYPE_RESET); - - // Print description if provided - if (!description.empty()) { - console::log("%s%s ", prefix.c_str(), TREE_VERTICAL); - console::set_display(DISPLAY_TYPE_REASONING); - console::log("%s\n", description.c_str()); - console::set_display(DISPLAY_TYPE_RESET); + if (buffer) { + // Buffered mode: write to buffer + buffer->write(DISPLAY_TYPE_RESET, "\n%s%s%s%s ", prefix.c_str(), TREE_CORNER_TOP, TREE_HORIZONTAL, TREE_HORIZONTAL); + buffer->write(DISPLAY_TYPE_RESET, "%s ", icon.c_str()); + buffer->write(DISPLAY_TYPE_SUBAGENT, "%s", name.c_str()); + buffer->write(DISPLAY_TYPE_REASONING, " (%s)\n", type_name.c_str()); + + if (!description.empty()) { + buffer->write(DISPLAY_TYPE_RESET, "%s%s ", prefix.c_str(), TREE_VERTICAL); + buffer->write(DISPLAY_TYPE_REASONING, "%s\n", description.c_str()); + } + } else { + // Direct mode: use output_guard for atomic output + console::output_guard guard; + + guard.write("\n%s%s%s%s ", prefix.c_str(), TREE_CORNER_TOP, TREE_HORIZONTAL, TREE_HORIZONTAL); + guard.write("%s ", icon.c_str()); + guard.set_display(DISPLAY_TYPE_SUBAGENT); + guard.write("%s", name.c_str()); + guard.set_display(DISPLAY_TYPE_RESET); + guard.set_display(DISPLAY_TYPE_REASONING); + guard.write(" (%s)\n", type_name.c_str()); + guard.set_display(DISPLAY_TYPE_RESET); + + if (!description.empty()) { + guard.write("%s%s ", prefix.c_str(), TREE_VERTICAL); + guard.set_display(DISPLAY_TYPE_REASONING); + guard.write("%s\n", description.c_str()); + guard.set_display(DISPLAY_TYPE_RESET); + } } } void subagent_display::print_tool_call(int depth, const std::string & tool_name, const std::string & args_summary, - int elapsed_ms) { + int elapsed_ms, + subagent_output_buffer * buffer) { std::string prefix = subagent_indent_prefix(depth); - // Print: │ ├─› tool_name args (timing) - console::log("%s%s %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_TEE, TREE_HORIZONTAL, ARROW_RIGHT); - console::set_display(DISPLAY_TYPE_INFO); - console::log("%s", tool_name.c_str()); - console::set_display(DISPLAY_TYPE_RESET); + if (buffer) { + // Buffered mode + buffer->write(DISPLAY_TYPE_RESET, "%s%s %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_TEE, TREE_HORIZONTAL, ARROW_RIGHT); + buffer->write(DISPLAY_TYPE_INFO, "%s", tool_name.c_str()); - if (!args_summary.empty()) { - console::log(" %s", args_summary.c_str()); - } + if (!args_summary.empty()) { + buffer->write(DISPLAY_TYPE_RESET, " %s", args_summary.c_str()); + } - console::log(" "); - console::set_display(DISPLAY_TYPE_REASONING); - if (elapsed_ms < 1000) { - console::log("(%dms)", elapsed_ms); + buffer->write(DISPLAY_TYPE_RESET, " "); + if (elapsed_ms < 1000) { + buffer->write(DISPLAY_TYPE_REASONING, "(%dms)", elapsed_ms); + } else { + buffer->write(DISPLAY_TYPE_REASONING, "(%.1fs)", elapsed_ms / 1000.0); + } + buffer->write(DISPLAY_TYPE_RESET, "\n"); } else { - console::log("(%.1fs)", elapsed_ms / 1000.0); - } - console::set_display(DISPLAY_TYPE_RESET); - console::log("\n"); -} + // Direct mode + console::output_guard guard; -void subagent_display::print_done(int depth, int elapsed_ms) { - std::string prefix = subagent_indent_prefix(depth); + guard.write("%s%s %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_TEE, TREE_HORIZONTAL, ARROW_RIGHT); + guard.set_display(DISPLAY_TYPE_INFO); + guard.write("%s", tool_name.c_str()); + guard.set_display(DISPLAY_TYPE_RESET); - // Print: │ └── done (timing) - console::log("%s%s %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_CORNER_BOTTOM, TREE_HORIZONTAL, TREE_HORIZONTAL); - console::set_display(DISPLAY_TYPE_INFO); - console::log("done"); - console::set_display(DISPLAY_TYPE_RESET); + if (!args_summary.empty()) { + guard.write(" %s", args_summary.c_str()); + } - if (elapsed_ms > 0) { - console::log(" "); - console::set_display(DISPLAY_TYPE_REASONING); + guard.write(" "); + guard.set_display(DISPLAY_TYPE_REASONING); if (elapsed_ms < 1000) { - console::log("(%dms)", elapsed_ms); + guard.write("(%dms)", elapsed_ms); } else { - console::log("(%.1fs)", elapsed_ms / 1000.0); + guard.write("(%.1fs)", elapsed_ms / 1000.0); + } + guard.set_display(DISPLAY_TYPE_RESET); + guard.write("\n"); + } +} + +void subagent_display::print_done(int depth, int elapsed_ms, + subagent_output_buffer * buffer) { + std::string prefix = subagent_indent_prefix(depth); + + if (buffer) { + // Buffered mode + buffer->write(DISPLAY_TYPE_RESET, "%s%s %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_CORNER_BOTTOM, TREE_HORIZONTAL, TREE_HORIZONTAL); + buffer->write(DISPLAY_TYPE_INFO, "done"); + + if (elapsed_ms > 0) { + buffer->write(DISPLAY_TYPE_RESET, " "); + if (elapsed_ms < 1000) { + buffer->write(DISPLAY_TYPE_REASONING, "(%dms)", elapsed_ms); + } else { + buffer->write(DISPLAY_TYPE_REASONING, "(%.1fs)", elapsed_ms / 1000.0); + } + } + buffer->write(DISPLAY_TYPE_RESET, "\n"); + } else { + // Direct mode + console::output_guard guard; + + guard.write("%s%s %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_CORNER_BOTTOM, TREE_HORIZONTAL, TREE_HORIZONTAL); + guard.set_display(DISPLAY_TYPE_INFO); + guard.write("done"); + guard.set_display(DISPLAY_TYPE_RESET); + + if (elapsed_ms > 0) { + guard.write(" "); + guard.set_display(DISPLAY_TYPE_REASONING); + if (elapsed_ms < 1000) { + guard.write("(%dms)", elapsed_ms); + } else { + guard.write("(%.1fs)", elapsed_ms / 1000.0); + } + guard.set_display(DISPLAY_TYPE_RESET); } - console::set_display(DISPLAY_TYPE_RESET); + guard.write("\n"); } - console::log("\n"); } // scope implementation +// Direct mode constructor (synchronous tasks) subagent_display::scope::scope(subagent_display & display, const std::string & name, subagent_type type, const std::string & description) : display_(display) + , buffer_(nullptr) +{ + std::lock_guard lock(display_.mtx_); + depth_ = display_.depth_.fetch_add(1); + + const auto & config = get_subagent_config(type); + display_.print_header(depth_, config.icon, name, config.name, description, buffer_); +} + +// Buffered mode constructor (background tasks) +subagent_display::scope::scope(subagent_display & display, + const std::string & name, + subagent_type type, + const std::string & description, + subagent_output_buffer * buffer) + : display_(display) + , buffer_(buffer) { std::lock_guard lock(display_.mtx_); depth_ = display_.depth_.fetch_add(1); const auto & config = get_subagent_config(type); - display_.print_header(depth_, config.icon, name, config.name, description); + display_.print_header(depth_, config.icon, name, config.name, description, buffer_); } subagent_display::scope::~scope() { @@ -121,7 +195,7 @@ subagent_display::scope::~scope() { display_.depth_.fetch_sub(1); if (!done_reported_) { - display_.print_done(depth_, 0); + display_.print_done(depth_, 0, buffer_); } } @@ -129,11 +203,11 @@ void subagent_display::scope::report_tool_call(const std::string & tool_name, const std::string & args_summary, int elapsed_ms) { std::lock_guard lock(display_.mtx_); - display_.print_tool_call(depth_, tool_name, args_summary, elapsed_ms); + display_.print_tool_call(depth_, tool_name, args_summary, elapsed_ms, buffer_); } void subagent_display::scope::report_done(int elapsed_ms) { std::lock_guard lock(display_.mtx_); - display_.print_done(depth_, elapsed_ms); + display_.print_done(depth_, elapsed_ms, buffer_); done_reported_ = true; } diff --git a/tools/agent/subagent/subagent-display.h b/tools/agent/subagent/subagent-display.h index 709914a7ffa..da592cf1117 100644 --- a/tools/agent/subagent/subagent-display.h +++ b/tools/agent/subagent/subagent-display.h @@ -6,16 +6,28 @@ #include #include +// Forward declaration +class subagent_output_buffer; + // Manages nested visual output for subagent execution class subagent_display { public: // RAII class for managing a subagent display scope class scope { public: + // Direct mode (synchronous tasks) - outputs immediately to console scope(subagent_display & display, const std::string & name, subagent_type type, const std::string & description); + + // Buffered mode (background tasks) - collects output in buffer + scope(subagent_display & display, + const std::string & name, + subagent_type type, + const std::string & description, + subagent_output_buffer * buffer); + ~scope(); // Prevent copying @@ -32,6 +44,7 @@ class subagent_display { private: subagent_display & display_; + subagent_output_buffer * buffer_ = nullptr; // nullptr = direct mode int depth_; bool done_reported_ = false; }; @@ -59,11 +72,15 @@ class subagent_display { int max_depth_ = 1; // Print tree characters for current depth + // If buffer is provided, output goes to buffer; otherwise to console void print_header(int depth, const std::string & icon, const std::string & name, - const std::string & type_name, const std::string & description); + const std::string & type_name, const std::string & description, + subagent_output_buffer * buffer = nullptr); void print_tool_call(int depth, const std::string & tool_name, - const std::string & args_summary, int elapsed_ms); - void print_done(int depth, int elapsed_ms); + const std::string & args_summary, int elapsed_ms, + subagent_output_buffer * buffer = nullptr); + void print_done(int depth, int elapsed_ms, + subagent_output_buffer * buffer = nullptr); friend class scope; }; diff --git a/tools/agent/subagent/subagent-output.cpp b/tools/agent/subagent/subagent-output.cpp new file mode 100644 index 00000000000..f8e98ef6e40 --- /dev/null +++ b/tools/agent/subagent/subagent-output.cpp @@ -0,0 +1,165 @@ +#include "subagent-output.h" + +#include +#include +#include + +// +// subagent_output_buffer implementation +// + +subagent_output_buffer::subagent_output_buffer(const std::string & task_id) + : task_id_(task_id) +{ +} + +void subagent_output_buffer::write(display_type type, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + + // Format the string + va_list args_copy; + va_copy(args_copy, args); + int size = vsnprintf(nullptr, 0, fmt, args_copy); + va_end(args_copy); + + std::string content(size + 1, '\0'); + vsnprintf(&content[0], size + 1, fmt, args); + content.resize(size); // Remove trailing null + + va_end(args); + + // Add to buffer + std::lock_guard lock(buffer_mutex_); + segments_.push_back({type, std::move(content)}); +} + +void subagent_output_buffer::write(const char * fmt, ...) { + va_list args; + va_start(args, fmt); + + // Format the string + va_list args_copy; + va_copy(args_copy, args); + int size = vsnprintf(nullptr, 0, fmt, args_copy); + va_end(args_copy); + + std::string content(size + 1, '\0'); + vsnprintf(&content[0], size + 1, fmt, args); + content.resize(size); + + va_end(args); + + // Add to buffer with default display type + std::lock_guard lock(buffer_mutex_); + segments_.push_back({DISPLAY_TYPE_RESET, std::move(content)}); +} + +void subagent_output_buffer::flush(bool with_task_prefix) { + std::lock_guard lock(buffer_mutex_); + + if (segments_.empty()) { + return; + } + + // Use output_guard to hold the console mutex for atomic output + console::output_guard guard; + + // Build prefix string + std::string prefix; + if (with_task_prefix && !task_id_.empty()) { + // Shorten task ID for display (task-abc12345 -> abc1) + std::string short_id = task_id_; + if (short_id.substr(0, 5) == "task-" && short_id.length() > 9) { + short_id = short_id.substr(5, 4); + } + prefix = "[" + short_id + "] "; + } + + // Track if we're at start of a line (for prefixing) + bool at_line_start = true; + + for (const auto & seg : segments_) { + guard.set_display(seg.type); + + // Process content character by character to handle newlines + for (size_t i = 0; i < seg.content.size(); ++i) { + char c = seg.content[i]; + + if (at_line_start && !prefix.empty()) { + guard.set_display(DISPLAY_TYPE_REASONING); // Dim prefix + guard.write("%s", prefix.c_str()); + guard.set_display(seg.type); // Restore segment color + at_line_start = false; + } + + guard.write("%c", c); + + if (c == '\n') { + at_line_start = true; + } + } + } + + guard.set_display(DISPLAY_TYPE_RESET); + guard.flush(); + + // Clear buffer after flush + segments_.clear(); +} + +void subagent_output_buffer::clear() { + std::lock_guard lock(buffer_mutex_); + segments_.clear(); +} + +bool subagent_output_buffer::empty() const { + std::lock_guard lock(buffer_mutex_); + return segments_.empty(); +} + +// +// subagent_output_manager implementation +// + +subagent_output_manager & subagent_output_manager::instance() { + static subagent_output_manager instance; + return instance; +} + +subagent_output_buffer * subagent_output_manager::create_buffer(const std::string & task_id) { + std::lock_guard lock(buffers_mutex_); + + auto buffer = std::make_unique(task_id); + auto * ptr = buffer.get(); + buffers_[task_id] = std::move(buffer); + return ptr; +} + +subagent_output_buffer * subagent_output_manager::get_buffer(const std::string & task_id) { + std::lock_guard lock(buffers_mutex_); + + auto it = buffers_.find(task_id); + if (it == buffers_.end()) { + return nullptr; + } + return it->second.get(); +} + +void subagent_output_manager::remove_buffer(const std::string & task_id) { + std::lock_guard lock(buffers_mutex_); + buffers_.erase(task_id); +} + +void subagent_output_manager::flush_all() { + std::lock_guard lock(buffers_mutex_); + + for (auto & [id, buffer] : buffers_) { + buffer->flush(true); + } +} + +size_t subagent_output_manager::active_count() const { + std::lock_guard lock(buffers_mutex_); + return buffers_.size(); +} diff --git a/tools/agent/subagent/subagent-output.h b/tools/agent/subagent/subagent-output.h new file mode 100644 index 00000000000..e7fc5b4f77a --- /dev/null +++ b/tools/agent/subagent/subagent-output.h @@ -0,0 +1,74 @@ +#pragma once + +#include "console.h" + +#include +#include +#include +#include +#include + +// Represents a single output segment with display type +struct output_segment { + display_type type = DISPLAY_TYPE_RESET; + std::string content; +}; + +// Buffered output for a single subagent task +// Collects output segments and flushes them atomically to console +class subagent_output_buffer { +public: + explicit subagent_output_buffer(const std::string & task_id); + + // Buffer text with a display type + void write(display_type type, const char * fmt, ...); + + // Buffer text without changing display type (uses DISPLAY_TYPE_RESET) + void write(const char * fmt, ...); + + // Flush all buffered content atomically to console + // Optionally prefix each line with task ID + void flush(bool with_task_prefix = true); + + // Clear buffer without flushing + void clear(); + + // Check if buffer has content + bool empty() const; + + // Get task ID + const std::string & task_id() const { return task_id_; } + +private: + std::string task_id_; + mutable std::mutex buffer_mutex_; + std::vector segments_; +}; + +// Manager for all active subagent output buffers +// Thread-safe singleton +class subagent_output_manager { +public: + static subagent_output_manager & instance(); + + // Create buffer for a new task (returns raw pointer, manager owns the buffer) + subagent_output_buffer * create_buffer(const std::string & task_id); + + // Get buffer for an existing task (returns nullptr if not found) + subagent_output_buffer * get_buffer(const std::string & task_id); + + // Remove and destroy buffer for a task + void remove_buffer(const std::string & task_id); + + // Flush all buffers (for status display or shutdown) + void flush_all(); + + // Get count of active buffers + size_t active_count() const; + +private: + subagent_output_manager() = default; + + mutable std::mutex buffers_mutex_; + std::map> buffers_; +}; diff --git a/tools/agent/subagent/subagent-runner.cpp b/tools/agent/subagent/subagent-runner.cpp index 575a7efa15c..7d4a8c49670 100644 --- a/tools/agent/subagent/subagent-runner.cpp +++ b/tools/agent/subagent/subagent-runner.cpp @@ -1,5 +1,6 @@ #include "subagent-runner.h" #include "subagent-display.h" +#include "subagent-output.h" #include "../agent-loop.h" #include "common.h" @@ -23,11 +24,21 @@ std::string subagent_runner::build_system_prompt(subagent_type type) const { const auto & config = get_subagent_config(type); std::ostringstream prompt; - prompt << "You are a specialized " << config.name << " subagent.\n\n"; + + // Start with parent's base prompt to enable KV cache prefix sharing + // The server's prompt cache will detect the common prefix and reuse cached tokens + if (!parent_tool_ctx_.base_system_prompt.empty()) { + prompt << parent_tool_ctx_.base_system_prompt; + prompt << "# Subagent Mode: " << config.name << "\n\n"; + } else { + // Fallback if no base prompt (shouldn't happen in normal operation) + prompt << "You are a specialized " << config.name << " subagent.\n\n"; + } + prompt << config.description << "\n\n"; - // Add tool restrictions - prompt << "# Available Tools\n\n"; + // Add tool restrictions (overrides base prompt's tool list) + prompt << "## Tools Available in This Mode\n\n"; prompt << "You have access to: "; bool first = true; for (const auto & tool : config.allowed_tools) { @@ -115,13 +126,29 @@ std::string subagent_runner::generate_task_id() { } subagent_result subagent_runner::run(const subagent_params & params) { + // Synchronous run uses direct console output (no buffer) + return run_internal(params, nullptr); +} + +subagent_result subagent_runner::run_internal(const subagent_params & params, + subagent_output_buffer * buffer) { subagent_result result; const auto & type_config = get_subagent_config(params.type); // Start display scope + // If buffer is provided (background mode), output goes to buffer + // Otherwise (synchronous mode), output goes directly to console auto & display = subagent_display::instance(); - subagent_display::scope display_scope(display, params.description.empty() ? "subagent" : params.description, - params.type, params.prompt.substr(0, 60) + (params.prompt.length() > 60 ? "..." : "")); + std::string desc = params.description.empty() ? "subagent" : params.description; + std::string prompt_preview = params.prompt.substr(0, 60) + (params.prompt.length() > 60 ? "..." : ""); + + // Use the appropriate constructor based on whether we have a buffer + std::unique_ptr display_scope; + if (buffer) { + display_scope = std::make_unique(display, desc, params.type, prompt_preview, buffer); + } else { + display_scope = std::make_unique(display, desc, params.type, prompt_preview); + } auto start_time = std::chrono::steady_clock::now(); @@ -144,7 +171,7 @@ subagent_result subagent_runner::run(const subagent_params & params) { auto tool_callback = [&display_scope, &result](const std::string & tool_name, const std::string & args_summary, int elapsed_ms) { - display_scope.report_tool_call(tool_name, args_summary, elapsed_ms); + display_scope->report_tool_call(tool_name, args_summary, elapsed_ms); // Also track in result std::ostringstream summary; summary << tool_name << " (" << elapsed_ms << "ms)"; @@ -166,11 +193,17 @@ subagent_result subagent_runner::run(const subagent_params & params) { auto elapsed_ms = std::chrono::duration_cast(end_time - start_time).count(); // Report completion - display_scope.report_done(static_cast(elapsed_ms)); + display_scope->report_done(static_cast(elapsed_ms)); // Convert result result.iterations = loop_result.iterations; + // Collect token stats from the subagent + const auto & subagent_stats = subagent.get_stats(); + result.input_tokens = subagent_stats.total_input; + result.output_tokens = subagent_stats.total_output; + result.cached_tokens = subagent_stats.total_cached; + switch (loop_result.stop_reason) { case agent_stop_reason::COMPLETED: result.success = true; @@ -200,6 +233,10 @@ subagent_result subagent_runner::run(const subagent_params & params) { std::string subagent_runner::start_background(const subagent_params & params) { std::string task_id = generate_task_id(); + // Create output buffer for this background task + auto & output_mgr = subagent_output_manager::instance(); + subagent_output_buffer * buffer = output_mgr.create_buffer(task_id); + auto task = std::make_unique(); task->id = task_id; task->params = params; @@ -207,17 +244,24 @@ std::string subagent_runner::start_background(const subagent_params & params) { // Capture what we need for the thread auto * task_ptr = task.get(); - // Start background thread - task->thread = std::thread([this, task_ptr, params]() { + // Start background thread with buffer for output + task->thread = std::thread([this, task_ptr, params, buffer, task_id]() { subagent_result result; try { - result = this->run(params); + result = this->run_internal(params, buffer); } catch (const std::exception & e) { result.success = false; result.error = std::string("Exception: ") + e.what(); } + + // Flush buffered output before completing + buffer->flush(true); + task_ptr->promise.set_value(std::move(result)); task_ptr->complete.store(true); + + // Cleanup buffer + subagent_output_manager::instance().remove_buffer(task_id); }); // Store task diff --git a/tools/agent/subagent/subagent-runner.h b/tools/agent/subagent/subagent-runner.h index b7b6a60dbbb..1912f42c74f 100644 --- a/tools/agent/subagent/subagent-runner.h +++ b/tools/agent/subagent/subagent-runner.h @@ -16,6 +16,7 @@ struct server_context; struct agent_config; struct common_params; +class subagent_output_buffer; // Parameters for running a subagent struct subagent_params { @@ -31,6 +32,11 @@ struct subagent_result { std::string error; int iterations = 0; std::vector tool_calls_summary; // List of tools called with timing + + // Token statistics from the subagent run + int32_t input_tokens = 0; + int32_t output_tokens = 0; + int32_t cached_tokens = 0; }; // Background task state @@ -98,4 +104,8 @@ class subagent_runner { // Generate unique task ID static std::string generate_task_id(); + + // Internal run method with optional buffer for background tasks + subagent_result run_internal(const subagent_params & params, + subagent_output_buffer * buffer); }; diff --git a/tools/agent/tool-registry.h b/tools/agent/tool-registry.h index 3f2b5a5207c..6e70d1db2af 100644 --- a/tools/agent/tool-registry.h +++ b/tools/agent/tool-registry.h @@ -22,7 +22,12 @@ struct tool_context { void * server_ctx_ptr = nullptr; // Pointer to server_context void * agent_config_ptr = nullptr; // Pointer to agent_config void * common_params_ptr = nullptr; // Pointer to common_params (for model inference params) + void * session_stats_ptr = nullptr; // Pointer to session_stats (for tracking subagent tokens) int subagent_depth = 0; // Current nesting depth (0 = main agent) + + // Prefix caching: base system prompt shared between parent and subagents + // Subagent prompts start with this prefix to maximize KV cache reuse + std::string base_system_prompt; }; // Result returned from tool execution diff --git a/tools/agent/tools/tool-task.cpp b/tools/agent/tools/tool-task.cpp index 922f3575cb9..bc6453f5f4d 100644 --- a/tools/agent/tools/tool-task.cpp +++ b/tools/agent/tools/tool-task.cpp @@ -40,6 +40,26 @@ static subagent_runner & get_runner(const tool_context & ctx) { return *g_runners[key]; } +// Update parent's session stats with subagent token usage +static void update_parent_stats(const tool_context & ctx, const subagent_result & result) { + if (!ctx.session_stats_ptr) { + return; + } + + auto * stats = static_cast(ctx.session_stats_ptr); + + // Add to subagent-specific counters (for breakdown display) + stats->subagent_input += result.input_tokens; + stats->subagent_output += result.output_tokens; + stats->subagent_cached += result.cached_tokens; + stats->subagent_count++; + + // Also add to totals so "Main = Total - Subagent" works correctly + stats->total_input += result.input_tokens; + stats->total_output += result.output_tokens; + stats->total_cached += result.cached_tokens; +} + static tool_result task_execute(const json & args, const tool_context & ctx) { // Check depth limit auto & display = subagent_display::instance(); @@ -69,6 +89,9 @@ static tool_result task_execute(const json & args, const tool_context & ctx) { if (runner.is_complete(resume_id)) { subagent_result result = runner.get_result(resume_id); + // Update parent stats with subagent token usage + update_parent_stats(ctx, result); + // Format output std::ostringstream output; output << "Background task " << resume_id << " completed"; @@ -143,6 +166,9 @@ static tool_result task_execute(const json & args, const tool_context & ctx) { // Run synchronously subagent_result result = runner.run(task_params); + // Update parent stats with subagent token usage + update_parent_stats(ctx, result); + // Format output std::ostringstream output; output << "Subagent (" << subagent_type_name(type) << ") ";