diff --git a/common/console.cpp b/common/console.cpp
index 857e9484519..d03aa5ff90f 100644
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -83,6 +83,10 @@ namespace console {
 
     static bool         bracket_paste_mode = false;  // true when inside ESC[200~ ... ESC[201~
 
+    // Thread-safety mutex for console operations
+    // Protects: current_display, out (FILE*), and all console output operations
+    static std::mutex   g_console_mutex;
+
     //
     // Init and cleanup
     //
@@ -179,7 +183,8 @@ namespace console {
     //
 
     // Keep track of current display and only emit ANSI code if it changes
-    void set_display(display_type display) {
+    // Internal version without lock (for use when lock is already held)
+    static void set_display_unlocked(display_type display) {
         if (advanced_display && current_display != display) {
             common_log_flush(common_log_main());
             switch(display) {
@@ -210,6 +215,11 @@ namespace console {
         }
     }
 
+    void set_display(display_type display) {
+        std::lock_guard<std::mutex> lock(g_console_mutex);
+        set_display_unlocked(display);
+    }
+
     static char32_t getchar32() {
 #if defined(_WIN32)
         HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
@@ -1198,6 +1208,7 @@ namespace console {
     }
 
     void log(const char * fmt, ...) {
+        std::lock_guard<std::mutex> lock(g_console_mutex);
         va_list args;
         va_start(args, fmt);
         vfprintf(out, fmt, args);
@@ -1205,16 +1216,45 @@ namespace console {
     }
 
     void error(const char * fmt, ...) {
+        std::lock_guard<std::mutex> lock(g_console_mutex);
         va_list args;
         va_start(args, fmt);
         display_type cur = current_display;
-        set_display(DISPLAY_TYPE_ERROR);
+        set_display_unlocked(DISPLAY_TYPE_ERROR);
         vfprintf(out, fmt, args);
-        set_display(cur); // restore previous color
+        set_display_unlocked(cur); // restore previous color
         va_end(args);
     }
 
     void flush() {
+        std::lock_guard<std::mutex> lock(g_console_mutex);
+        fflush(out);
+    }
+
+    //
+    // output_guard implementation
+    //
+
+    output_guard::output_guard() {
+        g_console_mutex.lock();
+    }
+
+    output_guard::~output_guard() {
+        g_console_mutex.unlock();
+    }
+
+    void output_guard::write(const char * fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        vfprintf(out, fmt, args);
+        va_end(args);
+    }
+
+    void output_guard::set_display(display_type type) {
+        set_display_unlocked(type);
+    }
+
+    void output_guard::flush() {
         fflush(out);
     }
 }
diff --git a/common/console.h b/common/console.h
index 6d0db6420cc..24f90bed7ac 100644
--- a/common/console.h
+++ b/common/console.h
@@ -39,4 +39,27 @@ namespace console {
     void error(const char * fmt, ...);
 
     void flush();
+
+    // RAII guard for atomic multi-part console output
+    // Holds the console mutex for the lifetime of the object
+    // Use this when you need to output multiple lines/parts atomically
+    class output_guard {
+    public:
+        output_guard();
+        ~output_guard();
+
+        // Non-copyable, non-movable
+        output_guard(const output_guard &) = delete;
+        output_guard & operator=(const output_guard &) = delete;
+
+        // Write to console (mutex already held)
+        LLAMA_COMMON_ATTRIBUTE_FORMAT(2, 3)
+        void write(const char * fmt, ...);
+
+        // Change display type (mutex already held)
+        void set_display(display_type type);
+
+        // Flush output (mutex already held)
+        void flush();
+    };
 }
diff --git a/tools/agent/CMakeLists.txt b/tools/agent/CMakeLists.txt
index 6d8c36369fd..673c20f0c2c 100644
--- a/tools/agent/CMakeLists.txt
+++ b/tools/agent/CMakeLists.txt
@@ -10,6 +10,7 @@ set(AGENT_SOURCES
     subagent/subagent-types.cpp
     subagent/subagent-display.cpp
     subagent/subagent-runner.cpp
+    subagent/subagent-output.cpp
     tools/tool-bash.cpp
     tools/tool-read.cpp
     tools/tool-write.cpp
diff --git a/tools/agent/README.md b/tools/agent/README.md
index c136fccd2bd..be649915762 100644
--- a/tools/agent/README.md
+++ b/tools/agent/README.md
@@ -54,6 +54,104 @@ llama-agent -hf unsloth/Nemotron-3-Nano-30B-A3B-GGUF:Q5_K_M
 | `write` | Create or overwrite files |
 | `edit` | Search and replace in files |
 | `glob` | Find files matching a pattern |
+| `task` | Spawn a subagent for complex tasks |
+
+## Subagents
+
+Subagents are specialized child agents that handle complex tasks independently, keeping the main conversation context clean and efficient.
+
+### Why Subagents?
+
+Without subagents, every file read and search pollutes your main context:
+
+```
+Main context after exploring codebase:
+├── glob **/*.cpp → 50 files (800 tokens)
+├── read src/main.cpp → full file (1,500 tokens)
+├── read src/utils.cpp → full file (2,200 tokens)
+├── grep "TODO" → 100 matches (1,200 tokens)
+└── Total: ~5,700 tokens consumed for ONE exploration
+```
+
+With subagents, only the summary enters main context:
+
+```
+Main context:
+└── task(explore) → "Found 3 TODO items in src/main.cpp:42,87,156" (50 tokens)
+
+Subagent context (discarded after):
+├── All the detailed exploration (~5,700 tokens)
+└── Summarized to parent
+```
+
+### Subagent Types
+
+| Type | Purpose | Tools Available |
+|------|---------|-----------------|
+| `explore` | Search and understand code (read-only) | `glob`, `read`, `bash` (read-only commands only) |
+| `bash` | Execute shell commands | `bash` |
+| `plan` | Design implementation approaches | `glob`, `read`, `bash` |
+| `general` | General-purpose tasks | All tools |
+
+### How It Works
+
+```
+┌─────────────────┐
+│   Main Agent    │  "Find where errors are handled"
+└────────┬────────┘
+         │ task(explore, "find error handling")
+         ▼
+┌─────────────────┐
+│    Subagent     │  Does detailed exploration:
+│    (explore)    │  - glob **/*.cpp
+│                 │  - read 5 files
+│                 │  - grep patterns
+└────────┬────────┘
+         │ Returns summary only
+         ▼
+┌─────────────────┐
+│   Main Agent    │  Receives: "Errors handled in src/error.cpp:45
+│                 │  via ErrorHandler class..."
+└─────────────────┘
+```
+
+### Memory Efficiency
+
+Subagents share the model - no additional VRAM is used:
+
+| Resource | Main Agent | Subagent | Total |
+|----------|------------|----------|-------|
+| Model weights | ✓ | Shared | 1x |
+| KV cache | ✓ | Shared via slots | 1x |
+| Context window | Own | Own (discarded after) | Efficient |
+
+### Parallel Execution
+
+Multiple subagents can run in the background simultaneously:
+
+```
+> Run tests and check for lint errors at the same time
+
+[task-a1b2] ┌── ⚡ run-tests (bash)
+[task-c3d4] ┌── ⚡ check-lint (bash)
+[task-a1b2] │   ├─› bash npm test (2.1s)
+[task-c3d4] │   ├─› bash npm run lint (1.8s)
+[task-c3d4] │   └── done (1.8s)
+[task-a1b2] │   └── done (2.1s)
+```
+
+### KV Cache Prefix Sharing
+
+Subagent prompts share a common prefix with the main agent, enabling automatic KV cache reuse:
+
+```
+Main agent prompt:    "You are llama-agent... [base] + [main agent instructions]"
+Subagent prompt:      "You are llama-agent... [base] + # Subagent Mode: explore..."
+                       ↑─────── shared prefix ──────↑
+                       Cached tokens reused, not re-processed
+```
+
+This reduces subagent startup latency and saves compute.
 
 ## Usage Examples
 
diff --git a/tools/agent/agent-loop.cpp b/tools/agent/agent-loop.cpp
index 04fc2d7008f..b03192441d4 100644
--- a/tools/agent/agent-loop.cpp
+++ b/tools/agent/agent-loop.cpp
@@ -70,12 +70,34 @@ agent_loop::agent_loop(server_context & server_ctx,
     tool_ctx_.server_ctx_ptr = &server_ctx_;
     tool_ctx_.agent_config_ptr = const_cast<agent_config *>(&config_);
     tool_ctx_.common_params_ptr = const_cast<common_params *>(&params);
+    tool_ctx_.session_stats_ptr = &stats_;
     tool_ctx_.subagent_depth = 0;
 
     // Set up permission manager
     permission_mgr_.set_project_root(tool_ctx_.working_dir);
     permission_mgr_.set_yolo_mode(config.yolo_mode);
 
+    // Base prompt shared with subagents for KV cache prefix sharing
+    // Subagent prompts start with this exact text to maximize cache hits
+    static const char * BASE_PROMPT_PREFIX = R"(You are llama-agent, a powerful local AI coding assistant running on llama.cpp.
+
+You help users with software engineering tasks by reading files, writing code, running commands, and navigating codebases. You run entirely on the user's machine - no data leaves their system.
+
+# Tools
+
+You have access to the following tools:
+
+- **bash**: Execute shell commands. Use for git, build commands, running tests, etc.
+- **read**: Read file contents with line numbers. Always read files before editing them.
+- **write**: Create new files or overwrite existing ones.
+- **edit**: Make targeted edits using search/replace. The old_string must match exactly. Use replace_all=true to replace all occurrences of a word or phrase.
+- **glob**: Find files matching a pattern. Use to explore project structure.
+
+)";
+
+    // Store base prompt for subagents to inherit (enables KV cache prefix sharing)
+    tool_ctx_.base_system_prompt = BASE_PROMPT_PREFIX;
+
     // Add system prompt for tool usage
     std::string system_prompt = R"(You are llama-agent, a powerful local AI coding assistant running on llama.cpp.
 
@@ -265,6 +287,7 @@ agent_loop::agent_loop(server_context & server_ctx,
     tool_ctx_.server_ctx_ptr = &server_ctx_;
     tool_ctx_.agent_config_ptr = const_cast<agent_config *>(&config_);
     tool_ctx_.common_params_ptr = const_cast<common_params *>(&params);
+    tool_ctx_.session_stats_ptr = &stats_;
     tool_ctx_.subagent_depth = subagent_depth;
 
     // Set up permission manager
@@ -286,6 +309,9 @@ void agent_loop::clear() {
         messages_.push_back(system_msg);
     }
     permission_mgr_.clear_session();
+
+    // Reset stats when conversation is cleared
+    stats_ = session_stats{};
 }
 
 common_chat_msg agent_loop::generate_completion(result_timings & out_timings) {
diff --git a/tools/agent/agent-loop.h b/tools/agent/agent-loop.h
index efb2990cf9c..01abe15df1a 100644
--- a/tools/agent/agent-loop.h
+++ b/tools/agent/agent-loop.h
@@ -61,6 +61,12 @@ struct session_stats {
     int32_t total_cached = 0;      // Total tokens served from KV cache
     double total_prompt_ms = 0;    // Total prompt evaluation time
     double total_predicted_ms = 0; // Total generation time
+
+    // Subagent-specific stats (subset of totals above)
+    int32_t subagent_input = 0;    // Prompt tokens from subagents
+    int32_t subagent_output = 0;   // Output tokens from subagents
+    int32_t subagent_cached = 0;   // Cached tokens from subagents
+    int32_t subagent_count = 0;    // Number of subagent runs
 };
 
 // The main agent loop class
diff --git a/tools/agent/agent.cpp b/tools/agent/agent.cpp
index dcc147157d4..e4dd6138a7b 100644
--- a/tools/agent/agent.cpp
+++ b/tools/agent/agent.cpp
@@ -452,6 +452,27 @@ int main(int argc, char ** argv) {
                     console::log("  Cached tokens:  %d\n", stats.total_cached);
                 }
                 console::log("  Total tokens:   %d\n", stats.total_input + stats.total_output);
+
+                // Show subagent breakdown if any subagents were used
+                if (stats.subagent_count > 0) {
+                    console::log("\n  Subagent breakdown (%d run%s):\n",
+                        stats.subagent_count, stats.subagent_count == 1 ? "" : "s");
+                    console::log("    Prompt tokens:  %d\n", stats.subagent_input);
+                    console::log("    Output tokens:  %d\n", stats.subagent_output);
+                    if (stats.subagent_cached > 0) {
+                        console::log("    Cached tokens:  %d\n", stats.subagent_cached);
+                    }
+                    console::log("    Total tokens:   %d\n", stats.subagent_input + stats.subagent_output);
+
+                    // Show main agent stats (total minus subagent)
+                    int32_t main_input = stats.total_input - stats.subagent_input;
+                    int32_t main_output = stats.total_output - stats.subagent_output;
+                    console::log("\n  Main agent:\n");
+                    console::log("    Prompt tokens:  %d\n", main_input);
+                    console::log("    Output tokens:  %d\n", main_output);
+                    console::log("    Total tokens:   %d\n", main_input + main_output);
+                }
+
                 if (stats.total_prompt_ms > 0) {
                     console::log("  Prompt time:    %.2fs\n", stats.total_prompt_ms / 1000.0);
                 }
diff --git a/tools/agent/subagent/subagent-display.cpp b/tools/agent/subagent/subagent-display.cpp
index e447e16422e..e9e7021d5c0 100644
--- a/tools/agent/subagent/subagent-display.cpp
+++ b/tools/agent/subagent/subagent-display.cpp
@@ -1,4 +1,5 @@
 #include "subagent-display.h"
+#include "subagent-output.h"
 #include "console.h"
 
 #include <sstream>
@@ -31,89 +32,162 @@ subagent_display & subagent_display::instance() {
 void subagent_display::print_header(int depth, const std::string & icon,
                                      const std::string & name,
                                      const std::string & type_name,
-                                     const std::string & description) {
+                                     const std::string & description,
+                                     subagent_output_buffer * buffer) {
     std::string prefix = subagent_indent_prefix(depth);
 
-    // Print: ┌── ⚡ name (type)
-    console::log("\n%s%s%s%s ", prefix.c_str(), TREE_CORNER_TOP, TREE_HORIZONTAL, TREE_HORIZONTAL);
-    console::log("%s ", icon.c_str());
-    console::set_display(DISPLAY_TYPE_SUBAGENT);
-    console::log("%s", name.c_str());
-    console::set_display(DISPLAY_TYPE_RESET);
-    console::set_display(DISPLAY_TYPE_REASONING);
-    console::log(" (%s)\n", type_name.c_str());
-    console::set_display(DISPLAY_TYPE_RESET);
-
-    // Print description if provided
-    if (!description.empty()) {
-        console::log("%s%s   ", prefix.c_str(), TREE_VERTICAL);
-        console::set_display(DISPLAY_TYPE_REASONING);
-        console::log("%s\n", description.c_str());
-        console::set_display(DISPLAY_TYPE_RESET);
+    if (buffer) {
+        // Buffered mode: write to buffer
+        buffer->write(DISPLAY_TYPE_RESET, "\n%s%s%s%s ", prefix.c_str(), TREE_CORNER_TOP, TREE_HORIZONTAL, TREE_HORIZONTAL);
+        buffer->write(DISPLAY_TYPE_RESET, "%s ", icon.c_str());
+        buffer->write(DISPLAY_TYPE_SUBAGENT, "%s", name.c_str());
+        buffer->write(DISPLAY_TYPE_REASONING, " (%s)\n", type_name.c_str());
+
+        if (!description.empty()) {
+            buffer->write(DISPLAY_TYPE_RESET, "%s%s   ", prefix.c_str(), TREE_VERTICAL);
+            buffer->write(DISPLAY_TYPE_REASONING, "%s\n", description.c_str());
+        }
+    } else {
+        // Direct mode: use output_guard for atomic output
+        console::output_guard guard;
+
+        guard.write("\n%s%s%s%s ", prefix.c_str(), TREE_CORNER_TOP, TREE_HORIZONTAL, TREE_HORIZONTAL);
+        guard.write("%s ", icon.c_str());
+        guard.set_display(DISPLAY_TYPE_SUBAGENT);
+        guard.write("%s", name.c_str());
+        guard.set_display(DISPLAY_TYPE_RESET);
+        guard.set_display(DISPLAY_TYPE_REASONING);
+        guard.write(" (%s)\n", type_name.c_str());
+        guard.set_display(DISPLAY_TYPE_RESET);
+
+        if (!description.empty()) {
+            guard.write("%s%s   ", prefix.c_str(), TREE_VERTICAL);
+            guard.set_display(DISPLAY_TYPE_REASONING);
+            guard.write("%s\n", description.c_str());
+            guard.set_display(DISPLAY_TYPE_RESET);
+        }
     }
 }
 
 void subagent_display::print_tool_call(int depth, const std::string & tool_name,
                                         const std::string & args_summary,
-                                        int elapsed_ms) {
+                                        int elapsed_ms,
+                                        subagent_output_buffer * buffer) {
     std::string prefix = subagent_indent_prefix(depth);
 
-    // Print: │   ├─› tool_name args (timing)
-    console::log("%s%s   %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_TEE, TREE_HORIZONTAL, ARROW_RIGHT);
-    console::set_display(DISPLAY_TYPE_INFO);
-    console::log("%s", tool_name.c_str());
-    console::set_display(DISPLAY_TYPE_RESET);
+    if (buffer) {
+        // Buffered mode
+        buffer->write(DISPLAY_TYPE_RESET, "%s%s   %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_TEE, TREE_HORIZONTAL, ARROW_RIGHT);
+        buffer->write(DISPLAY_TYPE_INFO, "%s", tool_name.c_str());
 
-    if (!args_summary.empty()) {
-        console::log(" %s", args_summary.c_str());
-    }
+        if (!args_summary.empty()) {
+            buffer->write(DISPLAY_TYPE_RESET, " %s", args_summary.c_str());
+        }
 
-    console::log(" ");
-    console::set_display(DISPLAY_TYPE_REASONING);
-    if (elapsed_ms < 1000) {
-        console::log("(%dms)", elapsed_ms);
+        buffer->write(DISPLAY_TYPE_RESET, " ");
+        if (elapsed_ms < 1000) {
+            buffer->write(DISPLAY_TYPE_REASONING, "(%dms)", elapsed_ms);
+        } else {
+            buffer->write(DISPLAY_TYPE_REASONING, "(%.1fs)", elapsed_ms / 1000.0);
+        }
+        buffer->write(DISPLAY_TYPE_RESET, "\n");
     } else {
-        console::log("(%.1fs)", elapsed_ms / 1000.0);
-    }
-    console::set_display(DISPLAY_TYPE_RESET);
-    console::log("\n");
-}
+        // Direct mode
+        console::output_guard guard;
 
-void subagent_display::print_done(int depth, int elapsed_ms) {
-    std::string prefix = subagent_indent_prefix(depth);
+        guard.write("%s%s   %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_TEE, TREE_HORIZONTAL, ARROW_RIGHT);
+        guard.set_display(DISPLAY_TYPE_INFO);
+        guard.write("%s", tool_name.c_str());
+        guard.set_display(DISPLAY_TYPE_RESET);
 
-    // Print: │   └── done (timing)
-    console::log("%s%s   %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_CORNER_BOTTOM, TREE_HORIZONTAL, TREE_HORIZONTAL);
-    console::set_display(DISPLAY_TYPE_INFO);
-    console::log("done");
-    console::set_display(DISPLAY_TYPE_RESET);
+        if (!args_summary.empty()) {
+            guard.write(" %s", args_summary.c_str());
+        }
 
-    if (elapsed_ms > 0) {
-        console::log(" ");
-        console::set_display(DISPLAY_TYPE_REASONING);
+        guard.write(" ");
+        guard.set_display(DISPLAY_TYPE_REASONING);
         if (elapsed_ms < 1000) {
-            console::log("(%dms)", elapsed_ms);
+            guard.write("(%dms)", elapsed_ms);
         } else {
-            console::log("(%.1fs)", elapsed_ms / 1000.0);
+            guard.write("(%.1fs)", elapsed_ms / 1000.0);
+        }
+        guard.set_display(DISPLAY_TYPE_RESET);
+        guard.write("\n");
+    }
+}
+
+void subagent_display::print_done(int depth, int elapsed_ms,
+                                   subagent_output_buffer * buffer) {
+    std::string prefix = subagent_indent_prefix(depth);
+
+    if (buffer) {
+        // Buffered mode
+        buffer->write(DISPLAY_TYPE_RESET, "%s%s   %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_CORNER_BOTTOM, TREE_HORIZONTAL, TREE_HORIZONTAL);
+        buffer->write(DISPLAY_TYPE_INFO, "done");
+
+        if (elapsed_ms > 0) {
+            buffer->write(DISPLAY_TYPE_RESET, " ");
+            if (elapsed_ms < 1000) {
+                buffer->write(DISPLAY_TYPE_REASONING, "(%dms)", elapsed_ms);
+            } else {
+                buffer->write(DISPLAY_TYPE_REASONING, "(%.1fs)", elapsed_ms / 1000.0);
+            }
+        }
+        buffer->write(DISPLAY_TYPE_RESET, "\n");
+    } else {
+        // Direct mode
+        console::output_guard guard;
+
+        guard.write("%s%s   %s%s%s ", prefix.c_str(), TREE_VERTICAL, TREE_CORNER_BOTTOM, TREE_HORIZONTAL, TREE_HORIZONTAL);
+        guard.set_display(DISPLAY_TYPE_INFO);
+        guard.write("done");
+        guard.set_display(DISPLAY_TYPE_RESET);
+
+        if (elapsed_ms > 0) {
+            guard.write(" ");
+            guard.set_display(DISPLAY_TYPE_REASONING);
+            if (elapsed_ms < 1000) {
+                guard.write("(%dms)", elapsed_ms);
+            } else {
+                guard.write("(%.1fs)", elapsed_ms / 1000.0);
+            }
+            guard.set_display(DISPLAY_TYPE_RESET);
         }
-        console::set_display(DISPLAY_TYPE_RESET);
+        guard.write("\n");
     }
-    console::log("\n");
 }
 
 // scope implementation
 
+// Direct mode constructor (synchronous tasks)
 subagent_display::scope::scope(subagent_display & display,
                                 const std::string & name,
                                 subagent_type type,
                                 const std::string & description)
     : display_(display)
+    , buffer_(nullptr)
+{
+    std::lock_guard<std::mutex> lock(display_.mtx_);
+    depth_ = display_.depth_.fetch_add(1);
+
+    const auto & config = get_subagent_config(type);
+    display_.print_header(depth_, config.icon, name, config.name, description, buffer_);
+}
+
+// Buffered mode constructor (background tasks)
+subagent_display::scope::scope(subagent_display & display,
+                                const std::string & name,
+                                subagent_type type,
+                                const std::string & description,
+                                subagent_output_buffer * buffer)
+    : display_(display)
+    , buffer_(buffer)
 {
     std::lock_guard<std::mutex> lock(display_.mtx_);
     depth_ = display_.depth_.fetch_add(1);
 
     const auto & config = get_subagent_config(type);
-    display_.print_header(depth_, config.icon, name, config.name, description);
+    display_.print_header(depth_, config.icon, name, config.name, description, buffer_);
 }
 
 subagent_display::scope::~scope() {
@@ -121,7 +195,7 @@ subagent_display::scope::~scope() {
     display_.depth_.fetch_sub(1);
 
     if (!done_reported_) {
-        display_.print_done(depth_, 0);
+        display_.print_done(depth_, 0, buffer_);
     }
 }
 
@@ -129,11 +203,11 @@ void subagent_display::scope::report_tool_call(const std::string & tool_name,
                                                 const std::string & args_summary,
                                                 int elapsed_ms) {
     std::lock_guard<std::mutex> lock(display_.mtx_);
-    display_.print_tool_call(depth_, tool_name, args_summary, elapsed_ms);
+    display_.print_tool_call(depth_, tool_name, args_summary, elapsed_ms, buffer_);
 }
 
 void subagent_display::scope::report_done(int elapsed_ms) {
     std::lock_guard<std::mutex> lock(display_.mtx_);
-    display_.print_done(depth_, elapsed_ms);
+    display_.print_done(depth_, elapsed_ms, buffer_);
     done_reported_ = true;
 }
diff --git a/tools/agent/subagent/subagent-display.h b/tools/agent/subagent/subagent-display.h
index 709914a7ffa..da592cf1117 100644
--- a/tools/agent/subagent/subagent-display.h
+++ b/tools/agent/subagent/subagent-display.h
@@ -6,16 +6,28 @@
 #include <mutex>
 #include <string>
 
+// Forward declaration
+class subagent_output_buffer;
+
 // Manages nested visual output for subagent execution
 class subagent_display {
 public:
     // RAII class for managing a subagent display scope
     class scope {
     public:
+        // Direct mode (synchronous tasks) - outputs immediately to console
         scope(subagent_display & display,
               const std::string & name,
               subagent_type type,
               const std::string & description);
+
+        // Buffered mode (background tasks) - collects output in buffer
+        scope(subagent_display & display,
+              const std::string & name,
+              subagent_type type,
+              const std::string & description,
+              subagent_output_buffer * buffer);
+
         ~scope();
 
         // Prevent copying
@@ -32,6 +44,7 @@ class subagent_display {
 
     private:
         subagent_display & display_;
+        subagent_output_buffer * buffer_ = nullptr;  // nullptr = direct mode
         int depth_;
         bool done_reported_ = false;
     };
@@ -59,11 +72,15 @@ class subagent_display {
     int max_depth_ = 1;
 
     // Print tree characters for current depth
+    // If buffer is provided, output goes to buffer; otherwise to console
     void print_header(int depth, const std::string & icon, const std::string & name,
-                      const std::string & type_name, const std::string & description);
+                      const std::string & type_name, const std::string & description,
+                      subagent_output_buffer * buffer = nullptr);
     void print_tool_call(int depth, const std::string & tool_name,
-                         const std::string & args_summary, int elapsed_ms);
-    void print_done(int depth, int elapsed_ms);
+                         const std::string & args_summary, int elapsed_ms,
+                         subagent_output_buffer * buffer = nullptr);
+    void print_done(int depth, int elapsed_ms,
+                    subagent_output_buffer * buffer = nullptr);
 
     friend class scope;
 };
diff --git a/tools/agent/subagent/subagent-output.cpp b/tools/agent/subagent/subagent-output.cpp
new file mode 100644
index 00000000000..f8e98ef6e40
--- /dev/null
+++ b/tools/agent/subagent/subagent-output.cpp
@@ -0,0 +1,165 @@
+#include "subagent-output.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <sstream>
+
+//
+// subagent_output_buffer implementation
+//
+
+subagent_output_buffer::subagent_output_buffer(const std::string & task_id)
+    : task_id_(task_id)
+{
+}
+
+void subagent_output_buffer::write(display_type type, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+
+    // Format the string
+    va_list args_copy;
+    va_copy(args_copy, args);
+    int size = vsnprintf(nullptr, 0, fmt, args_copy);
+    va_end(args_copy);
+
+    std::string content(size + 1, '\0');
+    vsnprintf(&content[0], size + 1, fmt, args);
+    content.resize(size);  // Remove trailing null
+
+    va_end(args);
+
+    // Add to buffer
+    std::lock_guard<std::mutex> lock(buffer_mutex_);
+    segments_.push_back({type, std::move(content)});
+}
+
+void subagent_output_buffer::write(const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+
+    // Format the string
+    va_list args_copy;
+    va_copy(args_copy, args);
+    int size = vsnprintf(nullptr, 0, fmt, args_copy);
+    va_end(args_copy);
+
+    std::string content(size + 1, '\0');
+    vsnprintf(&content[0], size + 1, fmt, args);
+    content.resize(size);
+
+    va_end(args);
+
+    // Add to buffer with default display type
+    std::lock_guard<std::mutex> lock(buffer_mutex_);
+    segments_.push_back({DISPLAY_TYPE_RESET, std::move(content)});
+}
+
+void subagent_output_buffer::flush(bool with_task_prefix) {
+    std::lock_guard<std::mutex> lock(buffer_mutex_);
+
+    if (segments_.empty()) {
+        return;
+    }
+
+    // Use output_guard to hold the console mutex for atomic output
+    console::output_guard guard;
+
+    // Build prefix string
+    std::string prefix;
+    if (with_task_prefix && !task_id_.empty()) {
+        // Shorten task ID for display (task-abc12345 -> abc1)
+        std::string short_id = task_id_;
+        if (short_id.substr(0, 5) == "task-" && short_id.length() > 9) {
+            short_id = short_id.substr(5, 4);
+        }
+        prefix = "[" + short_id + "] ";
+    }
+
+    // Track if we're at start of a line (for prefixing)
+    bool at_line_start = true;
+
+    for (const auto & seg : segments_) {
+        guard.set_display(seg.type);
+
+        // Process content character by character to handle newlines
+        for (size_t i = 0; i < seg.content.size(); ++i) {
+            char c = seg.content[i];
+
+            if (at_line_start && !prefix.empty()) {
+                guard.set_display(DISPLAY_TYPE_REASONING);  // Dim prefix
+                guard.write("%s", prefix.c_str());
+                guard.set_display(seg.type);  // Restore segment color
+                at_line_start = false;
+            }
+
+            guard.write("%c", c);
+
+            if (c == '\n') {
+                at_line_start = true;
+            }
+        }
+    }
+
+    guard.set_display(DISPLAY_TYPE_RESET);
+    guard.flush();
+
+    // Clear buffer after flush
+    segments_.clear();
+}
+
+void subagent_output_buffer::clear() {
+    std::lock_guard<std::mutex> lock(buffer_mutex_);
+    segments_.clear();
+}
+
+bool subagent_output_buffer::empty() const {
+    std::lock_guard<std::mutex> lock(buffer_mutex_);
+    return segments_.empty();
+}
+
+//
+// subagent_output_manager implementation
+//
+
+subagent_output_manager & subagent_output_manager::instance() {
+    static subagent_output_manager instance;
+    return instance;
+}
+
+subagent_output_buffer * subagent_output_manager::create_buffer(const std::string & task_id) {
+    std::lock_guard<std::mutex> lock(buffers_mutex_);
+
+    auto buffer = std::make_unique<subagent_output_buffer>(task_id);
+    auto * ptr = buffer.get();
+    buffers_[task_id] = std::move(buffer);
+    return ptr;
+}
+
+subagent_output_buffer * subagent_output_manager::get_buffer(const std::string & task_id) {
+    std::lock_guard<std::mutex> lock(buffers_mutex_);
+
+    auto it = buffers_.find(task_id);
+    if (it == buffers_.end()) {
+        return nullptr;
+    }
+    return it->second.get();
+}
+
+void subagent_output_manager::remove_buffer(const std::string & task_id) {
+    std::lock_guard<std::mutex> lock(buffers_mutex_);
+    buffers_.erase(task_id);
+}
+
+void subagent_output_manager::flush_all() {
+    std::lock_guard<std::mutex> lock(buffers_mutex_);
+
+    for (auto & [id, buffer] : buffers_) {
+        buffer->flush(true);
+    }
+}
+
+size_t subagent_output_manager::active_count() const {
+    std::lock_guard<std::mutex> lock(buffers_mutex_);
+    return buffers_.size();
+}
diff --git a/tools/agent/subagent/subagent-output.h b/tools/agent/subagent/subagent-output.h
new file mode 100644
index 00000000000..e7fc5b4f77a
--- /dev/null
+++ b/tools/agent/subagent/subagent-output.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "console.h"
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+// Represents a single output segment with display type
+struct output_segment {
+    display_type type = DISPLAY_TYPE_RESET;
+    std::string content;
+};
+
+// Buffered output for a single subagent task
+// Collects output segments and flushes them atomically to console
+class subagent_output_buffer {
+public:
+    explicit subagent_output_buffer(const std::string & task_id);
+
+    // Buffer text with a display type
+    void write(display_type type, const char * fmt, ...);
+
+    // Buffer text without changing display type (uses DISPLAY_TYPE_RESET)
+    void write(const char * fmt, ...);
+
+    // Flush all buffered content atomically to console
+    // Optionally prefix each line with task ID
+    void flush(bool with_task_prefix = true);
+
+    // Clear buffer without flushing
+    void clear();
+
+    // Check if buffer has content
+    bool empty() const;
+
+    // Get task ID
+    const std::string & task_id() const { return task_id_; }
+
+private:
+    std::string task_id_;
+    mutable std::mutex buffer_mutex_;
+    std::vector<output_segment> segments_;
+};
+
+// Manager for all active subagent output buffers
+// Thread-safe singleton
+class subagent_output_manager {
+public:
+    static subagent_output_manager & instance();
+
+    // Create buffer for a new task (returns raw pointer, manager owns the buffer)
+    subagent_output_buffer * create_buffer(const std::string & task_id);
+
+    // Get buffer for an existing task (returns nullptr if not found)
+    subagent_output_buffer * get_buffer(const std::string & task_id);
+
+    // Remove and destroy buffer for a task
+    void remove_buffer(const std::string & task_id);
+
+    // Flush all buffers (for status display or shutdown)
+    void flush_all();
+
+    // Get count of active buffers
+    size_t active_count() const;
+
+private:
+    subagent_output_manager() = default;
+
+    mutable std::mutex buffers_mutex_;
+    std::map<std::string, std::unique_ptr<subagent_output_buffer>> buffers_;
+};
diff --git a/tools/agent/subagent/subagent-runner.cpp b/tools/agent/subagent/subagent-runner.cpp
index 575a7efa15c..7d4a8c49670 100644
--- a/tools/agent/subagent/subagent-runner.cpp
+++ b/tools/agent/subagent/subagent-runner.cpp
@@ -1,5 +1,6 @@
 #include "subagent-runner.h"
 #include "subagent-display.h"
+#include "subagent-output.h"
 #include "../agent-loop.h"
 
 #include "common.h"
@@ -23,11 +24,21 @@ std::string subagent_runner::build_system_prompt(subagent_type type) const {
     const auto & config = get_subagent_config(type);
 
     std::ostringstream prompt;
-    prompt << "You are a specialized " << config.name << " subagent.\n\n";
+
+    // Start with parent's base prompt to enable KV cache prefix sharing
+    // The server's prompt cache will detect the common prefix and reuse cached tokens
+    if (!parent_tool_ctx_.base_system_prompt.empty()) {
+        prompt << parent_tool_ctx_.base_system_prompt;
+        prompt << "# Subagent Mode: " << config.name << "\n\n";
+    } else {
+        // Fallback if no base prompt (shouldn't happen in normal operation)
+        prompt << "You are a specialized " << config.name << " subagent.\n\n";
+    }
+
     prompt << config.description << "\n\n";
 
-    // Add tool restrictions
-    prompt << "# Available Tools\n\n";
+    // Add tool restrictions (overrides base prompt's tool list)
+    prompt << "## Tools Available in This Mode\n\n";
     prompt << "You have access to: ";
     bool first = true;
     for (const auto & tool : config.allowed_tools) {
@@ -115,13 +126,29 @@ std::string subagent_runner::generate_task_id() {
 }
 
 subagent_result subagent_runner::run(const subagent_params & params) {
+    // Synchronous run uses direct console output (no buffer)
+    return run_internal(params, nullptr);
+}
+
+subagent_result subagent_runner::run_internal(const subagent_params & params,
+                                               subagent_output_buffer * buffer) {
     subagent_result result;
     const auto & type_config = get_subagent_config(params.type);
 
     // Start display scope
+    // If buffer is provided (background mode), output goes to buffer
+    // Otherwise (synchronous mode), output goes directly to console
     auto & display = subagent_display::instance();
-    subagent_display::scope display_scope(display, params.description.empty() ? "subagent" : params.description,
-                                           params.type, params.prompt.substr(0, 60) + (params.prompt.length() > 60 ? "..." : ""));
+    std::string desc = params.description.empty() ? "subagent" : params.description;
+    std::string prompt_preview = params.prompt.substr(0, 60) + (params.prompt.length() > 60 ? "..." : "");
+
+    // Use the appropriate constructor based on whether we have a buffer
+    std::unique_ptr<subagent_display::scope> display_scope;
+    if (buffer) {
+        display_scope = std::make_unique<subagent_display::scope>(display, desc, params.type, prompt_preview, buffer);
+    } else {
+        display_scope = std::make_unique<subagent_display::scope>(display, desc, params.type, prompt_preview);
+    }
 
     auto start_time = std::chrono::steady_clock::now();
 
@@ -144,7 +171,7 @@ subagent_result subagent_runner::run(const subagent_params & params) {
     auto tool_callback = [&display_scope, &result](const std::string & tool_name,
                                                     const std::string & args_summary,
                                                     int elapsed_ms) {
-        display_scope.report_tool_call(tool_name, args_summary, elapsed_ms);
+        display_scope->report_tool_call(tool_name, args_summary, elapsed_ms);
         // Also track in result
         std::ostringstream summary;
         summary << tool_name << " (" << elapsed_ms << "ms)";
@@ -166,11 +193,17 @@ subagent_result subagent_runner::run(const subagent_params & params) {
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
 
     // Report completion
-    display_scope.report_done(static_cast<int>(elapsed_ms));
+    display_scope->report_done(static_cast<int>(elapsed_ms));
 
     // Convert result
     result.iterations = loop_result.iterations;
 
+    // Collect token stats from the subagent
+    const auto & subagent_stats = subagent.get_stats();
+    result.input_tokens = subagent_stats.total_input;
+    result.output_tokens = subagent_stats.total_output;
+    result.cached_tokens = subagent_stats.total_cached;
+
     switch (loop_result.stop_reason) {
         case agent_stop_reason::COMPLETED:
             result.success = true;
@@ -200,6 +233,10 @@ subagent_result subagent_runner::run(const subagent_params & params) {
 std::string subagent_runner::start_background(const subagent_params & params) {
     std::string task_id = generate_task_id();
 
+    // Create output buffer for this background task
+    auto & output_mgr = subagent_output_manager::instance();
+    subagent_output_buffer * buffer = output_mgr.create_buffer(task_id);
+
     auto task = std::make_unique<subagent_task>();
     task->id = task_id;
     task->params = params;
@@ -207,17 +244,24 @@ std::string subagent_runner::start_background(const subagent_params & params) {
     // Capture what we need for the thread
     auto * task_ptr = task.get();
 
-    // Start background thread
-    task->thread = std::thread([this, task_ptr, params]() {
+    // Start background thread with buffer for output
+    task->thread = std::thread([this, task_ptr, params, buffer, task_id]() {
         subagent_result result;
         try {
-            result = this->run(params);
+            result = this->run_internal(params, buffer);
         } catch (const std::exception & e) {
             result.success = false;
             result.error = std::string("Exception: ") + e.what();
         }
+
+        // Flush buffered output before completing
+        buffer->flush(true);
+
         task_ptr->promise.set_value(std::move(result));
         task_ptr->complete.store(true);
+
+        // Cleanup buffer
+        subagent_output_manager::instance().remove_buffer(task_id);
     });
 
     // Store task
diff --git a/tools/agent/subagent/subagent-runner.h b/tools/agent/subagent/subagent-runner.h
index b7b6a60dbbb..1912f42c74f 100644
--- a/tools/agent/subagent/subagent-runner.h
+++ b/tools/agent/subagent/subagent-runner.h
@@ -16,6 +16,7 @@
 struct server_context;
 struct agent_config;
 struct common_params;
+class subagent_output_buffer;
 
 // Parameters for running a subagent
 struct subagent_params {
@@ -31,6 +32,11 @@ struct subagent_result {
     std::string error;
     int iterations = 0;
     std::vector<std::string> tool_calls_summary;  // List of tools called with timing
+
+    // Token statistics from the subagent run
+    int32_t input_tokens = 0;
+    int32_t output_tokens = 0;
+    int32_t cached_tokens = 0;
 };
 
 // Background task state
@@ -98,4 +104,8 @@ class subagent_runner {
 
     // Generate unique task ID
     static std::string generate_task_id();
+
+    // Internal run method with optional buffer for background tasks
+    subagent_result run_internal(const subagent_params & params,
+                                  subagent_output_buffer * buffer);
 };
diff --git a/tools/agent/tool-registry.h b/tools/agent/tool-registry.h
index 3f2b5a5207c..6e70d1db2af 100644
--- a/tools/agent/tool-registry.h
+++ b/tools/agent/tool-registry.h
@@ -22,7 +22,12 @@ struct tool_context {
     void * server_ctx_ptr = nullptr;       // Pointer to server_context
     void * agent_config_ptr = nullptr;     // Pointer to agent_config
     void * common_params_ptr = nullptr;    // Pointer to common_params (for model inference params)
+    void * session_stats_ptr = nullptr;    // Pointer to session_stats (for tracking subagent tokens)
     int subagent_depth = 0;                // Current nesting depth (0 = main agent)
+
+    // Prefix caching: base system prompt shared between parent and subagents
+    // Subagent prompts start with this prefix to maximize KV cache reuse
+    std::string base_system_prompt;
 };
 
 // Result returned from tool execution
diff --git a/tools/agent/tools/tool-task.cpp b/tools/agent/tools/tool-task.cpp
index 922f3575cb9..bc6453f5f4d 100644
--- a/tools/agent/tools/tool-task.cpp
+++ b/tools/agent/tools/tool-task.cpp
@@ -40,6 +40,26 @@ static subagent_runner & get_runner(const tool_context & ctx) {
     return *g_runners[key];
 }
 
+// Update parent's session stats with subagent token usage
+static void update_parent_stats(const tool_context & ctx, const subagent_result & result) {
+    if (!ctx.session_stats_ptr) {
+        return;
+    }
+
+    auto * stats = static_cast<session_stats *>(ctx.session_stats_ptr);
+
+    // Add to subagent-specific counters (for breakdown display)
+    stats->subagent_input += result.input_tokens;
+    stats->subagent_output += result.output_tokens;
+    stats->subagent_cached += result.cached_tokens;
+    stats->subagent_count++;
+
+    // Also add to totals so "Main = Total - Subagent" works correctly
+    stats->total_input += result.input_tokens;
+    stats->total_output += result.output_tokens;
+    stats->total_cached += result.cached_tokens;
+}
+
 static tool_result task_execute(const json & args, const tool_context & ctx) {
     // Check depth limit
     auto & display = subagent_display::instance();
@@ -69,6 +89,9 @@ static tool_result task_execute(const json & args, const tool_context & ctx) {
         if (runner.is_complete(resume_id)) {
             subagent_result result = runner.get_result(resume_id);
 
+            // Update parent stats with subagent token usage
+            update_parent_stats(ctx, result);
+
             // Format output
             std::ostringstream output;
             output << "Background task " << resume_id << " completed";
@@ -143,6 +166,9 @@ static tool_result task_execute(const json & args, const tool_context & ctx) {
         // Run synchronously
         subagent_result result = runner.run(task_params);
 
+        // Update parent stats with subagent token usage
+        update_parent_stats(ctx, result);
+
         // Format output
         std::ostringstream output;
         output << "Subagent (" << subagent_type_name(type) << ") ";