gary149 · gary149 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/common/console.cpp b/common/console.cpp
@@ -83,6 +83,10 @@ namespace console {
 
     static bool         bracket_paste_mode = false;  // true when inside ESC[200~ ... ESC[201~
 
+    // Thread-safety mutex for console operations
+    // Protects: current_display, out (FILE*), and all console output operations
+    static std::mutex   g_console_mutex;
+
     //
     // Init and cleanup
     //
@@ -179,7 +183,8 @@ namespace console {
     //
 
     // Keep track of current display and only emit ANSI code if it changes
-    void set_display(display_type display) {
+    // Internal version without lock (for use when lock is already held)
+    static void set_display_unlocked(display_type display) {
         if (advanced_display && current_display != display) {
             common_log_flush(common_log_main());
             switch(display) {
@@ -210,6 +215,11 @@ namespace console {
         }
     }
 
+    void set_display(display_type display) {
+        std::lock_guard<std::mutex> lock(g_console_mutex);
+        set_display_unlocked(display);
+    }
+
     static char32_t getchar32() {
 #if defined(_WIN32)
         HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
@@ -1198,23 +1208,53 @@ namespace console {
     }
 
     void log(const char * fmt, ...) {
+        std::lock_guard<std::mutex> lock(g_console_mutex);
         va_list args;
         va_start(args, fmt);
         vfprintf(out, fmt, args);
         va_end(args);
     }
 
     void error(const char * fmt, ...) {
+        std::lock_guard<std::mutex> lock(g_console_mutex);
         va_list args;
         va_start(args, fmt);
         display_type cur = current_display;
-        set_display(DISPLAY_TYPE_ERROR);
+        set_display_unlocked(DISPLAY_TYPE_ERROR);
         vfprintf(out, fmt, args);
-        set_display(cur); // restore previous color
+        set_display_unlocked(cur); // restore previous color
         va_end(args);
     }
 
     void flush() {
+        std::lock_guard<std::mutex> lock(g_console_mutex);
+        fflush(out);
+    }
+
+    //
+    // output_guard implementation
+    //
+
+    output_guard::output_guard() {
+        g_console_mutex.lock();
+    }
+
+    output_guard::~output_guard() {
+        g_console_mutex.unlock();
+    }
+
+    void output_guard::write(const char * fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        vfprintf(out, fmt, args);
+        va_end(args);
+    }
+
+    void output_guard::set_display(display_type type) {
+        set_display_unlocked(type);
+    }
+
+    void output_guard::flush() {
         fflush(out);
     }
 }
diff --git a/common/console.h b/common/console.h
@@ -39,4 +39,27 @@ namespace console {
     void error(const char * fmt, ...);
 
     void flush();
+
+    // RAII guard for atomic multi-part console output
+    // Holds the console mutex for the lifetime of the object
+    // Use this when you need to output multiple lines/parts atomically
+    class output_guard {
+    public:
+        output_guard();
+        ~output_guard();
+
+        // Non-copyable, non-movable
+        output_guard(const output_guard &) = delete;
+        output_guard & operator=(const output_guard &) = delete;
+
+        // Write to console (mutex already held)
+        LLAMA_COMMON_ATTRIBUTE_FORMAT(2, 3)
+        void write(const char * fmt, ...);
+
+        // Change display type (mutex already held)
+        void set_display(display_type type);
+
+        // Flush output (mutex already held)
+        void flush();
+    };
 }
diff --git a/tools/agent/CMakeLists.txt b/tools/agent/CMakeLists.txt
@@ -10,6 +10,7 @@ set(AGENT_SOURCES
     subagent/subagent-types.cpp
     subagent/subagent-display.cpp
     subagent/subagent-runner.cpp
+    subagent/subagent-output.cpp
     tools/tool-bash.cpp
     tools/tool-read.cpp
     tools/tool-write.cpp

diff --git a/tools/agent/README.md b/tools/agent/README.md
@@ -54,6 +54,104 @@ llama-agent -hf unsloth/Nemotron-3-Nano-30B-A3B-GGUF:Q5_K_M
 | `write` | Create or overwrite files |
 | `edit` | Search and replace in files |
 | `glob` | Find files matching a pattern |
+| `task` | Spawn a subagent for complex tasks |
+
+## Subagents
+
+Subagents are specialized child agents that handle complex tasks independently, keeping the main conversation context clean and efficient.
+
+### Why Subagents?
+
+Without subagents, every file read and search pollutes your main context:
+
+```
+Main context after exploring codebase:
+├── glob **/*.cpp → 50 files (800 tokens)
+├── read src/main.cpp → full file (1,500 tokens)
+├── read src/utils.cpp → full file (2,200 tokens)
+├── grep "TODO" → 100 matches (1,200 tokens)
+└── Total: ~5,700 tokens consumed for ONE exploration
+```
+
+With subagents, only the summary enters main context:
+
+```
+Main context:
+└── task(explore) → "Found 3 TODO items in src/main.cpp:42,87,156" (50 tokens)
+
+Subagent context (discarded after):
+├── All the detailed exploration (~5,700 tokens)
+└── Summarized to parent
+```
+
+### Subagent Types
+
+| Type | Purpose | Tools Available |
+|------|---------|-----------------|
+| `explore` | Search and understand code (read-only) | `glob`, `read`, `bash` (read-only commands only) |
+| `bash` | Execute shell commands | `bash` |
+| `plan` | Design implementation approaches | `glob`, `read`, `bash` |
+| `general` | General-purpose tasks | All tools |
+
+### How It Works
+
+```
+┌─────────────────┐
+│   Main Agent    │  "Find where errors are handled"
+└────────┬────────┘
+         │ task(explore, "find error handling")
+         ▼
+┌─────────────────┐
+│    Subagent     │  Does detailed exploration:
+│    (explore)    │  - glob **/*.cpp
+│                 │  - read 5 files
+│                 │  - grep patterns
+└────────┬────────┘
+         │ Returns summary only
+         ▼
+┌─────────────────┐
+│   Main Agent    │  Receives: "Errors handled in src/error.cpp:45
+│                 │  via ErrorHandler class..."
+└─────────────────┘
+```
+
+### Memory Efficiency
+
+Subagents share the model - no additional VRAM is used:
+
+| Resource | Main Agent | Subagent | Total |
+|----------|------------|----------|-------|
+| Model weights | ✓ | Shared | 1x |
+| KV cache | ✓ | Shared via slots | 1x |
+| Context window | Own | Own (discarded after) | Efficient |
+
+### Parallel Execution
+
+Multiple subagents can run in the background simultaneously:
+
+```
+> Run tests and check for lint errors at the same time
+
+[task-a1b2] ┌── ⚡ run-tests (bash)
+[task-c3d4] ┌── ⚡ check-lint (bash)
+[task-a1b2] │   ├─› bash npm test (2.1s)
+[task-c3d4] │   ├─› bash npm run lint (1.8s)
+[task-c3d4] │   └── done (1.8s)
+[task-a1b2] │   └── done (2.1s)
+```
+
+### KV Cache Prefix Sharing
+
+Subagent prompts share a common prefix with the main agent, enabling automatic KV cache reuse:
+
+```
+Main agent prompt:    "You are llama-agent... [base] + [main agent instructions]"
+Subagent prompt:      "You are llama-agent... [base] + # Subagent Mode: explore..."
+                       ↑─────── shared prefix ──────↑
+                       Cached tokens reused, not re-processed
+```
+
+This reduces subagent startup latency and saves compute.
 
 ## Usage Examples
 

diff --git a/tools/agent/agent-loop.cpp b/tools/agent/agent-loop.cpp
@@ -70,12 +70,34 @@ agent_loop::agent_loop(server_context & server_ctx,
     tool_ctx_.server_ctx_ptr = &server_ctx_;
     tool_ctx_.agent_config_ptr = const_cast<agent_config *>(&config_);
     tool_ctx_.common_params_ptr = const_cast<common_params *>(&params);
+    tool_ctx_.session_stats_ptr = &stats_;
     tool_ctx_.subagent_depth = 0;
 
     // Set up permission manager
     permission_mgr_.set_project_root(tool_ctx_.working_dir);
     permission_mgr_.set_yolo_mode(config.yolo_mode);
 
+    // Base prompt shared with subagents for KV cache prefix sharing
+    // Subagent prompts start with this exact text to maximize cache hits
+    static const char * BASE_PROMPT_PREFIX = R"(You are llama-agent, a powerful local AI coding assistant running on llama.cpp.
+
+You help users with software engineering tasks by reading files, writing code, running commands, and navigating codebases. You run entirely on the user's machine - no data leaves their system.
+
+# Tools
+
+You have access to the following tools:
+
+- **bash**: Execute shell commands. Use for git, build commands, running tests, etc.
+- **read**: Read file contents with line numbers. Always read files before editing them.
+- **write**: Create new files or overwrite existing ones.
+- **edit**: Make targeted edits using search/replace. The old_string must match exactly. Use replace_all=true to replace all occurrences of a word or phrase.
+- **glob**: Find files matching a pattern. Use to explore project structure.
+
+)";
+
+    // Store base prompt for subagents to inherit (enables KV cache prefix sharing)
+    tool_ctx_.base_system_prompt = BASE_PROMPT_PREFIX;
+
     // Add system prompt for tool usage
     std::string system_prompt = R"(You are llama-agent, a powerful local AI coding assistant running on llama.cpp.
 
@@ -265,6 +287,7 @@ agent_loop::agent_loop(server_context & server_ctx,
     tool_ctx_.server_ctx_ptr = &server_ctx_;
     tool_ctx_.agent_config_ptr = const_cast<agent_config *>(&config_);
     tool_ctx_.common_params_ptr = const_cast<common_params *>(&params);
+    tool_ctx_.session_stats_ptr = &stats_;
     tool_ctx_.subagent_depth = subagent_depth;
 
     // Set up permission manager
@@ -286,6 +309,9 @@ void agent_loop::clear() {
         messages_.push_back(system_msg);
     }
     permission_mgr_.clear_session();
+
+    // Reset stats when conversation is cleared
+    stats_ = session_stats{};
 }
 
 common_chat_msg agent_loop::generate_completion(result_timings & out_timings) {

diff --git a/tools/agent/agent-loop.h b/tools/agent/agent-loop.h
@@ -61,6 +61,12 @@ struct session_stats {
     int32_t total_cached = 0;      // Total tokens served from KV cache
     double total_prompt_ms = 0;    // Total prompt evaluation time
     double total_predicted_ms = 0; // Total generation time
+
+    // Subagent-specific stats (subset of totals above)
+    int32_t subagent_input = 0;    // Prompt tokens from subagents
+    int32_t subagent_output = 0;   // Output tokens from subagents
+    int32_t subagent_cached = 0;   // Cached tokens from subagents
+    int32_t subagent_count = 0;    // Number of subagent runs
 };
 
 // The main agent loop class

diff --git a/tools/agent/agent.cpp b/tools/agent/agent.cpp
@@ -452,6 +452,27 @@ int main(int argc, char ** argv) {
                     console::log("  Cached tokens:  %d\n", stats.total_cached);
                 }
                 console::log("  Total tokens:   %d\n", stats.total_input + stats.total_output);
+
+                // Show subagent breakdown if any subagents were used
+                if (stats.subagent_count > 0) {
+                    console::log("\n  Subagent breakdown (%d run%s):\n",
+                        stats.subagent_count, stats.subagent_count == 1 ? "" : "s");
+                    console::log("    Prompt tokens:  %d\n", stats.subagent_input);
+                    console::log("    Output tokens:  %d\n", stats.subagent_output);
+                    if (stats.subagent_cached > 0) {
+                        console::log("    Cached tokens:  %d\n", stats.subagent_cached);
+                    }
+                    console::log("    Total tokens:   %d\n", stats.subagent_input + stats.subagent_output);
+
+                    // Show main agent stats (total minus subagent)
+                    int32_t main_input = stats.total_input - stats.subagent_input;
+                    int32_t main_output = stats.total_output - stats.subagent_output;
+                    console::log("\n  Main agent:\n");
+                    console::log("    Prompt tokens:  %d\n", main_input);
+                    console::log("    Output tokens:  %d\n", main_output);
+                    console::log("    Total tokens:   %d\n", main_input + main_output);
+                }
+
                 if (stats.total_prompt_ms > 0) {
                     console::log("  Prompt time:    %.2fs\n", stats.total_prompt_ms / 1000.0);
                 }