Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 43 additions & 3 deletions common/console.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ namespace console {

static bool bracket_paste_mode = false; // true when inside ESC[200~ ... ESC[201~

// Thread-safety mutex for console operations
// Protects: current_display, out (FILE*), and all console output operations
static std::mutex g_console_mutex;

//
// Init and cleanup
//
Expand Down Expand Up @@ -179,7 +183,8 @@ namespace console {
//

// Keep track of current display and only emit ANSI code if it changes
void set_display(display_type display) {
// Internal version without lock (for use when lock is already held)
static void set_display_unlocked(display_type display) {
if (advanced_display && current_display != display) {
common_log_flush(common_log_main());
switch(display) {
Expand Down Expand Up @@ -210,6 +215,11 @@ namespace console {
}
}

void set_display(display_type display) {
std::lock_guard<std::mutex> lock(g_console_mutex);
set_display_unlocked(display);
}

static char32_t getchar32() {
#if defined(_WIN32)
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
Expand Down Expand Up @@ -1198,23 +1208,53 @@ namespace console {
}

void log(const char * fmt, ...) {
std::lock_guard<std::mutex> lock(g_console_mutex);
va_list args;
va_start(args, fmt);
vfprintf(out, fmt, args);
va_end(args);
}

void error(const char * fmt, ...) {
std::lock_guard<std::mutex> lock(g_console_mutex);
va_list args;
va_start(args, fmt);
display_type cur = current_display;
set_display(DISPLAY_TYPE_ERROR);
set_display_unlocked(DISPLAY_TYPE_ERROR);
vfprintf(out, fmt, args);
set_display(cur); // restore previous color
set_display_unlocked(cur); // restore previous color
va_end(args);
}

void flush() {
std::lock_guard<std::mutex> lock(g_console_mutex);
fflush(out);
}

//
// output_guard implementation
//

output_guard::output_guard() {
g_console_mutex.lock();
}

output_guard::~output_guard() {
g_console_mutex.unlock();
}

void output_guard::write(const char * fmt, ...) {
va_list args;
va_start(args, fmt);
vfprintf(out, fmt, args);
va_end(args);
}

void output_guard::set_display(display_type type) {
set_display_unlocked(type);
}

void output_guard::flush() {
fflush(out);
}
}
23 changes: 23 additions & 0 deletions common/console.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,27 @@ namespace console {
void error(const char * fmt, ...);

void flush();

// RAII guard for atomic multi-part console output
// Holds the console mutex for the lifetime of the object
// Use this when you need to output multiple lines/parts atomically
class output_guard {
public:
output_guard();
~output_guard();

// Non-copyable, non-movable
output_guard(const output_guard &) = delete;
output_guard & operator=(const output_guard &) = delete;

// Write to console (mutex already held)
LLAMA_COMMON_ATTRIBUTE_FORMAT(2, 3)
void write(const char * fmt, ...);

// Change display type (mutex already held)
void set_display(display_type type);

// Flush output (mutex already held)
void flush();
};
}
1 change: 1 addition & 0 deletions tools/agent/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ set(AGENT_SOURCES
subagent/subagent-types.cpp
subagent/subagent-display.cpp
subagent/subagent-runner.cpp
subagent/subagent-output.cpp
tools/tool-bash.cpp
tools/tool-read.cpp
tools/tool-write.cpp
Expand Down
98 changes: 98 additions & 0 deletions tools/agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,104 @@ llama-agent -hf unsloth/Nemotron-3-Nano-30B-A3B-GGUF:Q5_K_M
| `write` | Create or overwrite files |
| `edit` | Search and replace in files |
| `glob` | Find files matching a pattern |
| `task` | Spawn a subagent for complex tasks |

## Subagents

Subagents are specialized child agents that handle complex tasks independently, keeping the main conversation context clean and efficient.

### Why Subagents?

Without subagents, every file read and search pollutes your main context:

```
Main context after exploring codebase:
├── glob **/*.cpp → 50 files (800 tokens)
├── read src/main.cpp → full file (1,500 tokens)
├── read src/utils.cpp → full file (2,200 tokens)
├── grep "TODO" → 100 matches (1,200 tokens)
└── Total: ~5,700 tokens consumed for ONE exploration
```

With subagents, only the summary enters main context:

```
Main context:
└── task(explore) → "Found 3 TODO items in src/main.cpp:42,87,156" (50 tokens)

Subagent context (discarded after):
├── All the detailed exploration (~5,700 tokens)
└── Summarized to parent
```

### Subagent Types

| Type | Purpose | Tools Available |
|------|---------|-----------------|
| `explore` | Search and understand code (read-only) | `glob`, `read`, `bash` (read-only commands only) |
| `bash` | Execute shell commands | `bash` |
| `plan` | Design implementation approaches | `glob`, `read`, `bash` |
| `general` | General-purpose tasks | All tools |

### How It Works

```
┌─────────────────┐
│ Main Agent │ "Find where errors are handled"
└────────┬────────┘
│ task(explore, "find error handling")
┌─────────────────┐
│ Subagent │ Does detailed exploration:
│ (explore) │ - glob **/*.cpp
│ │ - read 5 files
│ │ - grep patterns
└────────┬────────┘
│ Returns summary only
┌─────────────────┐
│ Main Agent │ Receives: "Errors handled in src/error.cpp:45
│ │ via ErrorHandler class..."
└─────────────────┘
```

### Memory Efficiency

Subagents share the model - no additional VRAM is used:

| Resource | Main Agent | Subagent | Total |
|----------|------------|----------|-------|
| Model weights | ✓ | Shared | 1x |
| KV cache | ✓ | Shared via slots | 1x |
| Context window | Own | Own (discarded after) | Efficient |

### Parallel Execution

Multiple subagents can run in the background simultaneously:

```
> Run tests and check for lint errors at the same time

[task-a1b2] ┌── ⚡ run-tests (bash)
[task-c3d4] ┌── ⚡ check-lint (bash)
[task-a1b2] │ ├─› bash npm test (2.1s)
[task-c3d4] │ ├─› bash npm run lint (1.8s)
[task-c3d4] │ └── done (1.8s)
[task-a1b2] │ └── done (2.1s)
```

### KV Cache Prefix Sharing

Subagent prompts share a common prefix with the main agent, enabling automatic KV cache reuse:

```
Main agent prompt: "You are llama-agent... [base] + [main agent instructions]"
Subagent prompt: "You are llama-agent... [base] + # Subagent Mode: explore..."
↑─────── shared prefix ──────↑
Cached tokens reused, not re-processed
```

This reduces subagent startup latency and saves compute.

## Usage Examples

Expand Down
26 changes: 26 additions & 0 deletions tools/agent/agent-loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,34 @@ agent_loop::agent_loop(server_context & server_ctx,
tool_ctx_.server_ctx_ptr = &server_ctx_;
tool_ctx_.agent_config_ptr = const_cast<agent_config *>(&config_);
tool_ctx_.common_params_ptr = const_cast<common_params *>(&params);
tool_ctx_.session_stats_ptr = &stats_;
tool_ctx_.subagent_depth = 0;

// Set up permission manager
permission_mgr_.set_project_root(tool_ctx_.working_dir);
permission_mgr_.set_yolo_mode(config.yolo_mode);

// Base prompt shared with subagents for KV cache prefix sharing
// Subagent prompts start with this exact text to maximize cache hits
static const char * BASE_PROMPT_PREFIX = R"(You are llama-agent, a powerful local AI coding assistant running on llama.cpp.

You help users with software engineering tasks by reading files, writing code, running commands, and navigating codebases. You run entirely on the user's machine - no data leaves their system.

# Tools

You have access to the following tools:

- **bash**: Execute shell commands. Use for git, build commands, running tests, etc.
- **read**: Read file contents with line numbers. Always read files before editing them.
- **write**: Create new files or overwrite existing ones.
- **edit**: Make targeted edits using search/replace. The old_string must match exactly. Use replace_all=true to replace all occurrences of a word or phrase.
- **glob**: Find files matching a pattern. Use to explore project structure.

)";

// Store base prompt for subagents to inherit (enables KV cache prefix sharing)
tool_ctx_.base_system_prompt = BASE_PROMPT_PREFIX;

// Add system prompt for tool usage
std::string system_prompt = R"(You are llama-agent, a powerful local AI coding assistant running on llama.cpp.

Expand Down Expand Up @@ -265,6 +287,7 @@ agent_loop::agent_loop(server_context & server_ctx,
tool_ctx_.server_ctx_ptr = &server_ctx_;
tool_ctx_.agent_config_ptr = const_cast<agent_config *>(&config_);
tool_ctx_.common_params_ptr = const_cast<common_params *>(&params);
tool_ctx_.session_stats_ptr = &stats_;
tool_ctx_.subagent_depth = subagent_depth;

// Set up permission manager
Expand All @@ -286,6 +309,9 @@ void agent_loop::clear() {
messages_.push_back(system_msg);
}
permission_mgr_.clear_session();

// Reset stats when conversation is cleared
stats_ = session_stats{};
}

common_chat_msg agent_loop::generate_completion(result_timings & out_timings) {
Expand Down
6 changes: 6 additions & 0 deletions tools/agent/agent-loop.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ struct session_stats {
int32_t total_cached = 0; // Total tokens served from KV cache
double total_prompt_ms = 0; // Total prompt evaluation time
double total_predicted_ms = 0; // Total generation time

// Subagent-specific stats (subset of totals above)
int32_t subagent_input = 0; // Prompt tokens from subagents
int32_t subagent_output = 0; // Output tokens from subagents
int32_t subagent_cached = 0; // Cached tokens from subagents
int32_t subagent_count = 0; // Number of subagent runs
};

// The main agent loop class
Expand Down
21 changes: 21 additions & 0 deletions tools/agent/agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,27 @@ int main(int argc, char ** argv) {
console::log(" Cached tokens: %d\n", stats.total_cached);
}
console::log(" Total tokens: %d\n", stats.total_input + stats.total_output);

// Show subagent breakdown if any subagents were used
if (stats.subagent_count > 0) {
console::log("\n Subagent breakdown (%d run%s):\n",
stats.subagent_count, stats.subagent_count == 1 ? "" : "s");
console::log(" Prompt tokens: %d\n", stats.subagent_input);
console::log(" Output tokens: %d\n", stats.subagent_output);
if (stats.subagent_cached > 0) {
console::log(" Cached tokens: %d\n", stats.subagent_cached);
}
console::log(" Total tokens: %d\n", stats.subagent_input + stats.subagent_output);

// Show main agent stats (total minus subagent)
int32_t main_input = stats.total_input - stats.subagent_input;
int32_t main_output = stats.total_output - stats.subagent_output;
console::log("\n Main agent:\n");
console::log(" Prompt tokens: %d\n", main_input);
console::log(" Output tokens: %d\n", main_output);
console::log(" Total tokens: %d\n", main_input + main_output);
}

if (stats.total_prompt_ms > 0) {
console::log(" Prompt time: %.2fs\n", stats.total_prompt_ms / 1000.0);
}
Expand Down
Loading