diff --git a/benchmarks/Netclaw.Benchmarks/CaptureBenchmarks.cs b/benchmarks/Netclaw.Benchmarks/CaptureBenchmarks.cs new file mode 100644 index 000000000..b4e2a44ae --- /dev/null +++ b/benchmarks/Netclaw.Benchmarks/CaptureBenchmarks.cs @@ -0,0 +1,36 @@ +// ----------------------------------------------------------------------- +// +// Copyright (C) 2026 - 2026 Petabridge, LLC +// +// ----------------------------------------------------------------------- + +using BenchmarkDotNet.Attributes; +using Netclaw.Actors.Tools; + +namespace Netclaw.Benchmarks; + +/// +/// Exercises the production capture path used by shell/background_job/file_read: +/// drain a stream to the capture ceiling, then derive the small inline window +/// (what ToolOutputSpill does before redaction). The point is to confirm +/// allocation is O(capture ceiling), not O(total output) — peak allocation stays +/// flat whether the source emits 1K chars or 50M. +/// +[MemoryDiagnoser] +public class CaptureBenchmarks +{ + private const int CaptureMax = 256_000; // ToolConfig.MaxOutputChars default + private const int InlineBudget = 2_000; // SessionTuning.MaxInlineToolResultChars default + + [Params(1_000, 256_000, 50_000_000)] + public int TotalChars; + + [Benchmark] + public async Task Capture_then_inline_window() + { + var reader = new SyntheticCharReader(TotalChars); + var (captured, _) = await BoundedOutputReader.DrainToWindowAsync(reader, CaptureMax, CancellationToken.None); + var inline = BoundedOutputReader.Window(captured, InlineBudget); + return inline.Length; + } +} diff --git a/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs b/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs index 4df1a3143..a8ec55f05 100644 --- a/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs +++ b/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs @@ -15,8 +15,8 @@ namespace Netclaw.Benchmarks; /// turning a child's stdout stream into a bounded, redaction-ready string. /// /// The comparison is the regression fix for #1293: -/// (head+tail ring buffer, capped at -/// read time) versus the old path — +/// (head+tail ring buffer, +/// capped at read time) versus the old path — /// followed by , which materialised the /// entire output before applying the cap. The story we want the numbers to tell /// is allocation shape: the old path is O(total output) and lands on the LOH for @@ -53,7 +53,7 @@ public async Task ReadToEnd_ThenTruncate() public async Task BoundedDrain() { var reader = new SyntheticCharReader(TotalChars); - var (text, _) = await ShellTool.BoundedDrainAsync(reader, Cap); + var (text, _) = await BoundedOutputReader.DrainToWindowAsync(reader, Cap, CancellationToken.None); return text.Length; } } diff --git a/evals/README.md b/evals/README.md index 77693a3d3..cbb60aee7 100644 --- a/evals/README.md +++ b/evals/README.md @@ -68,7 +68,7 @@ log patterns** (skill loading, memory recall, checkpoint formation). | Grounding & Alignment | 3 | Uses tools to verify facts, admits uncertainty | | Autonomy & Execution | 2 | Executes tasks rather than describing them | | Subagents | 1 | Delegates through `spawn_agent` and verifies headless subagents complete ambiguous work without clarification loops | -| Complex Task Execution | 3 | Multi-step tool chains complete successfully | +| Complex Task Execution | 5 | Multi-step tool chains complete successfully, incl. bounded tool output — given only the goal (no handling hints), the agent retrieves a deep line from oversized shell output and from a large file, which is only possible by coping with the bound the way AGENTS.md/skills/steer text direct | | Multi-Turn Conversation | 7 | Session resume and speaker attribution recall | Each case defines multiple natural phrasings of the same intent. Each diff --git a/evals/run-evals.sh b/evals/run-evals.sh index 5095af05a..ab10c2409 100755 --- a/evals/run-evals.sh +++ b/evals/run-evals.sh @@ -358,6 +358,19 @@ start_eval_daemon() { cp -r "$REPO_ROOT/evals/fixtures/agents/." "$EVAL_HOME/data/agents/" fi + # Pre-seed a large (>256 KB) text file in the workspaces read-root for the + # bounded-tool-output file_read eval (complex_large_file_read_ranged). It must + # be too big for one inline read AND have model-unguessable content so the only + # way to answer "what's on line 5000" is to page it with file_read StartLine/Limit + # — the behavior the bounded-output steer is meant to elicit. A deterministic + # Lehmer PRNG (pure integer modular arithmetic, identical across awk impls) + # makes line 5000 reproducible so the eval can assert the exact value. Lives + # under workspaces (a global read-root) rather than identity/skills so it is + # not pulled into the system prompt or scanned as a skill. 30000 lines ≈ 314 KB. + mkdir -p "$EVAL_HOME/data/workspaces" + awk 'BEGIN{x=1;for(i=1;i<=30000;i++){x=(x*48271)%2147483647;print x}}' \ + > "$EVAL_HOME/data/workspaces/netclaw-eval-largefile.txt" + # The eval container runs as the non-root `netclaw` user and needs write # access to the bind-mounted identity, logs, skills, and data trees. chmod -R ugo+rwX "$EVAL_HOME/identity" "$EVAL_HOME/logs" "$EVAL_HOME/data" "$EVAL_HOME/skills" @@ -1056,6 +1069,45 @@ assert_complex_diagnose_self() { stdout_contains '\[tool:call\] shell_execute' && stdout_contains 'netclaw.*doctor' } +# bounded-tool-output coverage (bound-tool-output-with-file-spill change). +# These two cases assert on OUTCOME, not mechanism: the prompts state only the +# goal and give the agent NO instructions about spilling, redirecting, re-running, +# file_read, StartLine/Limit, or grep. How the agent handles oversized output must +# come entirely from AGENTS.md, the netclaw-operations skill, and the steer text +# in the tool result — coaching it in the prompt would be testing instruction- +# following, not whether the real guidance surfaces work. +# +# The data is a deterministic Lehmer PRNG (pure integer modular arithmetic, +# identical across awk implementations and the host that computed the expected +# values), so the value at a deep line is reproducible AND un-fabricatable by the +# model. Because the tool bounds any single read to ~N=2000 inline chars, the +# deep-line value is unreachable from one read — so a correct answer can ONLY +# come from the agent paging/reading the oversized output the way the steer asks. +# Outcome therefore implies correct handling; no mechanism assertion is needed. + +# Large SHELL output: ~210 KB on stdout exceeds N, so the daemon spills it and +# steers. Line 200 (value 872671849) sits past the inline window; reporting it +# proves the agent retrieved it from the bounded/spilled output unaided. +assert_complex_large_shell_output_spill() { + stdout_contains '\[tool:call\] shell_execute' && \ + stdout_response_contains '872671849' +} + +# Large FILE: a pre-seeded ~314 KB file (>256 KB, so file_read returns a bounded +# sample + steer). The prompt asks for a small line WINDOW around 5000 rather than +# exactly line 5000: the model pages correctly with file_read StartLine/Limit (the +# behavior under test, every run) but can misindex the line by ±1 (the original +# run treated the param's former name "Offset" as a 0-based skip-count — the bug +# that motivated renaming it to the 1-based StartLine). A window makes line 5000 +# (value 1629331733) fall inside the returned slice regardless of any ±1 indexing, +# so the case measures bounded-output paging, not exact index arithmetic. Line 5000 +# is ~52 KB in, well past the inline window, so reporting its value still proves +# the agent paged rather than dumped. +assert_complex_large_file_read_ranged() { + stdout_contains '\[tool:call\] file_read' && \ + stdout_response_contains '1629331733' +} + # Category 8: Multi-Turn Conversation (tests session-resume + KV cache behavior) # All assertions run against the concatenated stdout of every turn in the case. @@ -1466,6 +1518,25 @@ run_all() { run_case complex_diagnose_self "shell_execute with netclaw doctor" \ "Run netclaw doctor and summarize any problems" + # bounded-tool-output: oversized SHELL output. The prompt states only the + # goal — run a command and report a deep line of its output. How to cope with + # the output being too large to return inline (read the spill the steer hands + # back, rather than re-running) must come from the agent's own guidance, not + # this prompt. The number is a deterministic-but-opaque Lehmer PRNG value; the + # assertion checks the agent reports the correct line-200 value (872671849). + run_case complex_large_shell_output_spill "retrieves a deep line from oversized shell output unaided" \ + "Run this command with shell_execute and tell me the number it prints on line 200: awk 'BEGIN{x=1;for(i=1;i<=20000;i++){x=(x*48271)%2147483647;print x}}'" \ + "Using shell_execute, run: awk 'BEGIN{x=1;for(i=1;i<=20000;i++){x=(x*48271)%2147483647;print x}}' — then tell me which number is printed on the 200th line of its output." + + # bounded-tool-output: oversized FILE. The prompt states only the goal — read + # a deep line of a named large file. How to cope with it being too large for + # one read (page it with file_read StartLine/Limit, per the steer) must come from + # the agent's own guidance. The file is pre-seeded in start_eval_daemon; the + # assertion checks the agent reports the correct line-5000 value (1629331733). + run_case complex_large_file_read_ranged "retrieves a deep line from a large file unaided" \ + "List the numbers on lines 4997 through 5003 of the file /home/netclaw/.netclaw/workspaces/netclaw-eval-largefile.txt" \ + "Read lines 4997 to 5003 of /home/netclaw/.netclaw/workspaces/netclaw-eval-largefile.txt and list the numbers on those lines." + end_category # ── Category 8: Multi-Turn Conversation ── diff --git a/feeds/skills/.system/files/netclaw-operations/SKILL.md b/feeds/skills/.system/files/netclaw-operations/SKILL.md index 17148ce3e..c4dc56ead 100644 --- a/feeds/skills/.system/files/netclaw-operations/SKILL.md +++ b/feeds/skills/.system/files/netclaw-operations/SKILL.md @@ -3,7 +3,7 @@ name: netclaw-operations description: "REQUIRED when the user asks about scheduling, reminders, cron jobs, timers, background jobs, diagnostics, troubleshooting, MCP tools, daemon health, identity updates, or Netclaw capabilities and self-maintenance." metadata: author: netclaw - version: "2.8.9" + version: "2.9.0" --- # Netclaw Operations @@ -218,7 +218,8 @@ Rules: - The user must approve the command before it starts running in the background. - Maximum 5 concurrent background jobs; overflow queues FIFO. - Job definitions persist to `~/.netclaw/jobs/{id}.json`. -- Output captured to `~/.netclaw/jobs/{id}/output.log`. +- Output is captured (bounded; head+tail for very large output) to + `~/.netclaw/jobs/{id}/output.log` — `file_read`/`grep` it for detail beyond the tail. Monitoring tools: @@ -235,6 +236,26 @@ results proactively when the job completes. Active background jobs appear in the `[active-background-jobs]` section of the session context on every turn. +## Large tool output + +Tool output is bounded to a small inline budget +(`Session.Tuning.MaxInlineToolResultChars`, default 2000 chars) so it never floods +the context window. When a tool's output exceeds that budget you get a head+tail +view inline plus a pointer to the full output — not the whole thing: + +- **`shell_execute`** spills the full (redacted) output to + `{session}/tool-calls/{toolCallId}.log` and gives you the path. Read a slice with + `file_read` (`StartLine`/`Limit`) or `grep` it — do NOT re-run the command to see more. +- **`file_read`** on a large file returns the head and steers you to read a + specific range with `StartLine`/`Limit` or `grep` (`StartLine` is a 1-based line + number — line 1 is the first line). Don't `cat` a huge file through + `shell_execute` to get around it — that just spills again. +- **`background_job`** output goes to `~/.netclaw/jobs/{id}/output.log` (bounded); + `check_background_job` returns a tail, and you can `file_read`/`grep` the log for the rest. + +Reading a targeted range or grepping is always cheaper than re-running a command or +re-reading a whole file. Secret-bearing values are redacted from all tool output. + ## Tool Discovery MCP tools are not loaded by default. Use `search_tools` to discover them: diff --git a/openspec/changes/bound-tool-output-with-file-spill/.openspec.yaml b/openspec/changes/bound-tool-output-with-file-spill/.openspec.yaml new file mode 100644 index 000000000..0ba725fbf --- /dev/null +++ b/openspec/changes/bound-tool-output-with-file-spill/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-03 diff --git a/openspec/changes/bound-tool-output-with-file-spill/design.md b/openspec/changes/bound-tool-output-with-file-spill/design.md new file mode 100644 index 000000000..5bc714de2 --- /dev/null +++ b/openspec/changes/bound-tool-output-with-file-spill/design.md @@ -0,0 +1,176 @@ +## Context + +Three tool paths capture external output and feed it to the model: +`shell_execute` (process pipes), `background_job` (process pipes, long-running), +and `file_read` (a file on disk). Originally they bounded output inconsistently: + +- `shell_execute` drained its pipes into a head+tail window capped at + `ToolConfig.MaxOutputChars` (32000) via `BoundedDrainAsync` (#1293), in the tool. +- The session pipeline then re-clamped **every** tool result to + `SessionTuning.MaxInlineToolResultChars` (12000), **head-only**, in + `SessionToolExecutionPipeline.ClampToolResult`. +- `background_job` and the default `file_read` path did not bound memory at all — + they materialized the full output/file as a managed string before trimming + (#1300, #1301). + +Two truncation stages that disagreed on budget (32000 vs 12000) and policy +(head+tail vs head-only), plus two unbounded OOM paths. Both Claude Code and +opencode ship the same "bound inline, spill the rest to a file, steer the model +to read ranges / grep" pattern; this change adopts it. + +**The load-bearing discovery (from the implementation review):** every tool +result already funnels through **one chokepoint** — `DispatchingToolExecutor`, +which both the main session pipeline and sub-agents call to run a tool, and which +**already redacts every result centrally** (`SecretOutputRedactor.Redact`, before +`ClampToolResult` ever runs). That makes the dispatcher — not the tool, not the +pipeline clamp — the correct home for bounding + spill: it sees every result, for +every audience, already redacted. Putting it anywhere else (per-tool, or in the +main-session-only `ClampToolResult`) either duplicates the logic or misses the +sub-agent path. + +## Goals / Non-Goals + +**Goals:** + +- One place that bounds tool output to an inline budget and, on overflow, spills + the full (redacted) output to a session file with a steer — uniform for the + main session and sub-agents. +- A budget that fits the tool: small for *verbose* tools (shell), generous for + *content* tools (file_read, web_fetch, MCP, memory) the model fetched to read. +- Bounded memory on every capture path — no path materializes arbitrarily large + output as a managed string (closes #1300, #1301 OOM). +- A single shared bounded-output reader so the ring/window logic is reviewed and + fixed once, not copy-pasted (the #1293 review's altitude finding). + +**Non-Goals:** + +- Spill-file retention / cleanup (owned by the session-log-cleanup issue). +- Streaming redaction / byte-complete spill of multi-GB floods (the capture + ceiling bounds the spill; see D7 risks). +- A new redaction abstraction — the central `SecretOutputRedactor` already covers + every result. +- Inferring from a shell command's AST that its output equals an existing file + (rejected: output ≠ a referenced file once piped/filtered; false positives + mislead). + +## Decisions + +### D1 — Bounding + spill live in `DispatchingToolExecutor`, the one chokepoint + +Immediately after the dispatcher's existing central redaction, bound the result +to the resolved budget and, when it overflows, window it head+tail and spill the +full redacted result to a session file with a steer. Because every tool result +(main + sub-agent) passes through the dispatcher already-redacted, this is a +single uniform stage and the spilled file is redacted for free. + +- *Alternative — spill in the tool:* rejected. It duplicates logic across tools, + pushes the spill into N places, and the spill body would have to be re-redacted + per tool. +- *Alternative — spill in `ClampToolResult` (pipeline):* rejected. That path is + main-session only; sub-agents call the dispatcher but not the pipeline clamp, so + this would miss them (the exact gap the review found). + +### D2 — `ClampToolResult` is retired (single truncation stage) + +With the dispatcher owning the bound, a second pipeline clamp would re-window an +already-windowed+steered result (slicing its tail, stacking a second marker). The +`ClampToolResult` calls and method are removed; the dispatcher is the only stage. + +### D3 — Per-tool inline budgets: one content default, opt-in verbose override + +The budget is a property of the tool, not a policy branch in the dispatcher. +`INetclawTool.InlineOutputBudgetChars` defaults to `0` = "use the session content +budget" (`SessionTuning.MaxInlineToolResultChars`, default **12000**). Verbose +tools override it: `shell_execute` returns **2000**, because shell output is noise +the model skims. Content tools (file_read, web_fetch, web_search, memory, MCP) +take the 12000 content budget because the model fetched them to read in full — +truncating those to a tiny window would force a wasteful extra `file_read` +round-trip. The dispatcher resolves `tool.InlineOutputBudgetChars` ?? content +default. Not a `shell` special-case, not per-tool busywork. + +- *Alternative — one global `N`:* rejected. A single small `N` starves content + tools; a single large `N` bloats shell context. Verbose-vs-content is genuinely + a per-tool property. + +### D4 — Spill file `{SessionDirectory}/tool-calls/{toolCallId}.log` + +The dispatcher has `toolCall.CallId` (a method parameter) and +`context.SessionDirectory`, so it names the spill per call with no extra plumbing +on `ToolExecutionContext`. The call id is sanitized against path traversal before +use as a filename. No session dir / call id → degrade to inline-only window. + +### D5 — Redaction stays central; tools and the spill helper do not redact + +`DispatchingToolExecutor` already redacts every result with `SecretOutputRedactor` +before bounding, so the inline result and the spilled file are both redacted from +one pass — no new abstraction, no per-tool or per-chunk redaction. `file_read`'s +own redact-on-read was therefore redundant and is removed (#1301's "no redaction" +half was a false alarm — the dispatcher covered it; only its OOM half was real). + +### D6 — Tools shrink to "bound capture (OOM) + return raw" + +Tools no longer window, redact, or spill — they only bound their own *capture* for +memory safety and return the raw bounded string; the dispatcher does the rest. +`shell_execute` drains both pipes to `MaxOutputChars` and returns the combined +output. `file_read` reads a bounded head (`ReadBoundedHeadAsync`, up to +`MaxOutputChars` — never `ReadAllTextAsync`) and returns it; the dispatcher then +applies the content budget + spill uniformly. The offset/limit path +(`ReadLinesAsync`) is already bounded and unchanged. + +### D7 — Combined (not per-stream) bound for `shell_execute` + +`shell_execute` drains stdout and stderr each to `MaxOutputChars`, assembles the +combined output, and re-windows the **combined** back to `MaxOutputChars` so the +captured/spilled body is bounded by the ceiling (not 2×). This also resolves the +per-stream doubling the review flagged. + +### D8 — `MaxOutputChars` is the capture ceiling + +`ToolConfig.MaxOutputChars` (raised 32000 → **256000**) is the memory/disk bound +on what a tool captures — the body that becomes the spill. It is independent of +the inline budget `N` (which the dispatcher applies). The pipe still drains past +it to avoid child deadlock; the discarded middle leaves a `...` gap in the spill. + +### D9 — `background_job` keeps its own log path + +`background_job` is not an inline tool result (it delivers a tail + an on-disk log +via an actor message), so it does **not** go through the dispatcher spill. It uses +the shared `BoundedOutputReader` to drain its pipes in bounded memory (closing +#1300) and marks output that exceeded its capture ceiling. + +## Risks / Trade-offs + +- **Capture ceiling clips the spill for a true flood** → e.g. `cat` of a 2 GB + file yields a ~512 KB head+tail spill with a `...` gap, not a duplicate. The + steer says "output saved to {path}" (not "full"), honest in both cases; the + skill steers the agent to `file_read` the source for files. +- **Verbose budget (2000) is small** → intended; shell output is skimmable and + the full output is one `file_read`/`grep` away. +- **Lowering then restoring the default** → `MaxInlineToolResultChars` stays + 12000 (content); only `shell_execute` is aggressive. Configs that set it + explicitly are unaffected. +- **Sub-agent spills land in the parent session dir** → acceptable; the parent + owns the directory and its access scope. + +## Migration Plan + +1. Land `BoundedOutputReader` (shared reader) — no behavior change. +2. Add the per-tool budget rail (`INetclawTool.InlineOutputBudgetChars`). +3. Move bound+spill into `DispatchingToolExecutor`; retire `ClampToolResult`. +4. Shrink the tools (shell/file_read) to bound-capture-and-return; remove their + redaction. +5. `background_job` bounded drain + ceiling marker. +6. Update the `netclaw-operations` skill + eval cases. + +Rollback: re-instate `ClampToolResult` and remove the dispatcher bound+spill; the +in-tool capture bounds (the OOM fixes) remain. + +## Open Questions + +- **`background_job` byte-complete log:** capped at `MaxCapturedOutputChars` + (256000); a long job's log is a head+tail view. Revisit if a user needs full + job logs (would need a larger ceiling or streaming redaction). +- **Spill for content tools with a natural backing (`file_read`):** today the + dispatcher may spill a bounded copy of file content. The copy is redacted and + capped, so it is harmless but slightly redundant with the original file; left + as-is rather than special-casing `file_read` out of the uniform path. diff --git a/openspec/changes/bound-tool-output-with-file-spill/proposal.md b/openspec/changes/bound-tool-output-with-file-spill/proposal.md new file mode 100644 index 000000000..7d833d0a3 --- /dev/null +++ b/openspec/changes/bound-tool-output-with-file-spill/proposal.md @@ -0,0 +1,108 @@ +## Why + +Large tool output is bounded inconsistently across the codebase, and the two +truncation stages that do exist disagree. `shell_execute` now bounds its pipe +reads to a head+tail window (`BoundedDrainAsync`, #1293), but the session +pipeline then re-clamps every tool result to `MaxInlineToolResultChars` +**head-only** — so the tail that `BoundedDrainAsync` worked to preserve (errors, +exit status) is discarded before the model sees it, and `stderr` can be dropped +entirely. Meanwhile two sibling paths still materialize unbounded output in +memory and can OOM a memory-limited daemon: `background_job` output capture +(#1300) and the default `file_read` path (#1301); `file_read` also returns file +contents with no secret redaction at all. + +The fix is to make output bounding a single, shared, deliberate mechanism: +one budget, one policy (head + tail), and — when output exceeds the budget — +spill the full output to a session-scoped file and hand the model the path with +a hint to read ranges or `grep` instead of re-running. This is the pattern both +Claude Code and opencode ship, and it is the pattern `background-job-execution` +already half-specifies (tail + output-file path). + +No dedicated PRD exists; this originates from the #1293 production OOM incident +and its review (#1300, #1301). It should be linked from a PRD if one is opened. + +## What Changes + +- Bound tool output and spill the overflow in **one place** — + `DispatchingToolExecutor`, the chokepoint every tool result already passes + through (main session and sub-agents) and where the central + `SecretOutputRedactor.Redact` already runs. Right after redaction it windows the + result to the tool's inline budget and, on overflow, spills the full redacted + result to `sessionDir/tool-calls/{toolCallId}.log` with a steer to `file_read` + (offset/limit) or `grep`. The spilled file is redacted for free. +- **Per-tool inline budgets.** Add `INetclawTool.InlineOutputBudgetChars` (default + `0` = use the session content budget). `shell_execute` overrides to **2000** + (verbose output the model skims); content tools (`file_read`, `web_fetch`, + `web_search`, memory, MCP) use the **12000** content budget because the model + fetched them to read in full. +- **Retire `ClampToolResult`.** The dispatcher is the single truncation stage; + the head-only pipeline clamp is removed (no double-clamp). +- `SessionTuning.MaxInlineToolResultChars` stays **12000** (the content default); + only verbose tools opt down. `ToolConfig.MaxOutputChars` becomes the **capture + ceiling** (32000 → **256000**) — the memory/disk bound on what a tool captures. +- Tools shrink to "bound capture (OOM) + return raw": `shell_execute` drains both + pipes to the ceiling and returns the combined output (no per-tool window / + redact / spill); `file_read` reads a bounded head (no `ReadAllTextAsync`) and + returns it (closes #1301 OOM). The central dispatcher redaction already covered + `file_read`, so its redundant redact-on-read is removed (#1301's redaction half + was a false alarm). +- `background_job`: bounded pipe drain via the shared reader + a capture-ceiling + marker (closes #1300 OOM). It keeps its own on-disk log, not the dispatcher + spill. + +### Out of scope (this change) + +- Spill-file lifecycle/cleanup (tracked separately; the session-log cleanup + issue owns retention/sweep). +- A byte-complete (unbounded) spill: capture is bounded by `MaxOutputChars`, so a + multi-hundred-MB flood is captured head+tail, not in full. See design D7/D8. +- Inferring from a shell command's AST that its output equals an existing file + (rejected — output ≠ a referenced file once piped/filtered). +- **Media egress (#1296)** — image/AV bytes sent *to* the model. It shares the + "don't `ReadAllBytes` a huge thing" lesson but needs a different fix + (downscale/streamed-encode/provider file APIs), so it stays a separate change; + this one closes only the **text** tool-output paths (#1300, #1301). + +## Capabilities + +### New Capabilities + +- `bounded-tool-output`: the cross-cutting contract — `DispatchingToolExecutor` + bounds every (already-redacted) tool result to the tool's inline budget and + spills the overflow to a session file with a steer; per-tool budgets + (`InlineOutputBudgetChars`, content default vs verbose override); a shared + bounded-output reader; the capture ceiling. Bounded memory on every path. + +### Modified Capabilities + +- `netclaw-tools`: `shell_execute` and `file_read` shrink to bounded-capture-and- + return; `shell_execute` declares the small verbose budget and bounds combined + stdout+stderr; `file_read` reads a bounded head (no full materialization). The + truncation requirement moves from a per-tool indicator to the central + dispatcher's window + spill + steer. +- `netclaw-session`: tool-result inlining no longer clamps in the pipeline + (`ClampToolResult` removed); the dispatcher is the single bounding+spill stage. +- `background-job-execution`: output capture SHALL bound memory (shared reader + + ceiling marker) rather than buffering the full output before trimming. + +## Impact + +- **Code:** `DispatchingToolExecutor` (bound+spill), `ToolOutputSpill` (the + bound+spill helper), `INetclawTool`/`NetclawTool` (`InlineOutputBudgetChars` + rail), `ShellTool` (declares 2000; bound combined; drop redact/spill), + `FileReadTool` (bounded head; drop redundant redaction), `BoundedOutputReader` + (shared reader), `BackgroundJobExecutionActor` (bounded drain + marker), + `SessionToolExecutionPipeline` (`ClampToolResult` removed), `ToolExecutionContext` + (`MaxInlineToolResultChars` content budget; `ToolCallId` removed — the + dispatcher uses `toolCall.CallId`). +- **Config:** `SessionTuning.MaxInlineToolResultChars` stays 12000 (content); + `ToolConfig.MaxOutputChars` 32000 → 256000 (capture ceiling; schema default + updated). +- **Security:** redaction unchanged in placement (central dispatcher) — the spill + file is redacted because redaction runs before it; spill files live under the + session directory and inherit its access scope. +- **Operational:** large tool outputs produce a session-dir `.log` the agent + reads on demand; the model is steered toward ranged reads/`grep`. Closes #1300 + and #1301; resolves the #1293 review's two-stage-truncation tension. +- **Evals/skills:** tool-output behavior changes — update `netclaw-operations` + (done) and any eval cases asserting on tool-result truncation/format. diff --git a/openspec/changes/bound-tool-output-with-file-spill/specs/background-job-execution/spec.md b/openspec/changes/bound-tool-output-with-file-spill/specs/background-job-execution/spec.md new file mode 100644 index 000000000..a7a7efe3b --- /dev/null +++ b/openspec/changes/bound-tool-output-with-file-spill/specs/background-job-execution/spec.md @@ -0,0 +1,76 @@ +## MODIFIED Requirements + +### Requirement: Background job execution + +The system SHALL spawn a `BackgroundJobExecutionActor` child of the manager +for each background job. The execution actor SHALL start the process, capture +stdout/stderr to `~/.netclaw/jobs/{id}/output.log`, and monitor for process +exit. Output capture SHALL bound peak managed memory per the `bounded-tool-output` +capability — the execution actor SHALL stream output to the log and retain only a +bounded tail in memory for the completion message, and SHALL NOT materialize the +full output as a single in-memory string before trimming. On process exit, the +actor SHALL deliver the result to the originating session via +`DeliverTrustedSessionTurn`. This trusted delivery SHALL be an intentional parity +decision with normal synchronous shell tool results, not a trust escalation +beyond the original tool execution. On timeout, the actor SHALL kill the entire +process tree. + +#### Scenario: Process started and monitored + +- **GIVEN** a `StartBackgroundJob` command is accepted +- **WHEN** the execution actor starts +- **THEN** the process is spawned with stdin closed +- **AND** stdout/stderr are captured to the output log file +- **AND** the manager returns a `BackgroundJobStarted` response with the job ID + +#### Scenario: Output capture bounds memory + +- **GIVEN** a background job process emits output far larger than the capture + ceiling +- **WHEN** the execution actor captures it +- **THEN** peak managed memory stays on the order of the capture ceiling +- **AND** the full output is not held as a single in-memory string +- **AND** the daemon is not OOM-killed by the capture + +#### Scenario: Process completes successfully + +- **GIVEN** a background job process exits with code 0 +- **WHEN** the execution actor detects the exit +- **THEN** the result (exit code, bounded output tail, output file path, original + rationale) is delivered to the originating session via + `DeliverTrustedSessionTurn` +- **AND** the job definition is updated to completed status + +#### Scenario: Process fails + +- **GIVEN** a background job process exits with non-zero code +- **WHEN** the execution actor detects the exit +- **THEN** the failure result (exit code, error output, original rationale) is + delivered to the originating session via `DeliverTrustedSessionTurn` +- **AND** the job definition is updated to failed status + +#### Scenario: Process timeout + +- **GIVEN** a background job exceeds its timeout +- **WHEN** the timeout fires +- **THEN** the entire process tree is killed +- **AND** a timeout error is delivered to the originating session +- **AND** the job definition is updated to timed-out status + +#### Scenario: Job result delivery to passivated session + +- **GIVEN** a background job completes +- **AND** the originating session has been passivated (idle timeout) +- **WHEN** the result is delivered via `DeliverTrustedSessionTurn` +- **THEN** the session rehydrates from the Akka Persistence journal +- **AND** processes the result turn + +#### Scenario: Trusted delivery preserves synchronous shell parity + +- **GIVEN** a background job was started from an approved `shell_execute` call +- **WHEN** the job completes +- **THEN** the completion is delivered as a trusted turn +- **AND** that trust matches the trust level of the same shell result if it had + completed synchronously +- **AND** no broader trust is granted than the persisted originating + audience/boundary diff --git a/openspec/changes/bound-tool-output-with-file-spill/specs/bounded-tool-output/spec.md b/openspec/changes/bound-tool-output-with-file-spill/specs/bounded-tool-output/spec.md new file mode 100644 index 000000000..a05ccd6f4 --- /dev/null +++ b/openspec/changes/bound-tool-output-with-file-spill/specs/bounded-tool-output/spec.md @@ -0,0 +1,92 @@ +## ADDED Requirements + +### Requirement: Central bound + spill for every tool result + +`DispatchingToolExecutor` SHALL bound every tool result to an inline budget and, +when the result exceeds that budget, return a head+tail window of the budget plus +a steer pointing the model at the full output. This applies uniformly to every +tool, for the main session and for sub-agents (both run tools through the +dispatcher). The bound SHALL be applied after the dispatcher's central secret +redaction, so the inline result and any spilled file are redacted from one pass. +Individual tools SHALL NOT window, redact, or spill their results; they only bound +their own capture for memory safety and return the raw bounded result. + +#### Scenario: Result under budget returned unchanged + +- **WHEN** a tool returns a result at or below its inline budget +- **THEN** the dispatcher returns it unchanged +- **AND** no spill file is created + +#### Scenario: Result over budget windowed to head and tail + +- **WHEN** a tool returns a result larger than its inline budget +- **THEN** the inline result contains the head and the tail of the result within + the budget, with the discarded middle marked + +#### Scenario: Same bounding for sub-agent tool calls + +- **WHEN** a sub-agent runs a tool that returns an oversized result +- **THEN** the same dispatcher bound + spill applies (sub-agents are not exempt) + +### Requirement: Per-tool inline budget + +The inline budget SHALL be a property of the tool. `INetclawTool.InlineOutputBudgetChars` +SHALL default to `0`, meaning "use the session content budget" +(`SessionTuning.MaxInlineToolResultChars`). A tool MAY override it: verbose tools +(e.g. `shell_execute`) override to a small value because their output is skimmed; +content tools keep the larger content budget because the model fetched them to +read in full. The dispatcher SHALL resolve the budget as the tool's override when +positive, else the session content budget. + +#### Scenario: Verbose tool bounded small + +- **GIVEN** a tool declares a small `InlineOutputBudgetChars` (e.g. shell) +- **WHEN** its output exceeds that small budget +- **THEN** the dispatcher windows it to the small budget and spills the rest + +#### Scenario: Content tool bounded by the session content budget + +- **GIVEN** a tool with no override (e.g. file_read, web_fetch, MCP) +- **WHEN** its output is at or below the content budget +- **THEN** the full result is returned inline with no spill + +### Requirement: Spill to a session-scoped file with a steer + +When a result exceeds its inline budget, the dispatcher SHALL write the full +(already-redacted) result to `{SessionDirectory}/tool-calls/{toolCallId}.log`, +using the call's identifier (sanitized so it cannot escape the directory), and +SHALL append a steer with the path directing the model to read a slice (`file_read` +with offset/limit) or `grep` rather than re-running the tool. When no session +directory or call id is available, the dispatcher SHALL return the inline window +without spilling. + +#### Scenario: Spill file written and path returned + +- **WHEN** a result over budget is produced in a session with a directory +- **THEN** the full redacted result is written to + `{SessionDirectory}/tool-calls/{toolCallId}.log` +- **AND** the inline result includes that path and a steer to `file_read`/`grep` + +#### Scenario: Spilled file is redacted + +- **WHEN** a result containing a secret is spilled +- **THEN** the on-disk spill file has the secret redacted (redaction runs before + the spill write) + +#### Scenario: Call id cannot escape the spill directory + +- **WHEN** the call id contains path-traversal characters +- **THEN** the spill file is written inside the tool-calls directory, never outside it + +### Requirement: Bounded-memory capture + +Tools SHALL bound their own capture so peak managed memory is on the order of the +capture ceiling (`ToolConfig.MaxOutputChars`), independent of total output size. +No tool SHALL materialize the entire output of an arbitrarily large source as a +single in-memory string before bounding it. + +#### Scenario: Large source does not scale memory + +- **WHEN** a tool reads from a source far larger than the capture ceiling +- **THEN** peak managed allocation stays on the order of the capture ceiling +- **AND** the process is not OOM-killed by the capture diff --git a/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-session/spec.md b/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-session/spec.md new file mode 100644 index 000000000..c0c6cb8a7 --- /dev/null +++ b/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-session/spec.md @@ -0,0 +1,33 @@ +## MODIFIED Requirements + +### Requirement: Tool execution encapsulation + +Tool execution SHALL be encapsulated in a `SessionToolExecutionPipeline` static +utility class. The pipeline SHALL execute tool calls in parallel, track sub-agent +activity, and send `ToolExecutionCompleted` or `ToolExecutionFailed` back to the +actor. The pipeline SHALL NOT itself bound or clamp tool-result size: bounding the +result to the inline budget and spilling the overflow is done once, centrally, by +`DispatchingToolExecutor` (per the `bounded-tool-output` capability), so the +pipeline stores the result the dispatcher already bounded. `SessionTuning.MaxInlineToolResultChars` +is the session **content** budget the dispatcher uses for tools without a smaller +per-tool override. + +#### Scenario: Parallel tool execution + +- **GIVEN** an LLM response contains 3 tool calls +- **WHEN** `SessionToolExecutionPipeline.ExecuteToolsAsync()` runs +- **THEN** all 3 tool calls execute in parallel +- **AND** results are collected and sent as a single `ToolExecutionCompleted` + +#### Scenario: Tool execution timeout + +- **GIVEN** tool execution is in progress +- **WHEN** the configured `ToolExecutionTimeout` elapses +- **THEN** the pipeline sends `ToolExecutionFailed` with a `TimeoutException` + +#### Scenario: Oversized result already bounded by the dispatcher + +- **GIVEN** a tool returns an oversized result +- **WHEN** it reaches the pipeline +- **THEN** the pipeline stores it as-is (already windowed + spilled by + `DispatchingToolExecutor`) without re-clamping diff --git a/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-tools/spec.md b/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-tools/spec.md new file mode 100644 index 000000000..78ff1f0b9 --- /dev/null +++ b/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-tools/spec.md @@ -0,0 +1,86 @@ +## MODIFIED Requirements + +### Requirement: Shell execution tool + +The system SHALL provide a shell execution tool that runs commands as the +Netclaw process user context. Stdin SHALL be closed (no interactive commands). +Execution SHALL enforce a configurable timeout (default: 60 seconds). The tool +SHALL drain stdout and stderr in bounded memory (each to the capture ceiling +`ToolConfig.MaxOutputChars`) and return the combined output bounded to the +ceiling — it does NOT itself window, redact, or spill (the central +`bounded-tool-output` mechanism does, after redaction). `shell_execute` SHALL +declare a small verbose inline budget (`InlineOutputBudgetChars`) so its skimmable +output is bounded aggressively. Before execution, the shell tool SHALL check the +hard deny list via `ShellCommandPolicy`; hard-denied commands SHALL be rejected +before `ToolPathPolicy` path checks. + +#### Scenario: Execute command and return output + +- **GIVEN** the `shell` grant is available for the session +- **WHEN** the agent invokes the shell tool with a command +- **THEN** the command is executed as the Netclaw process user +- **AND** stdout and stderr are captured +- **AND** the combined output is returned to the LLM + +#### Scenario: Hard-denied command rejected before execution + +- **GIVEN** the agent invokes `shell_execute` with `netclaw daemon stop` +- **WHEN** `ShellCommandPolicy` evaluates the command +- **THEN** the command is rejected with "Command blocked by hard deny policy" +- **AND** the shell process is never started + +#### Scenario: Execution timeout enforced + +- **GIVEN** a shell command is running +- **WHEN** the command exceeds the configured timeout (default: 60 seconds) +- **THEN** the process is terminated +- **AND** the tool returns a timeout error message to the LLM + +#### Scenario: Combined output bounded by the capture ceiling + +- **GIVEN** a shell command writes large output to both stdout and stderr +- **WHEN** the output is captured +- **THEN** the returned combined output is bounded by `MaxOutputChars` (one shared + ceiling, not a per-stream cap) +- **AND** the dispatcher applies the inline budget + spill + steer on top + (per `bounded-tool-output`) + +#### Scenario: Stdin closed prevents interactive commands + +- **GIVEN** the agent invokes the shell tool with a command +- **WHEN** the process is created +- **THEN** stdin is closed immediately +- **AND** commands that require interactive input fail promptly + +#### Scenario: Working directory set to project path + +- **GIVEN** the session is associated with a registered project +- **WHEN** the shell tool executes a command +- **THEN** the working directory is set to the project's registered path + +## ADDED Requirements + +### Requirement: File read tool bounds its read for memory safety + +The `file_read` tool's default (no `offset`/`limit`) path SHALL read a bounded +head of the file (up to `ToolConfig.MaxOutputChars`) and stop — it SHALL NOT read +the entire file into memory before truncating. The existing line-range +(`offset`/`limit`) path SHALL remain bounded. `file_read` SHALL NOT redact its +result itself; the central `DispatchingToolExecutor` redaction covers it. The +inline bound + spill (if any) is applied centrally per `bounded-tool-output`; +`file_read` is a content tool and uses the session content budget. + +#### Scenario: Large file is read in bounded memory + +- **WHEN** the agent reads a file larger than the capture ceiling with no + `offset`/`limit` +- **THEN** the tool reads only a bounded head and does not materialize the whole + file in memory +- **AND** it appends a steer to read a specific range (`offset`/`limit`) or `grep` + +#### Scenario: Secrets in a read file are redacted by the dispatcher + +- **GIVEN** a file contains a secret-bearing value (e.g. an API key) +- **WHEN** the agent reads the file +- **THEN** the result returned to the model has the secret redacted (by the + central dispatcher redaction) diff --git a/openspec/changes/bound-tool-output-with-file-spill/tasks.md b/openspec/changes/bound-tool-output-with-file-spill/tasks.md new file mode 100644 index 000000000..4050a14fa --- /dev/null +++ b/openspec/changes/bound-tool-output-with-file-spill/tasks.md @@ -0,0 +1,159 @@ +## 1. Shared bounded-output reader (foundation, no behavior change) + +- [x] 1.1 Extract `BoundedDrainAsync` from `ShellTool` into a reusable + `BoundedOutputReader` (Netclaw.Actors/Tools), keeping the #1293 ring core + (pooled buffer, `ValueTask` reads, block-copy tail ring) +- [x] 1.2 Expose `DrainToWindowAsync(TextReader, int budget, ct) → (string Text, + bool Truncated)` (head+tail, in-memory only) +- [x] 1.3 Expose `DrainCaptureAsync(TextReader, int captureMax, int inlineBudget, + ct) → (string Captured, string Inline, bool CeilingExceeded)` over a shared + core, plus a pure `Window(string, budget)` helper; drain past `captureMax` +- [x] 1.4 Move/adapt the `BoundedDrainAsync` unit tests onto `BoundedOutputReader` + (`DrainToWindow_*`), add `Window` + `DrainCapture` coverage +- [x] 1.5 Repoint `ShellTool` and the benchmark at + `BoundedOutputReader.DrainToWindowAsync` with no behavior change; 40 reader + + shell tests green + +## 2. Tool-call id and inline budget plumbed to the capture layer + +- [x] 2.1 Add `ToolCallId` (typed value object, nullable) to `ToolExecutionContext` +- [x] 2.2 Set `ToolCallId` in `BuildToolExecutionContext` (per-call) from `tc.CallId` +- [x] 2.3 Add `MaxInlineToolResultChars` to `ToolExecutionContext` and thread `N` + through `BuildToolExecutionContext` so tools bound to the same `N` the + pipeline enforces +- [x] 2.4 Carry/distinctness is compiler-enforced (per-call context) + verified + end-to-end by the task-4 spill test ({callId}.log); skip a trivial + assignment test per the testing guidelines. (Sub-agent/direct-construction + contexts default to null id + 0 budget — fallback handled in task 3/4.) + +## 3. Spill writer + steering message + +- [x] 3.1 Add `ToolOutputSpill.RenderAsync`: redact the bounded capture once, + window the inline from the redacted text, and write + `{sessionDirectory}/tool-calls/{toolCallId}.log` (call id sanitized against + path traversal). Removed the redundant `DrainCaptureAsync` — the real flow + is DrainToWindow → redact → Window → spill (inline must come from the + *redacted* capture), so DrainToWindow + Window + the spill helper supersede it. +- [x] 3.2 Steering message (path + "read a slice with file_read offset/limit or + grep instead of re-running") + a "capture ceiling exceeded" note +- [x] 3.3 Tests: under-budget verbatim; over-budget spill written + redacted-on-disk + + steer; ceiling note; no-session degrade; path-traversal call id contained. + 15 reader+spill tests pass; slopwatch clean + +## 4. shell_execute adopts capture + spill + +- [x] 4.1 Switch `ShellTool` to combined-capture (one shared budget across + stdout+stderr via `DrainToWindowAsync` per stream → assembled) then + `ToolOutputSpill.RenderAsync`, which redacts once, windows to `N`, and + spills+steers. Removed ShellTool's own per-stream redaction and markers. +- [x] 4.2 Replaced `Output_truncation_applies` with `Large_output_spills_to_file_and_steers` + (asserts inline head+tail + spill file + steer); kept the Windows-deterministic + `echo`. Redaction/echo/stderr tests still pass (redaction now in RenderAsync). +- [x] 4.3 Drains-past-ceiling behavior is `DrainToWindowAsync`'s (proven by the + reader tests); the existing cancellation/kill test covers the no-deadlock path. + +## 5. background_job bounded capture (closes #1300) + +- [x] 5.1 `BackgroundJobExecutionActor` now drains each stream via + `BoundedOutputReader.DrainToWindowAsync` (capture ceiling + `MaxCapturedOutputChars = 256000`) instead of `ReadToEndAsync` — bounded + memory; the log is head+tail for floods larger than the ceiling. Closes #1300. +- [x] 5.2 Redact-on-write unchanged (the existing `SecretOutputRedactor.Redact` + now runs over the bounded combined output before the log write) +- [x] 5.3 Bounding is `DrainToWindowAsync`'s (unit-tested); the existing + BackgroundJob integration tests exercise the new drain path end-to-end. + +## 6. file_read bounded reads + redaction (closes #1301) + +- [x] 6.1 Default `file_read` path now uses `ReadBoundedHeadAsync` (reads up to the + limit and stops — bounds memory AND I/O) instead of `ReadAllTextAsync` + + `TruncateFileOutput`; no spill (the file is its own backing). Closes #1301. + (Head-only, not head+tail: a file is read top-down via Offset/Limit, and + head+tail would require reading the whole file to reach the tail.) +- [x] 6.2 Over-budget steer: "read a specific range with Offset and Limit, or grep" +- [x] 6.3 Redact-on-read via `SecretOutputRedactor` on both the default and the + `ReadLinesAsync` (offset/limit) return paths; offset/limit path stays bounded +- [x] 6.4 Tests: large file returns a bounded head (first N only, not all 500 + chars) + steer; secret in a read file is redacted. 2195 actor tests pass + +## 7. Config + pipeline unification + +- [x] 7.1 Lowered `SessionTuning.MaxInlineToolResultChars` default 12000 → 2000 +- [x] 7.2 Repurposed `ToolConfig.MaxOutputChars` as the capture ceiling, default + 32000 → 256000 (docs updated to reflect the new role) +- [x] 7.3 `ClampToolResult` now head+tail (reuses `BoundedOutputReader.Window`); + safety net for non-shared-reader results (MCP, in-process) +- [x] 7.4 Updated `netclaw-config.v1.schema.json` MaxOutputChars default → 256000 + (MaxInlineToolResultChars schema has only `minimum:100`, which 2000 satisfies). + 363 config tests + 2194 actor tests pass; slopwatch clean + +## 8. Quality gates, docs, eval, and OpenSpec close-out + +- [x] 8.1 `dotnet slopwatch analyze` clean (run per group); headers on all new .cs +- [x] 8.2 Updated `netclaw-operations` skill (new "Large tool output" section: + spill to session file, steer to ranged reads/grep, bounded job log) and + bumped `metadata.version` 2.8.9 → 2.9.0 +- [x] 8.3 Eval cases flagged + the two coverage gaps ADDED + suite run against + spark2 (Qwen3.6-35B, openai-compatible): + - **At-risk-but-robust (verified still green, 5/5 each):** + `complex_diagnose_self`, `complex_gh_issues`, `complex_write_and_run`, + `multi_turn_tool_repeat`, `multi_turn_tool_carryover`. Their asserts check + `[tool:call] shell_execute` + a keyword, not exact tool-result content — + so truncating output to N=2000 + spill doesn't break them. + - **New coverage cases added** (Category 7, `evals/run-evals.sh`), both 5/5. + They assert on OUTCOME, not mechanism: the prompts state only the goal and + give ZERO handling hints (no mention of spill/redirect/re-run/file_read/ + Offset/grep) — coping with oversized output must come from AGENTS.md, the + netclaw-operations skill, and the tool-result steer text, or the eval is + just testing instruction-following. Data is a deterministic-but-opaque + Lehmer PRNG (`x=(x*48271)%2147483647`, identical across awk impls), so a + deep-line value is reproducible AND un-fabricatable; since one read is + bounded to ~N inline, retrieving a deep line is only possible by paging — + so a correct value *implies* correct handling (no mechanism assertion). + 1. `complex_large_shell_output_spill` — "run `awk '…'` and tell me the + number on line 200" (a bare safe-verb generator, no path tokens → + auto-approves headless; ~210 KB stdout → spills). Asserts the answer + contains line 200's value (872671849). Whether the agent reads the + spill or self-redirects+reads, the outcome assertion is satisfied. + 2. `complex_large_file_read_ranged` — a ~314 KB file pre-seeded by + `start_eval_daemon` under the workspaces read-root; "list lines + 4997–5003 of ". Asserts the answer contains line 5000's value + (1629331733). Line 5000 is ~52 KB in (past the inline window), so a + correct answer proves the agent paged with `file_read` Offset/Limit. + - **Finding (separate from this change):** the model reliably pages with + `file_read` Offset/Limit but treats `Offset` as **0-based** despite its + "(1-based)" param description (e.g. `Offset:4999` for "line 5000"). The + window prompt tolerates this so the case measures bounded-output paging, + not index arithmetic; the off-by-one itself likely warrants a clearer + `file_read.Offset` description / operations-skill note as a follow-up. + - Run: NETCLAW_EVAL_PROVIDER_TYPE=openai-compatible + NETCLAW_EVAL_PROVIDER_ENDPOINT=https://spark2.testlab.petabridge.net/ + NETCLAW_EVAL_MODEL_ID=Qwen/Qwen3.6-35B-A3B-FP8 + NETCLAW_EVAL_CATEGORY="Complex Task Execution" ./evals/run-evals.sh + → Complex Task Execution 5/5 (100%), both new cases 5/5 uncoached. +- [x] 8.4 Added `CaptureBenchmarks` confirming O(ceiling): allocation flat at + ~1255 KB for 256K and 50M chars (vs unbounded before) +- [ ] 8.5 `openspec validate` passes; sync/archive on PR merge (not yet merged) + +## 9. Architecture revision (post-implementation-review) + +A max-effort review of the per-tool spill design surfaced a HIGH-severity gap +(sub-agent tool output unbounded) and that the spill was at the wrong altitude. +The change pivoted; the artifacts above describe the original per-tool approach, +the proposal/design now describe the as-built. Net deltas: + +- [x] 9.1 Move bound+spill into `DispatchingToolExecutor` (the one chokepoint both + main session and sub-agents use, already the central redaction point) — + retire `ClampToolResult`; tools shrink to bound-capture-and-return. +- [x] 9.2 Per-tool inline budgets (`INetclawTool.InlineOutputBudgetChars`): content + default 12000 (restore `MaxInlineToolResultChars`), `shell_execute` opts to + 2000. Content tools (file_read/web_fetch/MCP/memory) no longer truncated to + a tiny window. +- [x] 9.3 Remove `file_read` redundant redaction (central dispatcher covers it); + drop `ToolExecutionContext.ToolCallId` (dispatcher uses `toolCall.CallId`). +- [x] 9.4 Steer wording: "output saved to {path}" (not "full"), honest for a + capture-ceiling-clipped flood. Rejected shell-AST file inference (output ≠ a + referenced file once piped/filtered). +- [x] 9.5 Tests retargeted to the dispatcher (end-to-end verbose-spill, redact- + before-spill, content no-spill); `FakeToolExecutor` mirrors the dispatcher + post-processing. 2196 actor tests pass; slopwatch clean. diff --git a/src/Netclaw.Actors.Tests/Sessions/ToolExecutionIntegrationTests.cs b/src/Netclaw.Actors.Tests/Sessions/ToolExecutionIntegrationTests.cs index f65d84441..b23f6a560 100644 --- a/src/Netclaw.Actors.Tests/Sessions/ToolExecutionIntegrationTests.cs +++ b/src/Netclaw.Actors.Tests/Sessions/ToolExecutionIntegrationTests.cs @@ -249,8 +249,8 @@ await sessionManager.Ask(new SendUserMessage var result = toolMessage!.Contents.OfType().Single().Result?.ToString(); Assert.NotNull(result); - Assert.Contains("tool result truncated", result, StringComparison.OrdinalIgnoreCase); - Assert.True(result!.Length < 300); + Assert.Contains("truncated", result, StringComparison.OrdinalIgnoreCase); + Assert.True(result!.Length < 500); // windowed to the 120-char budget + steer (vs the raw 800) } [Fact] @@ -360,7 +360,7 @@ public Task AuthorizeAsync(FunctionCallContent toolCall, Netclaw.Tools.ToolExecu return Task.CompletedTask; } - public Task ExecuteAsync(FunctionCallContent toolCall, Netclaw.Tools.ToolExecutionContext? context = null, CancellationToken ct = default) + public async Task ExecuteAsync(FunctionCallContent toolCall, Netclaw.Tools.ToolExecutionContext? context = null, CancellationToken ct = default) { Interlocked.Increment(ref _callCount); @@ -370,7 +370,16 @@ public Task ExecuteAsync(FunctionCallContent toolCall, Netclaw.Tools.Too } var result = Results.GetValueOrDefault(toolCall.Name, $"[fake result for {toolCall.Name}]"); - return Task.FromResult(result); + // Mirror DispatchingToolExecutor's post-processing so integration tests see + // the same redact + inline-bound + spill the real executor applies. The fake + // has no tool instance, so it uses the session content budget (per-tool + // verbose overrides are exercised in DispatchingToolExecutorTests). + result = Netclaw.Security.SecretOutputRedactor.Redact(result); + var budget = context?.MaxInlineToolResultChars is > 0 and var b + ? b + : Netclaw.Actors.Tools.ToolOutputSpill.DefaultContentBudget; + return await Netclaw.Actors.Tools.ToolOutputSpill.BoundAndSpillAsync( + result, toolCall.CallId, budget, context, ct); } } diff --git a/src/Netclaw.Actors.Tests/Tools/BoundedOutputReaderTests.cs b/src/Netclaw.Actors.Tests/Tools/BoundedOutputReaderTests.cs new file mode 100644 index 000000000..dd72b9228 --- /dev/null +++ b/src/Netclaw.Actors.Tests/Tools/BoundedOutputReaderTests.cs @@ -0,0 +1,133 @@ +// ----------------------------------------------------------------------- +// +// Copyright (C) 2026 - 2026 Petabridge, LLC +// +// ----------------------------------------------------------------------- +using Netclaw.Actors.Tools; +using Xunit; + +namespace Netclaw.Actors.Tests.Tools; + +public class BoundedOutputReaderTests +{ + // ── DrainToWindowAsync ── + + [Fact] + public async Task DrainToWindow_short_output_returned_verbatim() + { + var input = "hello world"; + var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 100, CancellationToken.None); + Assert.Equal(input, text); + Assert.False(truncated); + } + + [Fact] + public async Task DrainToWindow_empty_input_returns_empty() + { + var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(""), 100, CancellationToken.None); + Assert.Equal("", text); + Assert.False(truncated); + } + + [Fact] + public async Task DrainToWindow_output_exactly_at_cap_not_truncated() + { + var input = new string('a', 100); + var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 100, CancellationToken.None); + Assert.Equal(input, text); + Assert.False(truncated); + } + + [Fact] + public async Task DrainToWindow_long_output_truncated_with_head_and_tail() + { + // 100-char head marker + separator + 100-char tail marker, with filler in the middle + var head = new string('H', 100); + var middle = new string('M', 5000); + var tail = new string('T', 100); + var input = head + middle + tail; + + var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 200, CancellationToken.None); + + Assert.True(truncated); + Assert.StartsWith(new string('H', 100), text); // head preserved + Assert.EndsWith(new string('T', 100), text); // tail preserved + Assert.Contains("...", text); // separator present + Assert.DoesNotContain("M", text); // middle discarded + } + + [Fact] + public async Task DrainToWindow_head_and_tail_split_evenly() + { + // budget=10 → headCap=5, tailCap=5 + var input = "AAAAAXXXXXXBBBBB"; // 16 chars: 5 head, 6 overflow discard, 5 tail + var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 10, CancellationToken.None); + + Assert.True(truncated); + Assert.StartsWith("AAAAA", text); + Assert.EndsWith("BBBBB", text); + } + + [Fact] + public async Task DrainToWindow_disabled_cap_returns_full_output() + { + var input = new string('x', 10_000); + var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 0, CancellationToken.None); + Assert.Equal(input, text); + Assert.False(truncated); + } + + [Fact] + public async Task DrainToWindow_tail_ring_wraps_across_small_chunks() + { + // Drives the ring's wraparound + start-advance path that the StringReader + // tests skip: each read delivers a chunk smaller than tailCap, so the tail + // window is rebuilt incrementally and must wrap rather than reset wholesale. + // budget=10 → headCap=5 ("ABCDE"), tailCap=5; last 5 of "FGHIJKLMNO" = "KLMNO". + var reader = new ChunkedReader("ABCDEFGHIJKLMNO", chunkSize: 3); + + var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(reader, 10, CancellationToken.None); + + Assert.True(truncated); + Assert.Equal($"ABCDE{Environment.NewLine}...{Environment.NewLine}KLMNO", text); + } + + // ── Window (pure string head+tail) ── + + [Fact] + public void Window_under_budget_returned_unchanged() + { + Assert.Equal("short", BoundedOutputReader.Window("short", 100)); + } + + [Fact] + public void Window_over_budget_keeps_head_and_tail() + { + var input = new string('H', 50) + new string('M', 500) + new string('T', 50); + var result = BoundedOutputReader.Window(input, 100); + + Assert.StartsWith(new string('H', 50), result); + Assert.EndsWith(new string('T', 50), result); + Assert.DoesNotContain("M", result); + } + + // Hands out at most chunkSize chars per read so tests can exercise the tail + // ring's incremental wrap path — real pipe reads arrive in arbitrary slices, + // not the single 4KB gulp a StringReader gives. + private sealed class ChunkedReader(string data, int chunkSize) : TextReader + { + private int _pos; + + public override ValueTask ReadAsync(Memory buffer, CancellationToken cancellationToken = default) + { + var remaining = data.Length - _pos; + if (remaining <= 0) + return ValueTask.FromResult(0); + + var n = Math.Min(Math.Min(chunkSize, buffer.Length), remaining); + data.AsSpan(_pos, n).CopyTo(buffer.Span); + _pos += n; + return ValueTask.FromResult(n); + } + } +} diff --git a/src/Netclaw.Actors.Tests/Tools/DispatchingToolExecutorTests.cs b/src/Netclaw.Actors.Tests/Tools/DispatchingToolExecutorTests.cs index 243e57b69..309d950d5 100644 --- a/src/Netclaw.Actors.Tests/Tools/DispatchingToolExecutorTests.cs +++ b/src/Netclaw.Actors.Tests/Tools/DispatchingToolExecutorTests.cs @@ -67,6 +67,112 @@ public DispatchingToolExecutorTests() UsedStrictFallback: false))); } + [Fact] + public async Task Verbose_tool_output_over_budget_is_windowed_and_spilled() + { + var sessionDir = Path.Combine(Path.GetTempPath(), "nc-disp-" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(sessionDir); + try + { + // shell_execute declares the small verbose budget (2000); echo > 2000 chars. + var toolCall = new FunctionCallContent("call-spill", "shell_execute", + ToolInput.Create("Command", $"echo {new string('x', 3000)}")); + var context = new Netclaw.Tools.ToolExecutionContext("slack/thread-1", sessionDir) + { + Audience = TrustAudience.Personal, + }; + + var result = await _executor.ExecuteAsync(toolCall, context, CancellationToken.None); + + Assert.True(result.Length < 3000); // windowed inline, not the full 3000 + Assert.Contains("output saved to", result); + Assert.Contains("file_read", result); + var spill = Path.Combine(sessionDir, "tool-calls", "call-spill.log"); + Assert.True(File.Exists(spill)); + Assert.Contains(new string('x', 100), await File.ReadAllTextAsync(spill, CancellationToken.None)); + } + finally + { + Directory.Delete(sessionDir, recursive: true); + } + } + + [Fact] + public async Task Spilled_output_is_redacted_before_write() + { + var sessionDir = Path.Combine(Path.GetTempPath(), "nc-disp-" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(sessionDir); + try + { + // Secret + padding so it both redacts and exceeds the shell budget → spills. + var toolCall = new FunctionCallContent("call-redact", "shell_execute", + ToolInput.Create("Command", $"echo API_KEY=supersecret123 {new string('x', 3000)}")); + var context = new Netclaw.Tools.ToolExecutionContext("slack/thread-1", sessionDir) + { + Audience = TrustAudience.Personal, + }; + + var result = await _executor.ExecuteAsync(toolCall, context, CancellationToken.None); + var onDisk = await File.ReadAllTextAsync( + Path.Combine(sessionDir, "tool-calls", "call-redact.log"), CancellationToken.None); + + Assert.DoesNotContain("supersecret123", result); + Assert.DoesNotContain("supersecret123", onDisk); // redacted before the spill write + } + finally + { + Directory.Delete(sessionDir, recursive: true); + } + } + + [Fact] + public async Task Small_output_is_redacted_without_spilling() + { + // Redaction happens centrally for every result, spill or not. + var toolCall = new FunctionCallContent("call-r", "shell_execute", + ToolInput.Create("Command", "echo API_KEY=secret123")); + var context = new Netclaw.Tools.ToolExecutionContext("signalr/thread-1", null) + { + Audience = TrustAudience.Personal, + }; + + var result = await _executor.ExecuteAsync(toolCall, context, CancellationToken.None); + + Assert.Contains("API_KEY=***REDACTED***", result); + Assert.DoesNotContain("secret123", result); + Assert.DoesNotContain("saved to", result); // small → no spill + } + + [Fact] + public async Task Content_tool_under_default_budget_not_spilled() + { + var sessionDir = Path.Combine(Path.GetTempPath(), "nc-disp-" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(sessionDir); + try + { + // file_read has no verbose override → the 12000-char content budget; a + // small file is returned whole with no spill. + var file = Path.Combine(sessionDir, "note.txt"); + await File.WriteAllTextAsync(file, "hello content", CancellationToken.None); + var toolCall = new FunctionCallContent("call-content", "file_read", + ToolInput.Create("Path", file)); + var context = new Netclaw.Tools.ToolExecutionContext("slack/thread-1", sessionDir) + { + Audience = TrustAudience.Personal, + }; + + var result = await _executor.ExecuteAsync(toolCall, context, CancellationToken.None); + + Assert.Contains("hello content", result); + Assert.DoesNotContain("saved to", result); + Assert.False(Directory.Exists(Path.Combine(sessionDir, "tool-calls"))); + } + finally + { + Directory.Delete(sessionDir, recursive: true); + } + } + [Fact] public async Task Routes_shell_execute() { diff --git a/src/Netclaw.Actors.Tests/Tools/FileReadToolTests.cs b/src/Netclaw.Actors.Tests/Tools/FileReadToolTests.cs index 39a482219..7f96f253f 100644 --- a/src/Netclaw.Actors.Tests/Tools/FileReadToolTests.cs +++ b/src/Netclaw.Actors.Tests/Tools/FileReadToolTests.cs @@ -208,7 +208,7 @@ public async Task Read_with_offset_and_limit() var lines = Enumerable.Range(1, 10).Select(i => $"Line {i}"); await File.WriteAllLinesAsync(filePath, lines, TestContext.Current.CancellationToken); - var args = ToolInput.Create("Path", filePath, "Offset", 3, "Limit", 2); + var args = ToolInput.Create("Path", filePath, "StartLine", 3, "Limit", 2); var result = await _tool.ExecuteAsync(args, CreatePersonalContext(), CancellationToken.None); @@ -229,9 +229,13 @@ public async Task Large_file_is_truncated_with_continuation_hint() var result = await tool.ExecuteAsync(args, CreatePersonalContext(), CancellationToken.None); Assert.Contains("output truncated", result); - Assert.Contains("Offset=", result); + Assert.Contains("StartLine", result); + Assert.Contains("grep", result); + // Bounded read: only the first 100 chars are materialized, not all 500. + Assert.StartsWith(new string('x', 100), result); } + [Fact] public async Task Paginated_read_truncated_by_char_limit_includes_continuation_hint() { @@ -240,11 +244,11 @@ public async Task Paginated_read_truncated_by_char_limit_includes_continuation_h var lines = Enumerable.Range(1, 20).Select(i => $"Line {i:D2} content here"); await File.WriteAllLinesAsync(filePath, lines, TestContext.Current.CancellationToken); - var args = ToolInput.Create("Path", filePath, "Offset", 1, "Limit", 20); + var args = ToolInput.Create("Path", filePath, "StartLine", 1, "Limit", 20); var result = await tool.ExecuteAsync(args, CreatePersonalContext(), CancellationToken.None); Assert.Contains("output truncated", result); - Assert.Contains("Offset=", result); + Assert.Contains("StartLine=", result); } [Fact] diff --git a/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs b/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs index 6be5ddee5..06f384156 100644 --- a/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs +++ b/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs @@ -100,19 +100,20 @@ public async Task Caller_cancellation_kills_child_process_tree_and_returns_grace } [Fact] - public async Task Output_truncation_applies() + public async Task ShellTool_returns_raw_combined_output_without_spilling() { - var tool = new ShellTool(new ToolConfig { MaxOutputChars = 50 }); - // Write >50 chars to stdout with a command that needs no interpreter. - // `echo` is a builtin on both bash and cmd.exe; a long literal is - // deterministic. (python on the Windows runner resolves to the Store - // stub, which writes to stderr — so the truncation would land on stderr - // and the marker would read "[stderr truncated", not "[stdout truncated".) + // ShellTool only returns its (bounded) raw output now — redaction and the + // inline-budget bound + spill happen centrally in DispatchingToolExecutor + // (covered by DispatchingToolExecutorTests). `echo` is a builtin on both + // bash and cmd.exe; a long literal is deterministic on stdout. + var tool = new ShellTool(new ToolConfig()); var args = ToolInput.Create("Command", $"echo {new string('x', 200)}"); var result = await tool.ExecuteAsync(args, CancellationToken.None); - Assert.Contains("[stdout truncated", result); + Assert.Contains("Exit code: 0", result); + Assert.Contains(new string('x', 200), result); // full output, not yet windowed/spilled + Assert.DoesNotContain("saved to", result); // ShellTool itself does not spill } [Fact] @@ -367,111 +368,6 @@ public void TruncateOutput_truncates_and_appends_indicator() Assert.EndsWith("[output truncated]", result); } - // ── BoundedDrainAsync ── - - [Fact] - public async Task BoundedDrain_short_output_returned_verbatim() - { - var input = "hello world"; - var reader = new StringReader(input); - var (text, truncated) = await ShellTool.BoundedDrainAsync(reader, 100); - Assert.Equal(input, text); - Assert.False(truncated); - } - - [Fact] - public async Task BoundedDrain_empty_input_returns_empty() - { - var reader = new StringReader(""); - var (text, truncated) = await ShellTool.BoundedDrainAsync(reader, 100); - Assert.Equal("", text); - Assert.False(truncated); - } - - [Fact] - public async Task BoundedDrain_output_exactly_at_cap_not_truncated() - { - var input = new string('a', 100); - var reader = new StringReader(input); - var (text, truncated) = await ShellTool.BoundedDrainAsync(reader, 100); - Assert.Equal(input, text); - Assert.False(truncated); - } - - [Fact] - public async Task BoundedDrain_long_output_truncated_with_head_and_tail() - { - // 100-char head marker + separator + 100-char tail marker, with filler in the middle - var head = new string('H', 100); - var middle = new string('M', 5000); - var tail = new string('T', 100); - var input = head + middle + tail; - - var (text, truncated) = await ShellTool.BoundedDrainAsync(new StringReader(input), 200); - - Assert.True(truncated); - Assert.StartsWith(new string('H', 100), text); // head preserved - Assert.EndsWith(new string('T', 100), text); // tail preserved - Assert.Contains("...", text); // separator present - Assert.DoesNotContain("M", text); // middle discarded - } - - [Fact] - public async Task BoundedDrain_head_and_tail_split_evenly() - { - // maxChars=10 → headCap=5, tailCap=5 - var input = "AAAAAXXXXXXBBBBB"; // 16 chars: 5 head, 6 overflow discard, 5 tail - var (text, truncated) = await ShellTool.BoundedDrainAsync(new StringReader(input), 10); - - Assert.True(truncated); - Assert.StartsWith("AAAAA", text); - Assert.EndsWith("BBBBB", text); - } - - [Fact] - public async Task BoundedDrain_disabled_cap_returns_full_output() - { - var input = new string('x', 10_000); - var (text, truncated) = await ShellTool.BoundedDrainAsync(new StringReader(input), 0); - Assert.Equal(input, text); - Assert.False(truncated); - } - - [Fact] - public async Task BoundedDrain_tail_ring_wraps_across_small_chunks() - { - // Drives the ring's wraparound + start-advance path that the StringReader - // tests skip: each read delivers a chunk smaller than tailCap, so the tail - // window is rebuilt incrementally and must wrap rather than reset wholesale. - // maxChars=10 → headCap=5 ("ABCDE"), tailCap=5; last 5 of "FGHIJKLMNO" = "KLMNO". - var reader = new ChunkedReader("ABCDEFGHIJKLMNO", chunkSize: 3); - - var (text, truncated) = await ShellTool.BoundedDrainAsync(reader, 10); - - Assert.True(truncated); - Assert.Equal("ABCDE\n...\nKLMNO", text); - } - - // Hands out at most chunkSize chars per read so tests can exercise the tail - // ring's incremental wrap path — real pipe reads arrive in arbitrary slices, - // not the single 4KB gulp a StringReader gives. - private sealed class ChunkedReader(string data, int chunkSize) : TextReader - { - private int _pos; - - public override ValueTask ReadAsync(Memory buffer, CancellationToken cancellationToken = default) - { - var remaining = data.Length - _pos; - if (remaining <= 0) - return ValueTask.FromResult(0); - - var n = Math.Min(Math.Min(chunkSize, buffer.Length), remaining); - data.AsSpan(_pos, n).CopyTo(buffer.Span); - _pos += n; - return ValueTask.FromResult(n); - } - } - [Fact] public async Task Command_referencing_denied_path_returns_access_denied() { @@ -487,16 +383,6 @@ public async Task Command_referencing_denied_path_returns_access_denied() Assert.Contains("Access denied", result); } - [Fact] - public async Task Execute_redacts_secret_like_output() - { - var args = ToolInput.Create("Command", "echo API_KEY=secret123"); - - var result = await _tool.ExecuteAsync(args, CancellationToken.None); - - Assert.Contains("API_KEY=***REDACTED***", result); - Assert.DoesNotContain("secret123", result); - } [Fact] public async Task High_risk_glob_on_netclaw_config_is_blocked() diff --git a/src/Netclaw.Actors.Tests/Tools/ToolOutputSpillTests.cs b/src/Netclaw.Actors.Tests/Tools/ToolOutputSpillTests.cs new file mode 100644 index 000000000..f1e6cc185 --- /dev/null +++ b/src/Netclaw.Actors.Tests/Tools/ToolOutputSpillTests.cs @@ -0,0 +1,101 @@ +// ----------------------------------------------------------------------- +// +// Copyright (C) 2026 - 2026 Petabridge, LLC +// +// ----------------------------------------------------------------------- +using Netclaw.Actors.Tools; +using Netclaw.Configuration; +using Netclaw.Tools; +using Xunit; + +namespace Netclaw.Actors.Tests.Tools; + +public sealed class ToolOutputSpillTests : IDisposable +{ + private readonly string _sessionDir = + Path.Combine(Path.GetTempPath(), "nc-spill-" + Guid.NewGuid().ToString("N")); + + public ToolOutputSpillTests() => Directory.CreateDirectory(_sessionDir); + + public void Dispose() + { + if (Directory.Exists(_sessionDir)) + Directory.Delete(_sessionDir, recursive: true); + } + + private ToolExecutionContext Context() => + new("session/thread", _sessionDir) { Audience = TrustAudience.Personal }; + + private string ToolCallsDir => Path.Combine(_sessionDir, "tool-calls"); + + // BoundAndSpillAsync receives an ALREADY-redacted result (the dispatcher redacts + // first), so these tests pass content verbatim and assert the bound/spill shape. + + [Fact] + public async Task Under_budget_returned_unchanged() + { + var input = new string('a', 50); + var result = await ToolOutputSpill.BoundAndSpillAsync( + input, "call_1", budget: 100, Context(), CancellationToken.None); + + Assert.Equal(input, result); + Assert.False(Directory.Exists(ToolCallsDir)); // nothing spilled + } + + [Fact] + public async Task Over_budget_spills_full_output_and_steers() + { + var input = new string('H', 200) + new string('T', 200); // 400 > 100 + var result = await ToolOutputSpill.BoundAndSpillAsync( + input, "call_2", budget: 100, Context(), CancellationToken.None); + + var spillPath = Path.Combine(ToolCallsDir, "call_2.log"); + Assert.True(File.Exists(spillPath)); + Assert.Equal(input, await File.ReadAllTextAsync(spillPath, CancellationToken.None)); // full output on disk + Assert.StartsWith(new string('H', 50), result); // inline head + Assert.Contains("output saved to", result); + Assert.Contains(spillPath, result); + Assert.Contains("file_read", result); + Assert.Contains("grep", result); + } + + [Fact] + public async Task Budget_zero_falls_back_to_content_default() + { + // budget 0 → DefaultContentBudget (12000); a 5000-char input fits, returned whole. + var input = new string('x', 5000); + var result = await ToolOutputSpill.BoundAndSpillAsync( + input, "call_3", budget: 0, Context(), CancellationToken.None); + + Assert.Equal(input, result); + Assert.False(Directory.Exists(ToolCallsDir)); + } + + [Fact] + public async Task No_session_directory_degrades_to_inline_only() + { + var ctx = new ToolExecutionContext("session/thread", sessionDirectory: null) + { + Audience = TrustAudience.Personal, + }; + var input = new string('H', 200) + new string('T', 200); + + var result = await ToolOutputSpill.BoundAndSpillAsync( + input, "call_5", budget: 100, ctx, CancellationToken.None); + + Assert.StartsWith(new string('H', 50), result); // inline still produced + Assert.DoesNotContain("saved to", result); // but no spill path + } + + [Fact] + public async Task Unsafe_call_id_cannot_escape_tool_calls_directory() + { + var input = new string('H', 200) + new string('T', 200); + await ToolOutputSpill.BoundAndSpillAsync( + input, "../../evil", budget: 100, Context(), CancellationToken.None); + + var written = Directory.GetFiles(ToolCallsDir); + Assert.Single(written); + Assert.StartsWith(ToolCallsDir, Path.GetFullPath(written[0])); + } +} diff --git a/src/Netclaw.Actors/Jobs/BackgroundJobExecutionActor.cs b/src/Netclaw.Actors/Jobs/BackgroundJobExecutionActor.cs index 411ab0cec..8da749d6c 100644 --- a/src/Netclaw.Actors/Jobs/BackgroundJobExecutionActor.cs +++ b/src/Netclaw.Actors/Jobs/BackgroundJobExecutionActor.cs @@ -7,6 +7,7 @@ using System.Text; using Akka.Actor; using Akka.Event; +using Netclaw.Actors.Tools; using Netclaw.Security; namespace Netclaw.Actors.Jobs; @@ -136,11 +137,18 @@ private void SpawnProcess() var outputBuilder = new StringBuilder(); try { - var stdoutTask = process.StandardOutput.ReadToEndAsync(); - var stderrTask = process.StandardError.ReadToEndAsync(); - - var stdout = await stdoutTask; - var stderr = await stderrTask; + // Bounded capture (not ReadToEndAsync): a chatty long-running job — + // exactly what background jobs exist for — must not buffer its full + // output in memory. Drain each stream to the capture ceiling (head+tail + // for floods larger than it); the source keeps draining past the cap so + // the child never deadlocks on a full pipe. Closes #1300. + var stdoutTask = BoundedOutputReader.DrainToWindowAsync( + process.StandardOutput, BackgroundJobManagerActor.MaxCapturedOutputChars, CancellationToken.None); + var stderrTask = BoundedOutputReader.DrainToWindowAsync( + process.StandardError, BackgroundJobManagerActor.MaxCapturedOutputChars, CancellationToken.None); + + var (stdout, stdoutTruncated) = await stdoutTask; + var (stderr, stderrTruncated) = await stderrTask; await process.WaitForExitAsync(); outputBuilder.Append(stdout); @@ -151,6 +159,12 @@ private void SpawnProcess() outputBuilder.Append(stderr); } + // Mark a flood that exceeded the capture ceiling so the log isn't a + // silent head+tail splice presented as complete. + if (stdoutTruncated || stderrTruncated) + outputBuilder.Append( + $"\n[output exceeded the {BackgroundJobManagerActor.MaxCapturedOutputChars}-char capture ceiling — head and tail shown]"); + var fullOutput = SecretOutputRedactor.Redact(outputBuilder.ToString()); try diff --git a/src/Netclaw.Actors/Jobs/BackgroundJobManagerActor.cs b/src/Netclaw.Actors/Jobs/BackgroundJobManagerActor.cs index bddadbcc2..35f3457fd 100644 --- a/src/Netclaw.Actors/Jobs/BackgroundJobManagerActor.cs +++ b/src/Netclaw.Actors/Jobs/BackgroundJobManagerActor.cs @@ -22,6 +22,13 @@ public sealed class BackgroundJobManagerActor : ReceiveActor { internal const int MaxConcurrentJobs = 5; internal const int MaxOutputTailChars = 2000; + + // Capture ceiling for a job's output log: the execution actor drains each + // stream to this bound (head+tail) so a chatty long-running job can't buffer + // its full output in memory and OOM the daemon. The log holds a head+tail view + // for floods larger than the ceiling; the message still carries the last + // MaxOutputTailChars. + internal const int MaxCapturedOutputChars = 256_000; internal const string JobDeliveryKeyPrefix = "bg-job:"; internal const string SystemSenderId = "background-job-system"; internal const string SourceKind = "background-job"; diff --git a/src/Netclaw.Actors/Sessions/Pipelines/SessionToolExecutionPipeline.cs b/src/Netclaw.Actors/Sessions/Pipelines/SessionToolExecutionPipeline.cs index 10ca565db..487726865 100644 --- a/src/Netclaw.Actors/Sessions/Pipelines/SessionToolExecutionPipeline.cs +++ b/src/Netclaw.Actors/Sessions/Pipelines/SessionToolExecutionPipeline.cs @@ -225,7 +225,8 @@ public static async Task ExecuteSingleToolAsync( spawnChildActor, projectDirectory, turnContext, - modelInputModalities); + modelInputModalities, + maxInlineToolResultChars); context.RequestedTimeoutSeconds = (int)timeout.TotalSeconds; // Re-drive of an ApprovedOnce approval: the user already clicked @@ -343,7 +344,7 @@ public static async Task ExecuteSingleToolAsync( var deniedMessage = new SerializableChatMessage { Role = Protocol.ChatRole.Tool, - Content = ClampToolResult(resultText, maxInlineToolResultChars), + Content = resultText, ToolCallId = new ToolCallId(tc.CallId), Name = tc.Name }; @@ -409,7 +410,7 @@ public static async Task ExecuteSingleToolAsync( return new ToolCallResult(new SerializableChatMessage { Role = Protocol.ChatRole.Tool, - Content = ClampToolResult(resultText, maxInlineToolResultChars), + Content = resultText, ToolCallId = new ToolCallId(tc.CallId), Name = tc.Name }, [], context.FileAttachments, completedRuns, acceptedFindings); @@ -563,7 +564,9 @@ or ApprovalDecision.ApprovedAlways modelInputBudget ??= new ModelInputBatchBudget(MaxModelInputBatchBytes); var modelInputMaterialization = MaterializeModelInputFiles(context, sessionDir, logger, modelInputBudget); - resultText = ClampToolResult(resultText, maxInlineToolResultChars); + // No inline clamp here: DispatchingToolExecutor already bounds every tool + // result to the inline budget N (and spills the overflow). Clamping again + // would re-window the already-windowed+steered result. if (modelInputMaterialization.RequestedCount > modelInputMaterialization.MediaReferences.Count) resultText = AppendModelInputHandoffWarning( resultText, @@ -928,7 +931,8 @@ private static ToolExecutionContext BuildToolExecutionContext( Func> spawnChildActor, string? projectDirectory, TurnContext? turnContext, - ModelModality modelInputModalities) + ModelModality modelInputModalities, + int maxInlineToolResultChars) { // A turn with no authority context carries no trust context — fall closed // to the most-restrictive audience. The default is resolved once, here, @@ -936,6 +940,9 @@ private static ToolExecutionContext BuildToolExecutionContext( var context = new ToolExecutionContext(sessionId.Value, sessionDir) { Audience = turnContext?.Audience ?? source?.Audience ?? TrustAudience.Public, + // The session content budget; DispatchingToolExecutor uses it (or a + // tool's own override) to bound results and spill the overflow. + MaxInlineToolResultChars = maxInlineToolResultChars, }; context.Boundary = turnContext?.Boundary ?? source?.Boundary; context.ChannelType = turnContext?.ChannelType?.ToWireValue() @@ -973,19 +980,6 @@ private static ToolAuditEntry BuildAuditEntry( TimeoutHintSeconds = meta?.TimeoutHintSeconds }; - /// - /// Truncates a tool result to fit within the configured inline character limit. - /// - public static string ClampToolResult(string resultText, int maxInlineToolResultChars) - { - if (maxInlineToolResultChars <= 0 || resultText.Length <= maxInlineToolResultChars) - return resultText; - - var omittedChars = resultText.Length - maxInlineToolResultChars; - return resultText[..maxInlineToolResultChars] - + $"\n[tool result truncated: omitted {omittedChars} chars to protect context window]"; - } - /// /// Returns a one-line agent-facing hint pointing at set_working_directory /// when a shell call was denied specifically because its cwd is outside diff --git a/src/Netclaw.Actors/Tools/BoundedOutputReader.cs b/src/Netclaw.Actors/Tools/BoundedOutputReader.cs new file mode 100644 index 000000000..6cd498bd5 --- /dev/null +++ b/src/Netclaw.Actors/Tools/BoundedOutputReader.cs @@ -0,0 +1,188 @@ +// ----------------------------------------------------------------------- +// +// Copyright (C) 2026 - 2026 Petabridge, LLC +// +// ----------------------------------------------------------------------- +using System.Buffers; +using System.Text; + +namespace Netclaw.Actors.Tools; + +/// +/// Bounds external tool output (process pipes, files) into a head+tail window of +/// a fixed character budget, in bounded memory regardless of total output size. +/// Extracted from ShellTool.BoundedDrainAsync (#1293) so the ring/window +/// logic is reviewed and fixed once and reused by shell_execute, +/// background_job, and file_read. +/// +/// +/// The reader is a pure leaf: it does no redaction and no file IO — callers +/// redact (SecretOutputRedactor) and spill. Allocation is +/// O(budget), not O(total output): the scratch read buffer is pooled, the tail +/// ring is allocated only once the head fills, and reads go through the +/// overload so a pipe that already has data buffered +/// completes synchronously without a per-chunk allocation. +/// +internal static class BoundedOutputReader +{ + /// + /// Drains into a head+tail window bounded by + /// chars. Chars beyond the budget are discarded but + /// the source continues to be read so a still-running child never deadlocks on + /// a full pipe buffer. A non-positive disables the + /// cap (reads the whole stream). Returns the captured text and whether it was + /// truncated. + /// + public static async Task<(string Text, bool Truncated)> DrainToWindowAsync( + TextReader reader, int budget, CancellationToken ct) + { + if (budget <= 0) + { + // Explicit opt-out: a non-positive budget disables truncation and + // reads the whole stream. Callers that bound via config never pass + // this; it exists for deliberate full-capture callers. + var all = await reader.ReadToEndAsync(ct); + return (all, false); + } + + // Split the budget: first half for the head, second half for the tail. + // Odd budget gives the extra char to the head. Computed without (budget + + // 1) so a near-int.MaxValue budget can't overflow to a negative headCap. + var headCap = budget / 2 + budget % 2; + var tailCap = budget / 2; + + var head = new StringBuilder(Math.Min(headCap, 4096)); + + // Tail ring buffer, allocated lazily on first overflow past the head. The + // common case — output under the cap — never fills the head, so it never + // pays for the tail window at all. + char[]? tailBuf = null; + var tailStart = 0; // index of the oldest retained char in the ring + var tailLen = 0; // chars currently retained (<= tailCap) + + long totalChars = 0; // total chars seen across all reads; long so a multi-GB + // flood can't overflow the truncation check to a false negative + + // Transient scratch buffer for the read loop: pooled so a long drain + // doesn't allocate it per call and it never lands on the LOH. + var buf = ArrayPool.Shared.Rent(4096); + try + { + int read; + // ReadAsync(Memory) returns a non-allocating ValueTask when the + // read completes synchronously (data already buffered) — unlike the + // Task char[] overload, which allocates once per chunk. + while ((read = await reader.ReadAsync(buf.AsMemory(), ct)) > 0) + { + totalChars += read; + var span = buf.AsSpan(0, read); + + if (head.Length < headCap) + { + var headChunk = Math.Min(headCap - head.Length, span.Length); + head.Append(span[..headChunk]); + span = span[headChunk..]; + } + + if (span.IsEmpty || tailCap == 0) + continue; + + tailBuf ??= new char[tailCap]; + AppendToTailRing(tailBuf, span, ref tailStart, ref tailLen); + } + } + finally + { + // clearArray: the scratch buffer held raw output (possibly secrets); + // wipe it before returning to the shared pool. + ArrayPool.Shared.Return(buf, clearArray: true); + } + + // Truncation only when total chars exceeded the full budget (head+tail), + // meaning some middle chars were discarded. + var truncated = totalChars > budget; + + // Reconstruct in place on `head`: when truncated, the discarded middle is + // marked with a separator; otherwise head + tail is the full output. + if (truncated) + head.Append(Separator); + if (tailBuf is not null && tailLen > 0) + AppendRing(head, tailBuf, tailStart, tailLen); + return (head.ToString(), truncated); + } + + /// + /// Head+tail window of an already-in-memory string to + /// chars. Returns the string unchanged when it already fits (or the budget is + /// non-positive). Used to derive the inline window from a larger (redacted) + /// capture — see ToolOutputSpill. + /// + /// + /// bounds the retained content, not the returned + /// string length: a truncated result is budget + Separator.Length chars + /// (the head, the "…" separator, and the tail). Callers enforcing a hard + /// character ceiling must account for the separator. + /// + public static string Window(string text, int budget) + { + if (budget <= 0 || text.Length <= budget) + return text; + + var headCap = budget / 2 + budget % 2; + var tailCap = budget / 2; + return string.Concat(text.AsSpan(0, headCap), Separator, text.AsSpan(text.Length - tailCap)); + } + + // Separator marking the discarded middle of a truncated head+tail window. + // Uses the platform newline so it matches the line endings the rest of the + // captured output is assembled with (e.g. StringBuilder.AppendLine). + private static readonly string Separator = $"{Environment.NewLine}...{Environment.NewLine}"; + + /// + /// Writes into a ring buffer that retains only the + /// most recent ring.Length chars. Uses block copies (at most two per + /// call) rather than a per-char loop, so draining a very chatty child stays + /// cheap regardless of how much it prints. + /// + private static void AppendToTailRing(char[] ring, ReadOnlySpan span, ref int start, ref int len) + { + var cap = ring.Length; + + if (span.Length >= cap) + { + // This span alone fills (or overfills) the window: only its last `cap` + // chars can survive. One contiguous copy, ring reset. + span[^cap..].CopyTo(ring); + start = 0; + len = cap; + return; + } + + var writePos = (start + len) % cap; + var first = Math.Min(span.Length, cap - writePos); + span[..first].CopyTo(ring.AsSpan(writePos)); + if (first < span.Length) + span[first..].CopyTo(ring); // remainder wraps to the front + + var newLen = len + span.Length; + if (newLen > cap) + { + // Overwrote the oldest chars: advance start past them. + start = (start + (newLen - cap)) % cap; + len = cap; + } + else + { + len = newLen; + } + } + + /// Appends a ring buffer's retained chars, oldest-first, to . + private static void AppendRing(StringBuilder sb, char[] ring, int start, int len) + { + var first = Math.Min(len, ring.Length - start); + sb.Append(ring, start, first); + if (first < len) + sb.Append(ring, 0, len - first); + } +} diff --git a/src/Netclaw.Actors/Tools/DispatchingToolExecutor.cs b/src/Netclaw.Actors/Tools/DispatchingToolExecutor.cs index 11e91f22a..b56c883b3 100644 --- a/src/Netclaw.Actors/Tools/DispatchingToolExecutor.cs +++ b/src/Netclaw.Actors/Tools/DispatchingToolExecutor.cs @@ -68,6 +68,13 @@ public async Task ExecuteAsync(FunctionCallContent toolCall, ToolExecuti : await tool.ExecuteAsync(toolCall.Arguments, ct); result = SecretOutputRedactor.Redact(result); + // Single, uniform bounding+spill point for every tool (main session and + // sub-agents both funnel through here): cap the inline result to the + // tool's budget and, when it overflows, spill the full redacted result + // to a session file and steer the model to read a slice. Tools only + // bound their own capture for memory safety; they do not window or spill. + result = await ToolOutputSpill.BoundAndSpillAsync( + result, toolCall.CallId, ResolveInlineBudget(tool, context), context, ct); sw.Stop(); _logger.LogInformation( @@ -91,6 +98,15 @@ public async Task AuthorizeAsync(FunctionCallContent toolCall, ToolExecutionCont _ = await AuthorizeCoreAsync(toolCall, context, ct); } + // The tool's own override (verbose tools like shell opt down) wins; otherwise + // the session content budget; otherwise the built-in content default. + private static int ResolveInlineBudget(INetclawTool tool, ToolExecutionContext? context) + => tool.InlineOutputBudgetChars is > 0 and var toolBudget + ? toolBudget + : context?.MaxInlineToolResultChars is > 0 and var contentBudget + ? contentBudget + : ToolOutputSpill.DefaultContentBudget; + public async IAsyncEnumerable ExecuteStreamAsync( FunctionCallContent toolCall, ToolExecutionContext? context = null, @@ -117,6 +133,8 @@ public async IAsyncEnumerable ExecuteStreamAsync( case ToolCompletedUpdate completed: sw.Stop(); var redacted = SecretOutputRedactor.Redact(completed.Result); + redacted = await ToolOutputSpill.BoundAndSpillAsync( + redacted, toolCall.CallId, ResolveInlineBudget(tool, context), context, ct); _logger.LogInformation( "Tool executed: {ToolName} ({Duration}ms, {ResultLength} chars)", toolCall.Name, sw.ElapsedMilliseconds, redacted.Length); diff --git a/src/Netclaw.Actors/Tools/FileReadTool.cs b/src/Netclaw.Actors/Tools/FileReadTool.cs index 7767b7081..c897db6aa 100644 --- a/src/Netclaw.Actors/Tools/FileReadTool.cs +++ b/src/Netclaw.Actors/Tools/FileReadTool.cs @@ -20,7 +20,7 @@ namespace Netclaw.Actors.Tools; /// Reads text files and inspects non-text files without returning raw bytes. /// [NetclawTool(ToolName, - "Read text files or inspect non-text files. Images can be loaded for visual inspection when the active model supports image input; PDFs/media/archives return metadata and guidance. For large text files, use Offset and Limit to read sections.", + "Read text files or inspect non-text files. Images can be loaded for visual inspection when the active model supports image input; PDFs/media/archives return metadata and guidance. For large text files, use StartLine and Limit to read sections.", Grant = "file")] public sealed partial class FileReadTool : NetclawTool { @@ -42,8 +42,8 @@ public sealed partial class FileReadTool : NetclawTool public record Params( [property: Description("Absolute path to the file to read")] string Path, - [property: Description("Line number to start reading from (1-based). Use with Limit to read sections of large files and avoid context window truncation.")] int? Offset = null, - [property: Description("Maximum number of lines to read. Use with Offset to paginate through large files instead of reading the whole file.")] int? Limit = null); + [property: Description("Line number to start reading at, 1-based: the first line is line 1, matching the line numbers shown in this tool's output and in editors/grep/sed. To read line N, pass StartLine=N. Use with Limit to read sections of large files and avoid context window truncation.")] int? StartLine = null, + [property: Description("Maximum number of lines to read. Use with StartLine to paginate through large files instead of reading the whole file.")] int? Limit = null); public FileReadTool( ToolConfig config, @@ -79,7 +79,7 @@ protected override async Task ExecuteAsync(Params args, ToolExecutionCon return $"Error: File not found: {authorizedPath}"; // Treat 0 or negative as "not specified" - int? offset = args.Offset > 0 ? args.Offset : null; + int? startLine = args.StartLine > 0 ? args.StartLine : null; int? limit = args.Limit > 0 ? args.Limit : null; try @@ -89,16 +89,23 @@ protected override async Task ExecuteAsync(Params args, ToolExecutionCon return HandleNonTextFile(authorizedPath, inspection, context); var encoding = inspection.TextEncoding ?? StrictUtf8; - if (offset.HasValue || limit.HasValue) + if (startLine.HasValue || limit.HasValue) { - var lines = await ReadLinesAsync(authorizedPath, encoding, offset ?? 1, limit, _config.MaxOutputChars, ct); + var lines = await ReadLinesAsync(authorizedPath, encoding, startLine ?? 1, limit, _config.MaxOutputChars, ct); RecordSkillReadIfApplicable(authorizedPath); - return lines; + return lines; // redaction + inline bound + spill happen centrally in the dispatcher } - var content = await File.ReadAllTextAsync(authorizedPath, encoding, ct); + // Bounded head read (not ReadAllTextAsync): a multi-hundred-MB file must + // not be materialized just to truncate it — read up to the limit and stop. + // Closes #1301. Redaction and the inline-budget bound + spill happen + // centrally in DispatchingToolExecutor; file_read only bounds its read for + // memory safety and steers past the cap. + var (content, truncated) = await ReadBoundedHeadAsync(authorizedPath, encoding, _config.MaxOutputChars, ct); RecordSkillReadIfApplicable(authorizedPath); - return TruncateFileOutput(content, _config.MaxOutputChars); + return truncated + ? content + $"\n[output truncated at {_config.MaxOutputChars} chars — read a specific range with StartLine and Limit, or grep the file, for the rest]" + : content; } catch (DecoderFallbackException) { @@ -422,21 +429,28 @@ private static bool IsUtf16OrUtf32(Encoding? encoding) || ReferenceEquals(encoding, StrictUtf32Be); } - private static string TruncateFileOutput(string content, int maxChars) + /// + /// Reads up to chars from the head of the file and + /// stops — bounding both memory and I/O so a huge file is never fully + /// materialized. Returns the content and whether more remained past the cap. + /// + private static async Task<(string Content, bool Truncated)> ReadBoundedHeadAsync( + string path, Encoding encoding, int maxChars, CancellationToken ct) { - if (content.Length <= maxChars) - return content; - int newlinesBefore = 0, totalNewlines = 0; - for (var i = 0; i < content.Length; i++) + using var reader = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks: true); + var sb = new StringBuilder(); + var buf = new char[4096]; + while (sb.Length < maxChars) { - if (content[i] != '\n') continue; - totalNewlines++; - if (i < maxChars) newlinesBefore++; + var want = Math.Min(buf.Length, maxChars - sb.Length); + var read = await reader.ReadAsync(buf.AsMemory(0, want), ct); + if (read == 0) + return (sb.ToString(), false); // EOF before the cap — not truncated + sb.Append(buf, 0, read); } - var nextLine = newlinesBefore + 1; - var totalLines = totalNewlines + 1; - return string.Concat(content.AsSpan(0, maxChars), - $"\n[output truncated at line {nextLine} of {totalLines} — use Offset={nextLine} with Limit to continue reading]"); + + // Hit the cap: truncated iff at least one more char remains on disk. + return (sb.ToString(), reader.Peek() >= 0); } private static async Task ReadLinesAsync( @@ -467,7 +481,7 @@ private static async Task ReadLinesAsync( linesRead++; if (sb.Length >= maxChars) - return sb.ToString(0, maxChars) + $"\n[output truncated at line {lineNumber} — use Offset={lineNumber} with Limit to continue reading]"; + return sb.ToString(0, maxChars) + $"\n[output truncated at line {lineNumber} — use StartLine={lineNumber} with Limit to continue reading]"; } return sb.ToString(); diff --git a/src/Netclaw.Actors/Tools/ShellTool.cs b/src/Netclaw.Actors/Tools/ShellTool.cs index 355d49023..c09a92060 100644 --- a/src/Netclaw.Actors/Tools/ShellTool.cs +++ b/src/Netclaw.Actors/Tools/ShellTool.cs @@ -3,7 +3,6 @@ // Copyright (C) 2026 - 2026 Petabridge, LLC // // ----------------------------------------------------------------------- -using System.Buffers; using System.ComponentModel; using System.Diagnostics; using System.Text; @@ -24,6 +23,12 @@ public sealed partial class ShellTool : NetclawTool { public const string ToolName = "shell_execute"; + // Shell output is mostly verbose noise the model skims, so bound it + // aggressively: small inline head+tail, full output spilled to a session file + // to grep. Content tools (file_read, web_fetch, MCP) keep the larger session + // content budget because the model fetched them to read in full. + public override int InlineOutputBudgetChars => 2000; + private readonly ToolConfig _config; private readonly ToolPathPolicy? _pathPolicy; private readonly ShellCommandPolicy? _commandPolicy; @@ -159,11 +164,11 @@ or UnauthorizedAccessException // pipe write-ends open, so a blocked pipe read cannot be interrupted // by a token; killing the process is what closes the pipes. // - // BoundedDrainAsync reads into a head+tail window bounded by + // BoundedOutputReader reads into a head+tail window bounded by // MaxOutputChars but continues draining after the cap is reached so // the pipe never fills up and deadlocks a still-running child. - var stdoutTask = BoundedDrainAsync(process.StandardOutput, _config.MaxOutputChars); - var stderrTask = BoundedDrainAsync(process.StandardError, _config.MaxOutputChars); + var stdoutTask = BoundedOutputReader.DrainToWindowAsync(process.StandardOutput, _config.MaxOutputChars, CancellationToken.None); + var stderrTask = BoundedOutputReader.DrainToWindowAsync(process.StandardError, _config.MaxOutputChars, CancellationToken.None); try { @@ -190,7 +195,7 @@ or UnauthorizedAccessException { // If the OS refuses the kill, the child may keep stdout/stderr // open forever. Close our read ends so cancellation still - // returns promptly instead of hanging in BoundedDrainAsync. + // returns promptly instead of hanging in the pipe drain. killClosedPipes = false; Debug.WriteLine($"shell_execute: process kill skipped — {ex.Message}"); process.StandardOutput.Dispose(); @@ -211,178 +216,33 @@ or UnauthorizedAccessException : "Error: Command cancelled."; } - var (stdoutText, stdoutTruncated) = await stdoutTask; - var (stderrText, stderrTruncated) = await stderrTask; - - // Redact secrets on the already-bounded strings, then assemble. - var stdout = SecretOutputRedactor.Redact(stdoutText); - var stderr = SecretOutputRedactor.Redact(stderrText); - - var result = new StringBuilder(); - if (stdout.Length > 0) + var (stdoutText, _) = await stdoutTask; + var (stderrText, _) = await stderrTask; + + // Assemble the raw combined output (stdout then stderr). Each stream was + // drained to MaxOutputChars, so the concatenation can be up to 2x — re-window + // the COMBINED back to the capture ceiling so the spill body stays bounded by + // MaxOutputChars. Redaction and the inline-budget bound + spill+steer happen + // centrally in DispatchingToolExecutor; the tool only returns its bounded + // capture. + var combined = new StringBuilder(); + if (stdoutText.Length > 0) + combined.Append(stdoutText); + if (stderrText.Length > 0) { - result.Append(stdout); - if (stdoutTruncated) - result.Append($"\n[stdout truncated — output exceeded {_config.MaxOutputChars} chars; head and tail shown]"); + if (combined.Length > 0) + combined.AppendLine(); + combined.Append(stderrText); } - if (stderr.Length > 0) - { - if (result.Length > 0) - result.AppendLine(); - result.Append(stderr); - if (stderrTruncated) - result.Append($"\n[stderr truncated — output exceeded {_config.MaxOutputChars} chars; head and tail shown]"); - } - - return $"Exit code: {process.ExitCode}{Environment.NewLine}{result}"; - } - } - /// - /// Drains into a head+tail window bounded by - /// . Chars beyond the cap are discarded but the - /// pipe continues to be read so a still-running child never deadlocks on a - /// full pipe buffer. Returns the captured text and whether it was truncated. - /// - /// - /// Allocation is O(), not O(total output): the - /// scratch read buffer is pooled, the tail ring is allocated only if the head - /// fills, and reads go through the overload so a - /// pipe that already has data buffered completes synchronously without a - /// per-chunk allocation. A child that prints hundreds of MB - /// is drained with a handful of KB of managed allocations — the property that - /// keeps a memory-limited daemon from being OOM-killed (see #1293). - /// - internal static async Task<(string Text, bool Truncated)> BoundedDrainAsync( - TextReader reader, int maxChars) - { - if (maxChars <= 0) - { - // Explicit opt-out: a non-positive cap disables truncation and reads - // the whole stream. The config schema enforces MaxOutputChars >= 1, so - // production never reaches this — it exists only for programmatic - // callers that deliberately pass 0/negative to capture full output. - var all = await reader.ReadToEndAsync(CancellationToken.None); - return (all, false); + var captured = BoundedOutputReader.Window(combined.ToString(), _config.MaxOutputChars); + return $"Exit code: {process.ExitCode}{Environment.NewLine}{captured}"; } - - // Split the budget: first half for the head, second half for the tail. - // Odd maxChars gives the extra char to the head. Computed without (maxChars - // + 1) so a near-int.MaxValue cap can't overflow to a negative headCap. - var headCap = maxChars / 2 + maxChars % 2; - var tailCap = maxChars / 2; - - var head = new StringBuilder(Math.Min(headCap, 4096)); - - // Tail ring buffer, allocated lazily on first overflow past the head. The - // common case — output under the cap — never fills the head, so it never - // pays for the tail window at all. - char[]? tailBuf = null; - var tailStart = 0; // index of the oldest retained char in the ring - var tailLen = 0; // chars currently retained (<= tailCap) - - long totalChars = 0; // total chars seen across all reads; long so a multi-GB - // flood can't overflow the truncation check to a false negative - - // Transient scratch buffer for the read loop: pooled so a long drain - // doesn't allocate it per call and it never lands on the LOH. - var buf = ArrayPool.Shared.Rent(4096); - try - { - int read; - // ReadAsync(Memory) returns a non-allocating ValueTask when the - // read completes synchronously (data already buffered) — unlike the - // Task char[] overload, which allocates once per chunk. - while ((read = await reader.ReadAsync(buf.AsMemory(), CancellationToken.None)) > 0) - { - totalChars += read; - var span = buf.AsSpan(0, read); - - if (head.Length < headCap) - { - var headChunk = Math.Min(headCap - head.Length, span.Length); - head.Append(span[..headChunk]); - span = span[headChunk..]; - } - - if (span.IsEmpty || tailCap == 0) - continue; - - tailBuf ??= new char[tailCap]; - AppendToTailRing(tailBuf, span, ref tailStart, ref tailLen); - } - } - finally - { - // clearArray: the scratch buffer held raw stdout/stderr (possibly - // secrets); wipe it before returning to the shared pool. - ArrayPool.Shared.Return(buf, clearArray: true); - } - - // Truncation only when total chars exceeded the full budget (head+tail), - // meaning some middle chars were discarded. - var truncated = totalChars > maxChars; - - // Reconstruct in place on `head`: when truncated, the discarded middle is - // marked with a separator; otherwise head + tail is the full output. Reusing - // `head` avoids a second StringBuilder and re-copying the head. - if (truncated) - head.Append("\n...\n"); - if (tailBuf is not null && tailLen > 0) - AppendRing(head, tailBuf, tailStart, tailLen); - return (head.ToString(), truncated); - } - - /// - /// Writes into a ring buffer that retains only the - /// most recent ring.Length chars. Uses block copies (at most two per - /// call) rather than a per-char loop, so draining a very chatty child stays - /// cheap regardless of how much it prints. - /// - private static void AppendToTailRing(char[] ring, ReadOnlySpan span, ref int start, ref int len) - { - var cap = ring.Length; - - if (span.Length >= cap) - { - // This span alone fills (or overfills) the window: only its last `cap` - // chars can survive. One contiguous copy, ring reset. - span[^cap..].CopyTo(ring); - start = 0; - len = cap; - return; - } - - var writePos = (start + len) % cap; - var first = Math.Min(span.Length, cap - writePos); - span[..first].CopyTo(ring.AsSpan(writePos)); - if (first < span.Length) - span[first..].CopyTo(ring); // remainder wraps to the front - - var newLen = len + span.Length; - if (newLen > cap) - { - // Overwrote the oldest chars: advance start past them. - start = (start + (newLen - cap)) % cap; - len = cap; - } - else - { - len = newLen; - } - } - - /// Appends a ring buffer's retained chars, oldest-first, to . - private static void AppendRing(StringBuilder sb, char[] ring, int start, int len) - { - var first = Math.Min(len, ring.Length - start); - sb.Append(ring, start, first); - if (first < len) - sb.Append(ring, 0, len - first); } - // Retained for compatibility with tests that call it directly; the main - // execution path no longer uses this — BoundedDrainAsync caps at read time. + // Retained for compatibility with tests/benchmark that call it directly; the + // main execution path no longer uses this — output is bounded at read time by + // BoundedOutputReader. internal static string TruncateOutput(string output, int maxChars) { if (output.Length <= maxChars) diff --git a/src/Netclaw.Actors/Tools/ToolOutputSpill.cs b/src/Netclaw.Actors/Tools/ToolOutputSpill.cs new file mode 100644 index 000000000..b03df0141 --- /dev/null +++ b/src/Netclaw.Actors/Tools/ToolOutputSpill.cs @@ -0,0 +1,116 @@ +// ----------------------------------------------------------------------- +// +// Copyright (C) 2026 - 2026 Petabridge, LLC +// +// ----------------------------------------------------------------------- +using System.Diagnostics; +using System.Text; +using Netclaw.Tools; + +namespace Netclaw.Actors.Tools; + +/// +/// Bounds an already-redacted tool result to the inline budget N +/// () and, when it +/// exceeds N, spills the full result to +/// {SessionDirectory}/tool-calls/{toolCallId}.log and steers the model to +/// read a slice (file_read offset/limit) or grep it instead of re-running. +/// +/// +/// Called from DispatchingToolExecutor for every tool, right after +/// the central redaction — so bounding + spill happen once, uniformly, for the +/// main session and sub-agents alike (both funnel through the dispatcher), and +/// the spilled file is redacted for free. Tools only bound their own capture +/// for memory safety; they do not window or spill. The input here is already +/// redacted, so this method does NOT redact again. file_read keeps its +/// result at or under N and steers to the original file, so it never spills +/// here (its content is its own backing store). +/// +internal static class ToolOutputSpill +{ + private const string ToolCallsSubdirectory = "tool-calls"; + + // Content budget used when neither the tool nor the context supplies one + // (sub-agent / Empty / direct construction). Matches + // SessionTuning.MaxInlineToolResultChars's default so un-plumbed paths bound the + // same as the main session. + internal const int DefaultContentBudget = 12_000; + + /// + /// Returns unchanged if it fits + /// ; otherwise returns a -char + /// head+tail window plus a steer, having spilled the full result to a session + /// file (when a session directory and call id are available). The dispatcher + /// resolves from the tool's per-tool override or the + /// session content budget. + /// + public static async Task BoundAndSpillAsync( + string redactedResult, string? toolCallId, int budget, ToolExecutionContext? context, CancellationToken ct) + { + if (budget <= 0) + budget = DefaultContentBudget; + + if (redactedResult.Length <= budget) + return redactedResult; + + var inline = BoundedOutputReader.Window(redactedResult, budget); + var spillPath = await TryWriteSpillAsync(redactedResult, toolCallId, context, ct); + return Compose(inline, spillPath, redactedResult.Length, budget); + } + + private static async Task TryWriteSpillAsync( + string redacted, string? toolCallId, ToolExecutionContext? context, CancellationToken ct) + { + // A spill needs both a place (session dir) and a name (call id). Without + // either, degrade to inline-only. + if (context is null + || string.IsNullOrWhiteSpace(context.SessionDirectory) + || string.IsNullOrWhiteSpace(toolCallId)) + return null; + + // Best-effort: the inline head+tail is always returned, and a failed (or + // cancelled) on-disk copy must not fail the tool call — so the write is + // decoupled from the request's CancellationToken (the body is bounded by + // the capture ceiling, so the write is small and fast). The `ct` is kept + // in the signature for symmetry / future use. + _ = ct; + try + { + var dir = Path.Combine(context.SessionDirectory, ToolCallsSubdirectory); + Directory.CreateDirectory(dir); + // Sanitize the (provider-supplied) call id so a spill can never escape + // the tool-calls directory. + var path = Path.Combine(dir, SafeFileName(toolCallId!) + ".log"); + await File.WriteAllTextAsync(path, redacted, CancellationToken.None); + return path; + } + catch (Exception ex) when (ex is IOException + or UnauthorizedAccessException + or NotSupportedException + or System.Security.SecurityException) + { + Debug.WriteLine($"tool-output spill write failed: {ex.Message}"); + return null; + } + } + + private static string Compose(string inline, string? spillPath, int fullLength, int budget) + { + var sb = new StringBuilder(inline); + sb.Append($"\n\n[output truncated to {budget} chars of {fullLength}"); + if (spillPath is not null) + sb.Append($"; output saved to {spillPath} — read a slice with file_read (offset/limit) or grep it instead of re-running"); + sb.Append(']'); + return sb.ToString(); + } + + private static string SafeFileName(string id) + { + var invalid = Path.GetInvalidFileNameChars(); + Span buffer = id.Length is > 0 and <= 256 ? stackalloc char[id.Length] : new char[id.Length]; + for (var i = 0; i < id.Length; i++) + buffer[i] = invalid.Contains(id[i]) ? '_' : id[i]; + var safe = new string(buffer); + return string.IsNullOrWhiteSpace(safe) ? "tool-call" : safe; + } +} diff --git a/src/Netclaw.Configuration/Schemas/netclaw-config.v1.schema.json b/src/Netclaw.Configuration/Schemas/netclaw-config.v1.schema.json index 775f1bcfa..cbafd32a9 100644 --- a/src/Netclaw.Configuration/Schemas/netclaw-config.v1.schema.json +++ b/src/Netclaw.Configuration/Schemas/netclaw-config.v1.schema.json @@ -295,7 +295,8 @@ "MaxOutputChars": { "type": "integer", "minimum": 1, - "default": 32000 + "default": 256000, + "description": "Capture ceiling for tool output: the maximum characters captured in bounded memory to become the full-output spill file written under the session directory when a result exceeds the inline budget. This is NOT what the model sees inline — that is Session.Tuning.MaxInlineToolResultChars (plus any per-tool override). Output beyond this ceiling is discarded while the source keeps draining (so a live child process never deadlocks), and the spill is a head+tail view." }, "AudienceProfiles": { "$ref": "#/$defs/ToolAudienceProfiles" diff --git a/src/Netclaw.Configuration/SessionTuning.cs b/src/Netclaw.Configuration/SessionTuning.cs index c7aa5c78f..25d842fcb 100644 --- a/src/Netclaw.Configuration/SessionTuning.cs +++ b/src/Netclaw.Configuration/SessionTuning.cs @@ -32,10 +32,13 @@ public sealed record SessionTuning public int KeepRecentToolResults { get; init; } = 3; /// - /// Maximum number of characters from a single tool result that may be - /// inlined into conversation history. Oversized results are truncated to - /// protect the context window from verbose tool payloads (DOM dumps, - /// large JSON blobs, etc.). + /// The session content inline budget: the default maximum characters of + /// a tool result inlined into conversation history, for tools whose output the + /// model needs to read in full (file_read, web_fetch, memory recall, MCP). + /// Oversized results are truncated to a head+tail window by the dispatcher and + /// the full (redacted) output is spilled to a session file with a steer to read + /// a slice / grep. Verbose tools (shell) override this with a much smaller + /// per-tool budget (). /// public int MaxInlineToolResultChars { get; init; } = 12_000; diff --git a/src/Netclaw.Configuration/ToolConfig.cs b/src/Netclaw.Configuration/ToolConfig.cs index 638d5b60c..fd70b6214 100644 --- a/src/Netclaw.Configuration/ToolConfig.cs +++ b/src/Netclaw.Configuration/ToolConfig.cs @@ -12,7 +12,17 @@ public sealed class ToolConfig { public ShellExecutionMode? ShellMode { get; set; } public int ShellTimeoutSeconds { get; set; } = 60; - public int MaxOutputChars { get; set; } = 32_000; + + /// + /// The capture ceiling: the maximum characters of tool output captured (in + /// bounded memory) to become the spill body written to a session file. It is + /// NOT the inline budget — SessionTuning.MaxInlineToolResultChars (N) + /// owns what the model sees inline. Output beyond this ceiling is drained-and- + /// discarded (the source keeps draining so a live child never deadlocks) and the + /// spill is a head+tail view. Sized so the spill is useful while staying + /// redactable in a single in-memory pass. + /// + public int MaxOutputChars { get; set; } = 256_000; /// /// Maximum per-call timeout in seconds that the LLM can request via _timeout_seconds. diff --git a/src/Netclaw.Tools.Abstractions/INetclawTool.cs b/src/Netclaw.Tools.Abstractions/INetclawTool.cs index 2e2fbf177..fabe12872 100644 --- a/src/Netclaw.Tools.Abstractions/INetclawTool.cs +++ b/src/Netclaw.Tools.Abstractions/INetclawTool.cs @@ -44,6 +44,16 @@ public interface INetclawTool /// ACL grant category for policy filtering. string GrantCategory { get; } + /// + /// Inline character budget for this tool's result before the dispatcher + /// windows it (head+tail) and spills the full output to a session file with a + /// steer. 0 means "use the session content budget" + /// (SessionTuning.MaxInlineToolResultChars) — the default for content + /// tools whose output the model needs to read. Verbose tools (e.g. shell) + /// override this to a small value so their noisy output is bounded aggressively. + /// + int InlineOutputBudgetChars => 0; + /// JSON Schema describing the tool's parameters. JsonElement ParameterSchema { get; } diff --git a/src/Netclaw.Tools.Abstractions/NetclawTool.cs b/src/Netclaw.Tools.Abstractions/NetclawTool.cs index 1229d8683..243569039 100644 --- a/src/Netclaw.Tools.Abstractions/NetclawTool.cs +++ b/src/Netclaw.Tools.Abstractions/NetclawTool.cs @@ -123,6 +123,12 @@ protected bool TryParse( public abstract string GrantCategory { get; } public abstract JsonElement ParameterSchema { get; } + /// + /// Inline output budget; 0 = use the session content budget. First-party + /// tools override this to opt into a smaller (verbose) budget. + /// + public virtual int InlineOutputBudgetChars => 0; + /// /// Deserialize raw LLM arguments into the typed params record. /// Implemented by the source generator to handle JsonElement and native CLR types. diff --git a/src/Netclaw.Tools.Abstractions/ToolExecutionContext.cs b/src/Netclaw.Tools.Abstractions/ToolExecutionContext.cs index 486d2361d..ed049694e 100644 --- a/src/Netclaw.Tools.Abstractions/ToolExecutionContext.cs +++ b/src/Netclaw.Tools.Abstractions/ToolExecutionContext.cs @@ -165,6 +165,17 @@ public IReadOnlySet OneTimeApprovedPatterns /// public string? SessionDirectory { get; } + /// + /// The session content inline budget + /// (SessionTuning.MaxInlineToolResultChars), surfaced here so + /// DispatchingToolExecutor can bound a tool result and spill the + /// overflow to {SessionDirectory}/tool-calls/{callId}.log. The dispatcher + /// uses a tool's own InlineOutputBudgetChars override when set (verbose + /// tools), else this content budget. Zero when unset (the dispatcher falls back + /// to its built-in content default). + /// + public int MaxInlineToolResultChars { get; init; } + /// /// Resolved absolute working directory for the in-flight tool call. Set /// by the session pipeline from the candidate tool arguments,