diff --git a/benchmarks/Netclaw.Benchmarks/CaptureBenchmarks.cs b/benchmarks/Netclaw.Benchmarks/CaptureBenchmarks.cs
new file mode 100644
index 000000000..b4e2a44ae
--- /dev/null
+++ b/benchmarks/Netclaw.Benchmarks/CaptureBenchmarks.cs
@@ -0,0 +1,36 @@
+// -----------------------------------------------------------------------
+//
+// Copyright (C) 2026 - 2026 Petabridge, LLC
+//
+// -----------------------------------------------------------------------
+
+using BenchmarkDotNet.Attributes;
+using Netclaw.Actors.Tools;
+
+namespace Netclaw.Benchmarks;
+
+///
+/// Exercises the production capture path used by shell/background_job/file_read:
+/// drain a stream to the capture ceiling, then derive the small inline window
+/// (what ToolOutputSpill does before redaction). The point is to confirm
+/// allocation is O(capture ceiling), not O(total output) — peak allocation stays
+/// flat whether the source emits 1K chars or 50M.
+///
+[MemoryDiagnoser]
+public class CaptureBenchmarks
+{
+ private const int CaptureMax = 256_000; // ToolConfig.MaxOutputChars default
+ private const int InlineBudget = 2_000; // SessionTuning.MaxInlineToolResultChars default
+
+ [Params(1_000, 256_000, 50_000_000)]
+ public int TotalChars;
+
+ [Benchmark]
+ public async Task Capture_then_inline_window()
+ {
+ var reader = new SyntheticCharReader(TotalChars);
+ var (captured, _) = await BoundedOutputReader.DrainToWindowAsync(reader, CaptureMax, CancellationToken.None);
+ var inline = BoundedOutputReader.Window(captured, InlineBudget);
+ return inline.Length;
+ }
+}
diff --git a/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs b/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs
index 4df1a3143..a8ec55f05 100644
--- a/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs
+++ b/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs
@@ -15,8 +15,8 @@ namespace Netclaw.Benchmarks;
/// turning a child's stdout stream into a bounded, redaction-ready string.
///
/// The comparison is the regression fix for #1293:
-/// (head+tail ring buffer, capped at
-/// read time) versus the old path —
+/// (head+tail ring buffer,
+/// capped at read time) versus the old path —
/// followed by , which materialised the
/// entire output before applying the cap. The story we want the numbers to tell
/// is allocation shape: the old path is O(total output) and lands on the LOH for
@@ -53,7 +53,7 @@ public async Task ReadToEnd_ThenTruncate()
public async Task BoundedDrain()
{
var reader = new SyntheticCharReader(TotalChars);
- var (text, _) = await ShellTool.BoundedDrainAsync(reader, Cap);
+ var (text, _) = await BoundedOutputReader.DrainToWindowAsync(reader, Cap, CancellationToken.None);
return text.Length;
}
}
diff --git a/evals/README.md b/evals/README.md
index 77693a3d3..cbb60aee7 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -68,7 +68,7 @@ log patterns** (skill loading, memory recall, checkpoint formation).
| Grounding & Alignment | 3 | Uses tools to verify facts, admits uncertainty |
| Autonomy & Execution | 2 | Executes tasks rather than describing them |
| Subagents | 1 | Delegates through `spawn_agent` and verifies headless subagents complete ambiguous work without clarification loops |
-| Complex Task Execution | 3 | Multi-step tool chains complete successfully |
+| Complex Task Execution | 5 | Multi-step tool chains complete successfully, incl. bounded tool output — given only the goal (no handling hints), the agent retrieves a deep line from oversized shell output and from a large file, which is only possible by coping with the bound the way AGENTS.md/skills/steer text direct |
| Multi-Turn Conversation | 7 | Session resume and speaker attribution recall |
Each case defines multiple natural phrasings of the same intent. Each
diff --git a/evals/run-evals.sh b/evals/run-evals.sh
index 5095af05a..ab10c2409 100755
--- a/evals/run-evals.sh
+++ b/evals/run-evals.sh
@@ -358,6 +358,19 @@ start_eval_daemon() {
cp -r "$REPO_ROOT/evals/fixtures/agents/." "$EVAL_HOME/data/agents/"
fi
+ # Pre-seed a large (>256 KB) text file in the workspaces read-root for the
+ # bounded-tool-output file_read eval (complex_large_file_read_ranged). It must
+ # be too big for one inline read AND have model-unguessable content so the only
+ # way to answer "what's on line 5000" is to page it with file_read StartLine/Limit
+ # — the behavior the bounded-output steer is meant to elicit. A deterministic
+ # Lehmer PRNG (pure integer modular arithmetic, identical across awk impls)
+ # makes line 5000 reproducible so the eval can assert the exact value. Lives
+ # under workspaces (a global read-root) rather than identity/skills so it is
+ # not pulled into the system prompt or scanned as a skill. 30000 lines ≈ 314 KB.
+ mkdir -p "$EVAL_HOME/data/workspaces"
+ awk 'BEGIN{x=1;for(i=1;i<=30000;i++){x=(x*48271)%2147483647;print x}}' \
+ > "$EVAL_HOME/data/workspaces/netclaw-eval-largefile.txt"
+
# The eval container runs as the non-root `netclaw` user and needs write
# access to the bind-mounted identity, logs, skills, and data trees.
chmod -R ugo+rwX "$EVAL_HOME/identity" "$EVAL_HOME/logs" "$EVAL_HOME/data" "$EVAL_HOME/skills"
@@ -1056,6 +1069,45 @@ assert_complex_diagnose_self() {
stdout_contains '\[tool:call\] shell_execute' && stdout_contains 'netclaw.*doctor'
}
+# bounded-tool-output coverage (bound-tool-output-with-file-spill change).
+# These two cases assert on OUTCOME, not mechanism: the prompts state only the
+# goal and give the agent NO instructions about spilling, redirecting, re-running,
+# file_read, StartLine/Limit, or grep. How the agent handles oversized output must
+# come entirely from AGENTS.md, the netclaw-operations skill, and the steer text
+# in the tool result — coaching it in the prompt would be testing instruction-
+# following, not whether the real guidance surfaces work.
+#
+# The data is a deterministic Lehmer PRNG (pure integer modular arithmetic,
+# identical across awk implementations and the host that computed the expected
+# values), so the value at a deep line is reproducible AND un-fabricatable by the
+# model. Because the tool bounds any single read to ~N=2000 inline chars, the
+# deep-line value is unreachable from one read — so a correct answer can ONLY
+# come from the agent paging/reading the oversized output the way the steer asks.
+# Outcome therefore implies correct handling; no mechanism assertion is needed.
+
+# Large SHELL output: ~210 KB on stdout exceeds N, so the daemon spills it and
+# steers. Line 200 (value 872671849) sits past the inline window; reporting it
+# proves the agent retrieved it from the bounded/spilled output unaided.
+assert_complex_large_shell_output_spill() {
+ stdout_contains '\[tool:call\] shell_execute' && \
+ stdout_response_contains '872671849'
+}
+
+# Large FILE: a pre-seeded ~314 KB file (>256 KB, so file_read returns a bounded
+# sample + steer). The prompt asks for a small line WINDOW around 5000 rather than
+# exactly line 5000: the model pages correctly with file_read StartLine/Limit (the
+# behavior under test, every run) but can misindex the line by ±1 (the original
+# run treated the param's former name "Offset" as a 0-based skip-count — the bug
+# that motivated renaming it to the 1-based StartLine). A window makes line 5000
+# (value 1629331733) fall inside the returned slice regardless of any ±1 indexing,
+# so the case measures bounded-output paging, not exact index arithmetic. Line 5000
+# is ~52 KB in, well past the inline window, so reporting its value still proves
+# the agent paged rather than dumped.
+assert_complex_large_file_read_ranged() {
+ stdout_contains '\[tool:call\] file_read' && \
+ stdout_response_contains '1629331733'
+}
+
# Category 8: Multi-Turn Conversation (tests session-resume + KV cache behavior)
# All assertions run against the concatenated stdout of every turn in the case.
@@ -1466,6 +1518,25 @@ run_all() {
run_case complex_diagnose_self "shell_execute with netclaw doctor" \
"Run netclaw doctor and summarize any problems"
+ # bounded-tool-output: oversized SHELL output. The prompt states only the
+ # goal — run a command and report a deep line of its output. How to cope with
+ # the output being too large to return inline (read the spill the steer hands
+ # back, rather than re-running) must come from the agent's own guidance, not
+ # this prompt. The number is a deterministic-but-opaque Lehmer PRNG value; the
+ # assertion checks the agent reports the correct line-200 value (872671849).
+ run_case complex_large_shell_output_spill "retrieves a deep line from oversized shell output unaided" \
+ "Run this command with shell_execute and tell me the number it prints on line 200: awk 'BEGIN{x=1;for(i=1;i<=20000;i++){x=(x*48271)%2147483647;print x}}'" \
+ "Using shell_execute, run: awk 'BEGIN{x=1;for(i=1;i<=20000;i++){x=(x*48271)%2147483647;print x}}' — then tell me which number is printed on the 200th line of its output."
+
+ # bounded-tool-output: oversized FILE. The prompt states only the goal — read
+ # a deep line of a named large file. How to cope with it being too large for
+ # one read (page it with file_read StartLine/Limit, per the steer) must come from
+ # the agent's own guidance. The file is pre-seeded in start_eval_daemon; the
+ # assertion checks the agent reports the correct line-5000 value (1629331733).
+ run_case complex_large_file_read_ranged "retrieves a deep line from a large file unaided" \
+ "List the numbers on lines 4997 through 5003 of the file /home/netclaw/.netclaw/workspaces/netclaw-eval-largefile.txt" \
+ "Read lines 4997 to 5003 of /home/netclaw/.netclaw/workspaces/netclaw-eval-largefile.txt and list the numbers on those lines."
+
end_category
# ── Category 8: Multi-Turn Conversation ──
diff --git a/feeds/skills/.system/files/netclaw-operations/SKILL.md b/feeds/skills/.system/files/netclaw-operations/SKILL.md
index 17148ce3e..c4dc56ead 100644
--- a/feeds/skills/.system/files/netclaw-operations/SKILL.md
+++ b/feeds/skills/.system/files/netclaw-operations/SKILL.md
@@ -3,7 +3,7 @@ name: netclaw-operations
description: "REQUIRED when the user asks about scheduling, reminders, cron jobs, timers, background jobs, diagnostics, troubleshooting, MCP tools, daemon health, identity updates, or Netclaw capabilities and self-maintenance."
metadata:
author: netclaw
- version: "2.8.9"
+ version: "2.9.0"
---
# Netclaw Operations
@@ -218,7 +218,8 @@ Rules:
- The user must approve the command before it starts running in the background.
- Maximum 5 concurrent background jobs; overflow queues FIFO.
- Job definitions persist to `~/.netclaw/jobs/{id}.json`.
-- Output captured to `~/.netclaw/jobs/{id}/output.log`.
+- Output is captured (bounded; head+tail for very large output) to
+ `~/.netclaw/jobs/{id}/output.log` — `file_read`/`grep` it for detail beyond the tail.
Monitoring tools:
@@ -235,6 +236,26 @@ results proactively when the job completes.
Active background jobs appear in the `[active-background-jobs]` section of the
session context on every turn.
+## Large tool output
+
+Tool output is bounded to a small inline budget
+(`Session.Tuning.MaxInlineToolResultChars`, default 2000 chars) so it never floods
+the context window. When a tool's output exceeds that budget you get a head+tail
+view inline plus a pointer to the full output — not the whole thing:
+
+- **`shell_execute`** spills the full (redacted) output to
+ `{session}/tool-calls/{toolCallId}.log` and gives you the path. Read a slice with
+ `file_read` (`StartLine`/`Limit`) or `grep` it — do NOT re-run the command to see more.
+- **`file_read`** on a large file returns the head and steers you to read a
+ specific range with `StartLine`/`Limit` or `grep` (`StartLine` is a 1-based line
+ number — line 1 is the first line). Don't `cat` a huge file through
+ `shell_execute` to get around it — that just spills again.
+- **`background_job`** output goes to `~/.netclaw/jobs/{id}/output.log` (bounded);
+ `check_background_job` returns a tail, and you can `file_read`/`grep` the log for the rest.
+
+Reading a targeted range or grepping is always cheaper than re-running a command or
+re-reading a whole file. Secret-bearing values are redacted from all tool output.
+
## Tool Discovery
MCP tools are not loaded by default. Use `search_tools` to discover them:
diff --git a/openspec/changes/bound-tool-output-with-file-spill/.openspec.yaml b/openspec/changes/bound-tool-output-with-file-spill/.openspec.yaml
new file mode 100644
index 000000000..0ba725fbf
--- /dev/null
+++ b/openspec/changes/bound-tool-output-with-file-spill/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-06-03
diff --git a/openspec/changes/bound-tool-output-with-file-spill/design.md b/openspec/changes/bound-tool-output-with-file-spill/design.md
new file mode 100644
index 000000000..5bc714de2
--- /dev/null
+++ b/openspec/changes/bound-tool-output-with-file-spill/design.md
@@ -0,0 +1,176 @@
+## Context
+
+Three tool paths capture external output and feed it to the model:
+`shell_execute` (process pipes), `background_job` (process pipes, long-running),
+and `file_read` (a file on disk). Originally they bounded output inconsistently:
+
+- `shell_execute` drained its pipes into a head+tail window capped at
+ `ToolConfig.MaxOutputChars` (32000) via `BoundedDrainAsync` (#1293), in the tool.
+- The session pipeline then re-clamped **every** tool result to
+ `SessionTuning.MaxInlineToolResultChars` (12000), **head-only**, in
+ `SessionToolExecutionPipeline.ClampToolResult`.
+- `background_job` and the default `file_read` path did not bound memory at all —
+ they materialized the full output/file as a managed string before trimming
+ (#1300, #1301).
+
+Two truncation stages that disagreed on budget (32000 vs 12000) and policy
+(head+tail vs head-only), plus two unbounded OOM paths. Both Claude Code and
+opencode ship the same "bound inline, spill the rest to a file, steer the model
+to read ranges / grep" pattern; this change adopts it.
+
+**The load-bearing discovery (from the implementation review):** every tool
+result already funnels through **one chokepoint** — `DispatchingToolExecutor`,
+which both the main session pipeline and sub-agents call to run a tool, and which
+**already redacts every result centrally** (`SecretOutputRedactor.Redact`, before
+`ClampToolResult` ever runs). That makes the dispatcher — not the tool, not the
+pipeline clamp — the correct home for bounding + spill: it sees every result, for
+every audience, already redacted. Putting it anywhere else (per-tool, or in the
+main-session-only `ClampToolResult`) either duplicates the logic or misses the
+sub-agent path.
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- One place that bounds tool output to an inline budget and, on overflow, spills
+ the full (redacted) output to a session file with a steer — uniform for the
+ main session and sub-agents.
+- A budget that fits the tool: small for *verbose* tools (shell), generous for
+ *content* tools (file_read, web_fetch, MCP, memory) the model fetched to read.
+- Bounded memory on every capture path — no path materializes arbitrarily large
+ output as a managed string (closes #1300, #1301 OOM).
+- A single shared bounded-output reader so the ring/window logic is reviewed and
+ fixed once, not copy-pasted (the #1293 review's altitude finding).
+
+**Non-Goals:**
+
+- Spill-file retention / cleanup (owned by the session-log-cleanup issue).
+- Streaming redaction / byte-complete spill of multi-GB floods (the capture
+ ceiling bounds the spill; see D7 risks).
+- A new redaction abstraction — the central `SecretOutputRedactor` already covers
+ every result.
+- Inferring from a shell command's AST that its output equals an existing file
+ (rejected: output ≠ a referenced file once piped/filtered; false positives
+ mislead).
+
+## Decisions
+
+### D1 — Bounding + spill live in `DispatchingToolExecutor`, the one chokepoint
+
+Immediately after the dispatcher's existing central redaction, bound the result
+to the resolved budget and, when it overflows, window it head+tail and spill the
+full redacted result to a session file with a steer. Because every tool result
+(main + sub-agent) passes through the dispatcher already-redacted, this is a
+single uniform stage and the spilled file is redacted for free.
+
+- *Alternative — spill in the tool:* rejected. It duplicates logic across tools,
+ pushes the spill into N places, and the spill body would have to be re-redacted
+ per tool.
+- *Alternative — spill in `ClampToolResult` (pipeline):* rejected. That path is
+ main-session only; sub-agents call the dispatcher but not the pipeline clamp, so
+ this would miss them (the exact gap the review found).
+
+### D2 — `ClampToolResult` is retired (single truncation stage)
+
+With the dispatcher owning the bound, a second pipeline clamp would re-window an
+already-windowed+steered result (slicing its tail, stacking a second marker). The
+`ClampToolResult` calls and method are removed; the dispatcher is the only stage.
+
+### D3 — Per-tool inline budgets: one content default, opt-in verbose override
+
+The budget is a property of the tool, not a policy branch in the dispatcher.
+`INetclawTool.InlineOutputBudgetChars` defaults to `0` = "use the session content
+budget" (`SessionTuning.MaxInlineToolResultChars`, default **12000**). Verbose
+tools override it: `shell_execute` returns **2000**, because shell output is noise
+the model skims. Content tools (file_read, web_fetch, web_search, memory, MCP)
+take the 12000 content budget because the model fetched them to read in full —
+truncating those to a tiny window would force a wasteful extra `file_read`
+round-trip. The dispatcher resolves `tool.InlineOutputBudgetChars` ?? content
+default. Not a `shell` special-case, not per-tool busywork.
+
+- *Alternative — one global `N`:* rejected. A single small `N` starves content
+ tools; a single large `N` bloats shell context. Verbose-vs-content is genuinely
+ a per-tool property.
+
+### D4 — Spill file `{SessionDirectory}/tool-calls/{toolCallId}.log`
+
+The dispatcher has `toolCall.CallId` (a method parameter) and
+`context.SessionDirectory`, so it names the spill per call with no extra plumbing
+on `ToolExecutionContext`. The call id is sanitized against path traversal before
+use as a filename. No session dir / call id → degrade to inline-only window.
+
+### D5 — Redaction stays central; tools and the spill helper do not redact
+
+`DispatchingToolExecutor` already redacts every result with `SecretOutputRedactor`
+before bounding, so the inline result and the spilled file are both redacted from
+one pass — no new abstraction, no per-tool or per-chunk redaction. `file_read`'s
+own redact-on-read was therefore redundant and is removed (#1301's "no redaction"
+half was a false alarm — the dispatcher covered it; only its OOM half was real).
+
+### D6 — Tools shrink to "bound capture (OOM) + return raw"
+
+Tools no longer window, redact, or spill — they only bound their own *capture* for
+memory safety and return the raw bounded string; the dispatcher does the rest.
+`shell_execute` drains both pipes to `MaxOutputChars` and returns the combined
+output. `file_read` reads a bounded head (`ReadBoundedHeadAsync`, up to
+`MaxOutputChars` — never `ReadAllTextAsync`) and returns it; the dispatcher then
+applies the content budget + spill uniformly. The offset/limit path
+(`ReadLinesAsync`) is already bounded and unchanged.
+
+### D7 — Combined (not per-stream) bound for `shell_execute`
+
+`shell_execute` drains stdout and stderr each to `MaxOutputChars`, assembles the
+combined output, and re-windows the **combined** back to `MaxOutputChars` so the
+captured/spilled body is bounded by the ceiling (not 2×). This also resolves the
+per-stream doubling the review flagged.
+
+### D8 — `MaxOutputChars` is the capture ceiling
+
+`ToolConfig.MaxOutputChars` (raised 32000 → **256000**) is the memory/disk bound
+on what a tool captures — the body that becomes the spill. It is independent of
+the inline budget `N` (which the dispatcher applies). The pipe still drains past
+it to avoid child deadlock; the discarded middle leaves a `...` gap in the spill.
+
+### D9 — `background_job` keeps its own log path
+
+`background_job` is not an inline tool result (it delivers a tail + an on-disk log
+via an actor message), so it does **not** go through the dispatcher spill. It uses
+the shared `BoundedOutputReader` to drain its pipes in bounded memory (closing
+#1300) and marks output that exceeded its capture ceiling.
+
+## Risks / Trade-offs
+
+- **Capture ceiling clips the spill for a true flood** → e.g. `cat` of a 2 GB
+ file yields a ~512 KB head+tail spill with a `...` gap, not a duplicate. The
+ steer says "output saved to {path}" (not "full"), honest in both cases; the
+ skill steers the agent to `file_read` the source for files.
+- **Verbose budget (2000) is small** → intended; shell output is skimmable and
+ the full output is one `file_read`/`grep` away.
+- **Lowering then restoring the default** → `MaxInlineToolResultChars` stays
+ 12000 (content); only `shell_execute` is aggressive. Configs that set it
+ explicitly are unaffected.
+- **Sub-agent spills land in the parent session dir** → acceptable; the parent
+ owns the directory and its access scope.
+
+## Migration Plan
+
+1. Land `BoundedOutputReader` (shared reader) — no behavior change.
+2. Add the per-tool budget rail (`INetclawTool.InlineOutputBudgetChars`).
+3. Move bound+spill into `DispatchingToolExecutor`; retire `ClampToolResult`.
+4. Shrink the tools (shell/file_read) to bound-capture-and-return; remove their
+ redaction.
+5. `background_job` bounded drain + ceiling marker.
+6. Update the `netclaw-operations` skill + eval cases.
+
+Rollback: re-instate `ClampToolResult` and remove the dispatcher bound+spill; the
+in-tool capture bounds (the OOM fixes) remain.
+
+## Open Questions
+
+- **`background_job` byte-complete log:** capped at `MaxCapturedOutputChars`
+ (256000); a long job's log is a head+tail view. Revisit if a user needs full
+ job logs (would need a larger ceiling or streaming redaction).
+- **Spill for content tools with a natural backing (`file_read`):** today the
+ dispatcher may spill a bounded copy of file content. The copy is redacted and
+ capped, so it is harmless but slightly redundant with the original file; left
+ as-is rather than special-casing `file_read` out of the uniform path.
diff --git a/openspec/changes/bound-tool-output-with-file-spill/proposal.md b/openspec/changes/bound-tool-output-with-file-spill/proposal.md
new file mode 100644
index 000000000..7d833d0a3
--- /dev/null
+++ b/openspec/changes/bound-tool-output-with-file-spill/proposal.md
@@ -0,0 +1,108 @@
+## Why
+
+Large tool output is bounded inconsistently across the codebase, and the two
+truncation stages that do exist disagree. `shell_execute` now bounds its pipe
+reads to a head+tail window (`BoundedDrainAsync`, #1293), but the session
+pipeline then re-clamps every tool result to `MaxInlineToolResultChars`
+**head-only** — so the tail that `BoundedDrainAsync` worked to preserve (errors,
+exit status) is discarded before the model sees it, and `stderr` can be dropped
+entirely. Meanwhile two sibling paths still materialize unbounded output in
+memory and can OOM a memory-limited daemon: `background_job` output capture
+(#1300) and the default `file_read` path (#1301); `file_read` also returns file
+contents with no secret redaction at all.
+
+The fix is to make output bounding a single, shared, deliberate mechanism:
+one budget, one policy (head + tail), and — when output exceeds the budget —
+spill the full output to a session-scoped file and hand the model the path with
+a hint to read ranges or `grep` instead of re-running. This is the pattern both
+Claude Code and opencode ship, and it is the pattern `background-job-execution`
+already half-specifies (tail + output-file path).
+
+No dedicated PRD exists; this originates from the #1293 production OOM incident
+and its review (#1300, #1301). It should be linked from a PRD if one is opened.
+
+## What Changes
+
+- Bound tool output and spill the overflow in **one place** —
+ `DispatchingToolExecutor`, the chokepoint every tool result already passes
+ through (main session and sub-agents) and where the central
+ `SecretOutputRedactor.Redact` already runs. Right after redaction it windows the
+ result to the tool's inline budget and, on overflow, spills the full redacted
+ result to `sessionDir/tool-calls/{toolCallId}.log` with a steer to `file_read`
+ (offset/limit) or `grep`. The spilled file is redacted for free.
+- **Per-tool inline budgets.** Add `INetclawTool.InlineOutputBudgetChars` (default
+ `0` = use the session content budget). `shell_execute` overrides to **2000**
+ (verbose output the model skims); content tools (`file_read`, `web_fetch`,
+ `web_search`, memory, MCP) use the **12000** content budget because the model
+ fetched them to read in full.
+- **Retire `ClampToolResult`.** The dispatcher is the single truncation stage;
+ the head-only pipeline clamp is removed (no double-clamp).
+- `SessionTuning.MaxInlineToolResultChars` stays **12000** (the content default);
+ only verbose tools opt down. `ToolConfig.MaxOutputChars` becomes the **capture
+ ceiling** (32000 → **256000**) — the memory/disk bound on what a tool captures.
+- Tools shrink to "bound capture (OOM) + return raw": `shell_execute` drains both
+ pipes to the ceiling and returns the combined output (no per-tool window /
+ redact / spill); `file_read` reads a bounded head (no `ReadAllTextAsync`) and
+ returns it (closes #1301 OOM). The central dispatcher redaction already covered
+ `file_read`, so its redundant redact-on-read is removed (#1301's redaction half
+ was a false alarm).
+- `background_job`: bounded pipe drain via the shared reader + a capture-ceiling
+ marker (closes #1300 OOM). It keeps its own on-disk log, not the dispatcher
+ spill.
+
+### Out of scope (this change)
+
+- Spill-file lifecycle/cleanup (tracked separately; the session-log cleanup
+ issue owns retention/sweep).
+- A byte-complete (unbounded) spill: capture is bounded by `MaxOutputChars`, so a
+ multi-hundred-MB flood is captured head+tail, not in full. See design D7/D8.
+- Inferring from a shell command's AST that its output equals an existing file
+ (rejected — output ≠ a referenced file once piped/filtered).
+- **Media egress (#1296)** — image/AV bytes sent *to* the model. It shares the
+ "don't `ReadAllBytes` a huge thing" lesson but needs a different fix
+ (downscale/streamed-encode/provider file APIs), so it stays a separate change;
+ this one closes only the **text** tool-output paths (#1300, #1301).
+
+## Capabilities
+
+### New Capabilities
+
+- `bounded-tool-output`: the cross-cutting contract — `DispatchingToolExecutor`
+ bounds every (already-redacted) tool result to the tool's inline budget and
+ spills the overflow to a session file with a steer; per-tool budgets
+ (`InlineOutputBudgetChars`, content default vs verbose override); a shared
+ bounded-output reader; the capture ceiling. Bounded memory on every path.
+
+### Modified Capabilities
+
+- `netclaw-tools`: `shell_execute` and `file_read` shrink to bounded-capture-and-
+ return; `shell_execute` declares the small verbose budget and bounds combined
+ stdout+stderr; `file_read` reads a bounded head (no full materialization). The
+ truncation requirement moves from a per-tool indicator to the central
+ dispatcher's window + spill + steer.
+- `netclaw-session`: tool-result inlining no longer clamps in the pipeline
+ (`ClampToolResult` removed); the dispatcher is the single bounding+spill stage.
+- `background-job-execution`: output capture SHALL bound memory (shared reader +
+ ceiling marker) rather than buffering the full output before trimming.
+
+## Impact
+
+- **Code:** `DispatchingToolExecutor` (bound+spill), `ToolOutputSpill` (the
+ bound+spill helper), `INetclawTool`/`NetclawTool` (`InlineOutputBudgetChars`
+ rail), `ShellTool` (declares 2000; bound combined; drop redact/spill),
+ `FileReadTool` (bounded head; drop redundant redaction), `BoundedOutputReader`
+ (shared reader), `BackgroundJobExecutionActor` (bounded drain + marker),
+ `SessionToolExecutionPipeline` (`ClampToolResult` removed), `ToolExecutionContext`
+ (`MaxInlineToolResultChars` content budget; `ToolCallId` removed — the
+ dispatcher uses `toolCall.CallId`).
+- **Config:** `SessionTuning.MaxInlineToolResultChars` stays 12000 (content);
+ `ToolConfig.MaxOutputChars` 32000 → 256000 (capture ceiling; schema default
+ updated).
+- **Security:** redaction unchanged in placement (central dispatcher) — the spill
+ file is redacted because redaction runs before it; spill files live under the
+ session directory and inherit its access scope.
+- **Operational:** large tool outputs produce a session-dir `.log` the agent
+ reads on demand; the model is steered toward ranged reads/`grep`. Closes #1300
+ and #1301; resolves the #1293 review's two-stage-truncation tension.
+- **Evals/skills:** tool-output behavior changes — update `netclaw-operations`
+ (done) and any eval cases asserting on tool-result truncation/format.
diff --git a/openspec/changes/bound-tool-output-with-file-spill/specs/background-job-execution/spec.md b/openspec/changes/bound-tool-output-with-file-spill/specs/background-job-execution/spec.md
new file mode 100644
index 000000000..a7a7efe3b
--- /dev/null
+++ b/openspec/changes/bound-tool-output-with-file-spill/specs/background-job-execution/spec.md
@@ -0,0 +1,76 @@
+## MODIFIED Requirements
+
+### Requirement: Background job execution
+
+The system SHALL spawn a `BackgroundJobExecutionActor` child of the manager
+for each background job. The execution actor SHALL start the process, capture
+stdout/stderr to `~/.netclaw/jobs/{id}/output.log`, and monitor for process
+exit. Output capture SHALL bound peak managed memory per the `bounded-tool-output`
+capability — the execution actor SHALL stream output to the log and retain only a
+bounded tail in memory for the completion message, and SHALL NOT materialize the
+full output as a single in-memory string before trimming. On process exit, the
+actor SHALL deliver the result to the originating session via
+`DeliverTrustedSessionTurn`. This trusted delivery SHALL be an intentional parity
+decision with normal synchronous shell tool results, not a trust escalation
+beyond the original tool execution. On timeout, the actor SHALL kill the entire
+process tree.
+
+#### Scenario: Process started and monitored
+
+- **GIVEN** a `StartBackgroundJob` command is accepted
+- **WHEN** the execution actor starts
+- **THEN** the process is spawned with stdin closed
+- **AND** stdout/stderr are captured to the output log file
+- **AND** the manager returns a `BackgroundJobStarted` response with the job ID
+
+#### Scenario: Output capture bounds memory
+
+- **GIVEN** a background job process emits output far larger than the capture
+ ceiling
+- **WHEN** the execution actor captures it
+- **THEN** peak managed memory stays on the order of the capture ceiling
+- **AND** the full output is not held as a single in-memory string
+- **AND** the daemon is not OOM-killed by the capture
+
+#### Scenario: Process completes successfully
+
+- **GIVEN** a background job process exits with code 0
+- **WHEN** the execution actor detects the exit
+- **THEN** the result (exit code, bounded output tail, output file path, original
+ rationale) is delivered to the originating session via
+ `DeliverTrustedSessionTurn`
+- **AND** the job definition is updated to completed status
+
+#### Scenario: Process fails
+
+- **GIVEN** a background job process exits with non-zero code
+- **WHEN** the execution actor detects the exit
+- **THEN** the failure result (exit code, error output, original rationale) is
+ delivered to the originating session via `DeliverTrustedSessionTurn`
+- **AND** the job definition is updated to failed status
+
+#### Scenario: Process timeout
+
+- **GIVEN** a background job exceeds its timeout
+- **WHEN** the timeout fires
+- **THEN** the entire process tree is killed
+- **AND** a timeout error is delivered to the originating session
+- **AND** the job definition is updated to timed-out status
+
+#### Scenario: Job result delivery to passivated session
+
+- **GIVEN** a background job completes
+- **AND** the originating session has been passivated (idle timeout)
+- **WHEN** the result is delivered via `DeliverTrustedSessionTurn`
+- **THEN** the session rehydrates from the Akka Persistence journal
+- **AND** processes the result turn
+
+#### Scenario: Trusted delivery preserves synchronous shell parity
+
+- **GIVEN** a background job was started from an approved `shell_execute` call
+- **WHEN** the job completes
+- **THEN** the completion is delivered as a trusted turn
+- **AND** that trust matches the trust level of the same shell result if it had
+ completed synchronously
+- **AND** no broader trust is granted than the persisted originating
+ audience/boundary
diff --git a/openspec/changes/bound-tool-output-with-file-spill/specs/bounded-tool-output/spec.md b/openspec/changes/bound-tool-output-with-file-spill/specs/bounded-tool-output/spec.md
new file mode 100644
index 000000000..a05ccd6f4
--- /dev/null
+++ b/openspec/changes/bound-tool-output-with-file-spill/specs/bounded-tool-output/spec.md
@@ -0,0 +1,92 @@
+## ADDED Requirements
+
+### Requirement: Central bound + spill for every tool result
+
+`DispatchingToolExecutor` SHALL bound every tool result to an inline budget and,
+when the result exceeds that budget, return a head+tail window of the budget plus
+a steer pointing the model at the full output. This applies uniformly to every
+tool, for the main session and for sub-agents (both run tools through the
+dispatcher). The bound SHALL be applied after the dispatcher's central secret
+redaction, so the inline result and any spilled file are redacted from one pass.
+Individual tools SHALL NOT window, redact, or spill their results; they only bound
+their own capture for memory safety and return the raw bounded result.
+
+#### Scenario: Result under budget returned unchanged
+
+- **WHEN** a tool returns a result at or below its inline budget
+- **THEN** the dispatcher returns it unchanged
+- **AND** no spill file is created
+
+#### Scenario: Result over budget windowed to head and tail
+
+- **WHEN** a tool returns a result larger than its inline budget
+- **THEN** the inline result contains the head and the tail of the result within
+ the budget, with the discarded middle marked
+
+#### Scenario: Same bounding for sub-agent tool calls
+
+- **WHEN** a sub-agent runs a tool that returns an oversized result
+- **THEN** the same dispatcher bound + spill applies (sub-agents are not exempt)
+
+### Requirement: Per-tool inline budget
+
+The inline budget SHALL be a property of the tool. `INetclawTool.InlineOutputBudgetChars`
+SHALL default to `0`, meaning "use the session content budget"
+(`SessionTuning.MaxInlineToolResultChars`). A tool MAY override it: verbose tools
+(e.g. `shell_execute`) override to a small value because their output is skimmed;
+content tools keep the larger content budget because the model fetched them to
+read in full. The dispatcher SHALL resolve the budget as the tool's override when
+positive, else the session content budget.
+
+#### Scenario: Verbose tool bounded small
+
+- **GIVEN** a tool declares a small `InlineOutputBudgetChars` (e.g. shell)
+- **WHEN** its output exceeds that small budget
+- **THEN** the dispatcher windows it to the small budget and spills the rest
+
+#### Scenario: Content tool bounded by the session content budget
+
+- **GIVEN** a tool with no override (e.g. file_read, web_fetch, MCP)
+- **WHEN** its output is at or below the content budget
+- **THEN** the full result is returned inline with no spill
+
+### Requirement: Spill to a session-scoped file with a steer
+
+When a result exceeds its inline budget, the dispatcher SHALL write the full
+(already-redacted) result to `{SessionDirectory}/tool-calls/{toolCallId}.log`,
+using the call's identifier (sanitized so it cannot escape the directory), and
+SHALL append a steer with the path directing the model to read a slice (`file_read`
+with offset/limit) or `grep` rather than re-running the tool. When no session
+directory or call id is available, the dispatcher SHALL return the inline window
+without spilling.
+
+#### Scenario: Spill file written and path returned
+
+- **WHEN** a result over budget is produced in a session with a directory
+- **THEN** the full redacted result is written to
+ `{SessionDirectory}/tool-calls/{toolCallId}.log`
+- **AND** the inline result includes that path and a steer to `file_read`/`grep`
+
+#### Scenario: Spilled file is redacted
+
+- **WHEN** a result containing a secret is spilled
+- **THEN** the on-disk spill file has the secret redacted (redaction runs before
+ the spill write)
+
+#### Scenario: Call id cannot escape the spill directory
+
+- **WHEN** the call id contains path-traversal characters
+- **THEN** the spill file is written inside the tool-calls directory, never outside it
+
+### Requirement: Bounded-memory capture
+
+Tools SHALL bound their own capture so peak managed memory is on the order of the
+capture ceiling (`ToolConfig.MaxOutputChars`), independent of total output size.
+No tool SHALL materialize the entire output of an arbitrarily large source as a
+single in-memory string before bounding it.
+
+#### Scenario: Large source does not scale memory
+
+- **WHEN** a tool reads from a source far larger than the capture ceiling
+- **THEN** peak managed allocation stays on the order of the capture ceiling
+- **AND** the process is not OOM-killed by the capture
diff --git a/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-session/spec.md b/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-session/spec.md
new file mode 100644
index 000000000..c0c6cb8a7
--- /dev/null
+++ b/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-session/spec.md
@@ -0,0 +1,33 @@
+## MODIFIED Requirements
+
+### Requirement: Tool execution encapsulation
+
+Tool execution SHALL be encapsulated in a `SessionToolExecutionPipeline` static
+utility class. The pipeline SHALL execute tool calls in parallel, track sub-agent
+activity, and send `ToolExecutionCompleted` or `ToolExecutionFailed` back to the
+actor. The pipeline SHALL NOT itself bound or clamp tool-result size: bounding the
+result to the inline budget and spilling the overflow is done once, centrally, by
+`DispatchingToolExecutor` (per the `bounded-tool-output` capability), so the
+pipeline stores the result the dispatcher already bounded. `SessionTuning.MaxInlineToolResultChars`
+is the session **content** budget the dispatcher uses for tools without a smaller
+per-tool override.
+
+#### Scenario: Parallel tool execution
+
+- **GIVEN** an LLM response contains 3 tool calls
+- **WHEN** `SessionToolExecutionPipeline.ExecuteToolsAsync()` runs
+- **THEN** all 3 tool calls execute in parallel
+- **AND** results are collected and sent as a single `ToolExecutionCompleted`
+
+#### Scenario: Tool execution timeout
+
+- **GIVEN** tool execution is in progress
+- **WHEN** the configured `ToolExecutionTimeout` elapses
+- **THEN** the pipeline sends `ToolExecutionFailed` with a `TimeoutException`
+
+#### Scenario: Oversized result already bounded by the dispatcher
+
+- **GIVEN** a tool returns an oversized result
+- **WHEN** it reaches the pipeline
+- **THEN** the pipeline stores it as-is (already windowed + spilled by
+ `DispatchingToolExecutor`) without re-clamping
diff --git a/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-tools/spec.md b/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-tools/spec.md
new file mode 100644
index 000000000..78ff1f0b9
--- /dev/null
+++ b/openspec/changes/bound-tool-output-with-file-spill/specs/netclaw-tools/spec.md
@@ -0,0 +1,86 @@
+## MODIFIED Requirements
+
+### Requirement: Shell execution tool
+
+The system SHALL provide a shell execution tool that runs commands as the
+Netclaw process user context. Stdin SHALL be closed (no interactive commands).
+Execution SHALL enforce a configurable timeout (default: 60 seconds). The tool
+SHALL drain stdout and stderr in bounded memory (each to the capture ceiling
+`ToolConfig.MaxOutputChars`) and return the combined output bounded to the
+ceiling — it does NOT itself window, redact, or spill (the central
+`bounded-tool-output` mechanism does, after redaction). `shell_execute` SHALL
+declare a small verbose inline budget (`InlineOutputBudgetChars`) so its skimmable
+output is bounded aggressively. Before execution, the shell tool SHALL check the
+hard deny list via `ShellCommandPolicy`; hard-denied commands SHALL be rejected
+before `ToolPathPolicy` path checks.
+
+#### Scenario: Execute command and return output
+
+- **GIVEN** the `shell` grant is available for the session
+- **WHEN** the agent invokes the shell tool with a command
+- **THEN** the command is executed as the Netclaw process user
+- **AND** stdout and stderr are captured
+- **AND** the combined output is returned to the LLM
+
+#### Scenario: Hard-denied command rejected before execution
+
+- **GIVEN** the agent invokes `shell_execute` with `netclaw daemon stop`
+- **WHEN** `ShellCommandPolicy` evaluates the command
+- **THEN** the command is rejected with "Command blocked by hard deny policy"
+- **AND** the shell process is never started
+
+#### Scenario: Execution timeout enforced
+
+- **GIVEN** a shell command is running
+- **WHEN** the command exceeds the configured timeout (default: 60 seconds)
+- **THEN** the process is terminated
+- **AND** the tool returns a timeout error message to the LLM
+
+#### Scenario: Combined output bounded by the capture ceiling
+
+- **GIVEN** a shell command writes large output to both stdout and stderr
+- **WHEN** the output is captured
+- **THEN** the returned combined output is bounded by `MaxOutputChars` (one shared
+ ceiling, not a per-stream cap)
+- **AND** the dispatcher applies the inline budget + spill + steer on top
+ (per `bounded-tool-output`)
+
+#### Scenario: Stdin closed prevents interactive commands
+
+- **GIVEN** the agent invokes the shell tool with a command
+- **WHEN** the process is created
+- **THEN** stdin is closed immediately
+- **AND** commands that require interactive input fail promptly
+
+#### Scenario: Working directory set to project path
+
+- **GIVEN** the session is associated with a registered project
+- **WHEN** the shell tool executes a command
+- **THEN** the working directory is set to the project's registered path
+
+## ADDED Requirements
+
+### Requirement: File read tool bounds its read for memory safety
+
+The `file_read` tool's default (no `offset`/`limit`) path SHALL read a bounded
+head of the file (up to `ToolConfig.MaxOutputChars`) and stop — it SHALL NOT read
+the entire file into memory before truncating. The existing line-range
+(`offset`/`limit`) path SHALL remain bounded. `file_read` SHALL NOT redact its
+result itself; the central `DispatchingToolExecutor` redaction covers it. The
+inline bound + spill (if any) is applied centrally per `bounded-tool-output`;
+`file_read` is a content tool and uses the session content budget.
+
+#### Scenario: Large file is read in bounded memory
+
+- **WHEN** the agent reads a file larger than the capture ceiling with no
+ `offset`/`limit`
+- **THEN** the tool reads only a bounded head and does not materialize the whole
+ file in memory
+- **AND** it appends a steer to read a specific range (`offset`/`limit`) or `grep`
+
+#### Scenario: Secrets in a read file are redacted by the dispatcher
+
+- **GIVEN** a file contains a secret-bearing value (e.g. an API key)
+- **WHEN** the agent reads the file
+- **THEN** the result returned to the model has the secret redacted (by the
+ central dispatcher redaction)
diff --git a/openspec/changes/bound-tool-output-with-file-spill/tasks.md b/openspec/changes/bound-tool-output-with-file-spill/tasks.md
new file mode 100644
index 000000000..4050a14fa
--- /dev/null
+++ b/openspec/changes/bound-tool-output-with-file-spill/tasks.md
@@ -0,0 +1,159 @@
+## 1. Shared bounded-output reader (foundation, no behavior change)
+
+- [x] 1.1 Extract `BoundedDrainAsync` from `ShellTool` into a reusable
+ `BoundedOutputReader` (Netclaw.Actors/Tools), keeping the #1293 ring core
+ (pooled buffer, `ValueTask` reads, block-copy tail ring)
+- [x] 1.2 Expose `DrainToWindowAsync(TextReader, int budget, ct) → (string Text,
+ bool Truncated)` (head+tail, in-memory only)
+- [x] 1.3 Expose `DrainCaptureAsync(TextReader, int captureMax, int inlineBudget,
+ ct) → (string Captured, string Inline, bool CeilingExceeded)` over a shared
+ core, plus a pure `Window(string, budget)` helper; drain past `captureMax`
+- [x] 1.4 Move/adapt the `BoundedDrainAsync` unit tests onto `BoundedOutputReader`
+ (`DrainToWindow_*`), add `Window` + `DrainCapture` coverage
+- [x] 1.5 Repoint `ShellTool` and the benchmark at
+ `BoundedOutputReader.DrainToWindowAsync` with no behavior change; 40 reader +
+ shell tests green
+
+## 2. Tool-call id and inline budget plumbed to the capture layer
+
+- [x] 2.1 Add `ToolCallId` (typed value object, nullable) to `ToolExecutionContext`
+- [x] 2.2 Set `ToolCallId` in `BuildToolExecutionContext` (per-call) from `tc.CallId`
+- [x] 2.3 Add `MaxInlineToolResultChars` to `ToolExecutionContext` and thread `N`
+ through `BuildToolExecutionContext` so tools bound to the same `N` the
+ pipeline enforces
+- [x] 2.4 Carry/distinctness is compiler-enforced (per-call context) + verified
+ end-to-end by the task-4 spill test ({callId}.log); skip a trivial
+ assignment test per the testing guidelines. (Sub-agent/direct-construction
+ contexts default to null id + 0 budget — fallback handled in task 3/4.)
+
+## 3. Spill writer + steering message
+
+- [x] 3.1 Add `ToolOutputSpill.RenderAsync`: redact the bounded capture once,
+ window the inline from the redacted text, and write
+ `{sessionDirectory}/tool-calls/{toolCallId}.log` (call id sanitized against
+ path traversal). Removed the redundant `DrainCaptureAsync` — the real flow
+ is DrainToWindow → redact → Window → spill (inline must come from the
+ *redacted* capture), so DrainToWindow + Window + the spill helper supersede it.
+- [x] 3.2 Steering message (path + "read a slice with file_read offset/limit or
+ grep instead of re-running") + a "capture ceiling exceeded" note
+- [x] 3.3 Tests: under-budget verbatim; over-budget spill written + redacted-on-disk
+ + steer; ceiling note; no-session degrade; path-traversal call id contained.
+ 15 reader+spill tests pass; slopwatch clean
+
+## 4. shell_execute adopts capture + spill
+
+- [x] 4.1 Switch `ShellTool` to combined-capture (one shared budget across
+ stdout+stderr via `DrainToWindowAsync` per stream → assembled) then
+ `ToolOutputSpill.RenderAsync`, which redacts once, windows to `N`, and
+ spills+steers. Removed ShellTool's own per-stream redaction and markers.
+- [x] 4.2 Replaced `Output_truncation_applies` with `Large_output_spills_to_file_and_steers`
+ (asserts inline head+tail + spill file + steer); kept the Windows-deterministic
+ `echo`. Redaction/echo/stderr tests still pass (redaction now in RenderAsync).
+- [x] 4.3 Drains-past-ceiling behavior is `DrainToWindowAsync`'s (proven by the
+ reader tests); the existing cancellation/kill test covers the no-deadlock path.
+
+## 5. background_job bounded capture (closes #1300)
+
+- [x] 5.1 `BackgroundJobExecutionActor` now drains each stream via
+ `BoundedOutputReader.DrainToWindowAsync` (capture ceiling
+ `MaxCapturedOutputChars = 256000`) instead of `ReadToEndAsync` — bounded
+ memory; the log is head+tail for floods larger than the ceiling. Closes #1300.
+- [x] 5.2 Redact-on-write unchanged (the existing `SecretOutputRedactor.Redact`
+ now runs over the bounded combined output before the log write)
+- [x] 5.3 Bounding is `DrainToWindowAsync`'s (unit-tested); the existing
+ BackgroundJob integration tests exercise the new drain path end-to-end.
+
+## 6. file_read bounded reads + redaction (closes #1301)
+
+- [x] 6.1 Default `file_read` path now uses `ReadBoundedHeadAsync` (reads up to the
+ limit and stops — bounds memory AND I/O) instead of `ReadAllTextAsync` +
+ `TruncateFileOutput`; no spill (the file is its own backing). Closes #1301.
+ (Head-only, not head+tail: a file is read top-down via Offset/Limit, and
+ head+tail would require reading the whole file to reach the tail.)
+- [x] 6.2 Over-budget steer: "read a specific range with Offset and Limit, or grep"
+- [x] 6.3 Redact-on-read via `SecretOutputRedactor` on both the default and the
+ `ReadLinesAsync` (offset/limit) return paths; offset/limit path stays bounded
+- [x] 6.4 Tests: large file returns a bounded head (first N only, not all 500
+ chars) + steer; secret in a read file is redacted. 2195 actor tests pass
+
+## 7. Config + pipeline unification
+
+- [x] 7.1 Lowered `SessionTuning.MaxInlineToolResultChars` default 12000 → 2000
+- [x] 7.2 Repurposed `ToolConfig.MaxOutputChars` as the capture ceiling, default
+ 32000 → 256000 (docs updated to reflect the new role)
+- [x] 7.3 `ClampToolResult` now head+tail (reuses `BoundedOutputReader.Window`);
+ safety net for non-shared-reader results (MCP, in-process)
+- [x] 7.4 Updated `netclaw-config.v1.schema.json` MaxOutputChars default → 256000
+ (MaxInlineToolResultChars schema has only `minimum:100`, which 2000 satisfies).
+ 363 config tests + 2194 actor tests pass; slopwatch clean
+
+## 8. Quality gates, docs, eval, and OpenSpec close-out
+
+- [x] 8.1 `dotnet slopwatch analyze` clean (run per group); headers on all new .cs
+- [x] 8.2 Updated `netclaw-operations` skill (new "Large tool output" section:
+ spill to session file, steer to ranged reads/grep, bounded job log) and
+ bumped `metadata.version` 2.8.9 → 2.9.0
+- [x] 8.3 Eval cases flagged + the two coverage gaps ADDED + suite run against
+ spark2 (Qwen3.6-35B, openai-compatible):
+ - **At-risk-but-robust (verified still green, 5/5 each):**
+ `complex_diagnose_self`, `complex_gh_issues`, `complex_write_and_run`,
+ `multi_turn_tool_repeat`, `multi_turn_tool_carryover`. Their asserts check
+ `[tool:call] shell_execute` + a keyword, not exact tool-result content —
+ so truncating output to N=2000 + spill doesn't break them.
+ - **New coverage cases added** (Category 7, `evals/run-evals.sh`), both 5/5.
+ They assert on OUTCOME, not mechanism: the prompts state only the goal and
+ give ZERO handling hints (no mention of spill/redirect/re-run/file_read/
+ Offset/grep) — coping with oversized output must come from AGENTS.md, the
+ netclaw-operations skill, and the tool-result steer text, or the eval is
+ just testing instruction-following. Data is a deterministic-but-opaque
+ Lehmer PRNG (`x=(x*48271)%2147483647`, identical across awk impls), so a
+ deep-line value is reproducible AND un-fabricatable; since one read is
+ bounded to ~N inline, retrieving a deep line is only possible by paging —
+ so a correct value *implies* correct handling (no mechanism assertion).
+ 1. `complex_large_shell_output_spill` — "run `awk '…'` and tell me the
+ number on line 200" (a bare safe-verb generator, no path tokens →
+ auto-approves headless; ~210 KB stdout → spills). Asserts the answer
+ contains line 200's value (872671849). Whether the agent reads the
+ spill or self-redirects+reads, the outcome assertion is satisfied.
+ 2. `complex_large_file_read_ranged` — a ~314 KB file pre-seeded by
+ `start_eval_daemon` under the workspaces read-root; "list lines
+ 4997–5003 of ". Asserts the answer contains line 5000's value
+ (1629331733). Line 5000 is ~52 KB in (past the inline window), so a
+ correct answer proves the agent paged with `file_read` Offset/Limit.
+ - **Finding (separate from this change):** the model reliably pages with
+ `file_read` Offset/Limit but treats `Offset` as **0-based** despite its
+ "(1-based)" param description (e.g. `Offset:4999` for "line 5000"). The
+ window prompt tolerates this so the case measures bounded-output paging,
+ not index arithmetic; the off-by-one itself likely warrants a clearer
+ `file_read.Offset` description / operations-skill note as a follow-up.
+ - Run: NETCLAW_EVAL_PROVIDER_TYPE=openai-compatible
+ NETCLAW_EVAL_PROVIDER_ENDPOINT=https://spark2.testlab.petabridge.net/
+ NETCLAW_EVAL_MODEL_ID=Qwen/Qwen3.6-35B-A3B-FP8
+ NETCLAW_EVAL_CATEGORY="Complex Task Execution" ./evals/run-evals.sh
+ → Complex Task Execution 5/5 (100%), both new cases 5/5 uncoached.
+- [x] 8.4 Added `CaptureBenchmarks` confirming O(ceiling): allocation flat at
+ ~1255 KB for 256K and 50M chars (vs unbounded before)
+- [ ] 8.5 `openspec validate` passes; sync/archive on PR merge (not yet merged)
+
+## 9. Architecture revision (post-implementation-review)
+
+A max-effort review of the per-tool spill design surfaced a HIGH-severity gap
+(sub-agent tool output unbounded) and that the spill was at the wrong altitude.
+The change pivoted; the artifacts above describe the original per-tool approach,
+the proposal/design now describe the as-built. Net deltas:
+
+- [x] 9.1 Move bound+spill into `DispatchingToolExecutor` (the one chokepoint both
+ main session and sub-agents use, already the central redaction point) —
+ retire `ClampToolResult`; tools shrink to bound-capture-and-return.
+- [x] 9.2 Per-tool inline budgets (`INetclawTool.InlineOutputBudgetChars`): content
+ default 12000 (restore `MaxInlineToolResultChars`), `shell_execute` opts to
+ 2000. Content tools (file_read/web_fetch/MCP/memory) no longer truncated to
+ a tiny window.
+- [x] 9.3 Remove `file_read` redundant redaction (central dispatcher covers it);
+ drop `ToolExecutionContext.ToolCallId` (dispatcher uses `toolCall.CallId`).
+- [x] 9.4 Steer wording: "output saved to {path}" (not "full"), honest for a
+ capture-ceiling-clipped flood. Rejected shell-AST file inference (output ≠ a
+ referenced file once piped/filtered).
+- [x] 9.5 Tests retargeted to the dispatcher (end-to-end verbose-spill, redact-
+ before-spill, content no-spill); `FakeToolExecutor` mirrors the dispatcher
+ post-processing. 2196 actor tests pass; slopwatch clean.
diff --git a/src/Netclaw.Actors.Tests/Sessions/ToolExecutionIntegrationTests.cs b/src/Netclaw.Actors.Tests/Sessions/ToolExecutionIntegrationTests.cs
index f65d84441..b23f6a560 100644
--- a/src/Netclaw.Actors.Tests/Sessions/ToolExecutionIntegrationTests.cs
+++ b/src/Netclaw.Actors.Tests/Sessions/ToolExecutionIntegrationTests.cs
@@ -249,8 +249,8 @@ await sessionManager.Ask(new SendUserMessage
var result = toolMessage!.Contents.OfType().Single().Result?.ToString();
Assert.NotNull(result);
- Assert.Contains("tool result truncated", result, StringComparison.OrdinalIgnoreCase);
- Assert.True(result!.Length < 300);
+ Assert.Contains("truncated", result, StringComparison.OrdinalIgnoreCase);
+ Assert.True(result!.Length < 500); // windowed to the 120-char budget + steer (vs the raw 800)
}
[Fact]
@@ -360,7 +360,7 @@ public Task AuthorizeAsync(FunctionCallContent toolCall, Netclaw.Tools.ToolExecu
return Task.CompletedTask;
}
- public Task ExecuteAsync(FunctionCallContent toolCall, Netclaw.Tools.ToolExecutionContext? context = null, CancellationToken ct = default)
+ public async Task ExecuteAsync(FunctionCallContent toolCall, Netclaw.Tools.ToolExecutionContext? context = null, CancellationToken ct = default)
{
Interlocked.Increment(ref _callCount);
@@ -370,7 +370,16 @@ public Task ExecuteAsync(FunctionCallContent toolCall, Netclaw.Tools.Too
}
var result = Results.GetValueOrDefault(toolCall.Name, $"[fake result for {toolCall.Name}]");
- return Task.FromResult(result);
+ // Mirror DispatchingToolExecutor's post-processing so integration tests see
+ // the same redact + inline-bound + spill the real executor applies. The fake
+ // has no tool instance, so it uses the session content budget (per-tool
+ // verbose overrides are exercised in DispatchingToolExecutorTests).
+ result = Netclaw.Security.SecretOutputRedactor.Redact(result);
+ var budget = context?.MaxInlineToolResultChars is > 0 and var b
+ ? b
+ : Netclaw.Actors.Tools.ToolOutputSpill.DefaultContentBudget;
+ return await Netclaw.Actors.Tools.ToolOutputSpill.BoundAndSpillAsync(
+ result, toolCall.CallId, budget, context, ct);
}
}
diff --git a/src/Netclaw.Actors.Tests/Tools/BoundedOutputReaderTests.cs b/src/Netclaw.Actors.Tests/Tools/BoundedOutputReaderTests.cs
new file mode 100644
index 000000000..dd72b9228
--- /dev/null
+++ b/src/Netclaw.Actors.Tests/Tools/BoundedOutputReaderTests.cs
@@ -0,0 +1,133 @@
+// -----------------------------------------------------------------------
+//
+// Copyright (C) 2026 - 2026 Petabridge, LLC
+//
+// -----------------------------------------------------------------------
+using Netclaw.Actors.Tools;
+using Xunit;
+
+namespace Netclaw.Actors.Tests.Tools;
+
+public class BoundedOutputReaderTests
+{
+ // ── DrainToWindowAsync ──
+
+ [Fact]
+ public async Task DrainToWindow_short_output_returned_verbatim()
+ {
+ var input = "hello world";
+ var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 100, CancellationToken.None);
+ Assert.Equal(input, text);
+ Assert.False(truncated);
+ }
+
+ [Fact]
+ public async Task DrainToWindow_empty_input_returns_empty()
+ {
+ var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(""), 100, CancellationToken.None);
+ Assert.Equal("", text);
+ Assert.False(truncated);
+ }
+
+ [Fact]
+ public async Task DrainToWindow_output_exactly_at_cap_not_truncated()
+ {
+ var input = new string('a', 100);
+ var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 100, CancellationToken.None);
+ Assert.Equal(input, text);
+ Assert.False(truncated);
+ }
+
+ [Fact]
+ public async Task DrainToWindow_long_output_truncated_with_head_and_tail()
+ {
+ // 100-char head marker + separator + 100-char tail marker, with filler in the middle
+ var head = new string('H', 100);
+ var middle = new string('M', 5000);
+ var tail = new string('T', 100);
+ var input = head + middle + tail;
+
+ var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 200, CancellationToken.None);
+
+ Assert.True(truncated);
+ Assert.StartsWith(new string('H', 100), text); // head preserved
+ Assert.EndsWith(new string('T', 100), text); // tail preserved
+ Assert.Contains("...", text); // separator present
+ Assert.DoesNotContain("M", text); // middle discarded
+ }
+
+ [Fact]
+ public async Task DrainToWindow_head_and_tail_split_evenly()
+ {
+ // budget=10 → headCap=5, tailCap=5
+ var input = "AAAAAXXXXXXBBBBB"; // 16 chars: 5 head, 6 overflow discard, 5 tail
+ var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 10, CancellationToken.None);
+
+ Assert.True(truncated);
+ Assert.StartsWith("AAAAA", text);
+ Assert.EndsWith("BBBBB", text);
+ }
+
+ [Fact]
+ public async Task DrainToWindow_disabled_cap_returns_full_output()
+ {
+ var input = new string('x', 10_000);
+ var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(new StringReader(input), 0, CancellationToken.None);
+ Assert.Equal(input, text);
+ Assert.False(truncated);
+ }
+
+ [Fact]
+ public async Task DrainToWindow_tail_ring_wraps_across_small_chunks()
+ {
+ // Drives the ring's wraparound + start-advance path that the StringReader
+ // tests skip: each read delivers a chunk smaller than tailCap, so the tail
+ // window is rebuilt incrementally and must wrap rather than reset wholesale.
+ // budget=10 → headCap=5 ("ABCDE"), tailCap=5; last 5 of "FGHIJKLMNO" = "KLMNO".
+ var reader = new ChunkedReader("ABCDEFGHIJKLMNO", chunkSize: 3);
+
+ var (text, truncated) = await BoundedOutputReader.DrainToWindowAsync(reader, 10, CancellationToken.None);
+
+ Assert.True(truncated);
+ Assert.Equal($"ABCDE{Environment.NewLine}...{Environment.NewLine}KLMNO", text);
+ }
+
+ // ── Window (pure string head+tail) ──
+
+ [Fact]
+ public void Window_under_budget_returned_unchanged()
+ {
+ Assert.Equal("short", BoundedOutputReader.Window("short", 100));
+ }
+
+ [Fact]
+ public void Window_over_budget_keeps_head_and_tail()
+ {
+ var input = new string('H', 50) + new string('M', 500) + new string('T', 50);
+ var result = BoundedOutputReader.Window(input, 100);
+
+ Assert.StartsWith(new string('H', 50), result);
+ Assert.EndsWith(new string('T', 50), result);
+ Assert.DoesNotContain("M", result);
+ }
+
+ // Hands out at most chunkSize chars per read so tests can exercise the tail
+ // ring's incremental wrap path — real pipe reads arrive in arbitrary slices,
+ // not the single 4KB gulp a StringReader gives.
+ private sealed class ChunkedReader(string data, int chunkSize) : TextReader
+ {
+ private int _pos;
+
+ public override ValueTask ReadAsync(Memory buffer, CancellationToken cancellationToken = default)
+ {
+ var remaining = data.Length - _pos;
+ if (remaining <= 0)
+ return ValueTask.FromResult(0);
+
+ var n = Math.Min(Math.Min(chunkSize, buffer.Length), remaining);
+ data.AsSpan(_pos, n).CopyTo(buffer.Span);
+ _pos += n;
+ return ValueTask.FromResult(n);
+ }
+ }
+}
diff --git a/src/Netclaw.Actors.Tests/Tools/DispatchingToolExecutorTests.cs b/src/Netclaw.Actors.Tests/Tools/DispatchingToolExecutorTests.cs
index 243e57b69..309d950d5 100644
--- a/src/Netclaw.Actors.Tests/Tools/DispatchingToolExecutorTests.cs
+++ b/src/Netclaw.Actors.Tests/Tools/DispatchingToolExecutorTests.cs
@@ -67,6 +67,112 @@ public DispatchingToolExecutorTests()
UsedStrictFallback: false)));
}
+ [Fact]
+ public async Task Verbose_tool_output_over_budget_is_windowed_and_spilled()
+ {
+ var sessionDir = Path.Combine(Path.GetTempPath(), "nc-disp-" + Guid.NewGuid().ToString("N"));
+ Directory.CreateDirectory(sessionDir);
+ try
+ {
+ // shell_execute declares the small verbose budget (2000); echo > 2000 chars.
+ var toolCall = new FunctionCallContent("call-spill", "shell_execute",
+ ToolInput.Create("Command", $"echo {new string('x', 3000)}"));
+ var context = new Netclaw.Tools.ToolExecutionContext("slack/thread-1", sessionDir)
+ {
+ Audience = TrustAudience.Personal,
+ };
+
+ var result = await _executor.ExecuteAsync(toolCall, context, CancellationToken.None);
+
+ Assert.True(result.Length < 3000); // windowed inline, not the full 3000
+ Assert.Contains("output saved to", result);
+ Assert.Contains("file_read", result);
+ var spill = Path.Combine(sessionDir, "tool-calls", "call-spill.log");
+ Assert.True(File.Exists(spill));
+ Assert.Contains(new string('x', 100), await File.ReadAllTextAsync(spill, CancellationToken.None));
+ }
+ finally
+ {
+ Directory.Delete(sessionDir, recursive: true);
+ }
+ }
+
+ [Fact]
+ public async Task Spilled_output_is_redacted_before_write()
+ {
+ var sessionDir = Path.Combine(Path.GetTempPath(), "nc-disp-" + Guid.NewGuid().ToString("N"));
+ Directory.CreateDirectory(sessionDir);
+ try
+ {
+ // Secret + padding so it both redacts and exceeds the shell budget → spills.
+ var toolCall = new FunctionCallContent("call-redact", "shell_execute",
+ ToolInput.Create("Command", $"echo API_KEY=supersecret123 {new string('x', 3000)}"));
+ var context = new Netclaw.Tools.ToolExecutionContext("slack/thread-1", sessionDir)
+ {
+ Audience = TrustAudience.Personal,
+ };
+
+ var result = await _executor.ExecuteAsync(toolCall, context, CancellationToken.None);
+ var onDisk = await File.ReadAllTextAsync(
+ Path.Combine(sessionDir, "tool-calls", "call-redact.log"), CancellationToken.None);
+
+ Assert.DoesNotContain("supersecret123", result);
+ Assert.DoesNotContain("supersecret123", onDisk); // redacted before the spill write
+ }
+ finally
+ {
+ Directory.Delete(sessionDir, recursive: true);
+ }
+ }
+
+ [Fact]
+ public async Task Small_output_is_redacted_without_spilling()
+ {
+ // Redaction happens centrally for every result, spill or not.
+ var toolCall = new FunctionCallContent("call-r", "shell_execute",
+ ToolInput.Create("Command", "echo API_KEY=secret123"));
+ var context = new Netclaw.Tools.ToolExecutionContext("signalr/thread-1", null)
+ {
+ Audience = TrustAudience.Personal,
+ };
+
+ var result = await _executor.ExecuteAsync(toolCall, context, CancellationToken.None);
+
+ Assert.Contains("API_KEY=***REDACTED***", result);
+ Assert.DoesNotContain("secret123", result);
+ Assert.DoesNotContain("saved to", result); // small → no spill
+ }
+
+ [Fact]
+ public async Task Content_tool_under_default_budget_not_spilled()
+ {
+ var sessionDir = Path.Combine(Path.GetTempPath(), "nc-disp-" + Guid.NewGuid().ToString("N"));
+ Directory.CreateDirectory(sessionDir);
+ try
+ {
+ // file_read has no verbose override → the 12000-char content budget; a
+ // small file is returned whole with no spill.
+ var file = Path.Combine(sessionDir, "note.txt");
+ await File.WriteAllTextAsync(file, "hello content", CancellationToken.None);
+ var toolCall = new FunctionCallContent("call-content", "file_read",
+ ToolInput.Create("Path", file));
+ var context = new Netclaw.Tools.ToolExecutionContext("slack/thread-1", sessionDir)
+ {
+ Audience = TrustAudience.Personal,
+ };
+
+ var result = await _executor.ExecuteAsync(toolCall, context, CancellationToken.None);
+
+ Assert.Contains("hello content", result);
+ Assert.DoesNotContain("saved to", result);
+ Assert.False(Directory.Exists(Path.Combine(sessionDir, "tool-calls")));
+ }
+ finally
+ {
+ Directory.Delete(sessionDir, recursive: true);
+ }
+ }
+
[Fact]
public async Task Routes_shell_execute()
{
diff --git a/src/Netclaw.Actors.Tests/Tools/FileReadToolTests.cs b/src/Netclaw.Actors.Tests/Tools/FileReadToolTests.cs
index 39a482219..7f96f253f 100644
--- a/src/Netclaw.Actors.Tests/Tools/FileReadToolTests.cs
+++ b/src/Netclaw.Actors.Tests/Tools/FileReadToolTests.cs
@@ -208,7 +208,7 @@ public async Task Read_with_offset_and_limit()
var lines = Enumerable.Range(1, 10).Select(i => $"Line {i}");
await File.WriteAllLinesAsync(filePath, lines, TestContext.Current.CancellationToken);
- var args = ToolInput.Create("Path", filePath, "Offset", 3, "Limit", 2);
+ var args = ToolInput.Create("Path", filePath, "StartLine", 3, "Limit", 2);
var result = await _tool.ExecuteAsync(args, CreatePersonalContext(), CancellationToken.None);
@@ -229,9 +229,13 @@ public async Task Large_file_is_truncated_with_continuation_hint()
var result = await tool.ExecuteAsync(args, CreatePersonalContext(), CancellationToken.None);
Assert.Contains("output truncated", result);
- Assert.Contains("Offset=", result);
+ Assert.Contains("StartLine", result);
+ Assert.Contains("grep", result);
+ // Bounded read: only the first 100 chars are materialized, not all 500.
+ Assert.StartsWith(new string('x', 100), result);
}
+
[Fact]
public async Task Paginated_read_truncated_by_char_limit_includes_continuation_hint()
{
@@ -240,11 +244,11 @@ public async Task Paginated_read_truncated_by_char_limit_includes_continuation_h
var lines = Enumerable.Range(1, 20).Select(i => $"Line {i:D2} content here");
await File.WriteAllLinesAsync(filePath, lines, TestContext.Current.CancellationToken);
- var args = ToolInput.Create("Path", filePath, "Offset", 1, "Limit", 20);
+ var args = ToolInput.Create("Path", filePath, "StartLine", 1, "Limit", 20);
var result = await tool.ExecuteAsync(args, CreatePersonalContext(), CancellationToken.None);
Assert.Contains("output truncated", result);
- Assert.Contains("Offset=", result);
+ Assert.Contains("StartLine=", result);
}
[Fact]
diff --git a/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs b/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs
index 6be5ddee5..06f384156 100644
--- a/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs
+++ b/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs
@@ -100,19 +100,20 @@ public async Task Caller_cancellation_kills_child_process_tree_and_returns_grace
}
[Fact]
- public async Task Output_truncation_applies()
+ public async Task ShellTool_returns_raw_combined_output_without_spilling()
{
- var tool = new ShellTool(new ToolConfig { MaxOutputChars = 50 });
- // Write >50 chars to stdout with a command that needs no interpreter.
- // `echo` is a builtin on both bash and cmd.exe; a long literal is
- // deterministic. (python on the Windows runner resolves to the Store
- // stub, which writes to stderr — so the truncation would land on stderr
- // and the marker would read "[stderr truncated", not "[stdout truncated".)
+ // ShellTool only returns its (bounded) raw output now — redaction and the
+ // inline-budget bound + spill happen centrally in DispatchingToolExecutor
+ // (covered by DispatchingToolExecutorTests). `echo` is a builtin on both
+ // bash and cmd.exe; a long literal is deterministic on stdout.
+ var tool = new ShellTool(new ToolConfig());
var args = ToolInput.Create("Command", $"echo {new string('x', 200)}");
var result = await tool.ExecuteAsync(args, CancellationToken.None);
- Assert.Contains("[stdout truncated", result);
+ Assert.Contains("Exit code: 0", result);
+ Assert.Contains(new string('x', 200), result); // full output, not yet windowed/spilled
+ Assert.DoesNotContain("saved to", result); // ShellTool itself does not spill
}
[Fact]
@@ -367,111 +368,6 @@ public void TruncateOutput_truncates_and_appends_indicator()
Assert.EndsWith("[output truncated]", result);
}
- // ── BoundedDrainAsync ──
-
- [Fact]
- public async Task BoundedDrain_short_output_returned_verbatim()
- {
- var input = "hello world";
- var reader = new StringReader(input);
- var (text, truncated) = await ShellTool.BoundedDrainAsync(reader, 100);
- Assert.Equal(input, text);
- Assert.False(truncated);
- }
-
- [Fact]
- public async Task BoundedDrain_empty_input_returns_empty()
- {
- var reader = new StringReader("");
- var (text, truncated) = await ShellTool.BoundedDrainAsync(reader, 100);
- Assert.Equal("", text);
- Assert.False(truncated);
- }
-
- [Fact]
- public async Task BoundedDrain_output_exactly_at_cap_not_truncated()
- {
- var input = new string('a', 100);
- var reader = new StringReader(input);
- var (text, truncated) = await ShellTool.BoundedDrainAsync(reader, 100);
- Assert.Equal(input, text);
- Assert.False(truncated);
- }
-
- [Fact]
- public async Task BoundedDrain_long_output_truncated_with_head_and_tail()
- {
- // 100-char head marker + separator + 100-char tail marker, with filler in the middle
- var head = new string('H', 100);
- var middle = new string('M', 5000);
- var tail = new string('T', 100);
- var input = head + middle + tail;
-
- var (text, truncated) = await ShellTool.BoundedDrainAsync(new StringReader(input), 200);
-
- Assert.True(truncated);
- Assert.StartsWith(new string('H', 100), text); // head preserved
- Assert.EndsWith(new string('T', 100), text); // tail preserved
- Assert.Contains("...", text); // separator present
- Assert.DoesNotContain("M", text); // middle discarded
- }
-
- [Fact]
- public async Task BoundedDrain_head_and_tail_split_evenly()
- {
- // maxChars=10 → headCap=5, tailCap=5
- var input = "AAAAAXXXXXXBBBBB"; // 16 chars: 5 head, 6 overflow discard, 5 tail
- var (text, truncated) = await ShellTool.BoundedDrainAsync(new StringReader(input), 10);
-
- Assert.True(truncated);
- Assert.StartsWith("AAAAA", text);
- Assert.EndsWith("BBBBB", text);
- }
-
- [Fact]
- public async Task BoundedDrain_disabled_cap_returns_full_output()
- {
- var input = new string('x', 10_000);
- var (text, truncated) = await ShellTool.BoundedDrainAsync(new StringReader(input), 0);
- Assert.Equal(input, text);
- Assert.False(truncated);
- }
-
- [Fact]
- public async Task BoundedDrain_tail_ring_wraps_across_small_chunks()
- {
- // Drives the ring's wraparound + start-advance path that the StringReader
- // tests skip: each read delivers a chunk smaller than tailCap, so the tail
- // window is rebuilt incrementally and must wrap rather than reset wholesale.
- // maxChars=10 → headCap=5 ("ABCDE"), tailCap=5; last 5 of "FGHIJKLMNO" = "KLMNO".
- var reader = new ChunkedReader("ABCDEFGHIJKLMNO", chunkSize: 3);
-
- var (text, truncated) = await ShellTool.BoundedDrainAsync(reader, 10);
-
- Assert.True(truncated);
- Assert.Equal("ABCDE\n...\nKLMNO", text);
- }
-
- // Hands out at most chunkSize chars per read so tests can exercise the tail
- // ring's incremental wrap path — real pipe reads arrive in arbitrary slices,
- // not the single 4KB gulp a StringReader gives.
- private sealed class ChunkedReader(string data, int chunkSize) : TextReader
- {
- private int _pos;
-
- public override ValueTask ReadAsync(Memory buffer, CancellationToken cancellationToken = default)
- {
- var remaining = data.Length - _pos;
- if (remaining <= 0)
- return ValueTask.FromResult(0);
-
- var n = Math.Min(Math.Min(chunkSize, buffer.Length), remaining);
- data.AsSpan(_pos, n).CopyTo(buffer.Span);
- _pos += n;
- return ValueTask.FromResult(n);
- }
- }
-
[Fact]
public async Task Command_referencing_denied_path_returns_access_denied()
{
@@ -487,16 +383,6 @@ public async Task Command_referencing_denied_path_returns_access_denied()
Assert.Contains("Access denied", result);
}
- [Fact]
- public async Task Execute_redacts_secret_like_output()
- {
- var args = ToolInput.Create("Command", "echo API_KEY=secret123");
-
- var result = await _tool.ExecuteAsync(args, CancellationToken.None);
-
- Assert.Contains("API_KEY=***REDACTED***", result);
- Assert.DoesNotContain("secret123", result);
- }
[Fact]
public async Task High_risk_glob_on_netclaw_config_is_blocked()
diff --git a/src/Netclaw.Actors.Tests/Tools/ToolOutputSpillTests.cs b/src/Netclaw.Actors.Tests/Tools/ToolOutputSpillTests.cs
new file mode 100644
index 000000000..f1e6cc185
--- /dev/null
+++ b/src/Netclaw.Actors.Tests/Tools/ToolOutputSpillTests.cs
@@ -0,0 +1,101 @@
+// -----------------------------------------------------------------------
+//
+// Copyright (C) 2026 - 2026 Petabridge, LLC
+//
+// -----------------------------------------------------------------------
+using Netclaw.Actors.Tools;
+using Netclaw.Configuration;
+using Netclaw.Tools;
+using Xunit;
+
+namespace Netclaw.Actors.Tests.Tools;
+
+public sealed class ToolOutputSpillTests : IDisposable
+{
+ private readonly string _sessionDir =
+ Path.Combine(Path.GetTempPath(), "nc-spill-" + Guid.NewGuid().ToString("N"));
+
+ public ToolOutputSpillTests() => Directory.CreateDirectory(_sessionDir);
+
+ public void Dispose()
+ {
+ if (Directory.Exists(_sessionDir))
+ Directory.Delete(_sessionDir, recursive: true);
+ }
+
+ private ToolExecutionContext Context() =>
+ new("session/thread", _sessionDir) { Audience = TrustAudience.Personal };
+
+ private string ToolCallsDir => Path.Combine(_sessionDir, "tool-calls");
+
+ // BoundAndSpillAsync receives an ALREADY-redacted result (the dispatcher redacts
+ // first), so these tests pass content verbatim and assert the bound/spill shape.
+
+ [Fact]
+ public async Task Under_budget_returned_unchanged()
+ {
+ var input = new string('a', 50);
+ var result = await ToolOutputSpill.BoundAndSpillAsync(
+ input, "call_1", budget: 100, Context(), CancellationToken.None);
+
+ Assert.Equal(input, result);
+ Assert.False(Directory.Exists(ToolCallsDir)); // nothing spilled
+ }
+
+ [Fact]
+ public async Task Over_budget_spills_full_output_and_steers()
+ {
+ var input = new string('H', 200) + new string('T', 200); // 400 > 100
+ var result = await ToolOutputSpill.BoundAndSpillAsync(
+ input, "call_2", budget: 100, Context(), CancellationToken.None);
+
+ var spillPath = Path.Combine(ToolCallsDir, "call_2.log");
+ Assert.True(File.Exists(spillPath));
+ Assert.Equal(input, await File.ReadAllTextAsync(spillPath, CancellationToken.None)); // full output on disk
+ Assert.StartsWith(new string('H', 50), result); // inline head
+ Assert.Contains("output saved to", result);
+ Assert.Contains(spillPath, result);
+ Assert.Contains("file_read", result);
+ Assert.Contains("grep", result);
+ }
+
+ [Fact]
+ public async Task Budget_zero_falls_back_to_content_default()
+ {
+ // budget 0 → DefaultContentBudget (12000); a 5000-char input fits, returned whole.
+ var input = new string('x', 5000);
+ var result = await ToolOutputSpill.BoundAndSpillAsync(
+ input, "call_3", budget: 0, Context(), CancellationToken.None);
+
+ Assert.Equal(input, result);
+ Assert.False(Directory.Exists(ToolCallsDir));
+ }
+
+ [Fact]
+ public async Task No_session_directory_degrades_to_inline_only()
+ {
+ var ctx = new ToolExecutionContext("session/thread", sessionDirectory: null)
+ {
+ Audience = TrustAudience.Personal,
+ };
+ var input = new string('H', 200) + new string('T', 200);
+
+ var result = await ToolOutputSpill.BoundAndSpillAsync(
+ input, "call_5", budget: 100, ctx, CancellationToken.None);
+
+ Assert.StartsWith(new string('H', 50), result); // inline still produced
+ Assert.DoesNotContain("saved to", result); // but no spill path
+ }
+
+ [Fact]
+ public async Task Unsafe_call_id_cannot_escape_tool_calls_directory()
+ {
+ var input = new string('H', 200) + new string('T', 200);
+ await ToolOutputSpill.BoundAndSpillAsync(
+ input, "../../evil", budget: 100, Context(), CancellationToken.None);
+
+ var written = Directory.GetFiles(ToolCallsDir);
+ Assert.Single(written);
+ Assert.StartsWith(ToolCallsDir, Path.GetFullPath(written[0]));
+ }
+}
diff --git a/src/Netclaw.Actors/Jobs/BackgroundJobExecutionActor.cs b/src/Netclaw.Actors/Jobs/BackgroundJobExecutionActor.cs
index 411ab0cec..8da749d6c 100644
--- a/src/Netclaw.Actors/Jobs/BackgroundJobExecutionActor.cs
+++ b/src/Netclaw.Actors/Jobs/BackgroundJobExecutionActor.cs
@@ -7,6 +7,7 @@
using System.Text;
using Akka.Actor;
using Akka.Event;
+using Netclaw.Actors.Tools;
using Netclaw.Security;
namespace Netclaw.Actors.Jobs;
@@ -136,11 +137,18 @@ private void SpawnProcess()
var outputBuilder = new StringBuilder();
try
{
- var stdoutTask = process.StandardOutput.ReadToEndAsync();
- var stderrTask = process.StandardError.ReadToEndAsync();
-
- var stdout = await stdoutTask;
- var stderr = await stderrTask;
+ // Bounded capture (not ReadToEndAsync): a chatty long-running job —
+ // exactly what background jobs exist for — must not buffer its full
+ // output in memory. Drain each stream to the capture ceiling (head+tail
+ // for floods larger than it); the source keeps draining past the cap so
+ // the child never deadlocks on a full pipe. Closes #1300.
+ var stdoutTask = BoundedOutputReader.DrainToWindowAsync(
+ process.StandardOutput, BackgroundJobManagerActor.MaxCapturedOutputChars, CancellationToken.None);
+ var stderrTask = BoundedOutputReader.DrainToWindowAsync(
+ process.StandardError, BackgroundJobManagerActor.MaxCapturedOutputChars, CancellationToken.None);
+
+ var (stdout, stdoutTruncated) = await stdoutTask;
+ var (stderr, stderrTruncated) = await stderrTask;
await process.WaitForExitAsync();
outputBuilder.Append(stdout);
@@ -151,6 +159,12 @@ private void SpawnProcess()
outputBuilder.Append(stderr);
}
+ // Mark a flood that exceeded the capture ceiling so the log isn't a
+ // silent head+tail splice presented as complete.
+ if (stdoutTruncated || stderrTruncated)
+ outputBuilder.Append(
+ $"\n[output exceeded the {BackgroundJobManagerActor.MaxCapturedOutputChars}-char capture ceiling — head and tail shown]");
+
var fullOutput = SecretOutputRedactor.Redact(outputBuilder.ToString());
try
diff --git a/src/Netclaw.Actors/Jobs/BackgroundJobManagerActor.cs b/src/Netclaw.Actors/Jobs/BackgroundJobManagerActor.cs
index bddadbcc2..35f3457fd 100644
--- a/src/Netclaw.Actors/Jobs/BackgroundJobManagerActor.cs
+++ b/src/Netclaw.Actors/Jobs/BackgroundJobManagerActor.cs
@@ -22,6 +22,13 @@ public sealed class BackgroundJobManagerActor : ReceiveActor
{
internal const int MaxConcurrentJobs = 5;
internal const int MaxOutputTailChars = 2000;
+
+ // Capture ceiling for a job's output log: the execution actor drains each
+ // stream to this bound (head+tail) so a chatty long-running job can't buffer
+ // its full output in memory and OOM the daemon. The log holds a head+tail view
+ // for floods larger than the ceiling; the message still carries the last
+ // MaxOutputTailChars.
+ internal const int MaxCapturedOutputChars = 256_000;
internal const string JobDeliveryKeyPrefix = "bg-job:";
internal const string SystemSenderId = "background-job-system";
internal const string SourceKind = "background-job";
diff --git a/src/Netclaw.Actors/Sessions/Pipelines/SessionToolExecutionPipeline.cs b/src/Netclaw.Actors/Sessions/Pipelines/SessionToolExecutionPipeline.cs
index 10ca565db..487726865 100644
--- a/src/Netclaw.Actors/Sessions/Pipelines/SessionToolExecutionPipeline.cs
+++ b/src/Netclaw.Actors/Sessions/Pipelines/SessionToolExecutionPipeline.cs
@@ -225,7 +225,8 @@ public static async Task ExecuteSingleToolAsync(
spawnChildActor,
projectDirectory,
turnContext,
- modelInputModalities);
+ modelInputModalities,
+ maxInlineToolResultChars);
context.RequestedTimeoutSeconds = (int)timeout.TotalSeconds;
// Re-drive of an ApprovedOnce approval: the user already clicked
@@ -343,7 +344,7 @@ public static async Task ExecuteSingleToolAsync(
var deniedMessage = new SerializableChatMessage
{
Role = Protocol.ChatRole.Tool,
- Content = ClampToolResult(resultText, maxInlineToolResultChars),
+ Content = resultText,
ToolCallId = new ToolCallId(tc.CallId),
Name = tc.Name
};
@@ -409,7 +410,7 @@ public static async Task ExecuteSingleToolAsync(
return new ToolCallResult(new SerializableChatMessage
{
Role = Protocol.ChatRole.Tool,
- Content = ClampToolResult(resultText, maxInlineToolResultChars),
+ Content = resultText,
ToolCallId = new ToolCallId(tc.CallId),
Name = tc.Name
}, [], context.FileAttachments, completedRuns, acceptedFindings);
@@ -563,7 +564,9 @@ or ApprovalDecision.ApprovedAlways
modelInputBudget ??= new ModelInputBatchBudget(MaxModelInputBatchBytes);
var modelInputMaterialization = MaterializeModelInputFiles(context, sessionDir, logger, modelInputBudget);
- resultText = ClampToolResult(resultText, maxInlineToolResultChars);
+ // No inline clamp here: DispatchingToolExecutor already bounds every tool
+ // result to the inline budget N (and spills the overflow). Clamping again
+ // would re-window the already-windowed+steered result.
if (modelInputMaterialization.RequestedCount > modelInputMaterialization.MediaReferences.Count)
resultText = AppendModelInputHandoffWarning(
resultText,
@@ -928,7 +931,8 @@ private static ToolExecutionContext BuildToolExecutionContext(
Func