netclaw-dev · Aaronontheweb · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/benchmarks/Netclaw.Benchmarks/CaptureBenchmarks.cs b/benchmarks/Netclaw.Benchmarks/CaptureBenchmarks.cs
@@ -0,0 +1,36 @@
+// -----------------------------------------------------------------------
+// <copyright file="CaptureBenchmarks.cs" company="Petabridge, LLC">
+//      Copyright (C) 2026 - 2026 Petabridge, LLC <https://petabridge.com>
+// </copyright>
+// -----------------------------------------------------------------------
+
+using BenchmarkDotNet.Attributes;
+using Netclaw.Actors.Tools;
+
+namespace Netclaw.Benchmarks;
+
+/// <summary>
+/// Exercises the production capture path used by shell/background_job/file_read:
+/// drain a stream to the capture ceiling, then derive the small inline window
+/// (what <c>ToolOutputSpill</c> does before redaction). The point is to confirm
+/// allocation is O(capture ceiling), not O(total output) — peak allocation stays
+/// flat whether the source emits 1K chars or 50M.
+/// </summary>
+[MemoryDiagnoser]
+public class CaptureBenchmarks
+{
+    private const int CaptureMax = 256_000;  // ToolConfig.MaxOutputChars default
+    private const int InlineBudget = 2_000;  // SessionTuning.MaxInlineToolResultChars default
+
+    [Params(1_000, 256_000, 50_000_000)]
+    public int TotalChars;
+
+    [Benchmark]
+    public async Task<int> Capture_then_inline_window()
+    {
+        var reader = new SyntheticCharReader(TotalChars);
+        var (captured, _) = await BoundedOutputReader.DrainToWindowAsync(reader, CaptureMax, CancellationToken.None);
+        var inline = BoundedOutputReader.Window(captured, InlineBudget);
+        return inline.Length;
+    }
+}
diff --git a/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs b/benchmarks/Netclaw.Benchmarks/ShellDrainBenchmarks.cs
@@ -15,8 +15,8 @@ namespace Netclaw.Benchmarks;
 /// turning a child's stdout stream into a bounded, redaction-ready string.
 ///
 /// The comparison is the regression fix for #1293:
-///   <see cref="ShellTool.BoundedDrainAsync"/> (head+tail ring buffer, capped at
-///   read time) versus the old path — <see cref="System.IO.TextReader.ReadToEndAsync()"/>
+///   <see cref="BoundedOutputReader.DrainToWindowAsync"/> (head+tail ring buffer,
+///   capped at read time) versus the old path — <see cref="System.IO.TextReader.ReadToEndAsync()"/>
 ///   followed by <see cref="ShellTool.TruncateOutput"/>, which materialised the
 ///   entire output before applying the cap. The story we want the numbers to tell
 ///   is allocation shape: the old path is O(total output) and lands on the LOH for
@@ -53,7 +53,7 @@ public async Task<int> ReadToEnd_ThenTruncate()
     public async Task<int> BoundedDrain()
     {
         var reader = new SyntheticCharReader(TotalChars);
-        var (text, _) = await ShellTool.BoundedDrainAsync(reader, Cap);
+        var (text, _) = await BoundedOutputReader.DrainToWindowAsync(reader, Cap, CancellationToken.None);
         return text.Length;
     }
 }
diff --git a/evals/README.md b/evals/README.md
@@ -68,7 +68,7 @@ log patterns** (skill loading, memory recall, checkpoint formation).
 | Grounding & Alignment | 3 | Uses tools to verify facts, admits uncertainty |
 | Autonomy & Execution | 2 | Executes tasks rather than describing them |
 | Subagents | 1 | Delegates through `spawn_agent` and verifies headless subagents complete ambiguous work without clarification loops |
-| Complex Task Execution | 3 | Multi-step tool chains complete successfully |
+| Complex Task Execution | 5 | Multi-step tool chains complete successfully, incl. bounded tool output — given only the goal (no handling hints), the agent retrieves a deep line from oversized shell output and from a large file, which is only possible by coping with the bound the way AGENTS.md/skills/steer text direct |
 | Multi-Turn Conversation | 7 | Session resume and speaker attribution recall |
 
 Each case defines multiple natural phrasings of the same intent. Each

diff --git a/evals/run-evals.sh b/evals/run-evals.sh
@@ -358,6 +358,19 @@ start_eval_daemon() {
         cp -r "$REPO_ROOT/evals/fixtures/agents/." "$EVAL_HOME/data/agents/"
     fi
 
+    # Pre-seed a large (>256 KB) text file in the workspaces read-root for the
+    # bounded-tool-output file_read eval (complex_large_file_read_ranged). It must
+    # be too big for one inline read AND have model-unguessable content so the only
+    # way to answer "what's on line 5000" is to page it with file_read StartLine/Limit
+    # — the behavior the bounded-output steer is meant to elicit. A deterministic
+    # Lehmer PRNG (pure integer modular arithmetic, identical across awk impls)
+    # makes line 5000 reproducible so the eval can assert the exact value. Lives
+    # under workspaces (a global read-root) rather than identity/skills so it is
+    # not pulled into the system prompt or scanned as a skill. 30000 lines ≈ 314 KB.
+    mkdir -p "$EVAL_HOME/data/workspaces"
+    awk 'BEGIN{x=1;for(i=1;i<=30000;i++){x=(x*48271)%2147483647;print x}}' \
+        > "$EVAL_HOME/data/workspaces/netclaw-eval-largefile.txt"
+
     # The eval container runs as the non-root `netclaw` user and needs write
     # access to the bind-mounted identity, logs, skills, and data trees.
     chmod -R ugo+rwX "$EVAL_HOME/identity" "$EVAL_HOME/logs" "$EVAL_HOME/data" "$EVAL_HOME/skills"
@@ -1056,6 +1069,45 @@ assert_complex_diagnose_self() {
     stdout_contains '\[tool:call\] shell_execute' && stdout_contains 'netclaw.*doctor'
 }
 
+# bounded-tool-output coverage (bound-tool-output-with-file-spill change).
+# These two cases assert on OUTCOME, not mechanism: the prompts state only the
+# goal and give the agent NO instructions about spilling, redirecting, re-running,
+# file_read, StartLine/Limit, or grep. How the agent handles oversized output must
+# come entirely from AGENTS.md, the netclaw-operations skill, and the steer text
+# in the tool result — coaching it in the prompt would be testing instruction-
+# following, not whether the real guidance surfaces work.
+#
+# The data is a deterministic Lehmer PRNG (pure integer modular arithmetic,
+# identical across awk implementations and the host that computed the expected
+# values), so the value at a deep line is reproducible AND un-fabricatable by the
+# model. Because the tool bounds any single read to ~N=2000 inline chars, the
+# deep-line value is unreachable from one read — so a correct answer can ONLY
+# come from the agent paging/reading the oversized output the way the steer asks.
+# Outcome therefore implies correct handling; no mechanism assertion is needed.
+
+# Large SHELL output: ~210 KB on stdout exceeds N, so the daemon spills it and
+# steers. Line 200 (value 872671849) sits past the inline window; reporting it
+# proves the agent retrieved it from the bounded/spilled output unaided.
+assert_complex_large_shell_output_spill() {
+    stdout_contains '\[tool:call\] shell_execute' && \
+        stdout_response_contains '872671849'
+}
+
+# Large FILE: a pre-seeded ~314 KB file (>256 KB, so file_read returns a bounded
+# sample + steer). The prompt asks for a small line WINDOW around 5000 rather than
+# exactly line 5000: the model pages correctly with file_read StartLine/Limit (the
+# behavior under test, every run) but can misindex the line by ±1 (the original
+# run treated the param's former name "Offset" as a 0-based skip-count — the bug
+# that motivated renaming it to the 1-based StartLine). A window makes line 5000
+# (value 1629331733) fall inside the returned slice regardless of any ±1 indexing,
+# so the case measures bounded-output paging, not exact index arithmetic. Line 5000
+# is ~52 KB in, well past the inline window, so reporting its value still proves
+# the agent paged rather than dumped.
+assert_complex_large_file_read_ranged() {
+    stdout_contains '\[tool:call\] file_read' && \
+        stdout_response_contains '1629331733'
+}
+
 # Category 8: Multi-Turn Conversation (tests session-resume + KV cache behavior)
 # All assertions run against the concatenated stdout of every turn in the case.
 
@@ -1466,6 +1518,25 @@ run_all() {
     run_case complex_diagnose_self "shell_execute with netclaw doctor" \
         "Run netclaw doctor and summarize any problems"
 
+    # bounded-tool-output: oversized SHELL output. The prompt states only the
+    # goal — run a command and report a deep line of its output. How to cope with
+    # the output being too large to return inline (read the spill the steer hands
+    # back, rather than re-running) must come from the agent's own guidance, not
+    # this prompt. The number is a deterministic-but-opaque Lehmer PRNG value; the
+    # assertion checks the agent reports the correct line-200 value (872671849).
+    run_case complex_large_shell_output_spill "retrieves a deep line from oversized shell output unaided" \
+        "Run this command with shell_execute and tell me the number it prints on line 200: awk 'BEGIN{x=1;for(i=1;i<=20000;i++){x=(x*48271)%2147483647;print x}}'" \
+        "Using shell_execute, run: awk 'BEGIN{x=1;for(i=1;i<=20000;i++){x=(x*48271)%2147483647;print x}}' — then tell me which number is printed on the 200th line of its output."
+
+    # bounded-tool-output: oversized FILE. The prompt states only the goal — read
+    # a deep line of a named large file. How to cope with it being too large for
+    # one read (page it with file_read StartLine/Limit, per the steer) must come from
+    # the agent's own guidance. The file is pre-seeded in start_eval_daemon; the
+    # assertion checks the agent reports the correct line-5000 value (1629331733).
+    run_case complex_large_file_read_ranged "retrieves a deep line from a large file unaided" \
+        "List the numbers on lines 4997 through 5003 of the file /home/netclaw/.netclaw/workspaces/netclaw-eval-largefile.txt" \
+        "Read lines 4997 to 5003 of /home/netclaw/.netclaw/workspaces/netclaw-eval-largefile.txt and list the numbers on those lines."
+
     end_category
 
     # ── Category 8: Multi-Turn Conversation ──

diff --git a/feeds/skills/.system/files/netclaw-operations/SKILL.md b/feeds/skills/.system/files/netclaw-operations/SKILL.md
@@ -3,7 +3,7 @@ name: netclaw-operations
 description: "REQUIRED when the user asks about scheduling, reminders, cron jobs, timers, background jobs, diagnostics, troubleshooting, MCP tools, daemon health, identity updates, or Netclaw capabilities and self-maintenance."
 metadata:
   author: netclaw
-  version: "2.8.9"
+  version: "2.9.0"
 ---
 
 # Netclaw Operations
@@ -218,7 +218,8 @@ Rules:
 - The user must approve the command before it starts running in the background.
 - Maximum 5 concurrent background jobs; overflow queues FIFO.
 - Job definitions persist to `~/.netclaw/jobs/{id}.json`.
-- Output captured to `~/.netclaw/jobs/{id}/output.log`.
+- Output is captured (bounded; head+tail for very large output) to
+  `~/.netclaw/jobs/{id}/output.log` — `file_read`/`grep` it for detail beyond the tail.
 
 Monitoring tools:
 
@@ -235,6 +236,26 @@ results proactively when the job completes.
 Active background jobs appear in the `[active-background-jobs]` section of the
 session context on every turn.
 
+## Large tool output
+
+Tool output is bounded to a small inline budget
+(`Session.Tuning.MaxInlineToolResultChars`, default 2000 chars) so it never floods
+the context window. When a tool's output exceeds that budget you get a head+tail
+view inline plus a pointer to the full output — not the whole thing:
+
+- **`shell_execute`** spills the full (redacted) output to
+  `{session}/tool-calls/{toolCallId}.log` and gives you the path. Read a slice with
+  `file_read` (`StartLine`/`Limit`) or `grep` it — do NOT re-run the command to see more.
+- **`file_read`** on a large file returns the head and steers you to read a
+  specific range with `StartLine`/`Limit` or `grep` (`StartLine` is a 1-based line
+  number — line 1 is the first line). Don't `cat` a huge file through
+  `shell_execute` to get around it — that just spills again.
+- **`background_job`** output goes to `~/.netclaw/jobs/{id}/output.log` (bounded);
+  `check_background_job` returns a tail, and you can `file_read`/`grep` the log for the rest.
+
+Reading a targeted range or grepping is always cheaper than re-running a command or
+re-reading a whole file. Secret-bearing values are redacted from all tool output.
+
 ## Tool Discovery
 
 MCP tools are not loaded by default. Use `search_tools` to discover them:

diff --git a/openspec/changes/bound-tool-output-with-file-spill/.openspec.yaml b/openspec/changes/bound-tool-output-with-file-spill/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-06-03