lobu-ai · buremba · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/examples/personal-finance/agents/personal-finance/evals/promptfooconfig.yaml b/examples/personal-finance/agents/personal-finance/evals/promptfooconfig.yaml
@@ -75,3 +75,35 @@ tests:
           Two dates five days apart fall in different UK fiscal years. The agent
           should not claim both belong to the same year.
         weight: 0.7
+
+  # ─── retrieval — agent grounds answer in saved memory ───────────────────
+  # Exercises the `tool_use` SSE event surfaced by the gateway: the provider
+  # populates `metadata.toolCalls` (every tool call observed in the turn) and
+  # `metadata.retrievedContext` (joined snippet text from search_memory). For
+  # personal-finance, a prompt that references the workspace's saved facts
+  # should trigger at least one `search_memory` call.
+  #
+  # The metadata is on the provider response (returned from `LobuProvider`).
+  # promptfoo surfaces it to JS assertions via `context.providerResponse`.
+
+  - description: retrieval — recent activity question triggers search_memory
+    vars:
+      query: "What dividends have I logged in this tax year so far?"
+    assert:
+      # The agent should consult its memory; even an empty workspace should
+      # produce a search_memory call before answering.
+      - type: javascript
+        value: |
+          const meta = context.providerResponse?.metadata ?? {};
+          const calls = Array.isArray(meta.toolCalls) ? meta.toolCalls : [];
+          return calls.some((c) =>
+            c && (c.name === 'search_memory' || c.name === 'lobu_search_memory')
+          );
+        weight: 0.5
+      - type: llm-rubric
+        value: |
+          Response either lists dividends grounded in workspace data, or
+          (if the workspace is empty) cleanly tells the user no dividends are
+          logged yet and offers to record one. Does not invent dividend
+          amounts or sources.
+        weight: 0.5
diff --git a/packages/agent-worker/src/__tests__/tool-use-events.test.ts b/packages/agent-worker/src/__tests__/tool-use-events.test.ts
@@ -0,0 +1,108 @@
+import { describe, expect, test } from "bun:test";
+import { buildToolUseEventPayload } from "../openclaw/tool-use-events";
+
+// pi-agent's `tool_execution_end` event omits `args` (those live on the
+// matching `tool_execution_start`); worker.ts re-attaches them from a Map
+// before calling buildToolUseEventPayload. Tests mirror that — pass args
+// explicitly to validate the merged shape.
+const baseEvent = {
+  toolCallId: "call_1",
+  toolName: "search_memory",
+  args: { query: "rent due dates" },
+  result: {
+    matches: [{ id: 1, name: "Acme Ltd" }],
+    content: [
+      {
+        id: 42,
+        title: "Rent",
+        text_content: "Tenant Acme pays £1,200 on the 1st of each month",
+        author_name: null,
+        source_url: null,
+        platform: "manual",
+        occurred_at: null,
+        entity_ids: [1],
+      },
+      {
+        id: 43,
+        title: "Lease renewal",
+        text_content: "Lease renewed through Dec 2027",
+        author_name: null,
+        source_url: null,
+        platform: "manual",
+        occurred_at: null,
+        entity_ids: [1],
+      },
+    ],
+    metadata: { total_matches: 1, page_size: 5 },
+  },
+  isError: false,
+};
+
+describe("buildToolUseEventPayload", () => {
+  test("includes name, input, toolCallId for any tool", () => {
+    const payload = buildToolUseEventPayload({
+      toolCallId: "abc",
+      toolName: "bash",
+      args: { command: "ls" },
+      result: "ok",
+      isError: false,
+    });
+    expect(payload.name).toBe("bash");
+    expect(payload.toolCallId).toBe("abc");
+    expect(payload.input).toEqual({ command: "ls" });
+    expect(payload.isError).toBe(false);
+    // Non-retrieval tools get no result_summary unless they error.
+    expect(payload.result_summary).toBeUndefined();
+  });
+
+  test("extracts event_ids + snippets for search_memory", () => {
+    const payload = buildToolUseEventPayload(baseEvent);
+    expect(payload.result_summary?.event_ids).toEqual([42, 43]);
+    expect(payload.result_summary?.snippets).toEqual([
+      {
+        id: 42,
+        text: "Tenant Acme pays £1,200 on the 1st of each month",
+      },
+      { id: 43, text: "Lease renewed through Dec 2027" },
+    ]);
+  });
+
+  test("treats lobu_search_memory the same as search_memory", () => {
+    const payload = buildToolUseEventPayload({
+      ...baseEvent,
+      toolName: "lobu_search_memory",
+    });
+    expect(payload.result_summary?.event_ids).toEqual([42, 43]);
+  });
+
+  test("handles MCP CallToolResult wrapping", () => {
+    const payload = buildToolUseEventPayload({
+      ...baseEvent,
+      result: {
+        content: [{ type: "text", text: JSON.stringify(baseEvent.result) }],
+        isError: false,
+      },
+    });
+    expect(payload.result_summary?.event_ids).toEqual([42, 43]);
+  });
+
+  test("returns no result_summary when search_memory has no content", () => {
+    const payload = buildToolUseEventPayload({
+      ...baseEvent,
+      result: { matches: [], metadata: { total_matches: 0, page_size: 0 } },
+    });
+    expect(payload.result_summary).toBeUndefined();
+  });
+
+  test("propagates error message when isError", () => {
+    const payload = buildToolUseEventPayload({
+      toolCallId: "x",
+      toolName: "search_memory",
+      args: {},
+      result: { message: "authentication required" },
+      isError: true,
+    });
+    expect(payload.isError).toBe(true);
+    expect(payload.result_summary?.error).toBe("authentication required");
+  });
+});
diff --git a/packages/agent-worker/src/openclaw/tool-use-events.ts b/packages/agent-worker/src/openclaw/tool-use-events.ts
@@ -0,0 +1,146 @@
+/**
+ * Helpers for surfacing tool-call traces from worker → gateway → SSE clients.
+ *
+ * Workers emit a `tool_use` custom event per `tool_execution_end` so any SSE
+ * subscriber (the @lobu/promptfoo-provider, the Mac menubar, the CLI eval) can
+ * inspect tool calls without having to scrape worker logs. The shape mirrors
+ * Anthropic's tool-use block (name + input) plus a `result_summary` field for
+ * structured, tool-specific metadata that's small enough to ship over SSE.
+ *
+ * For retrieval tools (`search_memory` / `lobu_search_memory`) we extract the
+ * matched event IDs and the snippet text content so promptfoo RAG assertions
+ * (`context-recall`, `context-faithfulness`, custom `javascript`) can join the
+ * agent's answer back to the retrieved evidence.
+ */
+
+export interface ToolUseEventPayload {
+  toolCallId: string;
+  name: string;
+  input: unknown;
+  isError: boolean;
+  result_summary?: ToolUseResultSummary;
+}
+
+export interface ToolUseResultSummary {
+  /** Event IDs the tool returned (search_memory etc). */
+  event_ids?: number[];
+  /** Inline snippet text keyed by event id — populated by retrieval tools so
+   * provider clients can compute `retrievedContext` without a round-trip. */
+  snippets?: Array<{ id: number; text: string }>;
+  /** Tools may also include a short error string. */
+  error?: string;
+}
+
+const SEARCH_MEMORY_TOOL_NAMES = new Set([
+  "search_memory",
+  "lobu_search_memory",
+]);
+
+/**
+ * Build the SSE payload for a `tool_execution_end` pi-agent event.
+ *
+ * Always returns a record — even on parse failure — so the SSE event reliably
+ * fires for every tool call. `result_summary` is best-effort: tool-specific
+ * shape parsing is wrapped in try/catch and a parse failure just leaves the
+ * summary off, never throws.
+ */
+export function buildToolUseEventPayload(event: {
+  toolCallId: string;
+  toolName: string;
+  args?: unknown;
+  result: unknown;
+  isError: boolean;
+}): ToolUseEventPayload {
+  const payload: ToolUseEventPayload = {
+    toolCallId: event.toolCallId,
+    name: event.toolName,
+    input: event.args ?? null,
+    isError: event.isError === true,
+  };
+
+  if (event.isError) {
+    const message = extractErrorMessage(event.result);
+    if (message) {
+      payload.result_summary = { error: message };
+    }
+    return payload;
+  }
+
+  if (SEARCH_MEMORY_TOOL_NAMES.has(event.toolName)) {
+    const summary = summarizeSearchMemoryResult(event.result);
+    if (summary) {
+      payload.result_summary = summary;
+    }
+  }
+
+  return payload;
+}
+
+function summarizeSearchMemoryResult(
+  raw: unknown
+): ToolUseResultSummary | null {
+  const result = extractSearchMemoryBody(raw);
+  if (!result || typeof result !== "object") return null;
+
+  const summary: ToolUseResultSummary = {};
+  const contentArr = (result as { content?: unknown }).content;
+  if (Array.isArray(contentArr) && contentArr.length > 0) {
+    const ids: number[] = [];
+    const snippets: Array<{ id: number; text: string }> = [];
+    for (const snippet of contentArr) {
+      if (!snippet || typeof snippet !== "object") continue;
+      const idVal = (snippet as { id?: unknown }).id;
+      const textVal = (snippet as { text_content?: unknown }).text_content;
+      if (typeof idVal !== "number") continue;
+      ids.push(idVal);
+      if (typeof textVal === "string" && textVal.length > 0) {
+        snippets.push({ id: idVal, text: textVal });
+      }
+    }
+    if (ids.length > 0) summary.event_ids = ids;
+    if (snippets.length > 0) summary.snippets = snippets;
+  }
+
+  return Object.keys(summary).length > 0 ? summary : null;
+}
+
+/**
+ * MCP tool results from the gateway proxy land here as
+ *   { content: [{ type: 'text', text: '<json>' }], isError: false }
+ * but in-process tools sometimes pass the raw object straight through. Handle
+ * both shapes.
+ */
+function extractSearchMemoryBody(raw: unknown): unknown {
+  if (!raw || typeof raw !== "object") return null;
+
+  // MCP CallToolResult shape — text content holds a JSON-stringified payload.
+  const mcpContent = (raw as { content?: unknown }).content;
+  if (Array.isArray(mcpContent)) {
+    for (const part of mcpContent) {
+      if (!part || typeof part !== "object") continue;
+      const type = (part as { type?: unknown }).type;
+      const text = (part as { text?: unknown }).text;
+      if (type === "text" && typeof text === "string") {
+        try {
+          return JSON.parse(text);
+        } catch {
+          // Plain text result — nothing to summarise.
+          return null;
+        }
+      }
+    }
+  }
+
+  // Already the search_memory body.
+  return raw;
+}
+
+function extractErrorMessage(result: unknown): string | null {
+  if (typeof result === "string") return result;
+  if (!result || typeof result !== "object") return null;
+  const msg = (result as { message?: unknown }).message;
+  if (typeof msg === "string") return msg;
+  const err = (result as { error?: unknown }).error;
+  if (typeof err === "string") return err;
+  return null;
+}
diff --git a/packages/agent-worker/src/openclaw/worker.ts b/packages/agent-worker/src/openclaw/worker.ts
@@ -68,6 +68,7 @@ import {
 } from "./plugin-loader";
 import { OpenClawProgressProcessor } from "./processor";
 import { getOpenClawSessionContext } from "./session-context";
+import { buildToolUseEventPayload } from "./tool-use-events";
 import {
   buildToolPolicy,
   enforceBashCommandPolicy,
@@ -1442,6 +1443,15 @@ Use it when the user references past discussions or you need context.`);
         }
       };
 
+      // Track tool-call input args across tool_execution_start → _end. pi-agent
+      // only includes `args` on the start event; the end event carries
+      // `toolCallId`, `toolName`, `result`, `isError`. The worker emits one
+      // SSE `tool_use` per finished call, so it needs to remember the input.
+      const pendingToolArgs = new Map<string, unknown>();
+      // Tool-use SSE emits are awaited at agent_end so the `complete` event
+      // can't race ahead of late tool_use events on slow networks.
+      const inFlightToolUse: Set<Promise<void>> = new Set();
+
       session.subscribe((event) => {
         if (suppressProgressOutput) {
           if (event.type === "agent_end") {
@@ -1459,9 +1469,54 @@ Use it when the user references past discussions or you need context.`);
           }
         }
 
+        // Capture the input args at tool start so we can attach them when the
+        // matching end event fires.
+        if (event.type === "tool_execution_start") {
+          pendingToolArgs.set(event.toolCallId, event.args);
+        }
+
+        // Surface tool-use traces to SSE clients (promptfoo provider, CLI eval,
+        // any client subscribed via `event: tool_use`). Worker emits one record
+        // per tool call at `tool_execution_end` so the result is included.
+        if (event.type === "tool_execution_end") {
+          const args = pendingToolArgs.get(event.toolCallId);
+          pendingToolArgs.delete(event.toolCallId);
+          const payload = buildToolUseEventPayload({
+            toolCallId: event.toolCallId,
+            toolName: event.toolName,
+            args,
+            result: event.result,
+            isError: event.isError,
+          });
+          const promise = onProgress({
+            type: "custom_event",
+            data: {
+              name: "tool_use",
+              payload: payload as unknown as Record<string, unknown>,
+            },
+            timestamp: Date.now(),
+          }).catch((err) => {
+            logger.warn(
+              `Failed to emit tool_use custom event for ${event.toolName}:`,
+              err
+            );
+          });
+          inFlightToolUse.add(promise);
+          promise.finally(() => inFlightToolUse.delete(promise));
+        }
+
         if (event.type === "agent_end") {
           flushDelta()
-            .then(() => resolveTurnDone?.())
+            .then(async () => {
+              // Wait for any pending tool_use emits so clients don't see
+              // `complete` arrive before all tool_use records (the provider
+              // returns on `complete`, and a slow tool_use POST mid-flight
+              // would otherwise be lost).
+              if (inFlightToolUse.size > 0) {
+                await Promise.allSettled(Array.from(inFlightToolUse));
+              }
+              resolveTurnDone?.();
+            })
             .catch((err) => {
               logger.error("Failed to flush final delta:", err);
               resolveTurnDone?.();