Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,35 @@ tests:
Two dates five days apart fall in different UK fiscal years. The agent
should not claim both belong to the same year.
weight: 0.7

# ─── retrieval — agent grounds answer in saved memory ───────────────────
# Exercises the `tool_use` SSE event surfaced by the gateway: the provider
# populates `metadata.toolCalls` (every tool call observed in the turn) and
# `metadata.retrievedContext` (joined snippet text from search_memory). For
# personal-finance, a prompt that references the workspace's saved facts
# should trigger at least one `search_memory` call.
#
# The metadata is on the provider response (returned from `LobuProvider`).
# promptfoo surfaces it to JS assertions via `context.providerResponse`.

- description: retrieval — recent activity question triggers search_memory
vars:
query: "What dividends have I logged in this tax year so far?"
assert:
# The agent should consult its memory; even an empty workspace should
# produce a search_memory call before answering.
- type: javascript
value: |
const meta = context.providerResponse?.metadata ?? {};
const calls = Array.isArray(meta.toolCalls) ? meta.toolCalls : [];
return calls.some((c) =>
c && (c.name === 'search_memory' || c.name === 'lobu_search_memory')
);
weight: 0.5
- type: llm-rubric
value: |
Response either lists dividends grounded in workspace data, or
(if the workspace is empty) cleanly tells the user no dividends are
logged yet and offers to record one. Does not invent dividend
amounts or sources.
weight: 0.5
108 changes: 108 additions & 0 deletions packages/agent-worker/src/__tests__/tool-use-events.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import { describe, expect, test } from "bun:test";
import { buildToolUseEventPayload } from "../openclaw/tool-use-events";

// pi-agent's `tool_execution_end` event omits `args` (those live on the
// matching `tool_execution_start`); worker.ts re-attaches them from a Map
// before calling buildToolUseEventPayload. Tests mirror that — pass args
// explicitly to validate the merged shape.
const baseEvent = {
toolCallId: "call_1",
toolName: "search_memory",
args: { query: "rent due dates" },
result: {
matches: [{ id: 1, name: "Acme Ltd" }],
content: [
{
id: 42,
title: "Rent",
text_content: "Tenant Acme pays £1,200 on the 1st of each month",
author_name: null,
source_url: null,
platform: "manual",
occurred_at: null,
entity_ids: [1],
},
{
id: 43,
title: "Lease renewal",
text_content: "Lease renewed through Dec 2027",
author_name: null,
source_url: null,
platform: "manual",
occurred_at: null,
entity_ids: [1],
},
],
metadata: { total_matches: 1, page_size: 5 },
},
isError: false,
};

describe("buildToolUseEventPayload", () => {
test("includes name, input, toolCallId for any tool", () => {
const payload = buildToolUseEventPayload({
toolCallId: "abc",
toolName: "bash",
args: { command: "ls" },
result: "ok",
isError: false,
});
expect(payload.name).toBe("bash");
expect(payload.toolCallId).toBe("abc");
expect(payload.input).toEqual({ command: "ls" });
expect(payload.isError).toBe(false);
// Non-retrieval tools get no result_summary unless they error.
expect(payload.result_summary).toBeUndefined();
});

test("extracts event_ids + snippets for search_memory", () => {
const payload = buildToolUseEventPayload(baseEvent);
expect(payload.result_summary?.event_ids).toEqual([42, 43]);
expect(payload.result_summary?.snippets).toEqual([
{
id: 42,
text: "Tenant Acme pays £1,200 on the 1st of each month",
},
{ id: 43, text: "Lease renewed through Dec 2027" },
]);
});

test("treats lobu_search_memory the same as search_memory", () => {
const payload = buildToolUseEventPayload({
...baseEvent,
toolName: "lobu_search_memory",
});
expect(payload.result_summary?.event_ids).toEqual([42, 43]);
});

test("handles MCP CallToolResult wrapping", () => {
const payload = buildToolUseEventPayload({
...baseEvent,
result: {
content: [{ type: "text", text: JSON.stringify(baseEvent.result) }],
isError: false,
},
});
expect(payload.result_summary?.event_ids).toEqual([42, 43]);
});

test("returns no result_summary when search_memory has no content", () => {
const payload = buildToolUseEventPayload({
...baseEvent,
result: { matches: [], metadata: { total_matches: 0, page_size: 0 } },
});
expect(payload.result_summary).toBeUndefined();
});

test("propagates error message when isError", () => {
const payload = buildToolUseEventPayload({
toolCallId: "x",
toolName: "search_memory",
args: {},
result: { message: "authentication required" },
isError: true,
});
expect(payload.isError).toBe(true);
expect(payload.result_summary?.error).toBe("authentication required");
});
});
146 changes: 146 additions & 0 deletions packages/agent-worker/src/openclaw/tool-use-events.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
/**
* Helpers for surfacing tool-call traces from worker → gateway → SSE clients.
*
* Workers emit a `tool_use` custom event per `tool_execution_end` so any SSE
* subscriber (the @lobu/promptfoo-provider, the Mac menubar, the CLI eval) can
* inspect tool calls without having to scrape worker logs. The shape mirrors
* Anthropic's tool-use block (name + input) plus a `result_summary` field for
* structured, tool-specific metadata that's small enough to ship over SSE.
*
* For retrieval tools (`search_memory` / `lobu_search_memory`) we extract the
* matched event IDs and the snippet text content so promptfoo RAG assertions
* (`context-recall`, `context-faithfulness`, custom `javascript`) can join the
* agent's answer back to the retrieved evidence.
*/

export interface ToolUseEventPayload {
toolCallId: string;
name: string;
input: unknown;
isError: boolean;
result_summary?: ToolUseResultSummary;
}

export interface ToolUseResultSummary {
/** Event IDs the tool returned (search_memory etc). */
event_ids?: number[];
/** Inline snippet text keyed by event id — populated by retrieval tools so
* provider clients can compute `retrievedContext` without a round-trip. */
snippets?: Array<{ id: number; text: string }>;
/** Tools may also include a short error string. */
error?: string;
}

const SEARCH_MEMORY_TOOL_NAMES = new Set([
"search_memory",
"lobu_search_memory",
]);

/**
* Build the SSE payload for a `tool_execution_end` pi-agent event.
*
* Always returns a record — even on parse failure — so the SSE event reliably
* fires for every tool call. `result_summary` is best-effort: tool-specific
* shape parsing is wrapped in try/catch and a parse failure just leaves the
* summary off, never throws.
*/
export function buildToolUseEventPayload(event: {
toolCallId: string;
toolName: string;
args?: unknown;
result: unknown;
isError: boolean;
}): ToolUseEventPayload {
const payload: ToolUseEventPayload = {
toolCallId: event.toolCallId,
name: event.toolName,
input: event.args ?? null,
isError: event.isError === true,
};

if (event.isError) {
const message = extractErrorMessage(event.result);
if (message) {
payload.result_summary = { error: message };
}
return payload;
}

if (SEARCH_MEMORY_TOOL_NAMES.has(event.toolName)) {
const summary = summarizeSearchMemoryResult(event.result);
if (summary) {
payload.result_summary = summary;
}
}

return payload;
}

function summarizeSearchMemoryResult(
raw: unknown
): ToolUseResultSummary | null {
const result = extractSearchMemoryBody(raw);
if (!result || typeof result !== "object") return null;

const summary: ToolUseResultSummary = {};
const contentArr = (result as { content?: unknown }).content;
if (Array.isArray(contentArr) && contentArr.length > 0) {
const ids: number[] = [];
const snippets: Array<{ id: number; text: string }> = [];
for (const snippet of contentArr) {
if (!snippet || typeof snippet !== "object") continue;
const idVal = (snippet as { id?: unknown }).id;
const textVal = (snippet as { text_content?: unknown }).text_content;
if (typeof idVal !== "number") continue;
ids.push(idVal);
if (typeof textVal === "string" && textVal.length > 0) {
snippets.push({ id: idVal, text: textVal });
}
}
if (ids.length > 0) summary.event_ids = ids;
if (snippets.length > 0) summary.snippets = snippets;
}

return Object.keys(summary).length > 0 ? summary : null;
}

/**
* MCP tool results from the gateway proxy land here as
* { content: [{ type: 'text', text: '<json>' }], isError: false }
* but in-process tools sometimes pass the raw object straight through. Handle
* both shapes.
*/
function extractSearchMemoryBody(raw: unknown): unknown {
if (!raw || typeof raw !== "object") return null;

// MCP CallToolResult shape — text content holds a JSON-stringified payload.
const mcpContent = (raw as { content?: unknown }).content;
if (Array.isArray(mcpContent)) {
for (const part of mcpContent) {
if (!part || typeof part !== "object") continue;
const type = (part as { type?: unknown }).type;
const text = (part as { text?: unknown }).text;
if (type === "text" && typeof text === "string") {
try {
return JSON.parse(text);
} catch {
// Plain text result — nothing to summarise.
return null;
}
Comment on lines +123 to +129
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Don’t stop scanning MCP content after the first unparsable text block.

At Lines 126-128, returning null inside catch exits early and can miss a later valid JSON content part, which drops result_summary unexpectedly.

Proposed fix
 function extractSearchMemoryBody(raw: unknown): unknown {
   if (!raw || typeof raw !== "object") return null;
@@
   const mcpContent = (raw as { content?: unknown }).content;
   if (Array.isArray(mcpContent)) {
     for (const part of mcpContent) {
       if (!part || typeof part !== "object") continue;
       const type = (part as { type?: unknown }).type;
       const text = (part as { text?: unknown }).text;
       if (type === "text" && typeof text === "string") {
         try {
           return JSON.parse(text);
         } catch {
-          // Plain text result — nothing to summarise.
-          return null;
+          // Not JSON; keep scanning other parts.
+          continue;
         }
       }
     }
+    return null;
   }
 
   // Already the search_memory body.
   return raw;
 }
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/agent-worker/src/openclaw/tool-use-events.ts` around lines 123 -
129, The current handler in openclaw/tool-use-events.ts returns null inside the
catch when JSON.parse(text) fails, which aborts scanning and can miss later
valid JSON content (dropping result_summary); change the logic in the block that
checks if (type === "text" && typeof text === "string") so that on JSON.parse
failure you do not return null but instead skip this text entry and continue
scanning subsequent parts (e.g., use continue or otherwise ignore this entry),
allowing later valid JSON content to be parsed and result_summary to be set;
ensure any eventual fallback still returns null only after all parts are
examined.

}
}
}

// Already the search_memory body.
return raw;
}

function extractErrorMessage(result: unknown): string | null {
if (typeof result === "string") return result;
if (!result || typeof result !== "object") return null;
const msg = (result as { message?: unknown }).message;
if (typeof msg === "string") return msg;
const err = (result as { error?: unknown }).error;
if (typeof err === "string") return err;
return null;
}
57 changes: 56 additions & 1 deletion packages/agent-worker/src/openclaw/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ import {
} from "./plugin-loader";
import { OpenClawProgressProcessor } from "./processor";
import { getOpenClawSessionContext } from "./session-context";
import { buildToolUseEventPayload } from "./tool-use-events";
import {
buildToolPolicy,
enforceBashCommandPolicy,
Expand Down Expand Up @@ -1442,6 +1443,15 @@ Use it when the user references past discussions or you need context.`);
}
};

// Track tool-call input args across tool_execution_start → _end. pi-agent
// only includes `args` on the start event; the end event carries
// `toolCallId`, `toolName`, `result`, `isError`. The worker emits one
// SSE `tool_use` per finished call, so it needs to remember the input.
const pendingToolArgs = new Map<string, unknown>();
// Tool-use SSE emits are awaited at agent_end so the `complete` event
// can't race ahead of late tool_use events on slow networks.
const inFlightToolUse: Set<Promise<void>> = new Set();

session.subscribe((event) => {
if (suppressProgressOutput) {
if (event.type === "agent_end") {
Expand All @@ -1459,9 +1469,54 @@ Use it when the user references past discussions or you need context.`);
}
}

// Capture the input args at tool start so we can attach them when the
// matching end event fires.
if (event.type === "tool_execution_start") {
pendingToolArgs.set(event.toolCallId, event.args);
}

// Surface tool-use traces to SSE clients (promptfoo provider, CLI eval,
// any client subscribed via `event: tool_use`). Worker emits one record
// per tool call at `tool_execution_end` so the result is included.
if (event.type === "tool_execution_end") {
const args = pendingToolArgs.get(event.toolCallId);
pendingToolArgs.delete(event.toolCallId);
const payload = buildToolUseEventPayload({
toolCallId: event.toolCallId,
toolName: event.toolName,
args,
result: event.result,
isError: event.isError,
});
const promise = onProgress({
type: "custom_event",
data: {
name: "tool_use",
payload: payload as unknown as Record<string, unknown>,
},
timestamp: Date.now(),
}).catch((err) => {
logger.warn(
`Failed to emit tool_use custom event for ${event.toolName}:`,
err
);
});
inFlightToolUse.add(promise);
promise.finally(() => inFlightToolUse.delete(promise));
}

if (event.type === "agent_end") {
flushDelta()
.then(() => resolveTurnDone?.())
.then(async () => {
// Wait for any pending tool_use emits so clients don't see
// `complete` arrive before all tool_use records (the provider
// returns on `complete`, and a slow tool_use POST mid-flight
// would otherwise be lost).
if (inFlightToolUse.size > 0) {
await Promise.allSettled(Array.from(inFlightToolUse));
}
resolveTurnDone?.();
})
.catch((err) => {
logger.error("Failed to flush final delta:", err);
resolveTurnDone?.();
Expand Down
Loading
Loading