diff --git a/assistant/src/__tests__/app-control-tool-schemas.test.ts b/assistant/src/__tests__/app-control-tool-schemas.test.ts new file mode 100644 index 00000000000..8440ea916e3 --- /dev/null +++ b/assistant/src/__tests__/app-control-tool-schemas.test.ts @@ -0,0 +1,505 @@ +import { describe, expect, test } from "bun:test"; + +import { RiskLevel } from "../permissions/types.js"; +import { + appControlClickTool, + appControlComboTool, + appControlDragTool, + appControlObserveTool, + appControlPressTool, + appControlStartTool, + appControlStopTool, + appControlTools, + appControlTypeTool, +} from "../tools/app-control/definitions.js"; +import { forwardAppControlProxyTool } from "../tools/app-control/skill-proxy-bridge.js"; +import type { Tool, ToolContext } from "../tools/types.js"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +interface JsonSchemaProp { + type?: string; + enum?: string[]; + items?: { type?: string }; +} + +interface JsonSchema { + type?: string; + required?: string[]; + properties?: Record; +} + +function schema(tool: Tool): JsonSchema { + return tool.getDefinition().input_schema as JsonSchema; +} + +/** + * Lightweight, schema-driven validator covering the cases this PR exercises: + * - all `required` keys must be present + * - typed properties (`string` / `integer` / `number` / `boolean`) must match + * - `enum`-constrained string properties must be in the allowed set + * - `array`-typed properties must be arrays (and items must satisfy + * declared item types when present) + * + * This mirrors what a JSON-Schema validator like ajv would do for these + * simple shapes, without pulling ajv in as a direct dependency. + */ +function validate( + s: JsonSchema, + input: Record, +): { ok: boolean; error?: string } { + for (const key of s.required ?? []) { + if (!(key in input)) { + return { ok: false, error: `missing required property: ${key}` }; + } + } + for (const [key, propSchema] of Object.entries(s.properties ?? {})) { + if (!(key in input)) continue; + const value = input[key]; + if (!propSchema.type) continue; + switch (propSchema.type) { + case "string": + if (typeof value !== "string") { + return { ok: false, error: `${key} must be string` }; + } + if (propSchema.enum && !propSchema.enum.includes(value)) { + return { + ok: false, + error: `${key} must be one of ${propSchema.enum.join(", ")}`, + }; + } + break; + case "integer": + if (typeof value !== "number" || !Number.isInteger(value)) { + return { ok: false, error: `${key} must be integer` }; + } + break; + case "number": + if (typeof value !== "number") { + return { ok: false, error: `${key} must be number` }; + } + break; + case "boolean": + if (typeof value !== "boolean") { + return { ok: false, error: `${key} must be boolean` }; + } + break; + case "array": + if (!Array.isArray(value)) { + return { ok: false, error: `${key} must be array` }; + } + if (propSchema.items?.type) { + for (const item of value) { + if ( + propSchema.items.type === "string" && + typeof item !== "string" + ) { + return { ok: false, error: `${key} items must be string` }; + } + } + } + break; + } + } + return { ok: true }; +} + +const ctx: ToolContext = { + workingDir: "/tmp", + conversationId: "test-conversation", + trustClass: "guardian", +}; + +// --------------------------------------------------------------------------- +// Aggregate invariants +// --------------------------------------------------------------------------- + +describe("app-control tool definitions (aggregate)", () => { + test("appControlTools contains exactly 8 tools", () => { + expect(appControlTools.length).toBe(8); + }); + + test("all tools have proxy execution mode", () => { + for (const tool of appControlTools) { + expect(tool.executionMode).toBe("proxy"); + } + }); + + test("all tools belong to the app-control category", () => { + for (const tool of appControlTools) { + expect(tool.category).toBe("app-control"); + } + }); + + test("all tools have unique names", () => { + const names = appControlTools.map((t) => t.name); + expect(new Set(names).size).toBe(names.length); + }); + + test("all tool names use the app_control_ prefix", () => { + for (const tool of appControlTools) { + expect(tool.name.startsWith("app_control_")).toBe(true); + } + }); + + test("all tools have non-empty descriptions", () => { + for (const tool of appControlTools) { + expect(tool.description.length).toBeGreaterThan(0); + } + }); + + test("stub execute() throws for every tool", () => { + for (const tool of appControlTools) { + expect(() => tool.execute({}, ctx)).toThrow( + "app-control tool must be forwarded to the connected client", + ); + } + }); +}); + +// --------------------------------------------------------------------------- +// Per-tool schema cases +// --------------------------------------------------------------------------- + +describe("app_control_start", () => { + const s = schema(appControlStartTool); + + test("well-formed input passes (with args)", () => { + expect( + validate(s, { + app: "com.apple.Safari", + args: ["--new-window"], + reasoning: "open Safari fresh", + }).ok, + ).toBe(true); + }); + + test("well-formed input passes (without optional args)", () => { + expect( + validate(s, { app: "com.apple.Safari", reasoning: "focus" }).ok, + ).toBe(true); + }); + + test("missing required app rejects", () => { + const result = validate(s, { reasoning: "focus" }); + expect(result.ok).toBe(false); + expect(result.error).toContain("app"); + }); + + test("default risk level is Medium", () => { + expect(appControlStartTool.defaultRiskLevel).toBe(RiskLevel.Medium); + }); +}); + +describe("app_control_observe", () => { + const s = schema(appControlObserveTool); + + test("well-formed input passes", () => { + expect( + validate(s, { app: "com.apple.Safari", activity: "check state" }).ok, + ).toBe(true); + }); + + test("missing required app rejects", () => { + const result = validate(s, { activity: "check state" }); + expect(result.ok).toBe(false); + expect(result.error).toContain("app"); + }); + + test("default risk level is Low", () => { + expect(appControlObserveTool.defaultRiskLevel).toBe(RiskLevel.Low); + }); +}); + +describe("app_control_press", () => { + const s = schema(appControlPressTool); + + test("well-formed input passes (with optional fields)", () => { + expect( + validate(s, { + app: "com.apple.Safari", + key: "return", + modifiers: ["cmd"], + duration_ms: 50, + reasoning: "submit form", + }).ok, + ).toBe(true); + }); + + test("well-formed input passes (minimal)", () => { + expect( + validate(s, { + app: "com.apple.Safari", + key: "a", + reasoning: "type a", + }).ok, + ).toBe(true); + }); + + test("missing required app rejects", () => { + const result = validate(s, { key: "a", reasoning: "type a" }); + expect(result.ok).toBe(false); + expect(result.error).toContain("app"); + }); + + test("missing required key rejects", () => { + const result = validate(s, { + app: "com.apple.Safari", + reasoning: "press something", + }); + expect(result.ok).toBe(false); + expect(result.error).toContain("key"); + }); +}); + +describe("app_control_combo", () => { + const s = schema(appControlComboTool); + + test("well-formed input passes", () => { + expect( + validate(s, { + app: "com.apple.Safari", + keys: ["cmd", "shift", "4"], + reasoning: "screenshot region", + }).ok, + ).toBe(true); + }); + + test("missing required app rejects", () => { + const result = validate(s, { + keys: ["cmd", "a"], + reasoning: "select all", + }); + expect(result.ok).toBe(false); + expect(result.error).toContain("app"); + }); + + test("non-array keys rejects", () => { + const result = validate(s, { + app: "com.apple.Safari", + keys: "cmd+a", + reasoning: "select all", + }); + expect(result.ok).toBe(false); + expect(result.error).toContain("keys"); + }); +}); + +describe("app_control_type", () => { + const s = schema(appControlTypeTool); + + test("well-formed input passes", () => { + expect( + validate(s, { + app: "com.apple.Safari", + text: "hello", + reasoning: "search", + }).ok, + ).toBe(true); + }); + + test("missing required app rejects", () => { + const result = validate(s, { text: "hello", reasoning: "search" }); + expect(result.ok).toBe(false); + expect(result.error).toContain("app"); + }); + + test("missing required text rejects", () => { + const result = validate(s, { + app: "com.apple.Safari", + reasoning: "search", + }); + expect(result.ok).toBe(false); + expect(result.error).toContain("text"); + }); +}); + +describe("app_control_click", () => { + const s = schema(appControlClickTool); + + test("well-formed input passes (defaults)", () => { + expect( + validate(s, { + app: "com.apple.Safari", + x: 100, + y: 200, + reasoning: "tap link", + }).ok, + ).toBe(true); + }); + + test("well-formed input passes (right button + double)", () => { + expect( + validate(s, { + app: "com.apple.Safari", + x: 100, + y: 200, + button: "right", + double: true, + reasoning: "context menu", + }).ok, + ).toBe(true); + }); + + test("missing required app rejects", () => { + const result = validate(s, { x: 100, y: 200, reasoning: "click" }); + expect(result.ok).toBe(false); + expect(result.error).toContain("app"); + }); + + test("missing required coordinate rejects", () => { + const result = validate(s, { + app: "com.apple.Safari", + x: 100, + reasoning: "click", + }); + expect(result.ok).toBe(false); + expect(result.error).toContain("y"); + }); + + test("invalid button enum value rejects", () => { + const result = validate(s, { + app: "com.apple.Safari", + x: 100, + y: 200, + button: "scroll", + reasoning: "click", + }); + expect(result.ok).toBe(false); + expect(result.error).toContain("button"); + }); + + test("button enum is left/right/middle", () => { + const props = s.properties as Record; + expect(props.button.enum).toEqual(["left", "right", "middle"]); + }); +}); + +describe("app_control_drag", () => { + const s = schema(appControlDragTool); + + test("well-formed input passes", () => { + expect( + validate(s, { + app: "com.apple.Safari", + from_x: 10, + from_y: 20, + to_x: 100, + to_y: 200, + reasoning: "drag handle", + }).ok, + ).toBe(true); + }); + + test("missing required app rejects", () => { + const result = validate(s, { + from_x: 10, + from_y: 20, + to_x: 100, + to_y: 200, + reasoning: "drag", + }); + expect(result.ok).toBe(false); + expect(result.error).toContain("app"); + }); + + test("missing required destination rejects", () => { + const result = validate(s, { + app: "com.apple.Safari", + from_x: 10, + from_y: 20, + reasoning: "drag", + }); + expect(result.ok).toBe(false); + expect(result.error).toContain("to_"); + }); + + test("invalid button enum value rejects", () => { + const result = validate(s, { + app: "com.apple.Safari", + from_x: 10, + from_y: 20, + to_x: 100, + to_y: 200, + button: "scroll", + reasoning: "drag", + }); + expect(result.ok).toBe(false); + expect(result.error).toContain("button"); + }); +}); + +describe("app_control_stop", () => { + const s = schema(appControlStopTool); + + test("well-formed input passes (no app — terminal)", () => { + expect(validate(s, { activity: "wrap up" }).ok).toBe(true); + }); + + test("well-formed input passes (with app + reason)", () => { + expect( + validate(s, { + app: "com.apple.Safari", + reason: "task complete", + activity: "wrap up", + }).ok, + ).toBe(true); + }); + + test("missing activity rejects", () => { + const result = validate(s, { reason: "task complete" }); + expect(result.ok).toBe(false); + expect(result.error).toContain("activity"); + }); + + test("app is optional (terminal tool may omit it)", () => { + expect(s.required ?? []).not.toContain("app"); + }); +}); + +// --------------------------------------------------------------------------- +// skill-proxy-bridge +// --------------------------------------------------------------------------- + +describe("forwardAppControlProxyTool", () => { + test("returns error when no proxy resolver available", async () => { + const result = await forwardAppControlProxyTool( + "app_control_click", + { app: "com.apple.Safari", x: 1, y: 2 }, + ctx, + ); + expect(result.isError).toBe(true); + expect(result.content).toContain("no proxy resolver available"); + expect(result.content).toContain("app_control_click"); + }); + + test("delegates to proxy resolver when available", async () => { + let capturedName = ""; + let capturedInput: Record = {}; + const ctxWithProxy: ToolContext = { + ...ctx, + proxyToolResolver: async (name, input) => { + capturedName = name; + capturedInput = input; + return { content: `Forwarded ${name}`, isError: false }; + }, + }; + + const result = await forwardAppControlProxyTool( + "app_control_press", + { app: "com.apple.Safari", key: "return", reasoning: "submit" }, + ctxWithProxy, + ); + + expect(result.isError).toBe(false); + expect(result.content).toBe("Forwarded app_control_press"); + expect(capturedName).toBe("app_control_press"); + expect(capturedInput).toEqual({ + app: "com.apple.Safari", + key: "return", + reasoning: "submit", + }); + }); +}); diff --git a/assistant/src/tools/app-control/definitions.ts b/assistant/src/tools/app-control/definitions.ts new file mode 100644 index 00000000000..b7a98a51de3 --- /dev/null +++ b/assistant/src/tools/app-control/definitions.ts @@ -0,0 +1,400 @@ +/** + * App-control tool definitions. + * + * These tools target a specific application (by bundle ID or process name) on + * the desktop client (host machine). Each tool is a proxy: execution is + * forwarded to the connected client and never handled locally by the daemon. + * + * The eight tools mirror the input wire types declared in + * `daemon/message-types/host-app-control.ts`: + * start | observe | press | combo | type | click | drag | stop + * + * Distinct from the system-wide `computer_use_*` proxy tools — app-control + * scopes input/observation to a single targeted app window. + */ + +import { RiskLevel } from "../../permissions/types.js"; +import type { ToolDefinition } from "../../providers/types.js"; +import type { Tool, ToolExecutionResult } from "../types.js"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function proxyExecute(): Promise { + throw new Error("app-control tool must be forwarded to the connected client"); +} + +const activityProperty = { + type: "string" as const, + description: + "Brief non-technical explanation of why this tool is being called", +}; + +const appProperty = { + type: "string" as const, + description: + "Bundle ID (preferred, e.g. 'com.apple.Safari') or process name of the target application", +}; + +const buttonEnum = ["left", "right", "middle"] as const; + +// --------------------------------------------------------------------------- +// start — launch (or focus) the target app, optionally with CLI args +// --------------------------------------------------------------------------- + +export const appControlStartTool: Tool = { + name: "app_control_start", + description: + "Start (launch or focus) the target application. Optionally pass command-line arguments. Begins an app-control session targeting this app.", + category: "app-control", + defaultRiskLevel: RiskLevel.Medium, + executionMode: "proxy", + + getDefinition(): ToolDefinition { + return { + name: this.name, + description: this.description, + input_schema: { + type: "object", + properties: { + app: appProperty, + args: { + type: "array", + items: { type: "string" }, + description: + "Optional command-line arguments to launch the app with", + }, + reasoning: { + type: "string", + description: "Explanation of why you are starting this app", + }, + activity: activityProperty, + }, + required: ["app", "reasoning"], + }, + }; + }, + + execute: proxyExecute, +}; + +// --------------------------------------------------------------------------- +// observe — capture window state of the target app +// --------------------------------------------------------------------------- + +export const appControlObserveTool: Tool = { + name: "app_control_observe", + description: + "Capture the current window state of the target application — returns lifecycle state (running/missing/minimized/occluded), an optional screenshot, and window bounds. Use this before issuing input actions, or to check progress without acting.", + category: "app-control", + defaultRiskLevel: RiskLevel.Low, + executionMode: "proxy", + + getDefinition(): ToolDefinition { + return { + name: this.name, + description: this.description, + input_schema: { + type: "object", + properties: { + app: appProperty, + activity: activityProperty, + }, + required: ["app", "activity"], + }, + }; + }, + + execute: proxyExecute, +}; + +// --------------------------------------------------------------------------- +// press — single key with optional modifiers and hold duration +// --------------------------------------------------------------------------- + +export const appControlPressTool: Tool = { + name: "app_control_press", + description: + "Press a single key in the target application, with optional modifiers (cmd/shift/option/ctrl) and a hold duration in milliseconds.", + category: "app-control", + defaultRiskLevel: RiskLevel.Low, + executionMode: "proxy", + + getDefinition(): ToolDefinition { + return { + name: this.name, + description: this.description, + input_schema: { + type: "object", + properties: { + app: appProperty, + key: { + type: "string", + description: 'Key identifier, e.g. "return", "a", "f12"', + }, + modifiers: { + type: "array", + items: { type: "string" }, + description: + 'Modifier list, e.g. ["cmd", "shift"]. Omit for no modifiers.', + }, + duration_ms: { + type: "integer", + description: "Hold duration in milliseconds", + }, + reasoning: { + type: "string", + description: "Explanation of why you are pressing this key", + }, + activity: activityProperty, + }, + required: ["app", "key", "reasoning"], + }, + }; + }, + + execute: proxyExecute, +}; + +// --------------------------------------------------------------------------- +// combo — multiple keys pressed simultaneously +// --------------------------------------------------------------------------- + +export const appControlComboTool: Tool = { + name: "app_control_combo", + description: + "Press multiple keys simultaneously in the target application (e.g. cmd+shift+4). Use for keyboard shortcuts where every key is held at once.", + category: "app-control", + defaultRiskLevel: RiskLevel.Low, + executionMode: "proxy", + + getDefinition(): ToolDefinition { + return { + name: this.name, + description: this.description, + input_schema: { + type: "object", + properties: { + app: appProperty, + keys: { + type: "array", + items: { type: "string" }, + description: + 'Sequence of keys pressed simultaneously, e.g. ["cmd", "shift", "4"]', + }, + duration_ms: { + type: "integer", + description: "Hold duration in milliseconds", + }, + reasoning: { + type: "string", + description: "Explanation of why you are pressing this combo", + }, + activity: activityProperty, + }, + required: ["app", "keys", "reasoning"], + }, + }; + }, + + execute: proxyExecute, +}; + +// --------------------------------------------------------------------------- +// type — type literal text into the target app +// --------------------------------------------------------------------------- + +export const appControlTypeTool: Tool = { + name: "app_control_type", + description: + "Type literal text into the target application at the current focus. Ensure the intended field is focused before calling.", + category: "app-control", + defaultRiskLevel: RiskLevel.Low, + executionMode: "proxy", + + getDefinition(): ToolDefinition { + return { + name: this.name, + description: this.description, + input_schema: { + type: "object", + properties: { + app: appProperty, + text: { + type: "string", + description: "The text to type", + }, + reasoning: { + type: "string", + description: "Explanation of what you are typing and why", + }, + activity: activityProperty, + }, + required: ["app", "text", "reasoning"], + }, + }; + }, + + execute: proxyExecute, +}; + +// --------------------------------------------------------------------------- +// click — click at window-relative (x, y) coordinates +// --------------------------------------------------------------------------- + +export const appControlClickTool: Tool = { + name: "app_control_click", + description: + "Click at the given (x, y) coordinates inside the target application's window. Defaults to a single left-click; pass `button` and/or `double` to vary.", + category: "app-control", + defaultRiskLevel: RiskLevel.Low, + executionMode: "proxy", + + getDefinition(): ToolDefinition { + return { + name: this.name, + description: this.description, + input_schema: { + type: "object", + properties: { + app: appProperty, + x: { + type: "integer", + description: "X coordinate (window-relative)", + }, + y: { + type: "integer", + description: "Y coordinate (window-relative)", + }, + button: { + type: "string", + enum: [...buttonEnum], + description: 'Mouse button (default: "left")', + }, + double: { + type: "boolean", + description: "When true, performs a double-click", + }, + reasoning: { + type: "string", + description: + "Explanation of what you see and why you are clicking here", + }, + activity: activityProperty, + }, + required: ["app", "x", "y", "reasoning"], + }, + }; + }, + + execute: proxyExecute, +}; + +// --------------------------------------------------------------------------- +// drag — drag from one coord to another inside the target app +// --------------------------------------------------------------------------- + +export const appControlDragTool: Tool = { + name: "app_control_drag", + description: + "Drag from (from_x, from_y) to (to_x, to_y) inside the target application's window. Defaults to left button.", + category: "app-control", + defaultRiskLevel: RiskLevel.Low, + executionMode: "proxy", + + getDefinition(): ToolDefinition { + return { + name: this.name, + description: this.description, + input_schema: { + type: "object", + properties: { + app: appProperty, + from_x: { + type: "integer", + description: "Source X coordinate (window-relative)", + }, + from_y: { + type: "integer", + description: "Source Y coordinate (window-relative)", + }, + to_x: { + type: "integer", + description: "Destination X coordinate (window-relative)", + }, + to_y: { + type: "integer", + description: "Destination Y coordinate (window-relative)", + }, + button: { + type: "string", + enum: [...buttonEnum], + description: 'Mouse button (default: "left")', + }, + reasoning: { + type: "string", + description: "Explanation of what you are dragging and why", + }, + activity: activityProperty, + }, + required: ["app", "from_x", "from_y", "to_x", "to_y", "reasoning"], + }, + }; + }, + + execute: proxyExecute, +}; + +// --------------------------------------------------------------------------- +// stop — terminal: end the app-control session +// --------------------------------------------------------------------------- + +export const appControlStopTool: Tool = { + name: "app_control_stop", + description: + "Stop the current app-control session. When `app` is omitted, stops whichever app currently holds the session. This is the terminal action for an app-control flow.", + category: "app-control", + defaultRiskLevel: RiskLevel.Low, + executionMode: "proxy", + + getDefinition(): ToolDefinition { + return { + name: this.name, + description: this.description, + input_schema: { + type: "object", + properties: { + app: { + type: "string", + description: + "Optional bundle ID or process name. When omitted, stops whichever app currently holds the session.", + }, + reason: { + type: "string", + description: "Free-form reason, surfaced for logging", + }, + activity: activityProperty, + }, + required: ["activity"], + }, + }; + }, + + execute: proxyExecute, +}; + +// --------------------------------------------------------------------------- +// All tools exported as array for convenience +// --------------------------------------------------------------------------- + +export const appControlTools: Tool[] = [ + appControlStartTool, + appControlObserveTool, + appControlPressTool, + appControlComboTool, + appControlTypeTool, + appControlClickTool, + appControlDragTool, + appControlStopTool, +]; diff --git a/assistant/src/tools/app-control/skill-proxy-bridge.ts b/assistant/src/tools/app-control/skill-proxy-bridge.ts new file mode 100644 index 00000000000..18203cba189 --- /dev/null +++ b/assistant/src/tools/app-control/skill-proxy-bridge.ts @@ -0,0 +1,28 @@ +/** + * Shared helper for app-control skill wrapper scripts. + * + * Each wrapper calls forwardAppControlProxyTool() to delegate execution to + * the proxy resolver, which forwards the call to the connected client. + */ + +import type { ToolContext, ToolExecutionResult } from "../types.js"; + +/** + * Forward an app-control proxy tool call through the context's proxyToolResolver. + * + * Returns a clear error result if the resolver is missing (e.g. when the tool + * is invoked outside a session with a connected client). + */ +export function forwardAppControlProxyTool( + toolName: string, + input: Record, + context: ToolContext, +): Promise { + if (!context.proxyToolResolver) { + return Promise.resolve({ + content: `Cannot execute ${toolName}: no proxy resolver available. This tool requires a connected client.`, + isError: true, + }); + } + return context.proxyToolResolver(toolName, input); +}