diff --git a/assistant/src/config/bundled-skills/app-control/SKILL.md b/assistant/src/config/bundled-skills/app-control/SKILL.md new file mode 100644 index 00000000000..5e5b7f1f8ec --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/SKILL.md @@ -0,0 +1,62 @@ +--- +name: app-control +description: Drive a specific named macOS app via raw input bypassing the Accessibility tree +compatibility: "Designed for Vellum personal assistants" +metadata: + emoji: "🎯" + vellum: + display-name: "App Control" + feature-flag: "app-control" + activation-hints: + - "User explicitly directs the assistant to drive a specific named app via raw input (emulator, game, OpenGL canvas, custom-rendered Electron app)" + - "User says the macOS Accessibility tree is unhelpful or empty for the target app" + avoid-when: + - "Task can be done via the computer-use skill (general macOS UI navigation)" + - "Task can be done via a CLI / API alternative" +--- + +This skill exposes the `app_control_*` proxy tools for driving a single +named macOS application via raw input — keyboard, mouse, screenshot — that +bypasses the system Accessibility tree. Use it only when explicitly directed +to a specific app where the AX tree is unhelpful (emulators, games, OpenGL +canvases, custom-rendered Electron apps). For general macOS UI navigation +prefer the `computer-use` skill. + +Tools in this skill are proxy tools — execution is forwarded to the connected +macOS client, never handled locally by the assistant. + +## Cadence + +Take 2-3 actions per turn, then yield with a short narration so the user can +interject. Do not chain long sequences without surfacing what you are doing. + +## Always observe before acting + +Call `app_control_observe` before your first input action whenever the screen +state matters (e.g. you need to know what is on screen, where a UI element is, +or whether the app is even running). Re-observe after actions that may have +moved the window or changed visibility. + +## Input choice + +- Prefer `app_control_combo` over rapid sequential `app_control_press` for + simultaneous inputs (e.g. cmd+shift+4). `combo` holds every key at once; + sequential presses interleave key-down and key-up events. +- Use `app_control_type` for literal text into a focused field. + +## Coordinate caveat + +`app_control_click` and `app_control_drag` use **window-relative** coordinates. +The window may move or resize between observation and click — if you are +uncertain whether the window has shifted, re-observe first. + +## App targeting + +Use bundle IDs (e.g. `com.example.app`) when possible — they are the most +reliable identifier. Fall back to localized process names if a bundle ID is +unavailable. + +## Ending the session + +Call `app_control_stop` when you are done. Do **not** auto-quit the controlled +app — `stop` only ends the app-control session, leaving the app running. diff --git a/assistant/src/config/bundled-skills/app-control/TOOLS.json b/assistant/src/config/bundled-skills/app-control/TOOLS.json new file mode 100644 index 00000000000..a59e0a8ea50 --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/TOOLS.json @@ -0,0 +1,278 @@ +{ + "version": 1, + "tools": [ + { + "name": "app_control_start", + "description": "Start (launch or focus) the target application. Optionally pass command-line arguments. Begins an app-control session targeting this app.", + "category": "app-control", + "risk": "medium", + "input_schema": { + "type": "object", + "properties": { + "app": { + "type": "string", + "description": "Bundle ID (preferred, e.g. 'com.apple.Safari') or process name of the target application" + }, + "args": { + "type": "array", + "items": { "type": "string" }, + "description": "Optional command-line arguments to launch the app with" + }, + "reasoning": { + "type": "string", + "description": "Explanation of why you are starting this app" + }, + "activity": { + "type": "string", + "description": "Brief non-technical explanation of why this tool is being called" + } + }, + "required": ["app", "reasoning"] + }, + "executor": "tools/app-control-start.ts", + "execution_target": "host" + }, + { + "name": "app_control_observe", + "description": "Capture the current window state of the target application — returns lifecycle state (running/missing/minimized/occluded), an optional screenshot, and window bounds. Use this before issuing input actions, or to check progress without acting.", + "category": "app-control", + "risk": "low", + "input_schema": { + "type": "object", + "properties": { + "app": { + "type": "string", + "description": "Bundle ID (preferred, e.g. 'com.apple.Safari') or process name of the target application" + }, + "activity": { + "type": "string", + "description": "Brief non-technical explanation of why this tool is being called" + } + }, + "required": ["app", "activity"] + }, + "executor": "tools/app-control-observe.ts", + "execution_target": "host" + }, + { + "name": "app_control_press", + "description": "Press a single key in the target application, with optional modifiers (cmd/shift/option/ctrl) and a hold duration in milliseconds.", + "category": "app-control", + "risk": "low", + "input_schema": { + "type": "object", + "properties": { + "app": { + "type": "string", + "description": "Bundle ID (preferred, e.g. 'com.apple.Safari') or process name of the target application" + }, + "key": { + "type": "string", + "description": "Key identifier, e.g. \"return\", \"a\", \"f12\"" + }, + "modifiers": { + "type": "array", + "items": { "type": "string" }, + "description": "Modifier list, e.g. [\"cmd\", \"shift\"]. Omit for no modifiers." + }, + "duration_ms": { + "type": "integer", + "description": "Hold duration in milliseconds" + }, + "reasoning": { + "type": "string", + "description": "Explanation of why you are pressing this key" + }, + "activity": { + "type": "string", + "description": "Brief non-technical explanation of why this tool is being called" + } + }, + "required": ["app", "key", "reasoning"] + }, + "executor": "tools/app-control-press.ts", + "execution_target": "host" + }, + { + "name": "app_control_combo", + "description": "Press multiple keys simultaneously in the target application (e.g. cmd+shift+4). Use for keyboard shortcuts where every key is held at once.", + "category": "app-control", + "risk": "low", + "input_schema": { + "type": "object", + "properties": { + "app": { + "type": "string", + "description": "Bundle ID (preferred, e.g. 'com.apple.Safari') or process name of the target application" + }, + "keys": { + "type": "array", + "items": { "type": "string" }, + "description": "Sequence of keys pressed simultaneously, e.g. [\"cmd\", \"shift\", \"4\"]" + }, + "duration_ms": { + "type": "integer", + "description": "Hold duration in milliseconds" + }, + "reasoning": { + "type": "string", + "description": "Explanation of why you are pressing this combo" + }, + "activity": { + "type": "string", + "description": "Brief non-technical explanation of why this tool is being called" + } + }, + "required": ["app", "keys", "reasoning"] + }, + "executor": "tools/app-control-combo.ts", + "execution_target": "host" + }, + { + "name": "app_control_type", + "description": "Type literal text into the target application at the current focus. Ensure the intended field is focused before calling.", + "category": "app-control", + "risk": "low", + "input_schema": { + "type": "object", + "properties": { + "app": { + "type": "string", + "description": "Bundle ID (preferred, e.g. 'com.apple.Safari') or process name of the target application" + }, + "text": { + "type": "string", + "description": "The text to type" + }, + "reasoning": { + "type": "string", + "description": "Explanation of what you are typing and why" + }, + "activity": { + "type": "string", + "description": "Brief non-technical explanation of why this tool is being called" + } + }, + "required": ["app", "text", "reasoning"] + }, + "executor": "tools/app-control-type.ts", + "execution_target": "host" + }, + { + "name": "app_control_click", + "description": "Click at the given (x, y) coordinates inside the target application's window. Defaults to a single left-click; pass `button` and/or `double` to vary.", + "category": "app-control", + "risk": "low", + "input_schema": { + "type": "object", + "properties": { + "app": { + "type": "string", + "description": "Bundle ID (preferred, e.g. 'com.apple.Safari') or process name of the target application" + }, + "x": { + "type": "integer", + "description": "X coordinate (window-relative)" + }, + "y": { + "type": "integer", + "description": "Y coordinate (window-relative)" + }, + "button": { + "type": "string", + "enum": ["left", "right", "middle"], + "description": "Mouse button (default: \"left\")" + }, + "double": { + "type": "boolean", + "description": "When true, performs a double-click" + }, + "reasoning": { + "type": "string", + "description": "Explanation of what you see and why you are clicking here" + }, + "activity": { + "type": "string", + "description": "Brief non-technical explanation of why this tool is being called" + } + }, + "required": ["app", "x", "y", "reasoning"] + }, + "executor": "tools/app-control-click.ts", + "execution_target": "host" + }, + { + "name": "app_control_drag", + "description": "Drag from (from_x, from_y) to (to_x, to_y) inside the target application's window. Defaults to left button.", + "category": "app-control", + "risk": "low", + "input_schema": { + "type": "object", + "properties": { + "app": { + "type": "string", + "description": "Bundle ID (preferred, e.g. 'com.apple.Safari') or process name of the target application" + }, + "from_x": { + "type": "integer", + "description": "Source X coordinate (window-relative)" + }, + "from_y": { + "type": "integer", + "description": "Source Y coordinate (window-relative)" + }, + "to_x": { + "type": "integer", + "description": "Destination X coordinate (window-relative)" + }, + "to_y": { + "type": "integer", + "description": "Destination Y coordinate (window-relative)" + }, + "button": { + "type": "string", + "enum": ["left", "right", "middle"], + "description": "Mouse button (default: \"left\")" + }, + "reasoning": { + "type": "string", + "description": "Explanation of what you are dragging and why" + }, + "activity": { + "type": "string", + "description": "Brief non-technical explanation of why this tool is being called" + } + }, + "required": ["app", "from_x", "from_y", "to_x", "to_y", "reasoning"] + }, + "executor": "tools/app-control-drag.ts", + "execution_target": "host" + }, + { + "name": "app_control_stop", + "description": "Stop the current app-control session. When `app` is omitted, stops whichever app currently holds the session. This is the terminal action for an app-control flow.", + "category": "app-control", + "risk": "low", + "input_schema": { + "type": "object", + "properties": { + "app": { + "type": "string", + "description": "Optional bundle ID or process name. When omitted, stops whichever app currently holds the session." + }, + "reason": { + "type": "string", + "description": "Free-form reason, surfaced for logging" + }, + "activity": { + "type": "string", + "description": "Brief non-technical explanation of why this tool is being called" + } + }, + "required": ["activity"] + }, + "executor": "tools/app-control-stop.ts", + "execution_target": "host" + } + ] +} diff --git a/assistant/src/config/bundled-skills/app-control/tools/app-control-click.ts b/assistant/src/config/bundled-skills/app-control/tools/app-control-click.ts new file mode 100644 index 00000000000..6fb9eb6fa63 --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/tools/app-control-click.ts @@ -0,0 +1,12 @@ +import { forwardAppControlProxyTool } from "../../../../tools/app-control/skill-proxy-bridge.js"; +import type { + ToolContext, + ToolExecutionResult, +} from "../../../../tools/types.js"; + +export async function run( + input: Record, + context: ToolContext, +): Promise { + return forwardAppControlProxyTool("app_control_click", input, context); +} diff --git a/assistant/src/config/bundled-skills/app-control/tools/app-control-combo.ts b/assistant/src/config/bundled-skills/app-control/tools/app-control-combo.ts new file mode 100644 index 00000000000..2210d1a942b --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/tools/app-control-combo.ts @@ -0,0 +1,12 @@ +import { forwardAppControlProxyTool } from "../../../../tools/app-control/skill-proxy-bridge.js"; +import type { + ToolContext, + ToolExecutionResult, +} from "../../../../tools/types.js"; + +export async function run( + input: Record, + context: ToolContext, +): Promise { + return forwardAppControlProxyTool("app_control_combo", input, context); +} diff --git a/assistant/src/config/bundled-skills/app-control/tools/app-control-drag.ts b/assistant/src/config/bundled-skills/app-control/tools/app-control-drag.ts new file mode 100644 index 00000000000..cdfed3b290b --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/tools/app-control-drag.ts @@ -0,0 +1,12 @@ +import { forwardAppControlProxyTool } from "../../../../tools/app-control/skill-proxy-bridge.js"; +import type { + ToolContext, + ToolExecutionResult, +} from "../../../../tools/types.js"; + +export async function run( + input: Record, + context: ToolContext, +): Promise { + return forwardAppControlProxyTool("app_control_drag", input, context); +} diff --git a/assistant/src/config/bundled-skills/app-control/tools/app-control-observe.ts b/assistant/src/config/bundled-skills/app-control/tools/app-control-observe.ts new file mode 100644 index 00000000000..c6e51c8beaa --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/tools/app-control-observe.ts @@ -0,0 +1,12 @@ +import { forwardAppControlProxyTool } from "../../../../tools/app-control/skill-proxy-bridge.js"; +import type { + ToolContext, + ToolExecutionResult, +} from "../../../../tools/types.js"; + +export async function run( + input: Record, + context: ToolContext, +): Promise { + return forwardAppControlProxyTool("app_control_observe", input, context); +} diff --git a/assistant/src/config/bundled-skills/app-control/tools/app-control-press.ts b/assistant/src/config/bundled-skills/app-control/tools/app-control-press.ts new file mode 100644 index 00000000000..2247a56ba66 --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/tools/app-control-press.ts @@ -0,0 +1,12 @@ +import { forwardAppControlProxyTool } from "../../../../tools/app-control/skill-proxy-bridge.js"; +import type { + ToolContext, + ToolExecutionResult, +} from "../../../../tools/types.js"; + +export async function run( + input: Record, + context: ToolContext, +): Promise { + return forwardAppControlProxyTool("app_control_press", input, context); +} diff --git a/assistant/src/config/bundled-skills/app-control/tools/app-control-start.ts b/assistant/src/config/bundled-skills/app-control/tools/app-control-start.ts new file mode 100644 index 00000000000..1c4be643ecf --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/tools/app-control-start.ts @@ -0,0 +1,12 @@ +import { forwardAppControlProxyTool } from "../../../../tools/app-control/skill-proxy-bridge.js"; +import type { + ToolContext, + ToolExecutionResult, +} from "../../../../tools/types.js"; + +export async function run( + input: Record, + context: ToolContext, +): Promise { + return forwardAppControlProxyTool("app_control_start", input, context); +} diff --git a/assistant/src/config/bundled-skills/app-control/tools/app-control-stop.ts b/assistant/src/config/bundled-skills/app-control/tools/app-control-stop.ts new file mode 100644 index 00000000000..0e1e1e2fbdc --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/tools/app-control-stop.ts @@ -0,0 +1,12 @@ +import { forwardAppControlProxyTool } from "../../../../tools/app-control/skill-proxy-bridge.js"; +import type { + ToolContext, + ToolExecutionResult, +} from "../../../../tools/types.js"; + +export async function run( + input: Record, + context: ToolContext, +): Promise { + return forwardAppControlProxyTool("app_control_stop", input, context); +} diff --git a/assistant/src/config/bundled-skills/app-control/tools/app-control-type.ts b/assistant/src/config/bundled-skills/app-control/tools/app-control-type.ts new file mode 100644 index 00000000000..d5024bc051e --- /dev/null +++ b/assistant/src/config/bundled-skills/app-control/tools/app-control-type.ts @@ -0,0 +1,12 @@ +import { forwardAppControlProxyTool } from "../../../../tools/app-control/skill-proxy-bridge.js"; +import type { + ToolContext, + ToolExecutionResult, +} from "../../../../tools/types.js"; + +export async function run( + input: Record, + context: ToolContext, +): Promise { + return forwardAppControlProxyTool("app_control_type", input, context); +} diff --git a/assistant/src/config/bundled-tool-registry.ts b/assistant/src/config/bundled-tool-registry.ts index 8a520474ce9..b7b66294bb9 100644 --- a/assistant/src/config/bundled-tool-registry.ts +++ b/assistant/src/config/bundled-tool-registry.ts @@ -24,6 +24,15 @@ import * as appCreate from "./bundled-skills/app-builder/tools/app-create.js"; import * as appDelete from "./bundled-skills/app-builder/tools/app-delete.js"; import * as appGenerateIcon from "./bundled-skills/app-builder/tools/app-generate-icon.js"; import * as appRefresh from "./bundled-skills/app-builder/tools/app-refresh.js"; +// ── app-control ──────────────────────────────────────────────────────────────── +import * as appControlClick from "./bundled-skills/app-control/tools/app-control-click.js"; +import * as appControlCombo from "./bundled-skills/app-control/tools/app-control-combo.js"; +import * as appControlDrag from "./bundled-skills/app-control/tools/app-control-drag.js"; +import * as appControlObserve from "./bundled-skills/app-control/tools/app-control-observe.js"; +import * as appControlPress from "./bundled-skills/app-control/tools/app-control-press.js"; +import * as appControlStart from "./bundled-skills/app-control/tools/app-control-start.js"; +import * as appControlStop from "./bundled-skills/app-control/tools/app-control-stop.js"; +import * as appControlType from "./bundled-skills/app-control/tools/app-control-type.js"; // ── computer-use ─────────────────────────────────────────────────────────────── import * as computerUseClick from "./bundled-skills/computer-use/tools/computer-use-click.js"; import * as computerUseDone from "./bundled-skills/computer-use/tools/computer-use-done.js"; @@ -125,6 +134,16 @@ export const bundledToolRegistry = new Map([ ["app-builder:tools/app-refresh.ts", appRefresh], ["app-builder:tools/app-generate-icon.ts", appGenerateIcon], + // app-control + ["app-control:tools/app-control-start.ts", appControlStart], + ["app-control:tools/app-control-observe.ts", appControlObserve], + ["app-control:tools/app-control-press.ts", appControlPress], + ["app-control:tools/app-control-combo.ts", appControlCombo], + ["app-control:tools/app-control-type.ts", appControlType], + ["app-control:tools/app-control-click.ts", appControlClick], + ["app-control:tools/app-control-drag.ts", appControlDrag], + ["app-control:tools/app-control-stop.ts", appControlStop], + // computer-use ["computer-use:tools/computer-use-observe.ts", computerUseObserve], ["computer-use:tools/computer-use-click.ts", computerUseClick], diff --git a/meta/feature-flags/feature-flag-registry.json b/meta/feature-flags/feature-flag-registry.json index f722bff56f9..d5cc90cf90d 100644 --- a/meta/feature-flags/feature-flag-registry.json +++ b/meta/feature-flags/feature-flag-registry.json @@ -256,6 +256,14 @@ "label": "Account Deletion", "description": "Surfaces the user-initiated account deletion flow in client settings.", "defaultEnabled": false + }, + { + "id": "app-control", + "scope": "assistant", + "key": "app-control", + "label": "App Control", + "description": "Enable the app-control skill (per-app screenshot + raw input bypassing AX tree)", + "defaultEnabled": false } ] }