Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions clients/macos/vellum-assistant/AppControl/AppControlExecutor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,10 @@ enum AppControlExecutor {
pngBase64: capture.pngBase64,
windowBounds: capture.bounds,
executionResult: "observed: \(resolved.name) (pid=\(resolved.pid))",
executionError: nil
// Surface ScreenCaptureKit failures (commonly missing Screen
// Recording permission) so the daemon/LLM doesn't see a "successful"
// observe with no image and no signal to the user.
executionError: capture.captureError
Comment on lines 148 to +152
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 performObserve now returns both executionResult and executionError simultaneously — new pattern

Previously, all action handlers followed a convention where executionResult and executionError were mutually exclusive (success paths set executionResult with executionError: nil; failure paths set executionError with executionResult defaulting to nil). The updated performObserve at line 148-152 now sets both when the window is found but capture fails: executionResult: "observed: ..." alongside executionError: capture.captureError. This represents a partial-success semantic (window found, capture failed) which is reasonable, but it's a new pattern. If the daemon-side TypeScript code treats executionError != nil as a hard failure signal, the coexisting executionResult might be ignored or cause unexpected behavior. Worth verifying on the daemon side.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

)
}

Expand Down Expand Up @@ -283,10 +286,12 @@ enum AppControlExecutor {
state: capture.state,
pngBase64: capture.pngBase64,
windowBounds: capture.bounds,
executionError: "Window not visible (state=\(capture.state.rawValue))"
executionError: boundsMissingExecutionError(capture)
)
}

// Bounds came through, so a missing PNG is non-fatal: the click can
// proceed without a screenshot. Ignore `capture.captureError` here.
do {
try AppMouse.click(
pid: resolved.pid,
Expand Down Expand Up @@ -341,10 +346,12 @@ enum AppControlExecutor {
state: capture.state,
pngBase64: capture.pngBase64,
windowBounds: capture.bounds,
executionError: "Window not visible (state=\(capture.state.rawValue))"
executionError: boundsMissingExecutionError(capture)
)
}

// Bounds came through, so a missing PNG is non-fatal: the drag can
// proceed without a screenshot. Ignore `capture.captureError` here.
do {
try AppMouse.drag(
pid: resolved.pid,
Expand Down Expand Up @@ -387,6 +394,22 @@ enum AppControlExecutor {
)
}

// MARK: - capture error mapping

/// Pick an `executionError` value for the bounds-missing branch of click
/// and drag. Bounds are required by those tools to translate the
/// caller-supplied coordinates into screen space — so when bounds are
/// missing we always return a non-nil error.
///
/// We prefer `capture.captureError` when present (it tells the user *why*
/// we couldn't get bounds — commonly missing Screen Recording permission)
/// over a bare state-classification message. Marked `internal` for unit
/// testing; not part of the public executor surface.
static func boundsMissingExecutionError(_ capture: AppWindowCapture.CaptureResult) -> String {
return capture.captureError
?? "Window not visible (state=\(capture.state.rawValue))"
}

// MARK: - PID resolution

/// Resolves a user-supplied app identifier to a running PID and a display
Expand Down
57 changes: 46 additions & 11 deletions clients/macos/vellum-assistant/AppControl/AppWindowCapture.swift
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@ private let log = Logger(subsystem: Bundle.appBundleIdentifier, category: "AppWi
/// (e.g. minimized to Dock or app is hidden).
/// - `.missing` — no running NSRunningApplication has the requested PID.
///
/// PNG bytes (base64) and `WindowBounds` are populated when a normal window was
/// found and ScreenCaptureKit was able to capture it (`state == .running`).
/// `WindowBounds` is populated whenever a layer-0 window matched
/// (`state == .running`). `pngBase64` is populated only when ScreenCaptureKit
/// also succeeded; on capture failure it stays `nil` and `captureError`
/// carries the underlying reason (most commonly: Screen Recording permission
/// is not granted). Callers must not assume "running implies image" — `state`
/// describes the window, `captureError` describes the screenshot.
///
/// Window filtering uses `CGWindowListCopyWindowInfo` (which remains available in
/// macOS 15) to identify on-screen layer-0 windows owned by the target PID. The
Expand All @@ -30,6 +34,19 @@ enum AppWindowCapture {
let state: HostAppControlState
let pngBase64: String?
let bounds: WindowBounds?
/// Set when ScreenCaptureKit failed even though the window exists. Most
/// commonly indicates missing Screen Recording permission. The window
/// `state` remains correctly classified (`.running`/`.minimized`/
/// `.missing`); this field is an orthogonal signal that *capture* failed.
let captureError: String?
}

/// Inner result for the ScreenCaptureKit path. Distinguishes "no PNG, here's
/// why" from "no PNG, no error" so callers can surface the failure reason
/// without conflating it with the window-state classification.
private struct PNGCaptureResult {
let pngBase64: String?
let error: String?
}

/// Capture the frontmost normal window owned by `pid`. See type docs for state semantics.
Expand All @@ -56,13 +73,19 @@ enum AppWindowCapture {
return CaptureResult(
state: processIsAlive ? .minimized : .missing,
pngBase64: nil,
bounds: nil
bounds: nil,
captureError: nil
)
}

let bounds = parseBounds(entry[kCGWindowBounds])
let pngBase64 = await captureWindowPNG(windowID: windowNumber)
return CaptureResult(state: .running, pngBase64: pngBase64, bounds: bounds)
let png = await captureWindowPNG(windowID: windowNumber)
return CaptureResult(
state: .running,
pngBase64: png.pngBase64,
bounds: bounds,
captureError: png.error
)
}

// MARK: - Private helpers
Expand All @@ -83,12 +106,20 @@ enum AppWindowCapture {
)
}

private static func captureWindowPNG(windowID: CGWindowID) async -> String? {
/// Capture a single PNG of `windowID` via ScreenCaptureKit. Returns the
/// base64 PNG on success, or a human-readable error message describing why
/// capture failed. Empirically, missing Screen Recording permission may
/// either *throw* (most common, observed in `ScreenCapture.swift`) or
/// silently return an empty `SCShareableContent.windows` list on some
/// macOS versions — we surface a permission hint in both branches so the
/// daemon and the LLM can suggest the right fix.
private static func captureWindowPNG(windowID: CGWindowID) async -> PNGCaptureResult {
do {
let shareable = try await SCShareableContent.current
guard let scWindow = shareable.windows.first(where: { $0.windowID == windowID }) else {
log.warning("AppWindowCapture: SCShareableContent missing windowID \(windowID)")
return nil
let message = "ScreenCaptureKit could not find window \(windowID) — Screen Recording permission may be required (System Settings > Privacy & Security > Screen & System Audio Recording)"
log.warning("AppWindowCapture: \(message, privacy: .public)")
return PNGCaptureResult(pngBase64: nil, error: message)
}

let filter = SCContentFilter(desktopIndependentWindow: scWindow)
Expand All @@ -102,10 +133,14 @@ enum AppWindowCapture {
contentFilter: filter,
configuration: config
)
return encodePNGBase64(cgImage: cgImage)
guard let png = encodePNGBase64(cgImage: cgImage) else {
return PNGCaptureResult(pngBase64: nil, error: "Failed to encode captured window as PNG")
}
return PNGCaptureResult(pngBase64: png, error: nil)
} catch {
log.warning("AppWindowCapture: ScreenCaptureKit capture failed: \(error.localizedDescription, privacy: .public)")
return nil
let message = "Screen capture failed: \(error.localizedDescription) — Screen Recording permission may be required (System Settings > Privacy & Security > Screen & System Audio Recording)"
log.warning("AppWindowCapture: \(message, privacy: .public)")
return PNGCaptureResult(pngBase64: nil, error: message)
}
}

Expand Down
48 changes: 48 additions & 0 deletions clients/macos/vellum-assistantTests/AppControlExecutorTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -82,5 +82,53 @@ final class AppControlExecutorTests: XCTestCase {
XCTAssertNil(result.windowBounds)
XCTAssertNil(result.pngBase64)
}

// MARK: - bounds-missing executionError selection

/// When ScreenCaptureKit fails (commonly: Screen Recording permission
/// missing), the capture surfaces a `captureError` even though the window
/// state may be `.running` or `.minimized`. The bounds-missing branch of
/// click/drag must propagate that message verbatim — that's the new signal
/// users need so they know to grant the permission.
func test_boundsMissingExecutionError_prefersCaptureError() {
let capture = AppWindowCapture.CaptureResult(
state: .running,
pngBase64: nil,
bounds: nil,
captureError: "Screen capture failed: permission denied — Screen Recording permission may be required"
)
let message = AppControlExecutor.boundsMissingExecutionError(capture)
XCTAssertEqual(
message,
"Screen capture failed: permission denied — Screen Recording permission may be required"
)
}

/// Falls back to a state-classification message when capture itself
/// succeeded (no captureError) but bounds are still unavailable — e.g.
/// the app is minimized to the Dock.
func test_boundsMissingExecutionError_fallsBackToStateMessage() {
let capture = AppWindowCapture.CaptureResult(
state: .minimized,
pngBase64: nil,
bounds: nil,
captureError: nil
)
let message = AppControlExecutor.boundsMissingExecutionError(capture)
XCTAssertEqual(message, "Window not visible (state=minimized)")
}

/// Even for `.missing` (process gone), the helper still returns a
/// non-empty fallback so click/drag never hand the daemon an empty error.
func test_boundsMissingExecutionError_missingState_hasFallback() {
let capture = AppWindowCapture.CaptureResult(
state: .missing,
pngBase64: nil,
bounds: nil,
captureError: nil
)
let message = AppControlExecutor.boundsMissingExecutionError(capture)
XCTAssertEqual(message, "Window not visible (state=missing)")
}
}
#endif
24 changes: 22 additions & 2 deletions clients/macos/vellum-assistantTests/AppWindowCaptureTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,44 @@ final class AppWindowCaptureTests: XCTestCase {
XCTAssertNotNil(result.bounds, "Expected non-nil bounds for a running Finder window")

// pngBase64 may be nil if Screen Recording permission is not granted in this
// test environment. When it is present, validate the PNG magic header.
// test environment. When it is present, validate the PNG magic header
// and assert no captureError was reported. When it is absent, captureError
// should explain why (typically a Screen Recording permission hint).
if let pngBase64 = result.pngBase64 {
XCTAssertNil(
result.captureError,
"Expected no captureError when pngBase64 is present; got: \(result.captureError ?? "")"
)
let pngData = try XCTUnwrap(Data(base64Encoded: pngBase64))
XCTAssertGreaterThanOrEqual(pngData.count, 8, "PNG payload too small to contain magic bytes")

// PNG magic header: 0x89 0x50 0x4E 0x47.
let magic: [UInt8] = [0x89, 0x50, 0x4E, 0x47]
let prefix = Array(pngData.prefix(4))
XCTAssertEqual(prefix, magic, "PNG bytes do not begin with the PNG magic header")
} else {
// If we got no PNG, captureError must explain why so the daemon and
// LLM can surface that to the user (commonly: Screen Recording
// permission missing).
let error = result.captureError ?? ""
XCTAssertFalse(
error.isEmpty,
"Expected non-empty captureError when pngBase64 is nil for a running window"
)
}
}

func test_capture_unknownPid_returnsMissing() async {
/// A bogus PID is a "no window" failure — the result is `.missing` and
/// `captureError` stays `nil`. The error field is reserved for *capture*
/// failures (ScreenCaptureKit returned no image even though the window
/// existed), not window-state classification failures.
func test_capture_unknownPid_returnsMissingWithNoCaptureError() async {
let unknownPid: pid_t = 999_999
let result = await AppWindowCapture.capture(forPid: unknownPid)
XCTAssertEqual(result.state, .missing)
XCTAssertNil(result.pngBase64)
XCTAssertNil(result.bounds)
XCTAssertNil(result.captureError)
}
}
#endif