diff --git a/clients/macos/vellum-assistant/AppControl/AppControlExecutor.swift b/clients/macos/vellum-assistant/AppControl/AppControlExecutor.swift index c4fce88adfb..3283d0bcdb8 100644 --- a/clients/macos/vellum-assistant/AppControl/AppControlExecutor.swift +++ b/clients/macos/vellum-assistant/AppControl/AppControlExecutor.swift @@ -146,7 +146,10 @@ enum AppControlExecutor { pngBase64: capture.pngBase64, windowBounds: capture.bounds, executionResult: "observed: \(resolved.name) (pid=\(resolved.pid))", - executionError: nil + // Surface ScreenCaptureKit failures (commonly missing Screen + // Recording permission) so the daemon/LLM doesn't see a "successful" + // observe with no image and no signal to the user. + executionError: capture.captureError ) } @@ -283,10 +286,12 @@ enum AppControlExecutor { state: capture.state, pngBase64: capture.pngBase64, windowBounds: capture.bounds, - executionError: "Window not visible (state=\(capture.state.rawValue))" + executionError: boundsMissingExecutionError(capture) ) } + // Bounds came through, so a missing PNG is non-fatal: the click can + // proceed without a screenshot. Ignore `capture.captureError` here. do { try AppMouse.click( pid: resolved.pid, @@ -341,10 +346,12 @@ enum AppControlExecutor { state: capture.state, pngBase64: capture.pngBase64, windowBounds: capture.bounds, - executionError: "Window not visible (state=\(capture.state.rawValue))" + executionError: boundsMissingExecutionError(capture) ) } + // Bounds came through, so a missing PNG is non-fatal: the drag can + // proceed without a screenshot. Ignore `capture.captureError` here. do { try AppMouse.drag( pid: resolved.pid, @@ -387,6 +394,22 @@ enum AppControlExecutor { ) } + // MARK: - capture error mapping + + /// Pick an `executionError` value for the bounds-missing branch of click + /// and drag. Bounds are required by those tools to translate the + /// caller-supplied coordinates into screen space — so when bounds are + /// missing we always return a non-nil error. + /// + /// We prefer `capture.captureError` when present (it tells the user *why* + /// we couldn't get bounds — commonly missing Screen Recording permission) + /// over a bare state-classification message. Marked `internal` for unit + /// testing; not part of the public executor surface. + static func boundsMissingExecutionError(_ capture: AppWindowCapture.CaptureResult) -> String { + return capture.captureError + ?? "Window not visible (state=\(capture.state.rawValue))" + } + // MARK: - PID resolution /// Resolves a user-supplied app identifier to a running PID and a display diff --git a/clients/macos/vellum-assistant/AppControl/AppWindowCapture.swift b/clients/macos/vellum-assistant/AppControl/AppWindowCapture.swift index 2fc064e96a1..dafc8d1436d 100644 --- a/clients/macos/vellum-assistant/AppControl/AppWindowCapture.swift +++ b/clients/macos/vellum-assistant/AppControl/AppWindowCapture.swift @@ -17,8 +17,12 @@ private let log = Logger(subsystem: Bundle.appBundleIdentifier, category: "AppWi /// (e.g. minimized to Dock or app is hidden). /// - `.missing` — no running NSRunningApplication has the requested PID. /// -/// PNG bytes (base64) and `WindowBounds` are populated when a normal window was -/// found and ScreenCaptureKit was able to capture it (`state == .running`). +/// `WindowBounds` is populated whenever a layer-0 window matched +/// (`state == .running`). `pngBase64` is populated only when ScreenCaptureKit +/// also succeeded; on capture failure it stays `nil` and `captureError` +/// carries the underlying reason (most commonly: Screen Recording permission +/// is not granted). Callers must not assume "running implies image" — `state` +/// describes the window, `captureError` describes the screenshot. /// /// Window filtering uses `CGWindowListCopyWindowInfo` (which remains available in /// macOS 15) to identify on-screen layer-0 windows owned by the target PID. The @@ -30,6 +34,19 @@ enum AppWindowCapture { let state: HostAppControlState let pngBase64: String? let bounds: WindowBounds? + /// Set when ScreenCaptureKit failed even though the window exists. Most + /// commonly indicates missing Screen Recording permission. The window + /// `state` remains correctly classified (`.running`/`.minimized`/ + /// `.missing`); this field is an orthogonal signal that *capture* failed. + let captureError: String? + } + + /// Inner result for the ScreenCaptureKit path. Distinguishes "no PNG, here's + /// why" from "no PNG, no error" so callers can surface the failure reason + /// without conflating it with the window-state classification. + private struct PNGCaptureResult { + let pngBase64: String? + let error: String? } /// Capture the frontmost normal window owned by `pid`. See type docs for state semantics. @@ -56,13 +73,19 @@ enum AppWindowCapture { return CaptureResult( state: processIsAlive ? .minimized : .missing, pngBase64: nil, - bounds: nil + bounds: nil, + captureError: nil ) } let bounds = parseBounds(entry[kCGWindowBounds]) - let pngBase64 = await captureWindowPNG(windowID: windowNumber) - return CaptureResult(state: .running, pngBase64: pngBase64, bounds: bounds) + let png = await captureWindowPNG(windowID: windowNumber) + return CaptureResult( + state: .running, + pngBase64: png.pngBase64, + bounds: bounds, + captureError: png.error + ) } // MARK: - Private helpers @@ -83,12 +106,20 @@ enum AppWindowCapture { ) } - private static func captureWindowPNG(windowID: CGWindowID) async -> String? { + /// Capture a single PNG of `windowID` via ScreenCaptureKit. Returns the + /// base64 PNG on success, or a human-readable error message describing why + /// capture failed. Empirically, missing Screen Recording permission may + /// either *throw* (most common, observed in `ScreenCapture.swift`) or + /// silently return an empty `SCShareableContent.windows` list on some + /// macOS versions — we surface a permission hint in both branches so the + /// daemon and the LLM can suggest the right fix. + private static func captureWindowPNG(windowID: CGWindowID) async -> PNGCaptureResult { do { let shareable = try await SCShareableContent.current guard let scWindow = shareable.windows.first(where: { $0.windowID == windowID }) else { - log.warning("AppWindowCapture: SCShareableContent missing windowID \(windowID)") - return nil + let message = "ScreenCaptureKit could not find window \(windowID) — Screen Recording permission may be required (System Settings > Privacy & Security > Screen & System Audio Recording)" + log.warning("AppWindowCapture: \(message, privacy: .public)") + return PNGCaptureResult(pngBase64: nil, error: message) } let filter = SCContentFilter(desktopIndependentWindow: scWindow) @@ -102,10 +133,14 @@ enum AppWindowCapture { contentFilter: filter, configuration: config ) - return encodePNGBase64(cgImage: cgImage) + guard let png = encodePNGBase64(cgImage: cgImage) else { + return PNGCaptureResult(pngBase64: nil, error: "Failed to encode captured window as PNG") + } + return PNGCaptureResult(pngBase64: png, error: nil) } catch { - log.warning("AppWindowCapture: ScreenCaptureKit capture failed: \(error.localizedDescription, privacy: .public)") - return nil + let message = "Screen capture failed: \(error.localizedDescription) — Screen Recording permission may be required (System Settings > Privacy & Security > Screen & System Audio Recording)" + log.warning("AppWindowCapture: \(message, privacy: .public)") + return PNGCaptureResult(pngBase64: nil, error: message) } } diff --git a/clients/macos/vellum-assistantTests/AppControlExecutorTests.swift b/clients/macos/vellum-assistantTests/AppControlExecutorTests.swift index 8282acf09e0..3297c1e9863 100644 --- a/clients/macos/vellum-assistantTests/AppControlExecutorTests.swift +++ b/clients/macos/vellum-assistantTests/AppControlExecutorTests.swift @@ -82,5 +82,53 @@ final class AppControlExecutorTests: XCTestCase { XCTAssertNil(result.windowBounds) XCTAssertNil(result.pngBase64) } + + // MARK: - bounds-missing executionError selection + + /// When ScreenCaptureKit fails (commonly: Screen Recording permission + /// missing), the capture surfaces a `captureError` even though the window + /// state may be `.running` or `.minimized`. The bounds-missing branch of + /// click/drag must propagate that message verbatim — that's the new signal + /// users need so they know to grant the permission. + func test_boundsMissingExecutionError_prefersCaptureError() { + let capture = AppWindowCapture.CaptureResult( + state: .running, + pngBase64: nil, + bounds: nil, + captureError: "Screen capture failed: permission denied — Screen Recording permission may be required" + ) + let message = AppControlExecutor.boundsMissingExecutionError(capture) + XCTAssertEqual( + message, + "Screen capture failed: permission denied — Screen Recording permission may be required" + ) + } + + /// Falls back to a state-classification message when capture itself + /// succeeded (no captureError) but bounds are still unavailable — e.g. + /// the app is minimized to the Dock. + func test_boundsMissingExecutionError_fallsBackToStateMessage() { + let capture = AppWindowCapture.CaptureResult( + state: .minimized, + pngBase64: nil, + bounds: nil, + captureError: nil + ) + let message = AppControlExecutor.boundsMissingExecutionError(capture) + XCTAssertEqual(message, "Window not visible (state=minimized)") + } + + /// Even for `.missing` (process gone), the helper still returns a + /// non-empty fallback so click/drag never hand the daemon an empty error. + func test_boundsMissingExecutionError_missingState_hasFallback() { + let capture = AppWindowCapture.CaptureResult( + state: .missing, + pngBase64: nil, + bounds: nil, + captureError: nil + ) + let message = AppControlExecutor.boundsMissingExecutionError(capture) + XCTAssertEqual(message, "Window not visible (state=missing)") + } } #endif diff --git a/clients/macos/vellum-assistantTests/AppWindowCaptureTests.swift b/clients/macos/vellum-assistantTests/AppWindowCaptureTests.swift index 86a3c0a08ef..25e14ee76c0 100644 --- a/clients/macos/vellum-assistantTests/AppWindowCaptureTests.swift +++ b/clients/macos/vellum-assistantTests/AppWindowCaptureTests.swift @@ -34,8 +34,14 @@ final class AppWindowCaptureTests: XCTestCase { XCTAssertNotNil(result.bounds, "Expected non-nil bounds for a running Finder window") // pngBase64 may be nil if Screen Recording permission is not granted in this - // test environment. When it is present, validate the PNG magic header. + // test environment. When it is present, validate the PNG magic header + // and assert no captureError was reported. When it is absent, captureError + // should explain why (typically a Screen Recording permission hint). if let pngBase64 = result.pngBase64 { + XCTAssertNil( + result.captureError, + "Expected no captureError when pngBase64 is present; got: \(result.captureError ?? "")" + ) let pngData = try XCTUnwrap(Data(base64Encoded: pngBase64)) XCTAssertGreaterThanOrEqual(pngData.count, 8, "PNG payload too small to contain magic bytes") @@ -43,15 +49,29 @@ final class AppWindowCaptureTests: XCTestCase { let magic: [UInt8] = [0x89, 0x50, 0x4E, 0x47] let prefix = Array(pngData.prefix(4)) XCTAssertEqual(prefix, magic, "PNG bytes do not begin with the PNG magic header") + } else { + // If we got no PNG, captureError must explain why so the daemon and + // LLM can surface that to the user (commonly: Screen Recording + // permission missing). + let error = result.captureError ?? "" + XCTAssertFalse( + error.isEmpty, + "Expected non-empty captureError when pngBase64 is nil for a running window" + ) } } - func test_capture_unknownPid_returnsMissing() async { + /// A bogus PID is a "no window" failure — the result is `.missing` and + /// `captureError` stays `nil`. The error field is reserved for *capture* + /// failures (ScreenCaptureKit returned no image even though the window + /// existed), not window-state classification failures. + func test_capture_unknownPid_returnsMissingWithNoCaptureError() async { let unknownPid: pid_t = 999_999 let result = await AppWindowCapture.capture(forPid: unknownPid) XCTAssertEqual(result.state, .missing) XCTAssertNil(result.pngBase64) XCTAssertNil(result.bounds) + XCTAssertNil(result.captureError) } } #endif