diff --git a/clients/macos/vellum-assistant/ComputerUse/ScreenRecorder.swift b/clients/macos/vellum-assistant/ComputerUse/ScreenRecorder.swift new file mode 100644 index 00000000000..a2d380adaca --- /dev/null +++ b/clients/macos/vellum-assistant/ComputerUse/ScreenRecorder.swift @@ -0,0 +1,370 @@ +import Foundation +import ScreenCaptureKit +import AVFoundation +import CoreMedia +import os + +private let log = Logger(subsystem: "com.vellum.vellum-assistant", category: "ScreenRecorder") + +// MARK: - Recording Result + +/// Metadata returned after a screen recording session completes. +struct RecordingResult: Sendable { + let fileURL: URL + let mimeType: String // always "video/mp4" + let sizeBytes: Int + let durationMs: Int + let width: Int + let height: Int + let captureScope: String // "window" or "display" + let includeAudio: Bool + let targetBundleId: String? +} + +// MARK: - Recording Errors + +enum ScreenRecorderError: LocalizedError { + case alreadyRecording + case notRecording + case permissionDenied + case noDisplayFound + case windowNotFound(CGWindowID) + case assetWriterSetupFailed(String) + case assetWriterFailed(String) + case recordingDirectoryCreationFailed + + var errorDescription: String? { + switch self { + case .alreadyRecording: + return "Screen recording is already in progress" + case .notRecording: + return "No active screen recording to stop" + case .permissionDenied: + return "Screen Recording permission denied. Grant it in System Settings > Privacy & Security > Screen Recording." + case .noDisplayFound: + return "No display found for recording" + case .windowNotFound(let id): + return "Window with ID \(id) not found for recording" + case .assetWriterSetupFailed(let reason): + return "Failed to set up recording writer: \(reason)" + case .assetWriterFailed(let reason): + return "Recording writer error: \(reason)" + case .recordingDirectoryCreationFailed: + return "Failed to create recordings directory" + } + } +} + +// MARK: - Protocol + +/// Protocol for screen recording, enabling dependency injection and testing. +@MainActor +protocol ScreenRecording { + func startRecording(windowID: CGWindowID?, displayID: CGDirectDisplayID?, includeAudio: Bool) async throws + func stopRecording() async throws -> RecordingResult + var isRecording: Bool { get } +} + +// MARK: - ScreenRecorder + +/// Records screen content to an .mp4 file using ScreenCaptureKit (SCStream). +/// +/// Supports two capture scopes: +/// - **Window capture**: captures a specific window by CGWindowID +/// - **Display capture**: captures the full display (fallback) +/// +/// Recordings are saved to `~/Library/Application Support/vellum-assistant/recordings/`. +@MainActor +final class ScreenRecorder: NSObject, ScreenRecording { + private(set) var isRecording = false + + private var stream: SCStream? + private var assetWriter: AVAssetWriter? + private var videoInput: AVAssetWriterInput? + private var audioInput: AVAssetWriterInput? + private var recordingFileURL: URL? + private var recordingStartTime: Date? + private var captureScope: String = "display" + private var includesAudio: Bool = false + private var targetBundleId: String? + private var captureWidth: Int = 0 + private var captureHeight: Int = 0 + + /// Nonisolated delegate that buffers samples and forwards them to the asset writer. + /// Must be nonisolated because SCStreamOutput callbacks arrive on an arbitrary queue. + private var outputHandler: StreamOutputHandler? + + // MARK: - Directory Setup + + private static func recordingsDirectory() throws -> URL { + let appSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + let recordingsDir = appSupport + .appendingPathComponent("vellum-assistant", isDirectory: true) + .appendingPathComponent("recordings", isDirectory: true) + + if !FileManager.default.fileExists(atPath: recordingsDir.path) { + do { + try FileManager.default.createDirectory(at: recordingsDir, withIntermediateDirectories: true) + } catch { + log.error("Failed to create recordings directory: \(error.localizedDescription)") + throw ScreenRecorderError.recordingDirectoryCreationFailed + } + } + + return recordingsDir + } + + // MARK: - Start Recording + + func startRecording(windowID: CGWindowID? = nil, displayID: CGDirectDisplayID? = nil, includeAudio: Bool = false) async throws { + guard !isRecording else { + throw ScreenRecorderError.alreadyRecording + } + + // Fetch shareable content (triggers permission prompt if needed) + let content: SCShareableContent + do { + content = try await SCShareableContent.current + } catch { + throw ScreenRecorderError.permissionDenied + } + + // Build content filter based on capture scope + let filter: SCContentFilter + if let windowID, let window = content.windows.first(where: { $0.windowID == windowID }) { + filter = SCContentFilter(desktopIndependentWindow: window) + captureScope = "window" + targetBundleId = window.owningApplication?.bundleIdentifier + log.info("Recording window \(windowID) (bundle: \(self.targetBundleId ?? "unknown"))") + } else if let displayID, let display = content.displays.first(where: { $0.displayID == displayID }) { + // Exclude our own app's windows from display capture + let myPID = ProcessInfo.processInfo.processIdentifier + let ownWindows = content.windows.filter { $0.owningApplication?.processID == myPID } + filter = SCContentFilter(display: display, excludingWindows: ownWindows) + captureScope = "display" + targetBundleId = nil + log.info("Recording display \(displayID)") + } else { + // Fallback: use main display + let mainDisplayID = CGMainDisplayID() + guard let display = content.displays.first(where: { $0.displayID == mainDisplayID }) + ?? content.displays.first else { + throw ScreenRecorderError.noDisplayFound + } + let myPID = ProcessInfo.processInfo.processIdentifier + let ownWindows = content.windows.filter { $0.owningApplication?.processID == myPID } + filter = SCContentFilter(display: display, excludingWindows: ownWindows) + captureScope = "display" + targetBundleId = nil + log.info("Recording main display (fallback)") + } + + includesAudio = includeAudio + + // Configure the stream + let config = SCStreamConfiguration() + config.width = 1920 + config.height = 1080 + config.pixelFormat = kCVPixelFormatType_32BGRA + config.showsCursor = true + config.minimumFrameInterval = CMTime(value: 1, timescale: 30) // 30 fps + + if includeAudio { + config.capturesAudio = true + config.sampleRate = 44100 + config.channelCount = 2 + } + + captureWidth = config.width + captureHeight = config.height + + // Set up the output file + let recordingsDir = try Self.recordingsDirectory() + let timestamp = ISO8601DateFormatter().string(from: Date()) + .replacingOccurrences(of: ":", with: "-") + let fileName = "qa-recording-\(timestamp).mp4" + let fileURL = recordingsDir.appendingPathComponent(fileName) + recordingFileURL = fileURL + + // Set up AVAssetWriter + let writer: AVAssetWriter + do { + writer = try AVAssetWriter(outputURL: fileURL, fileType: .mp4) + } catch { + throw ScreenRecorderError.assetWriterSetupFailed(error.localizedDescription) + } + + // Video input + let videoSettings: [String: Any] = [ + AVVideoCodecKey: AVVideoCodecType.h264, + AVVideoWidthKey: config.width, + AVVideoHeightKey: config.height, + AVVideoCompressionPropertiesKey: [ + AVVideoAverageBitRateKey: 4_000_000, // 4 Mbps + AVVideoProfileLevelKey: AVVideoProfileLevelH264HighAutoLevel, + ] + ] + let vInput = AVAssetWriterInput(mediaType: .video, outputSettings: videoSettings) + vInput.expectsMediaDataInRealTime = true + guard writer.canAdd(vInput) else { + throw ScreenRecorderError.assetWriterSetupFailed("Cannot add video input to asset writer") + } + writer.add(vInput) + videoInput = vInput + + // Audio input (optional) + if includeAudio { + let audioSettings: [String: Any] = [ + AVFormatIDKey: kAudioFormatMPEG4AAC, + AVSampleRateKey: 44100, + AVNumberOfChannelsKey: 2, + AVEncoderBitRateKey: 128_000, + ] + let aInput = AVAssetWriterInput(mediaType: .audio, outputSettings: audioSettings) + aInput.expectsMediaDataInRealTime = true + if writer.canAdd(aInput) { + writer.add(aInput) + audioInput = aInput + } + } + + writer.startWriting() + assetWriter = writer + + // Create the nonisolated output handler + let handler = StreamOutputHandler(writer: writer, videoInput: vInput, audioInput: audioInput) + outputHandler = handler + + // Create and start the stream + let scStream = SCStream(filter: filter, configuration: config, delegate: nil) + try scStream.addStreamOutput(handler, type: .screen, sampleHandlerQueue: .global(qos: .userInitiated)) + if includeAudio { + try scStream.addStreamOutput(handler, type: .audio, sampleHandlerQueue: .global(qos: .userInitiated)) + } + + try await scStream.startCapture() + stream = scStream + isRecording = true + recordingStartTime = Date() + + log.info("Screen recording started: \(fileURL.lastPathComponent)") + } + + // MARK: - Stop Recording + + func stopRecording() async throws -> RecordingResult { + guard isRecording, let stream, let writer = assetWriter, let fileURL = recordingFileURL else { + throw ScreenRecorderError.notRecording + } + + // Stop the stream capture + do { + try await stream.stopCapture() + } catch { + log.warning("Error stopping stream capture: \(error.localizedDescription)") + } + + // Mark inputs as finished + videoInput?.markAsFinished() + audioInput?.markAsFinished() + + // Finalize the asset writer + await writer.finishWriting() + + if writer.status == .failed { + let errorMsg = writer.error?.localizedDescription ?? "Unknown error" + log.error("Asset writer failed: \(errorMsg)") + throw ScreenRecorderError.assetWriterFailed(errorMsg) + } + + // Compute metadata + let fileAttributes = try FileManager.default.attributesOfItem(atPath: fileURL.path) + let sizeBytes = (fileAttributes[.size] as? Int) ?? 0 + + // Compute duration from the asset + let asset = AVAsset(url: fileURL) + let duration: CMTime + if let tracks = try? await asset.load(.tracks), !tracks.isEmpty { + duration = try await asset.load(.duration) + } else { + // Fallback: estimate from wall clock time + let elapsed = recordingStartTime.map { Date().timeIntervalSince($0) } ?? 0 + duration = CMTime(seconds: elapsed, preferredTimescale: 1000) + } + let durationMs = Int(CMTimeGetSeconds(duration) * 1000) + + let result = RecordingResult( + fileURL: fileURL, + mimeType: "video/mp4", + sizeBytes: sizeBytes, + durationMs: durationMs, + width: captureWidth, + height: captureHeight, + captureScope: captureScope, + includeAudio: includesAudio, + targetBundleId: targetBundleId + ) + + // Clean up state + self.stream = nil + self.assetWriter = nil + self.videoInput = nil + self.audioInput = nil + self.outputHandler = nil + self.recordingFileURL = nil + self.recordingStartTime = nil + self.isRecording = false + + log.info("Screen recording stopped: \(fileURL.lastPathComponent) (\(sizeBytes) bytes, \(durationMs)ms)") + + return result + } +} + +// MARK: - Stream Output Handler + +/// Nonisolated handler for SCStream output that writes samples to an AVAssetWriter. +/// SCStreamOutput callbacks arrive on arbitrary queues, so this class must not be +/// @MainActor-isolated. +private final class StreamOutputHandler: NSObject, SCStreamOutput, @unchecked Sendable { + private let writer: AVAssetWriter + private let videoInput: AVAssetWriterInput + private let audioInput: AVAssetWriterInput? + private var sessionStarted = false + private let lock = NSLock() + + init(writer: AVAssetWriter, videoInput: AVAssetWriterInput, audioInput: AVAssetWriterInput?) { + self.writer = writer + self.videoInput = videoInput + self.audioInput = audioInput + super.init() + } + + func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, of type: SCStreamOutputType) { + guard writer.status == .writing else { return } + guard sampleBuffer.isValid else { return } + + lock.lock() + defer { lock.unlock() } + + // Start the session on the first sample + if !sessionStarted { + let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer) + writer.startSession(atSourceTime: timestamp) + sessionStarted = true + } + + switch type { + case .screen: + if videoInput.isReadyForMoreMediaData { + videoInput.append(sampleBuffer) + } + case .audio: + if let audioInput, audioInput.isReadyForMoreMediaData { + audioInput.append(sampleBuffer) + } + @unknown default: + break + } + } +} diff --git a/clients/macos/vellum-assistant/ComputerUse/Session.swift b/clients/macos/vellum-assistant/ComputerUse/Session.swift index e21dfc64743..f76cbb971c3 100644 --- a/clients/macos/vellum-assistant/ComputerUse/Session.swift +++ b/clients/macos/vellum-assistant/ComputerUse/Session.swift @@ -34,6 +34,13 @@ final class ComputerUseSession: ObservableObject { private let skipSessionCreate: Bool private let notificationService: ActivityNotificationServiceProtocol? + /// Screen recorder for QA mode — nil when not in QA mode. + private let screenRecorder: ScreenRecording? + /// Origin chat session ID for result injection (QA workflow). + let reportToSessionId: String? + /// Whether this session is running in QA/test mode. + let qaMode: Bool + /// Weak reference to the chat view model for extracting tool calls for notifications. weak var relatedViewModel: ChatViewModel? @@ -74,7 +81,10 @@ final class ComputerUseSession: ObservableObject { adaptiveDelay: Bool = true, sessionId: String? = nil, skipSessionCreate: Bool = false, - notificationService: ActivityNotificationServiceProtocol? = nil + notificationService: ActivityNotificationServiceProtocol? = nil, + screenRecorder: ScreenRecording? = nil, + reportToSessionId: String? = nil, + qaMode: Bool = false ) { self.id = sessionId ?? UUID().uuidString self.task = task @@ -89,6 +99,9 @@ final class ComputerUseSession: ObservableObject { self.adaptiveDelayEnabled = adaptiveDelay self.skipSessionCreate = skipSessionCreate self.notificationService = notificationService + self.screenRecorder = screenRecorder + self.reportToSessionId = reportToSessionId + self.qaMode = qaMode self.verifier = ActionVerifier(maxSteps: maxSteps) self.logger = SessionLogger(task: task, attachments: attachments) } @@ -114,6 +127,17 @@ final class ComputerUseSession: ObservableObject { try? await Task.sleep(nanoseconds: initialDelayMs * 1_000_000) } + // Start screen recording in QA mode + if qaMode, let recorder = screenRecorder { + do { + try await recorder.startRecording(windowID: nil, displayID: nil, includeAudio: false) + log.info("QA mode: screen recording started for session \(self.id)") + } catch { + log.error("QA mode: failed to start screen recording: \(error.localizedDescription)") + // Non-fatal — continue the session without recording + } + } + // 1. Subscribe before sending so we don't miss fast daemon responses let messageStream = daemonClient.subscribe() @@ -138,7 +162,9 @@ final class ComputerUseSession: ObservableObject { screenWidth: Int(screenSize.width), screenHeight: Int(screenSize.height), attachments: ipcAttachments, - interactionType: interactionTypeString + interactionType: interactionTypeString, + reportToSessionId: reportToSessionId, + qaMode: qaMode ? true : nil )) } catch { log.error("Failed to send session create message: \(error)") @@ -161,6 +187,9 @@ final class ComputerUseSession: ObservableObject { log.error("Failed to send session abort message: \(error)") } logger.finishSession(result: "failed: no window") + if qaMode { + await finalizeQARecording() + } return } @@ -243,6 +272,11 @@ final class ComputerUseSession: ObservableObject { logger.finishSession(result: "failed: stream ended unexpectedly") } } + + // Finalize QA recording and send cu_session_finalized + if qaMode { + await finalizeQARecording() + } } // MARK: - Action Handler @@ -910,6 +944,76 @@ final class ComputerUseSession: ObservableObject { .flatMap { $0.toolCalls } } + // MARK: - QA Recording Finalization + + /// Stops the screen recorder (if active) and sends a `cu_session_finalized` message to the daemon. + private func finalizeQARecording() async { + // Map SessionState to a status string + let status: String + let summary: String + let stepCount: Int + switch state { + case .completed(let s, let steps): + status = "completed" + summary = s + stepCount = steps + case .responded(let answer, let steps): + status = "responded" + summary = answer + stepCount = steps + case .failed(let reason): + status = "failed" + summary = reason + stepCount = currentStepNumber + case .cancelled: + status = "cancelled" + summary = "Session cancelled by user" + stepCount = currentStepNumber + default: + status = "unknown" + summary = "Session ended in unexpected state" + stepCount = currentStepNumber + } + + // Stop the recorder and gather metadata + var recordingData: IPCCuSessionFinalizedRecording? + if let recorder = screenRecorder, recorder.isRecording { + do { + let result = try await recorder.stopRecording() + let expiresAtEpoch = Int(Date().addingTimeInterval(7 * 24 * 3600).timeIntervalSince1970) + recordingData = IPCCuSessionFinalizedRecording( + localPath: result.fileURL.path, + mimeType: result.mimeType, + sizeBytes: result.sizeBytes, + durationMs: result.durationMs, + width: result.width, + height: result.height, + captureScope: result.captureScope, + includeAudio: result.includeAudio, + targetBundleId: result.targetBundleId, + expiresAt: expiresAtEpoch + ) + log.info("QA recording finalized: \(result.fileURL.lastPathComponent) (\(result.sizeBytes) bytes, \(result.durationMs)ms)") + } catch { + log.error("QA mode: failed to stop screen recording: \(error.localizedDescription)") + } + } + + // Send cu_session_finalized to the daemon + do { + try daemonClient.send(CuSessionFinalizedMessage( + sessionId: id, + status: status, + summary: summary, + stepCount: stepCount, + recording: recordingData + )) + log.info("QA mode: sent cu_session_finalized for session \(self.id) (status: \(status))") + } catch { + log.error("QA mode: failed to send cu_session_finalized: \(error.localizedDescription)") + } + } + // MARK: - Control func pause() { diff --git a/clients/macos/vellum-assistant/Features/Chat/MediaEmbeds/InlineVideoAttachmentView.swift b/clients/macos/vellum-assistant/Features/Chat/MediaEmbeds/InlineVideoAttachmentView.swift index f2ed4bf5f34..5a514888175 100644 --- a/clients/macos/vellum-assistant/Features/Chat/MediaEmbeds/InlineVideoAttachmentView.swift +++ b/clients/macos/vellum-assistant/Features/Chat/MediaEmbeds/InlineVideoAttachmentView.swift @@ -87,6 +87,9 @@ struct InlineVideoAttachmentView: View { .frame(maxWidth: 360) .aspectRatio(videoAspectRatio, contentMode: .fit) .onHover { isHovering = $0 } + .onDrag { + dragItemProvider() + } .onDisappear { player?.pause() player = nil @@ -339,6 +342,56 @@ struct InlineVideoAttachmentView: View { } } + /// Creates an NSItemProvider for drag-and-drop to Finder or other apps. + /// Uses the cached temp file if available, otherwise writes inline data to disk first. + private func dragItemProvider() -> NSItemProvider { + if let fileURL = cachedFileURL { + return NSItemProvider(contentsOf: fileURL) ?? NSItemProvider() + } + + // Write inline base64 data to a temp file for dragging + if !attachment.data.isEmpty, let data = Data(base64Encoded: attachment.data) { + let fileURL = safeTempURL() + do { + try data.write(to: fileURL) + return NSItemProvider(contentsOf: fileURL) ?? NSItemProvider() + } catch { + log.warning("Failed to write video for drag: \(error.localizedDescription)") + } + } + + // Fallback: provide the filename as a promise (lazy-loaded attachments) + if attachment.isLazyLoad, let port = daemonHttpPort, !attachment.id.isEmpty { + let provider = NSItemProvider() + let fileURL = safeTempURL() + let attachmentId = attachment.id + provider.suggestedName = (attachment.filename as NSString).lastPathComponent + provider.registerFileRepresentation( + forTypeIdentifier: "public.mpeg-4", + fileOptions: [], + visibility: .all + ) { completion in + Task { + do { + let base64 = try await fetchAttachmentData(port: port, attachmentId: attachmentId) + guard let data = Data(base64Encoded: base64) else { + completion(nil, false, URLError(.cannotDecodeContentData)) + return + } + try data.write(to: fileURL) + completion(fileURL, true, nil) + } catch { + completion(nil, false, error) + } + } + return nil + } + return provider + } + + return NSItemProvider() + } + private func openInExternalPlayer() { if let fileURL = cachedFileURL { NSWorkspace.shared.open(fileURL) diff --git a/clients/shared/IPC/IPCMessages.swift b/clients/shared/IPC/IPCMessages.swift index 70e5bc524fd..34d65d49622 100644 --- a/clients/shared/IPC/IPCMessages.swift +++ b/clients/shared/IPC/IPCMessages.swift @@ -160,6 +160,22 @@ extension IPCCuSessionCreate { } } +/// Sent when a CU session reaches a terminal state (QA mode). +/// Backed by generated `IPCCuSessionFinalized`. +public typealias CuSessionFinalizedMessage = IPCCuSessionFinalized + +extension IPCCuSessionFinalized { + public init(sessionId: String, status: String, summary: String, stepCount: Int, recording: IPCCuSessionFinalizedRecording?) { + self.init(type: "cu_session_finalized", sessionId: sessionId, status: status, summary: summary, stepCount: stepCount, recording: recording) + } +} + +extension IPCCuSessionFinalizedRecording { + public init(localPath: String, mimeType: String, sizeBytes: Int, durationMs: Int, width: Int, height: Int, captureScope: String, includeAudio: Bool, targetBundleId: String?, expiresAt: Int) { + self.init(localPath: localPath, mimeType: mimeType, sizeBytes: sizeBytes, durationMs: durationMs, width: width, height: height, captureScope: captureScope, includeAudio: includeAudio, targetBundleId: targetBundleId, expiresAt: expiresAt) + } +} + /// Sent after each perceive step with AX tree, screenshot, and execution results. /// Backed by generated `IPCCuObservation`. public typealias CuObservationMessage = IPCCuObservation