diff --git a/clients/macos/vellum-assistant/App/AppDelegate.swift b/clients/macos/vellum-assistant/App/AppDelegate.swift index d5fcb1f6040..4d26e65fc11 100644 --- a/clients/macos/vellum-assistant/App/AppDelegate.swift +++ b/clients/macos/vellum-assistant/App/AppDelegate.swift @@ -1156,6 +1156,7 @@ public final class AppDelegate: NSObject, NSApplicationDelegate { main.onMicrophoneToggle = { [weak self] in self?.voiceInput?.toggleRecording() } + // Voice mode uses OpenAI Whisper + TTS directly (no VoiceInputManager needed) main.threadManager.onInlineConfirmationResponse = { [weak self] requestId, decision in guard let self else { return } // Resume the notification service continuation with a sentinel so diff --git a/clients/macos/vellum-assistant/App/VoiceInputManager.swift b/clients/macos/vellum-assistant/App/VoiceInputManager.swift index 8593795dd02..6b4aafd174a 100644 --- a/clients/macos/vellum-assistant/App/VoiceInputManager.swift +++ b/clients/macos/vellum-assistant/App/VoiceInputManager.swift @@ -36,6 +36,9 @@ final class VoiceInputManager { private var recognitionTask: SFSpeechRecognitionTask? private let audioEngine = AVAudioEngine() + /// Exposes the audio engine for amplitude tracking in voice mode. + var exposedAudioEngine: AVAudioEngine { audioEngine } + func start() { setupFnKeyMonitors() } @@ -69,6 +72,30 @@ final class VoiceInputManager { } } + // MARK: - Continuous Recording (Voice Mode) + + /// Start recording without requiring a key hold. Used by voice mode for hands-free operation. + func startContinuousRecording() { + guard !isRecording else { return } + beginRecording() + } + + /// Stop continuous recording. Unlike `stopRecording()`, this does NOT cancel + /// the recognition task — it stops audio input and calls `endAudio()` so the + /// recognizer produces an `isFinal` result via the callback, which then + /// triggers `onTranscription` and cleans up. + func stopContinuousRecording() { + guard isRecording else { return } + log.info("Stopping continuous recording — waiting for final transcription") + + audioEngine.stop() + audioEngine.inputNode.removeTap(onBus: 0) + + // Signal end of audio — the recognizer will process remaining audio + // and fire the callback with isFinal = true. + recognitionRequest?.endAudio() + } + // MARK: - Fn Key Detection private func setupFnKeyMonitors() { @@ -259,8 +286,10 @@ final class VoiceInputManager { onRecordingStateChanged?(false) log.info("Voice recording stopped") - audioEngine.stop() - audioEngine.inputNode.removeTap(onBus: 0) + if audioEngine.isRunning { + audioEngine.stop() + audioEngine.inputNode.removeTap(onBus: 0) + } recognitionTask?.cancel() recognitionTask = nil recognitionRequest?.endAudio() diff --git a/clients/macos/vellum-assistant/Features/MainWindow/LayoutConfig.swift b/clients/macos/vellum-assistant/Features/MainWindow/LayoutConfig.swift index 7db364e1120..e1003f461f4 100644 --- a/clients/macos/vellum-assistant/Features/MainWindow/LayoutConfig.swift +++ b/clients/macos/vellum-assistant/Features/MainWindow/LayoutConfig.swift @@ -4,7 +4,7 @@ import VellumAssistantShared // MARK: - Domain Types public enum NativePanelId: String, Codable, Equatable, Sendable { - case chat, threadList, settings, agent, debug, doctor, directory, generated, identity, avatarCustomization + case chat, threadList, settings, agent, debug, doctor, directory, generated, identity, avatarCustomization, voiceMode } public enum SlotContent: Equatable, Sendable { diff --git a/clients/macos/vellum-assistant/Features/MainWindow/MainWindow.swift b/clients/macos/vellum-assistant/Features/MainWindow/MainWindow.swift index 9fd82629050..aecca1b5577 100644 --- a/clients/macos/vellum-assistant/Features/MainWindow/MainWindow.swift +++ b/clients/macos/vellum-assistant/Features/MainWindow/MainWindow.swift @@ -139,6 +139,7 @@ final class MainWindow { let documentManager = DocumentManager() let avatarEvolutionState: AvatarEvolutionState? var onMicrophoneToggle: (() -> Void)? + let voiceModeManager = VoiceModeManager() // Forwarding accessors — keeps existing references working while // ownership lives in the `services` container. @@ -263,7 +264,7 @@ final class MainWindow { return } - let rootView = MainWindowView(threadManager: threadManager, appListManager: appListManager, zoomManager: zoomManager, traceStore: traceStore, daemonClient: daemonClient, surfaceManager: surfaceManager, ambientAgent: ambientAgent, settingsStore: services.settingsStore, windowState: windowState, documentManager: documentManager, avatarEvolutionState: avatarEvolutionState, onMicrophoneToggle: onMicrophoneToggle ?? {}) + let rootView = MainWindowView(threadManager: threadManager, appListManager: appListManager, zoomManager: zoomManager, traceStore: traceStore, daemonClient: daemonClient, surfaceManager: surfaceManager, ambientAgent: ambientAgent, settingsStore: services.settingsStore, windowState: windowState, documentManager: documentManager, avatarEvolutionState: avatarEvolutionState, onMicrophoneToggle: onMicrophoneToggle ?? {}, voiceModeManager: voiceModeManager) let hostingController = NonDraggableHostingController(rootView: rootView) let screenFrame = NSScreen.main?.visibleFrame ?? NSScreen.screens.first?.visibleFrame ?? NSRect(x: 0, y: 0, width: 1440, height: 900) diff --git a/clients/macos/vellum-assistant/Features/MainWindow/MainWindowView.swift b/clients/macos/vellum-assistant/Features/MainWindow/MainWindowView.swift index 4099df9cca2..a34aefab0a1 100644 --- a/clients/macos/vellum-assistant/Features/MainWindow/MainWindowView.swift +++ b/clients/macos/vellum-assistant/Features/MainWindow/MainWindowView.swift @@ -52,8 +52,9 @@ struct MainWindowView: View { let avatarEvolutionState: AvatarEvolutionState? @State private var lastAppliedBootstrapTurn: Int = 0 let onMicrophoneToggle: () -> Void + @ObservedObject var voiceModeManager: VoiceModeManager - init(threadManager: ThreadManager, appListManager: AppListManager, zoomManager: ZoomManager, traceStore: TraceStore, daemonClient: DaemonClient, surfaceManager: SurfaceManager, ambientAgent: AmbientAgent, settingsStore: SettingsStore, windowState: MainWindowState, documentManager: DocumentManager, avatarEvolutionState: AvatarEvolutionState? = nil, onMicrophoneToggle: @escaping () -> Void = {}) { + init(threadManager: ThreadManager, appListManager: AppListManager, zoomManager: ZoomManager, traceStore: TraceStore, daemonClient: DaemonClient, surfaceManager: SurfaceManager, ambientAgent: AmbientAgent, settingsStore: SettingsStore, windowState: MainWindowState, documentManager: DocumentManager, avatarEvolutionState: AvatarEvolutionState? = nil, onMicrophoneToggle: @escaping () -> Void = {}, voiceModeManager: VoiceModeManager = VoiceModeManager()) { self.threadManager = threadManager self.appListManager = appListManager self.zoomManager = zoomManager @@ -66,6 +67,7 @@ struct MainWindowView: View { self.documentManager = documentManager self.avatarEvolutionState = avatarEvolutionState self.onMicrophoneToggle = onMicrophoneToggle + self.voiceModeManager = voiceModeManager } // MARK: - Layout Constants @@ -143,6 +145,24 @@ struct MainWindowView: View { FileManager.default.fileExists(atPath: NSHomeDirectory() + "/.vellum/workspace/BOOTSTRAP.md") } + private func toggleVoiceMode() { + if voiceModeManager.state != .off { + voiceModeManager.deactivate() + windowState.selection = nil + } else { + // Ensure a thread exists + if threadManager.activeViewModel == nil { + threadManager.createThread() + } + windowState.selection = .panel(.voiceMode) + // Activate directly — voiceInput was set on VoiceModeManager at MainWindow creation + if let viewModel = threadManager.activeViewModel { + voiceModeManager.activate(chatViewModel: viewModel, settingsStore: settingsStore) + voiceModeManager.startListening() + } + } + } + private func toggleTemporaryChat() { withAnimation(VAnimation.standard) { if threadManager.activeThread?.kind == .private { @@ -271,6 +291,13 @@ struct MainWindowView: View { var body: some View { coreLayoutView .onChange(of: windowState.selection) { oldSelection, newSelection in + // Deactivate voice mode when navigating away from the voice panel + if case .panel(.voiceMode) = oldSelection, voiceModeManager.state != .off { + if case .panel(.voiceMode) = newSelection {} else { + voiceModeManager.deactivate() + } + } + // When selection transitions to .thread, ensure ThreadManager is synced // so chat content targets the correct thread (e.g. after dismissOverlay). // Guard against archived threads: if the thread was archived while an @@ -409,6 +436,17 @@ struct MainWindowView: View { .help(showCopyThreadConfirmation ? "Copied!" : "Copy thread") } + // Voice mode toggle + VIconButton( + label: "Voice Mode", + icon: voiceModeManager.state != .off ? "waveform.circle.fill" : "waveform.circle", + isActive: voiceModeManager.state != .off, + iconOnly: true, + tooltip: voiceModeManager.state != .off ? "Exit voice mode" : "Voice mode" + ) { + toggleVoiceMode() + } + TemporaryChatToggle( isActive: threadManager.activeThread?.kind == .private, tooltip: threadManager.activeThread?.kind == .private ? "Exit temporary chat" : "Temporary chat", diff --git a/clients/macos/vellum-assistant/Features/MainWindow/PanelCoordinator.swift b/clients/macos/vellum-assistant/Features/MainWindow/PanelCoordinator.swift index a552fba7c58..46521454fd0 100644 --- a/clients/macos/vellum-assistant/Features/MainWindow/PanelCoordinator.swift +++ b/clients/macos/vellum-assistant/Features/MainWindow/PanelCoordinator.swift @@ -95,6 +95,15 @@ extension MainWindowView { IdentityPanel(onClose: { windowState.selection = nil }, onCustomizeAvatar: { windowState.selection = .panel(.avatarCustomization) }, daemonClient: daemonClient) case .avatarCustomization: AvatarCustomizationPanel(onClose: { windowState.selection = .panel(.identity) }) + case .voiceMode: + VoiceModePanel( + manager: voiceModeManager, + voiceService: voiceModeManager.voiceService, + onClose: { + voiceModeManager.deactivate() + windowState.selection = nil + } + ) } } @@ -245,6 +254,16 @@ extension MainWindowView { ) } ) + } else if panelType == .voiceMode { + // Voice mode: split view with chat on left, voice panel on right + VSplitView( + panelWidth: $sidePanelWidth, + showPanel: true, + main: { chatView }, + panel: { + nativePanelView(.voiceMode) + } + ) } else { // Full-window panels: settings, debug, doctor, identity fullWindowPanel(panelType) diff --git a/clients/macos/vellum-assistant/Features/MainWindow/Panels/SettingsPanel.swift b/clients/macos/vellum-assistant/Features/MainWindow/Panels/SettingsPanel.swift index 45f4b6b227c..5fd4376fbac 100644 --- a/clients/macos/vellum-assistant/Features/MainWindow/Panels/SettingsPanel.swift +++ b/clients/macos/vellum-assistant/Features/MainWindow/Panels/SettingsPanel.swift @@ -20,6 +20,8 @@ struct SettingsPanel: View { @State private var braveKeyText: String = "" @State private var perplexityKeyText: String = "" @State private var imageGenKeyText: String = "" + @State private var openaiKeyText: String = "" + @State private var elevenLabsKeyText: String = "" @State private var showingTrustRules = false @State private var showingReminders = false @State private var twitterClientId: String = "" @@ -507,6 +509,116 @@ struct SettingsPanel: View { .padding(VSpacing.lg) .vCard(background: VColor.surfaceSubtle) + // OPENAI section (for Voice Mode — Whisper + TTS) + VStack(alignment: .leading, spacing: VSpacing.md) { + Text("OpenAI") + .font(VFont.sectionTitle) + .foregroundColor(VColor.textPrimary) + + if store.hasOpenAIKey { + HStack(spacing: VSpacing.sm) { + Image(systemName: "checkmark.circle.fill") + .foregroundColor(VColor.success) + .font(.system(size: 14)) + Text(store.maskedOpenAIKey) + .font(VFont.body) + .foregroundColor(VColor.textSecondary) + Spacer() + VButton(label: "Clear", style: .danger) { + store.clearOpenAIKey() + openaiKeyText = "" + } + } + } else { + HStack(spacing: VSpacing.xs) { + Text("Enter OpenAI API Key") + .font(VFont.caption) + .foregroundColor(VColor.textSecondary) + Image(systemName: "info.circle") + .font(.system(size: 12)) + .foregroundColor(VColor.textMuted) + } + + SecureField("Your OpenAI API key", text: $openaiKeyText) + .textFieldStyle(.plain) + .font(VFont.body) + .foregroundColor(VColor.textPrimary) + .padding(VSpacing.md) + .background(VColor.surface) + .clipShape(RoundedRectangle(cornerRadius: VRadius.md)) + .overlay( + RoundedRectangle(cornerRadius: VRadius.md) + .stroke(VColor.surfaceBorder.opacity(0.5), lineWidth: 1) + ) + + Text("Used for Voice Mode (Whisper transcription). Get your key at platform.openai.com/api-keys") + .font(VFont.caption) + .foregroundColor(VColor.textMuted) + + VButton(label: "Save", style: .primary) { + store.saveOpenAIKey(openaiKeyText) + openaiKeyText = "" + } + } + } + .padding(VSpacing.lg) + .vCard(background: VColor.surfaceSubtle) + + // ELEVENLABS section (for Voice Mode TTS) + VStack(alignment: .leading, spacing: VSpacing.md) { + Text("ElevenLabs") + .font(VFont.sectionTitle) + .foregroundColor(VColor.textPrimary) + + if store.hasElevenLabsKey { + HStack(spacing: VSpacing.sm) { + Image(systemName: "checkmark.circle.fill") + .foregroundColor(VColor.success) + .font(.system(size: 14)) + Text(store.maskedElevenLabsKey) + .font(VFont.body) + .foregroundColor(VColor.textSecondary) + Spacer() + VButton(label: "Clear", style: .danger) { + store.clearElevenLabsKey() + elevenLabsKeyText = "" + } + } + } else { + HStack(spacing: VSpacing.xs) { + Text("Enter ElevenLabs API Key") + .font(VFont.caption) + .foregroundColor(VColor.textSecondary) + Image(systemName: "info.circle") + .font(.system(size: 12)) + .foregroundColor(VColor.textMuted) + } + + SecureField("Your ElevenLabs API key", text: $elevenLabsKeyText) + .textFieldStyle(.plain) + .font(VFont.body) + .foregroundColor(VColor.textPrimary) + .padding(VSpacing.md) + .background(VColor.surface) + .clipShape(RoundedRectangle(cornerRadius: VRadius.md)) + .overlay( + RoundedRectangle(cornerRadius: VRadius.md) + .stroke(VColor.surfaceBorder.opacity(0.5), lineWidth: 1) + ) + + Text("Used for Voice Mode (text-to-speech). Get your key at elevenlabs.io/app/settings/api-keys") + .font(VFont.caption) + .foregroundColor(VColor.textMuted) + + VButton(label: "Save", style: .primary) { + store.saveElevenLabsKey(elevenLabsKeyText) + elevenLabsKeyText = "" + } + } + } + .padding(VSpacing.lg) + .vCard(background: VColor.surfaceSubtle) + // INTEGRATIONS section if daemonClient != nil { VStack(alignment: .leading, spacing: VSpacing.md) { diff --git a/clients/macos/vellum-assistant/Features/MainWindow/SidePanelType.swift b/clients/macos/vellum-assistant/Features/MainWindow/SidePanelType.swift index e1620d16f64..2445941888c 100644 --- a/clients/macos/vellum-assistant/Features/MainWindow/SidePanelType.swift +++ b/clients/macos/vellum-assistant/Features/MainWindow/SidePanelType.swift @@ -8,6 +8,7 @@ enum SidePanelType: Hashable, CaseIterable { case identity case documentEditor case avatarCustomization + case voiceMode init?(rawValue: String) { switch rawValue { @@ -20,6 +21,7 @@ enum SidePanelType: Hashable, CaseIterable { case "identity": self = .identity case "documentEditor": self = .documentEditor case "avatarCustomization": self = .avatarCustomization + case "voiceMode": self = .voiceMode default: return nil } } diff --git a/clients/macos/vellum-assistant/Features/Settings/SettingsStore.swift b/clients/macos/vellum-assistant/Features/Settings/SettingsStore.swift index 7090e4f1f93..fb7e841cce8 100644 --- a/clients/macos/vellum-assistant/Features/Settings/SettingsStore.swift +++ b/clients/macos/vellum-assistant/Features/Settings/SettingsStore.swift @@ -12,11 +12,15 @@ public final class SettingsStore: ObservableObject { @Published var hasBraveKey: Bool @Published var hasPerplexityKey: Bool @Published var hasImageGenKey: Bool + @Published var hasOpenAIKey: Bool + @Published var hasElevenLabsKey: Bool @Published var hasVercelKey: Bool = false @Published var maskedKey: String = "" @Published var maskedBraveKey: String = "" @Published var maskedPerplexityKey: String = "" @Published var maskedImageGenKey: String = "" + @Published var maskedOpenAIKey: String = "" + @Published var maskedElevenLabsKey: String = "" // MARK: - Model Selection @@ -121,6 +125,12 @@ public final class SettingsStore: ObservableObject { let imageGenKey = APIKeyManager.getKey(for: "gemini") self.hasImageGenKey = imageGenKey != nil self.maskedImageGenKey = Self.maskKey(imageGenKey) + let openaiKey = APIKeyManager.getKey(for: "openai") + self.hasOpenAIKey = openaiKey != nil + self.maskedOpenAIKey = Self.maskKey(openaiKey) + let elevenLabsKey = APIKeyManager.getKey(for: "elevenlabs") + self.hasElevenLabsKey = elevenLabsKey != nil + self.maskedElevenLabsKey = Self.maskKey(elevenLabsKey) let storedImageGenModel = UserDefaults.standard.string(forKey: "selectedImageGenModel") if let storedImageGenModel, Self.availableImageGenModels.contains(storedImageGenModel) { @@ -294,6 +304,34 @@ public final class SettingsStore: ObservableObject { maskedImageGenKey = "" } + func saveOpenAIKey(_ raw: String) { + let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return } + APIKeyManager.setKey(trimmed, for: "openai") + hasOpenAIKey = true + maskedOpenAIKey = Self.maskKey(trimmed) + } + + func clearOpenAIKey() { + APIKeyManager.deleteKey(for: "openai") + hasOpenAIKey = false + maskedOpenAIKey = "" + } + + func saveElevenLabsKey(_ raw: String) { + let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return } + APIKeyManager.setKey(trimmed, for: "elevenlabs") + hasElevenLabsKey = true + maskedElevenLabsKey = Self.maskKey(trimmed) + } + + func clearElevenLabsKey() { + APIKeyManager.deleteKey(for: "elevenlabs") + hasElevenLabsKey = false + maskedElevenLabsKey = "" + } + func setImageGenModel(_ model: String) { selectedImageGenModel = model UserDefaults.standard.set(model, forKey: "selectedImageGenModel") @@ -316,6 +354,14 @@ public final class SettingsStore: ObservableObject { let imageGenKey = APIKeyManager.getKey(for: "gemini") hasImageGenKey = imageGenKey != nil maskedImageGenKey = Self.maskKey(imageGenKey) + + let openaiKey = APIKeyManager.getKey(for: "openai") + hasOpenAIKey = openaiKey != nil + maskedOpenAIKey = Self.maskKey(openaiKey) + + let elevenLabsKey = APIKeyManager.getKey(for: "elevenlabs") + hasElevenLabsKey = elevenLabsKey != nil + maskedElevenLabsKey = Self.maskKey(elevenLabsKey) } /// Shows the first 10 and last 4 characters of a key, e.g. "sk-ant-api...Ab1x". diff --git a/clients/macos/vellum-assistant/Features/Settings/SettingsView.swift b/clients/macos/vellum-assistant/Features/Settings/SettingsView.swift index 56ca1969e11..69ef49033a0 100644 --- a/clients/macos/vellum-assistant/Features/Settings/SettingsView.swift +++ b/clients/macos/vellum-assistant/Features/Settings/SettingsView.swift @@ -8,6 +8,7 @@ public struct SettingsView: View { @State private var braveKeyText = "" @State private var perplexityKeyText = "" @State private var imageGenKeyText = "" + @State private var openaiKeyText = "" @State private var vercelKeyText = "" @State private var twitterClientId = "" @State private var twitterClientSecret = "" @@ -193,6 +194,38 @@ public struct SettingsView: View { } } + Section("OpenAI API Key") { + if store.hasOpenAIKey { + HStack(spacing: 6) { + Image(systemName: "checkmark.circle.fill") + .foregroundStyle(.green) + .font(.system(size: 14)) + Text(store.maskedOpenAIKey) + .foregroundStyle(.secondary) + Spacer() + Button("Clear") { + store.clearOpenAIKey() + openaiKeyText = "" + } + .tint(.red) + } + } else { + SecureField("Enter OpenAI API key", text: $openaiKeyText) + .textFieldStyle(.roundedBorder) + HStack { + Text("Get your API key at platform.openai.com/api-keys") + .font(.caption) + .foregroundStyle(.secondary) + Spacer() + Button("Save") { + store.saveOpenAIKey(openaiKeyText) + openaiKeyText = "" + } + .disabled(openaiKeyText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + } + } + } + Section("Vercel API Key") { if store.hasVercelKey { HStack { diff --git a/clients/macos/vellum-assistant/Features/Voice/AudioAmplitudeTracker.swift b/clients/macos/vellum-assistant/Features/Voice/AudioAmplitudeTracker.swift new file mode 100644 index 00000000000..c1d854f78e4 --- /dev/null +++ b/clients/macos/vellum-assistant/Features/Voice/AudioAmplitudeTracker.swift @@ -0,0 +1,41 @@ +import Foundation +import AVFoundation +import os + +private let log = Logger(subsystem: "com.vellum.vellum-assistant", category: "AudioAmplitudeTracker") + +/// Tracks microphone input amplitude by polling the audio engine's input node metering. +@MainActor +final class AudioAmplitudeTracker { + var onAmplitude: ((Float) -> Void)? + + private var timer: Timer? + private weak var audioEngine: AVAudioEngine? + + nonisolated init() {} + + func startTracking(audioEngine: AVAudioEngine) { + self.audioEngine = audioEngine + startPolling() + } + + func stopTracking() { + timer?.invalidate() + timer = nil + audioEngine = nil + } + + private func startPolling() { + timer = Timer.scheduledTimer(withTimeInterval: 0.05, repeats: true) { [weak self] _ in + Task { @MainActor [weak self] in + guard let self else { return } + // Since we can't directly access metering from AVAudioEngine input + // without an additional tap (and we already have one on bus 0), + // simulate amplitude based on a smoothed random for now. + // The waveform will still react when speaking due to the variation. + let simulated = Float.random(in: 0.2...0.7) + self.onAmplitude?(simulated) + } + } + } +} diff --git a/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift b/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift new file mode 100644 index 00000000000..08a51843ad0 --- /dev/null +++ b/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift @@ -0,0 +1,481 @@ +import Foundation +import AVFoundation +import os + +private let log = Logger(subsystem: "com.vellum.vellum-assistant", category: "OpenAIVoiceService") + +enum OpenAIVoiceError: Error, LocalizedError { + case noAPIKey + case invalidResponse + case apiError(statusCode: Int, message: String) + case noAudioData + + var errorDescription: String? { + switch self { + case .noAPIKey: return "API key not configured" + case .invalidResponse: return "Invalid API response" + case .apiError(let code, let msg): return "API error (\(code)): \(msg)" + case .noAudioData: return "No audio data recorded" + } + } +} + +/// Voice service: Whisper STT (OpenAI) + TTS (ElevenLabs REST API). +/// Records audio, detects silence, transcribes via Whisper, speaks via ElevenLabs. +@MainActor +final class OpenAIVoiceService: ObservableObject { + @Published var amplitude: Float = 0 + @Published var speakingAmplitude: Float = 0 + + // MARK: - Recording State + + private let audioEngine = AVAudioEngine() + private var rawPCMData = Data() + private var recordingFormat: AVAudioFormat? + private var isRecording = false + + /// Fires once when silence is detected after speech. + var onSilenceDetected: (() -> Void)? + /// Callback fired when mic permission is granted after being requested. + var onMicrophoneAuthorized: (() -> Void)? + /// Fires when speech is detected during TTS playback (barge-in). + var onBargeInDetected: (() -> Void)? + + private var lastSpeechTime = Date() + private var recordingStartTime: Date? + private var silenceHandled = false + private var hasSpeechOccurred = false + private var enginePrewarmed = false + + private static let silenceThreshold: Float = 0.015 + private static let speechThreshold: Float = 0.025 + private static let silenceTimeout: TimeInterval = 1.0 + private static let minRecordingDuration: TimeInterval = 0.5 + + // MARK: - ElevenLabs TTS State + + /// Accumulated text from streaming deltas — sent to ElevenLabs when response completes. + private var ttsTextBuffer = "" + private var ttsOnComplete: (() -> Void)? + private var audioPlayer: AVAudioPlayer? + private var speakingTimer: Timer? + private var ttsTask: Task? + + /// ElevenLabs voice ID — "Rachel" (clear, natural female voice). + private static let elevenLabsVoiceId = "21m00Tcm4TlvDq8ikWAM" + + nonisolated init() {} + + // MARK: - API Keys + + var apiKey: String? { APIKeyManager.getKey(for: "openai") } + var elevenLabsKey: String? { APIKeyManager.getKey(for: "elevenlabs") } + var hasAPIKey: Bool { apiKey != nil } + var hasElevenLabsKey: Bool { elevenLabsKey != nil } + + // MARK: - Recording + + /// Pre-initialize the audio engine so the first recording starts instantly. + func prewarmEngine() { + guard !enginePrewarmed else { return } + let _ = audioEngine.inputNode + audioEngine.prepare() + enginePrewarmed = true + log.info("Audio engine pre-warmed") + } + + func startRecording() { + guard !isRecording else { return } + + rawPCMData = Data() + silenceHandled = false + hasSpeechOccurred = false + let inputNode = audioEngine.inputNode + let format = inputNode.outputFormat(forBus: 0) + + guard format.channelCount > 0 else { + log.error("No audio input channels") + return + } + + recordingFormat = format + + inputNode.installTap(onBus: 0, bufferSize: 4096, format: format) { [weak self] buffer, _ in + guard let floatData = buffer.floatChannelData else { return } + let frameCount = Int(buffer.frameLength) + guard frameCount > 0 else { return } + + var chunk = Data(capacity: frameCount * 2) + var sum: Float = 0 + for i in 0.. Self.speechThreshold { + self.hasSpeechOccurred = true + } + if rms > Self.silenceThreshold { + self.lastSpeechTime = Date() + } + let silenceDuration = Date().timeIntervalSince(self.lastSpeechTime) + let recordingDuration = self.recordingStartTime.map { Date().timeIntervalSince($0) } ?? 0 + if !self.silenceHandled, + self.hasSpeechOccurred, + recordingDuration > Self.minRecordingDuration, + silenceDuration > Self.silenceTimeout { + log.info("Silence detected: rms=\(rms, privacy: .public) silenceDuration=\(silenceDuration, privacy: .public)") + self.silenceHandled = true + self.onSilenceDetected?() + } + } + } + + do { + audioEngine.prepare() + try audioEngine.start() + isRecording = true + lastSpeechTime = Date() + recordingStartTime = Date() + log.info("Recording started") + } catch { + log.error("Failed to start audio engine: \(error.localizedDescription)") + } + } + + /// Stop recording and return the audio data as WAV. + func stopRecordingAndGetAudio() -> Data? { + guard isRecording else { return nil } + + isRecording = false + amplitude = 0 + + if audioEngine.isRunning { + audioEngine.stop() + audioEngine.inputNode.removeTap(onBus: 0) + } + + guard let format = recordingFormat, !rawPCMData.isEmpty else { + log.warning("No audio data recorded") + return nil + } + + let wavData = createWAV(pcmData: rawPCMData, sampleRate: UInt32(format.sampleRate)) + rawPCMData = Data() + recordingFormat = nil + recordingStartTime = nil + + log.info("Recording stopped, WAV size: \(wavData.count) bytes") + return wavData + } + + /// Force stop recording without returning audio data. + func cancelRecording() { + guard isRecording else { return } + isRecording = false + amplitude = 0 + if audioEngine.isRunning { + audioEngine.stop() + audioEngine.inputNode.removeTap(onBus: 0) + } + rawPCMData = Data() + recordingFormat = nil + recordingStartTime = nil + } + + /// Fully shut down the audio engine and release the microphone. + func shutdown() { + cancelRecording() + stopBargeInMonitor() + stopSpeaking() + if audioEngine.isRunning { + audioEngine.stop() + } + enginePrewarmed = false + log.info("Audio engine shut down") + } + + // MARK: - Whisper STT + + func transcribe(_ audioData: Data) async throws -> String { + guard let apiKey else { + throw OpenAIVoiceError.noAPIKey + } + + let url = URL(string: "https://api.openai.com/v1/audio/transcriptions")! + var request = URLRequest(url: url) + request.httpMethod = "POST" + request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") + request.timeoutInterval = 30 + + let boundary = UUID().uuidString + request.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type") + + var body = Data() + body.append(contentsOf: "--\(boundary)\r\n".utf8) + body.append(contentsOf: "Content-Disposition: form-data; name=\"file\"; filename=\"audio.wav\"\r\n".utf8) + body.append(contentsOf: "Content-Type: audio/wav\r\n\r\n".utf8) + body.append(audioData) + body.append(contentsOf: "\r\n".utf8) + body.append(contentsOf: "--\(boundary)\r\n".utf8) + body.append(contentsOf: "Content-Disposition: form-data; name=\"model\"\r\n\r\n".utf8) + body.append(contentsOf: "whisper-1\r\n".utf8) + body.append(contentsOf: "--\(boundary)--\r\n".utf8) + + request.httpBody = body + + let (data, response) = try await URLSession.shared.data(for: request) + + guard let httpResponse = response as? HTTPURLResponse else { + throw OpenAIVoiceError.invalidResponse + } + + guard httpResponse.statusCode == 200 else { + let errorBody = String(data: data, encoding: .utf8) ?? "Unknown error" + log.error("Whisper API error (\(httpResponse.statusCode)): \(errorBody)") + throw OpenAIVoiceError.apiError(statusCode: httpResponse.statusCode, message: errorBody) + } + + struct WhisperResponse: Decodable { let text: String } + let result = try JSONDecoder().decode(WhisperResponse.self, from: data) + log.info("Whisper transcription: \(result.text, privacy: .public)") + return result.text + } + + // MARK: - ElevenLabs TTS (REST API) + + /// Called with each text delta — just accumulates text. + func feedTextDelta(_ delta: String) { + ttsTextBuffer += delta + } + + /// Called when the full response is complete — sends accumulated text to ElevenLabs. + func finishTextStream(onComplete: @escaping () -> Void) { + let text = ttsTextBuffer.trimmingCharacters(in: .whitespacesAndNewlines) + ttsTextBuffer = "" + + guard !text.isEmpty, elevenLabsKey != nil else { + log.info("TTS: no text or no ElevenLabs key, completing immediately") + onComplete() + return + } + + ttsOnComplete = onComplete + startSpeakingAmplitudePolling() + + ttsTask = Task { + do { + let audioData = try await fetchElevenLabsTTS(text: text) + guard !Task.isCancelled else { return } + + let player = try AVAudioPlayer(data: audioData) + self.audioPlayer = player + player.delegate = nil // We poll for completion below + player.play() + log.info("TTS: playing \(audioData.count) bytes of audio") + + // Poll until playback finishes + while player.isPlaying && !Task.isCancelled { + try await Task.sleep(nanoseconds: 100_000_000) // 100ms + } + + guard !Task.isCancelled else { return } + log.info("TTS: playback complete") + } catch { + if !Task.isCancelled { + log.error("TTS error: \(error.localizedDescription)") + } + } + + self.audioPlayer = nil + self.finishSpeaking() + self.ttsOnComplete?() + self.ttsOnComplete = nil + } + } + + /// Reset TTS state for a new conversation turn. + func resetStreamingTTS() { + ttsTextBuffer = "" + ttsOnComplete = nil + } + + func stopSpeaking() { + ttsTask?.cancel() + ttsTask = nil + audioPlayer?.stop() + audioPlayer = nil + stopBargeInMonitor() + finishSpeaking() + ttsOnComplete?() + ttsOnComplete = nil + } + + private func finishSpeaking() { + stopSpeakingAmplitudePolling() + stopBargeInMonitor() + speakingAmplitude = 0 + } + + // MARK: - Barge-in (interrupt TTS by speaking) + + private var bargeInMonitorActive = false + + /// Start monitoring the mic for speech during TTS playback. + /// Uses a higher threshold than normal to avoid picking up speaker output. + func startBargeInMonitor() { + guard !bargeInMonitorActive else { return } + bargeInMonitorActive = true + + let inputNode = audioEngine.inputNode + let format = inputNode.outputFormat(forBus: 0) + guard format.channelCount > 0 else { return } + + inputNode.installTap(onBus: 0, bufferSize: 4096, format: format) { [weak self] buffer, _ in + guard let floatData = buffer.floatChannelData else { return } + let frameCount = Int(buffer.frameLength) + guard frameCount > 0 else { return } + + var sum: Float = 0 + for i in 0.. 0.05 { + Task { @MainActor [weak self] in + guard let self, self.bargeInMonitorActive else { return } + log.info("Barge-in detected: rms=\(rms, privacy: .public)") + self.stopBargeInMonitor() + self.onBargeInDetected?() + } + } + } + + do { + audioEngine.prepare() + try audioEngine.start() + log.info("Barge-in monitor started") + } catch { + log.error("Failed to start barge-in monitor: \(error.localizedDescription)") + bargeInMonitorActive = false + } + } + + func stopBargeInMonitor() { + guard bargeInMonitorActive else { return } + bargeInMonitorActive = false + if audioEngine.isRunning { + audioEngine.stop() + audioEngine.inputNode.removeTap(onBus: 0) + } + } + + /// Call ElevenLabs REST API to convert text to speech. Returns MP3 audio data. + private func fetchElevenLabsTTS(text: String) async throws -> Data { + guard let elevenLabsKey else { + throw OpenAIVoiceError.noAPIKey + } + + let voiceId = Self.elevenLabsVoiceId + let url = URL(string: "https://api.elevenlabs.io/v1/text-to-speech/\(voiceId)")! + var request = URLRequest(url: url) + request.httpMethod = "POST" + request.setValue(elevenLabsKey, forHTTPHeaderField: "xi-api-key") + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + request.setValue("audio/mpeg", forHTTPHeaderField: "Accept") + request.timeoutInterval = 30 + + let body: [String: Any] = [ + "text": text, + "model_id": "eleven_flash_v2_5", + "voice_settings": [ + "stability": 0.5, + "similarity_boost": 0.75, + "speed": 1.1 + ] + ] + request.httpBody = try JSONSerialization.data(withJSONObject: body) + + let (data, response) = try await URLSession.shared.data(for: request) + + guard let httpResponse = response as? HTTPURLResponse else { + throw OpenAIVoiceError.invalidResponse + } + + guard httpResponse.statusCode == 200 else { + let errorBody = String(data: data, encoding: .utf8) ?? "Unknown error" + log.error("ElevenLabs API error (\(httpResponse.statusCode)): \(errorBody)") + throw OpenAIVoiceError.apiError(statusCode: httpResponse.statusCode, message: errorBody) + } + + guard !data.isEmpty else { + throw OpenAIVoiceError.noAudioData + } + + return data + } + + // MARK: - Speaking Amplitude + + private func startSpeakingAmplitudePolling() { + speakingTimer = Timer.scheduledTimer(withTimeInterval: 0.05, repeats: true) { [weak self] _ in + Task { @MainActor [weak self] in + guard let self, self.audioPlayer?.isPlaying == true else { return } + self.speakingAmplitude = Float.random(in: 0.3...0.8) + } + } + } + + private func stopSpeakingAmplitudePolling() { + speakingTimer?.invalidate() + speakingTimer = nil + } + + // MARK: - WAV Encoding + + private func createWAV(pcmData: Data, sampleRate: UInt32) -> Data { + let numChannels: UInt16 = 1 + let bitsPerSample: UInt16 = 16 + let bytesPerSample = bitsPerSample / 8 + let dataSize = UInt32(pcmData.count) + + var wav = Data(capacity: 44 + pcmData.count) + wav.append(contentsOf: "RIFF".utf8) + appendLE(&wav, 36 + dataSize) + wav.append(contentsOf: "WAVE".utf8) + wav.append(contentsOf: "fmt ".utf8) + appendLE(&wav, UInt32(16)) + appendLE(&wav, UInt16(1)) // PCM + appendLE(&wav, numChannels) + appendLE(&wav, sampleRate) + appendLE(&wav, sampleRate * UInt32(numChannels) * UInt32(bytesPerSample)) + appendLE(&wav, numChannels * bytesPerSample) + appendLE(&wav, bitsPerSample) + wav.append(contentsOf: "data".utf8) + appendLE(&wav, dataSize) + wav.append(pcmData) + return wav + } + + private func appendLE(_ data: inout Data, _ value: UInt32) { + var v = value.littleEndian + withUnsafeBytes(of: &v) { data.append(contentsOf: $0) } + } + + private func appendLE(_ data: inout Data, _ value: UInt16) { + var v = value.littleEndian + withUnsafeBytes(of: &v) { data.append(contentsOf: $0) } + } +} diff --git a/clients/macos/vellum-assistant/Features/Voice/TTSEngine.swift b/clients/macos/vellum-assistant/Features/Voice/TTSEngine.swift new file mode 100644 index 00000000000..afd1567ced9 --- /dev/null +++ b/clients/macos/vellum-assistant/Features/Voice/TTSEngine.swift @@ -0,0 +1,94 @@ +import Foundation +import AVFoundation +import os + +private let log = Logger(subsystem: "com.vellum.vellum-assistant", category: "TTSEngine") + +@MainActor +final class TTSEngine: NSObject, ObservableObject { + @Published var isSpeaking = false + @Published var currentAmplitude: Float = 0 + + private let synthesizer = AVSpeechSynthesizer() + private var onComplete: (() -> Void)? + private var amplitudeTimer: Timer? + private var delegateSet = false + + nonisolated override init() { + super.init() + } + + private func ensureDelegate() { + guard !delegateSet else { return } + delegateSet = true + synthesizer.delegate = self + } + + func speak(_ text: String, onComplete: (() -> Void)? = nil) { + ensureDelegate() + stop() + self.onComplete = onComplete + + let utterance = AVSpeechUtterance(string: text) + utterance.rate = AVSpeechUtteranceDefaultSpeechRate + utterance.pitchMultiplier = 1.0 + utterance.volume = 1.0 + + isSpeaking = true + startAmplitudePolling() + synthesizer.speak(utterance) + log.info("TTS started speaking") + } + + func stop() { + guard isSpeaking else { return } + synthesizer.stopSpeaking(at: .immediate) + stopAmplitudePolling() + isSpeaking = false + currentAmplitude = 0 + onComplete = nil + log.info("TTS stopped") + } + + private func startAmplitudePolling() { + amplitudeTimer = Timer.scheduledTimer(withTimeInterval: 0.05, repeats: true) { [weak self] _ in + Task { @MainActor [weak self] in + guard let self, self.isSpeaking else { return } + // Simulate amplitude variation while speaking since AVSpeechSynthesizer + // doesn't expose audio levels directly. + self.currentAmplitude = Float.random(in: 0.3...0.8) + } + } + } + + private func stopAmplitudePolling() { + amplitudeTimer?.invalidate() + amplitudeTimer = nil + } +} + +extension TTSEngine: AVSpeechSynthesizerDelegate { + nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) { + Task { @MainActor [weak self] in + guard let self else { return } + self.stopAmplitudePolling() + self.isSpeaking = false + self.currentAmplitude = 0 + log.info("TTS utterance finished") + let completion = self.onComplete + self.onComplete = nil + completion?() + } + } + + nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) { + Task { @MainActor [weak self] in + guard let self else { return } + self.stopAmplitudePolling() + self.isSpeaking = false + self.currentAmplitude = 0 + log.info("TTS utterance cancelled") + self.onComplete = nil + } + } +} diff --git a/clients/macos/vellum-assistant/Features/Voice/VoiceModeManager.swift b/clients/macos/vellum-assistant/Features/Voice/VoiceModeManager.swift new file mode 100644 index 00000000000..a84778027f1 --- /dev/null +++ b/clients/macos/vellum-assistant/Features/Voice/VoiceModeManager.swift @@ -0,0 +1,479 @@ +import Foundation +import Combine +import VellumAssistantShared +import os + +private let log = Logger(subsystem: "com.vellum.vellum-assistant", category: "VoiceModeManager") + +@MainActor +final class VoiceModeManager: ObservableObject { + enum State: Equatable { + case off, idle, listening, processing, speaking + } + + @Published var state: State = .off + @Published var partialTranscription: String = "" + @Published var errorMessage: String = "" + + let voiceService: OpenAIVoiceService + + private weak var chatViewModel: ChatViewModel? + private weak var settingsStore: SettingsStore? + private var previousOnVoiceResponseComplete: ((String) -> Void)? + private var previousOnVoiceTextDelta: ((String) -> Void)? + /// Safety timeout to recover from stuck TTS. + private var ttsTimeoutTask: Task? + /// Permission request IDs currently being handled via voice. + private var pendingPermissionIds: [String] = [] + /// Combine subscription to detect new confirmations in chat messages. + private var messageCancellable: AnyCancellable? + + nonisolated init() { + self.voiceService = OpenAIVoiceService() + } + + var hasAPIKey: Bool { voiceService.hasAPIKey } + + var stateLabel: String { + if !pendingPermissionIds.isEmpty { + switch state { + case .speaking: return "Asking permission..." + case .listening: return "Say yes or no..." + case .processing: return "Processing approval..." + default: break + } + } + switch state { + case .off: return "" + case .idle: return "Ready" + case .listening: return "Listening..." + case .processing: return "Thinking..." + case .speaking: return "Speaking..." + } + } + + func activate(chatViewModel: ChatViewModel, settingsStore: SettingsStore? = nil) { + guard state == .off else { return } + + guard voiceService.hasAPIKey else { + log.error("Voice mode: no OpenAI API key configured") + return + } + + self.chatViewModel = chatViewModel + self.settingsStore = settingsStore + + // Keep the user's current model — don't downgrade for voice mode. + // Capable models (Opus) are much better at tool use (osascript, etc.). + + // Save existing callbacks to restore on deactivation + previousOnVoiceResponseComplete = chatViewModel.onVoiceResponseComplete + previousOnVoiceTextDelta = chatViewModel.onVoiceTextDelta + + // Stream text deltas to TTS as they arrive + chatViewModel.onVoiceTextDelta = { [weak self] delta in + self?.handleTextDelta(delta) + } + + // When the full response is complete, flush remaining text to TTS + chatViewModel.onVoiceResponseComplete = { [weak self] _ in + self?.handleResponseComplete() + } + chatViewModel.isVoiceModeActive = true + + // Monitor for permission requests during voice mode + messageCancellable = chatViewModel.$messages + .receive(on: DispatchQueue.main) + .sink { [weak self] messages in + self?.checkForConfirmations(in: messages) + } + + // Pre-warm audio engine so first recording starts instantly + voiceService.prewarmEngine() + + // Set up silence detection callback + voiceService.onSilenceDetected = { [weak self] in + self?.handleSilenceDetected() + } + + // If mic permission is requested and granted, auto-start listening + voiceService.onMicrophoneAuthorized = { [weak self] in + guard let self, self.state == .idle else { return } + self.startListening() + } + + // Barge-in: user speaks while assistant is talking → interrupt and listen + voiceService.onBargeInDetected = { [weak self] in + self?.handleBargeIn() + } + + state = .idle + log.info("Voice mode activated (daemon + Haiku + streaming TTS)") + } + + func deactivate() { + guard state != .off else { return } + + // Fully shut down audio engine to release the microphone + voiceService.shutdown() + + voiceService.onSilenceDetected = nil + voiceService.onMicrophoneAuthorized = nil + voiceService.onBargeInDetected = nil + + if let chatViewModel { + chatViewModel.onVoiceResponseComplete = previousOnVoiceResponseComplete + chatViewModel.onVoiceTextDelta = previousOnVoiceTextDelta + chatViewModel.isVoiceModeActive = false + } + previousOnVoiceResponseComplete = nil + previousOnVoiceTextDelta = nil + messageCancellable?.cancel() + messageCancellable = nil + pendingPermissionIds = [] + + chatViewModel = nil + settingsStore = nil + state = .off + partialTranscription = "" + log.info("Voice mode deactivated") + } + + func toggleListening() { + switch state { + case .idle: + startListening() + case .listening: + stopListening() + case .speaking: + handleBargeIn() + default: + break + } + } + + func startListening() { + guard state == .idle else { return } + partialTranscription = "" + errorMessage = "" + state = .listening + voiceService.startRecording() + log.info("Voice mode: started listening") + } + + private func stopListening() { + guard state == .listening else { return } + voiceService.cancelRecording() + state = .idle + log.info("Voice mode: stopped listening") + } + + // MARK: - Silence Detection → Transcription + + private func handleSilenceDetected() { + guard state == .listening else { return } + + state = .processing + log.info("Voice mode: silence detected, transcribing via Whisper") + + // Reset streaming TTS state for the new turn + voiceService.resetStreamingTTS() + + guard let audioData = voiceService.stopRecordingAndGetAudio() else { + state = .idle + return + } + + Task { + do { + let text = try await voiceService.transcribe(audioData) + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + + guard !trimmed.isEmpty, let chatViewModel else { + state = .idle + return + } + + // If we're awaiting a permission response, handle it separately + if !self.pendingPermissionIds.isEmpty { + self.partialTranscription = trimmed + self.handlePermissionResponse(trimmed) + return + } + + partialTranscription = trimmed + + // Send the transcribed message through the daemon (full context) + chatViewModel.pendingVoiceMessage = true + chatViewModel.inputText = trimmed + chatViewModel.sendMessage() + log.info("Voice mode: sent transcription to chat via daemon") + } catch { + log.error("Transcription failed: \(error.localizedDescription)") + partialTranscription = "" + if let voiceError = error as? OpenAIVoiceError { + switch voiceError { + case .apiError(let statusCode, _): + if statusCode == 429 { + self.errorMessage = "OpenAI rate limit exceeded. Check your billing at platform.openai.com" + } else if statusCode == 401 { + self.errorMessage = "Invalid OpenAI API key. Update it in Settings." + } else { + self.errorMessage = "OpenAI API error (\(statusCode))" + } + case .noAPIKey: + self.errorMessage = "OpenAI API key not configured. Add it in Settings." + default: + self.errorMessage = "Transcription failed: \(error.localizedDescription)" + } + } else { + self.errorMessage = "Transcription failed: \(error.localizedDescription)" + } + state = .idle + } + } + } + + // MARK: - Streaming TTS from daemon response + + private func handleTextDelta(_ delta: String) { + guard state == .processing || state == .speaking else { return } + guard pendingPermissionIds.isEmpty else { return } + + // Transition to speaking on first delta + if state == .processing { + state = .speaking + log.info("Voice mode: first text delta, starting streaming TTS") + } + + voiceService.feedTextDelta(delta) + } + + private func handleResponseComplete() { + log.info("Voice mode: response complete, flushing remaining TTS") + + // If we never got any text deltas (empty response), go back to idle + if state == .processing { + state = .idle + partialTranscription = "" + startListening() + return + } + + guard state == .speaking else { return } + + // Safety timeout: if TTS completion doesn't fire within 15s, recover + ttsTimeoutTask?.cancel() + ttsTimeoutTask = Task { [weak self] in + try? await Task.sleep(nanoseconds: 15_000_000_000) + guard let self, !Task.isCancelled, self.state == .speaking else { return } + log.warning("Voice mode: TTS timeout, recovering to idle") + self.voiceService.stopSpeaking() + self.state = .idle + self.partialTranscription = "" + self.startListening() + } + + voiceService.finishTextStream { [weak self] in + guard let self, self.state == .speaking else { return } + self.ttsTimeoutTask?.cancel() + self.ttsTimeoutTask = nil + self.voiceService.stopBargeInMonitor() + self.state = .idle + self.partialTranscription = "" + // Auto-start listening for the next turn + self.startListening() + } + + // Start monitoring mic for barge-in (interrupt by speaking) + voiceService.startBargeInMonitor() + } + + // MARK: - Voice-Driven Permission Handling + + private func checkForConfirmations(in messages: [ChatMessage]) { + guard pendingPermissionIds.isEmpty else { return } + guard state == .processing || state == .speaking || state == .idle || state == .listening else { return } + + let pending = messages + .compactMap { $0.confirmation } + .filter { $0.state == .pending } + + guard !pending.isEmpty else { return } + + pendingPermissionIds = pending.map { $0.requestId } + + // Stop any current activity before speaking the permission prompt + switch state { + case .speaking: + // Set state to .processing first so ttsOnComplete callback (from stopSpeaking) + // won't auto-transition to idle/listening + ttsTimeoutTask?.cancel() + ttsTimeoutTask = nil + state = .processing + voiceService.stopSpeaking() + case .listening: + voiceService.cancelRecording() + default: + break + } + + speakPermissionSummary(pending) + } + + private func speakPermissionSummary(_ confirmations: [ToolConfirmationData]) { + let summary = generatePermissionSummary(confirmations) + log.info("Voice mode: asking permission via voice — \(summary, privacy: .public)") + + state = .speaking + voiceService.resetStreamingTTS() + voiceService.feedTextDelta(summary) + + ttsTimeoutTask?.cancel() + ttsTimeoutTask = Task { [weak self] in + try? await Task.sleep(nanoseconds: 15_000_000_000) + guard let self, !Task.isCancelled, self.state == .speaking else { return } + log.warning("Voice mode: permission TTS timeout, recovering") + self.voiceService.stopSpeaking() + self.state = .idle + self.startListening() + } + + voiceService.finishTextStream { [weak self] in + guard let self, self.state == .speaking else { return } + self.ttsTimeoutTask?.cancel() + self.ttsTimeoutTask = nil + self.voiceService.stopBargeInMonitor() + self.state = .idle + self.startListening() + } + } + + private static let permissionPhrases: [(String) -> String] = [ + { "Sure thing! To do that, I'll need to \($0). Can I go ahead?" }, + { "Yeah let me try! I just need access to \($0). Is that okay?" }, + { "On it! To do what you're asking I need to \($0). Want me to?" }, + ] + private var lastPhraseIndex = -1 + + private func generatePermissionSummary(_ confirmations: [ToolConfirmationData]) -> String { + let descriptions = confirmations.map { describeAction($0) } + let unique = Array(Set(descriptions)) + + let actions: String + if unique.count == 1 { + actions = unique[0] + } else if unique.count == 2 { + actions = "\(unique[0]), and then \(unique[1])" + } else { + actions = unique.dropLast().joined(separator: ", ") + ", and \(unique.last!)" + } + + // Rotate through phrases so it doesn't sound repetitive + var idx = Int.random(in: 0.. String { + let reason = (confirmation.input["reason"]?.value as? String) ?? "" + + // If the model provided a reason, use it directly — it's already high-level. + if !reason.isEmpty { + return reason.prefix(1).lowercased() + reason.dropFirst() + } + + // Fall back to tool-specific descriptions + switch confirmation.toolName { + case "bash", "host_bash": + let cmd = (confirmation.input["command"]?.value as? String) ?? "" + if cmd.hasPrefix("open ") { return "open an app for you" } + if cmd.contains("osascript") { return "run a quick script on your Mac" } + return "run something on your Mac" + case "file_write", "host_file_write": + let path = (confirmation.input["path"]?.value as? String) ?? "" + if path.isEmpty { return "create a file for you" } + return "create a file called \(URL(fileURLWithPath: path).lastPathComponent)" + case "file_edit", "host_file_edit": + let path = (confirmation.input["path"]?.value as? String) ?? "" + if path.isEmpty { return "make some changes to a file" } + return "make some changes to \(URL(fileURLWithPath: path).lastPathComponent)" + case "file_read", "host_file_read": + let path = (confirmation.input["path"]?.value as? String) ?? "" + if path.isEmpty { return "take a look at a file" } + return "take a look at \(URL(fileURLWithPath: path).lastPathComponent)" + case "web_fetch": + let url = (confirmation.input["url"]?.value as? String) ?? "" + if let host = URL(string: url)?.host { return "grab some info from \(host)" } + return "look something up online" + case "browser_navigate": + let url = (confirmation.input["url"]?.value as? String) ?? "" + if let host = URL(string: url)?.host { return "open up \(host)" } + return "open up a webpage" + default: + return confirmation.toolCategory.lowercased() + } + } + + private func handlePermissionResponse(_ text: String) { + let lower = text.lowercased() + let affirmative = ["yes", "yeah", "yep", "go ahead", "allow", "approve", + "sure", "okay", "ok", "do it", "proceed"] + let negative = ["no", "nope", "don't", "deny", "stop", "cancel", "reject"] + + let isApproved = affirmative.contains(where: { lower.contains($0) }) + let isDenied = negative.contains(where: { lower.contains($0) }) + + guard let chatViewModel else { + pendingPermissionIds = [] + state = .idle + return + } + + if isApproved { + log.info("Voice mode: permissions approved via voice") + for requestId in pendingPermissionIds { + chatViewModel.respondToConfirmation(requestId: requestId, decision: "allow") + } + pendingPermissionIds = [] + partialTranscription = "" + state = .processing + } else if isDenied { + log.info("Voice mode: permissions denied via voice") + for requestId in pendingPermissionIds { + chatViewModel.respondToConfirmation(requestId: requestId, decision: "deny") + } + pendingPermissionIds = [] + partialTranscription = "" + state = .processing + } else { + log.info("Voice mode: unclear permission response — \(text, privacy: .public)") + state = .speaking + voiceService.resetStreamingTTS() + voiceService.feedTextDelta("Sorry, I didn't quite catch that. Do you want me to go ahead with that?") + voiceService.finishTextStream { [weak self] in + guard let self else { return } + self.voiceService.stopBargeInMonitor() + self.state = .idle + self.startListening() + } + } + } + + // MARK: - Barge-in (interrupt TTS) + + private func handleBargeIn() { + guard state == .speaking else { return } + log.info("Voice mode: barge-in — interrupting TTS") + + ttsTimeoutTask?.cancel() + ttsTimeoutTask = nil + voiceService.stopSpeaking() + state = .idle + partialTranscription = "" + // Immediately start listening so the user's speech is captured + startListening() + } +} diff --git a/clients/macos/vellum-assistant/Features/Voice/VoiceModePanel.swift b/clients/macos/vellum-assistant/Features/Voice/VoiceModePanel.swift new file mode 100644 index 00000000000..c953ea3153a --- /dev/null +++ b/clients/macos/vellum-assistant/Features/Voice/VoiceModePanel.swift @@ -0,0 +1,228 @@ +import SwiftUI +import VellumAssistantShared + +struct VoiceModePanel: View { + @ObservedObject var manager: VoiceModeManager + @ObservedObject var voiceService: OpenAIVoiceService + let onClose: () -> Void + + @State private var appearance = AvatarAppearanceManager.shared + @State private var showingInfo = false + + private let avatarSize: CGFloat = 100 + private let avatarPixelSize: CGFloat = 4 + + var body: some View { + VStack(spacing: 0) { + // Header + HStack { + HStack(spacing: VSpacing.sm) { + Text("VOICE") + .font(VFont.display) + .foregroundColor(VColor.textPrimary) + Text("BETA") + .font(VFont.small) + .foregroundColor(VColor.accent) + .padding(.horizontal, 6) + .padding(.vertical, 2) + .background(VColor.accent.opacity(0.15)) + .clipShape(RoundedRectangle(cornerRadius: VRadius.sm)) + } + Spacer() + Button(action: { showingInfo.toggle() }) { + Image(systemName: "info.circle") + .font(.system(size: 14, weight: .medium)) + .foregroundColor(VColor.textSecondary) + .frame(width: 28, height: 28) + } + .buttonStyle(.plain) + Button(action: onClose) { + Image(systemName: "xmark") + .font(.system(size: 12, weight: .semibold)) + .foregroundColor(VColor.textSecondary) + .frame(width: 28, height: 28) + .background(VColor.surface.opacity(0.8)) + .clipShape(Circle()) + } + .buttonStyle(.plain) + } + .padding(.horizontal, VSpacing.xl) + .padding(.top, VSpacing.xl) + .padding(.bottom, VSpacing.lg) + + // Info panel + if showingInfo { + VStack(alignment: .leading, spacing: VSpacing.sm) { + infoRow(label: "STT", value: "OpenAI Whisper") + infoRow(label: "LLM", value: "Your selected model") + infoRow(label: "TTS", value: "ElevenLabs Flash v2.5") + Divider().background(VColor.surfaceBorder) + Text("Voice mode transcribes your speech, sends it to your assistant, and speaks the response. Tool permissions are handled via voice — say \"yes\" or \"no\" when asked.") + .font(VFont.caption) + .foregroundColor(VColor.textMuted) + } + .padding(VSpacing.lg) + .background(VColor.surface) + .clipShape(RoundedRectangle(cornerRadius: VRadius.md)) + .padding(.horizontal, VSpacing.xl) + .padding(.bottom, VSpacing.lg) + } + + Spacer() + + if !manager.hasAPIKey { + // No API key configured + VStack(spacing: VSpacing.lg) { + Image(systemName: "key.fill") + .font(.system(size: 32)) + .foregroundColor(VColor.textMuted) + Text("OpenAI API key required") + .font(VFont.bodyMedium) + .foregroundColor(VColor.textSecondary) + Text("Add your OpenAI API key in Settings to use voice mode with Whisper and TTS.") + .font(VFont.caption) + .foregroundColor(VColor.textMuted) + .multilineTextAlignment(.center) + .padding(.horizontal, VSpacing.xl) + } + } else { + // Avatar + ZStack { + Circle() + .stroke(strokeColor, lineWidth: 3) + .frame(width: avatarSize, height: avatarSize) + .scaleEffect(manager.state == .listening ? 1.05 : 1.0) + .animation(.easeInOut(duration: 0.5).repeatForever(autoreverses: true), value: manager.state == .listening) + + Image(nsImage: PixelSpriteBuilder.buildBlobNSImage(pixelSize: avatarPixelSize, palette: appearance.palette)) + .interpolation(.none) + } + .padding(.bottom, VSpacing.xl) + + // Waveform + VWaveformView( + amplitude: effectiveAmplitude, + barCount: 30, + isActive: manager.state == .listening || manager.state == .speaking, + accentColor: waveformColor + ) + .frame(height: 44) + .padding(.horizontal, VSpacing.xl) + .padding(.bottom, VSpacing.lg) + + // State label + Text(manager.stateLabel) + .font(VFont.bodyMedium) + .foregroundColor(VColor.textSecondary) + .padding(.bottom, VSpacing.sm) + + // Error message + if !manager.errorMessage.isEmpty { + HStack(spacing: VSpacing.sm) { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(VColor.warning) + .font(.system(size: 14)) + Text(manager.errorMessage) + .font(VFont.caption) + .foregroundColor(VColor.warning) + .multilineTextAlignment(.center) + } + .padding(.horizontal, VSpacing.xl) + .padding(.vertical, VSpacing.md) + .background(VColor.warning.opacity(0.1)) + .clipShape(RoundedRectangle(cornerRadius: VRadius.md)) + .padding(.horizontal, VSpacing.xl) + .padding(.bottom, VSpacing.lg) + } + + // Partial transcription + if !manager.partialTranscription.isEmpty { + ScrollView { + Text(manager.partialTranscription) + .font(VFont.body) + .foregroundColor(VColor.textPrimary) + .multilineTextAlignment(.center) + .padding(.horizontal, VSpacing.xl) + } + .frame(maxHeight: 80) + .padding(.bottom, VSpacing.lg) + } + } + + Spacer() + + // Controls + if manager.hasAPIKey { + VStack(spacing: VSpacing.md) { + // Mic toggle button + Button(action: { manager.toggleListening() }) { + Image(systemName: micIcon) + .font(.system(size: 20, weight: .medium)) + .foregroundColor(micButtonForeground) + .frame(width: 56, height: 56) + .background(micButtonBackground) + .clipShape(Circle()) + } + .buttonStyle(.plain) + .disabled(manager.state == .processing) + + // End voice mode button + Button(action: onClose) { + Text("End Voice Mode") + .font(VFont.captionMedium) + .foregroundColor(VColor.textMuted) + } + .buttonStyle(.plain) + } + .padding(.bottom, VSpacing.xxl) + } + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + .background(VColor.background) + } + + private var strokeColor: Color { + switch manager.state { + case .listening: return VColor.accent + case .speaking: return VColor.success + case .processing: return VColor.textMuted + default: return VColor.surfaceBorder + } + } + + private var effectiveAmplitude: Float { + switch manager.state { + case .listening: return voiceService.amplitude + case .speaking: return voiceService.speakingAmplitude + default: return 0 + } + } + + private var waveformColor: Color { + manager.state == .speaking ? VColor.success : VColor.accent + } + + private var micIcon: String { + manager.state == .listening ? "mic.fill" : "mic" + } + + private var micButtonForeground: Color { + manager.state == .listening ? .white : VColor.textPrimary + } + + private var micButtonBackground: Color { + manager.state == .listening ? VColor.accent : VColor.surface + } + + private func infoRow(label: String, value: String) -> some View { + HStack { + Text(label) + .font(VFont.captionMedium) + .foregroundColor(VColor.textMuted) + .frame(width: 32, alignment: .leading) + Text(value) + .font(VFont.caption) + .foregroundColor(VColor.textSecondary) + } + } +} diff --git a/clients/shared/DesignSystem/Components/Display/VWaveformView.swift b/clients/shared/DesignSystem/Components/Display/VWaveformView.swift new file mode 100644 index 00000000000..ebd30fbe466 --- /dev/null +++ b/clients/shared/DesignSystem/Components/Display/VWaveformView.swift @@ -0,0 +1,84 @@ +import SwiftUI + +/// Animated waveform visualization — a row of bars that fluctuate with amplitude. +public struct VWaveformView: View { + let amplitude: Float + var barCount: Int + let isActive: Bool + var accentColor: Color + + @State private var barOffsets: [Float] = [] + @State private var animationTimer: Timer? + + public init(amplitude: Float, barCount: Int = 40, isActive: Bool, accentColor: Color = VColor.accent) { + self.amplitude = amplitude + self.barCount = barCount + self.isActive = isActive + self.accentColor = accentColor + } + + public var body: some View { + HStack(spacing: 2) { + ForEach(0.. some View { + let offset = index < barOffsets.count ? barOffsets[index] : 0.5 + let baseHeight: CGFloat = 4 + let maxAdditional: CGFloat = 36 + + let effectiveAmplitude: CGFloat = if isActive { + CGFloat(amplitude) * CGFloat(offset) + } else { + // Gentle ambient wave when idle + CGFloat(offset) * 0.15 + } + + let height = baseHeight + maxAdditional * effectiveAmplitude + + RoundedRectangle(cornerRadius: 1.5) + .fill(isActive ? accentColor : VColor.textMuted.opacity(0.3)) + .frame(width: 3, height: height) + .animation(.easeInOut(duration: 0.1), value: height) + } + + private func startAnimation() { + animationTimer?.invalidate() + animationTimer = Timer.scheduledTimer(withTimeInterval: 0.08, repeats: true) { _ in + Task { @MainActor in + var newOffsets = [Float]() + for i in 0.. Void)? + /// Called with each streaming text delta during a voice-triggered response, for real-time TTS. + public var onVoiceTextDelta: ((String) -> Void)? + /// When true, messages are prefixed with a concise-response instruction for voice conversations. + public var isVoiceModeActive: Bool = false var pendingUserAttachments: [IPCAttachment]? /// Stores the last user message that failed to send, enabling retry. private(set) var lastFailedMessageText: String? @@ -234,7 +238,10 @@ public final class ChatViewModel: ObservableObject { // MARK: - Sending public func sendMessage() { - let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines) + let rawText = inputText.trimmingCharacters(in: .whitespacesAndNewlines) + let text = isVoiceModeActive + ? "[Voice conversation — keep spoken responses brief (2-3 sentences) but fully complete the task using any tools needed. Do not give up early. When interacting with macOS apps (Messages, Contacts, Calendar, Reminders, Notes, Mail, etc.), always use osascript with AppleScript — never query databases directly or use sqlite3.]\n\n\(rawText)" + : rawText let hasAttachments = !pendingAttachments.isEmpty let hasSkillInvocation = pendingSkillInvocation != nil guard !text.isEmpty || hasAttachments || hasSkillInvocation else { return }