Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clients/macos/vellum-assistant/App/AppDelegate.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,7 @@ public final class AppDelegate: NSObject, NSApplicationDelegate {
main.onMicrophoneToggle = { [weak self] in
self?.voiceInput?.toggleRecording()
}
// Voice mode uses OpenAI Whisper + TTS directly (no VoiceInputManager needed)
main.threadManager.onInlineConfirmationResponse = { [weak self] requestId, decision in
guard let self else { return }
// Resume the notification service continuation with a sentinel so
Expand Down
33 changes: 31 additions & 2 deletions clients/macos/vellum-assistant/App/VoiceInputManager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ final class VoiceInputManager {
private var recognitionTask: SFSpeechRecognitionTask?
private let audioEngine = AVAudioEngine()

/// Exposes the audio engine for amplitude tracking in voice mode.
var exposedAudioEngine: AVAudioEngine { audioEngine }

func start() {
setupFnKeyMonitors()
}
Expand Down Expand Up @@ -69,6 +72,30 @@ final class VoiceInputManager {
}
}

// MARK: - Continuous Recording (Voice Mode)

/// Start recording without requiring a key hold. Used by voice mode for hands-free operation.
func startContinuousRecording() {
guard !isRecording else { return }
beginRecording()
}

/// Stop continuous recording. Unlike `stopRecording()`, this does NOT cancel
/// the recognition task — it stops audio input and calls `endAudio()` so the
/// recognizer produces an `isFinal` result via the callback, which then
/// triggers `onTranscription` and cleans up.
func stopContinuousRecording() {
guard isRecording else { return }
log.info("Stopping continuous recording — waiting for final transcription")

audioEngine.stop()
audioEngine.inputNode.removeTap(onBus: 0)

// Signal end of audio — the recognizer will process remaining audio
// and fire the callback with isFinal = true.
recognitionRequest?.endAudio()
}

// MARK: - Fn Key Detection

private func setupFnKeyMonitors() {
Expand Down Expand Up @@ -259,8 +286,10 @@ final class VoiceInputManager {
onRecordingStateChanged?(false)
log.info("Voice recording stopped")

audioEngine.stop()
audioEngine.inputNode.removeTap(onBus: 0)
if audioEngine.isRunning {
audioEngine.stop()
audioEngine.inputNode.removeTap(onBus: 0)
}
recognitionTask?.cancel()
recognitionTask = nil
recognitionRequest?.endAudio()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import VellumAssistantShared
// MARK: - Domain Types

public enum NativePanelId: String, Codable, Equatable, Sendable {
case chat, threadList, settings, agent, debug, doctor, directory, generated, identity, avatarCustomization
case chat, threadList, settings, agent, debug, doctor, directory, generated, identity, avatarCustomization, voiceMode
}

public enum SlotContent: Equatable, Sendable {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ final class MainWindow {
let documentManager = DocumentManager()
let avatarEvolutionState: AvatarEvolutionState?
var onMicrophoneToggle: (() -> Void)?
let voiceModeManager = VoiceModeManager()

// Forwarding accessors — keeps existing references working while
// ownership lives in the `services` container.
Expand Down Expand Up @@ -263,7 +264,7 @@ final class MainWindow {
return
}

let rootView = MainWindowView(threadManager: threadManager, appListManager: appListManager, zoomManager: zoomManager, traceStore: traceStore, daemonClient: daemonClient, surfaceManager: surfaceManager, ambientAgent: ambientAgent, settingsStore: services.settingsStore, windowState: windowState, documentManager: documentManager, avatarEvolutionState: avatarEvolutionState, onMicrophoneToggle: onMicrophoneToggle ?? {})
let rootView = MainWindowView(threadManager: threadManager, appListManager: appListManager, zoomManager: zoomManager, traceStore: traceStore, daemonClient: daemonClient, surfaceManager: surfaceManager, ambientAgent: ambientAgent, settingsStore: services.settingsStore, windowState: windowState, documentManager: documentManager, avatarEvolutionState: avatarEvolutionState, onMicrophoneToggle: onMicrophoneToggle ?? {}, voiceModeManager: voiceModeManager)
let hostingController = NonDraggableHostingController(rootView: rootView)

let screenFrame = NSScreen.main?.visibleFrame ?? NSScreen.screens.first?.visibleFrame ?? NSRect(x: 0, y: 0, width: 1440, height: 900)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ struct MainWindowView: View {
let avatarEvolutionState: AvatarEvolutionState?
@State private var lastAppliedBootstrapTurn: Int = 0
let onMicrophoneToggle: () -> Void
@ObservedObject var voiceModeManager: VoiceModeManager

init(threadManager: ThreadManager, appListManager: AppListManager, zoomManager: ZoomManager, traceStore: TraceStore, daemonClient: DaemonClient, surfaceManager: SurfaceManager, ambientAgent: AmbientAgent, settingsStore: SettingsStore, windowState: MainWindowState, documentManager: DocumentManager, avatarEvolutionState: AvatarEvolutionState? = nil, onMicrophoneToggle: @escaping () -> Void = {}) {
init(threadManager: ThreadManager, appListManager: AppListManager, zoomManager: ZoomManager, traceStore: TraceStore, daemonClient: DaemonClient, surfaceManager: SurfaceManager, ambientAgent: AmbientAgent, settingsStore: SettingsStore, windowState: MainWindowState, documentManager: DocumentManager, avatarEvolutionState: AvatarEvolutionState? = nil, onMicrophoneToggle: @escaping () -> Void = {}, voiceModeManager: VoiceModeManager = VoiceModeManager()) {
self.threadManager = threadManager
self.appListManager = appListManager
self.zoomManager = zoomManager
Expand All @@ -66,6 +67,7 @@ struct MainWindowView: View {
self.documentManager = documentManager
self.avatarEvolutionState = avatarEvolutionState
self.onMicrophoneToggle = onMicrophoneToggle
self.voiceModeManager = voiceModeManager
}

// MARK: - Layout Constants
Expand Down Expand Up @@ -143,6 +145,24 @@ struct MainWindowView: View {
FileManager.default.fileExists(atPath: NSHomeDirectory() + "/.vellum/workspace/BOOTSTRAP.md")
}

private func toggleVoiceMode() {
if voiceModeManager.state != .off {
voiceModeManager.deactivate()
windowState.selection = nil
} else {
// Ensure a thread exists
if threadManager.activeViewModel == nil {
threadManager.createThread()
}
windowState.selection = .panel(.voiceMode)
// Activate directly — voiceInput was set on VoiceModeManager at MainWindow creation
if let viewModel = threadManager.activeViewModel {
voiceModeManager.activate(chatViewModel: viewModel, settingsStore: settingsStore)
voiceModeManager.startListening()
}
}
}

private func toggleTemporaryChat() {
withAnimation(VAnimation.standard) {
if threadManager.activeThread?.kind == .private {
Expand Down Expand Up @@ -271,6 +291,13 @@ struct MainWindowView: View {
var body: some View {
coreLayoutView
.onChange(of: windowState.selection) { oldSelection, newSelection in
// Deactivate voice mode when navigating away from the voice panel
if case .panel(.voiceMode) = oldSelection, voiceModeManager.state != .off {
if case .panel(.voiceMode) = newSelection {} else {
voiceModeManager.deactivate()
}
}

// When selection transitions to .thread, ensure ThreadManager is synced
// so chat content targets the correct thread (e.g. after dismissOverlay).
// Guard against archived threads: if the thread was archived while an
Expand Down Expand Up @@ -409,6 +436,17 @@ struct MainWindowView: View {
.help(showCopyThreadConfirmation ? "Copied!" : "Copy thread")
}

// Voice mode toggle
VIconButton(
label: "Voice Mode",
icon: voiceModeManager.state != .off ? "waveform.circle.fill" : "waveform.circle",
isActive: voiceModeManager.state != .off,
iconOnly: true,
tooltip: voiceModeManager.state != .off ? "Exit voice mode" : "Voice mode"
) {
toggleVoiceMode()
}

TemporaryChatToggle(
isActive: threadManager.activeThread?.kind == .private,
tooltip: threadManager.activeThread?.kind == .private ? "Exit temporary chat" : "Temporary chat",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,15 @@ extension MainWindowView {
IdentityPanel(onClose: { windowState.selection = nil }, onCustomizeAvatar: { windowState.selection = .panel(.avatarCustomization) }, daemonClient: daemonClient)
case .avatarCustomization:
AvatarCustomizationPanel(onClose: { windowState.selection = .panel(.identity) })
case .voiceMode:
VoiceModePanel(
manager: voiceModeManager,
voiceService: voiceModeManager.voiceService,
onClose: {
voiceModeManager.deactivate()
windowState.selection = nil
}
)
}
}

Expand Down Expand Up @@ -245,6 +254,16 @@ extension MainWindowView {
)
}
)
} else if panelType == .voiceMode {
// Voice mode: split view with chat on left, voice panel on right
VSplitView(
panelWidth: $sidePanelWidth,
showPanel: true,
main: { chatView },
panel: {
nativePanelView(.voiceMode)
}
)
} else {
// Full-window panels: settings, debug, doctor, identity
fullWindowPanel(panelType)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ struct SettingsPanel: View {
@State private var braveKeyText: String = ""
@State private var perplexityKeyText: String = ""
@State private var imageGenKeyText: String = ""
@State private var openaiKeyText: String = ""
@State private var elevenLabsKeyText: String = ""
@State private var showingTrustRules = false
@State private var showingReminders = false
@State private var twitterClientId: String = ""
Expand Down Expand Up @@ -507,6 +509,116 @@ struct SettingsPanel: View {
.padding(VSpacing.lg)
.vCard(background: VColor.surfaceSubtle)

// OPENAI section (for Voice Mode — Whisper + TTS)
VStack(alignment: .leading, spacing: VSpacing.md) {
Text("OpenAI")
.font(VFont.sectionTitle)
.foregroundColor(VColor.textPrimary)

if store.hasOpenAIKey {
HStack(spacing: VSpacing.sm) {
Image(systemName: "checkmark.circle.fill")
.foregroundColor(VColor.success)
.font(.system(size: 14))
Text(store.maskedOpenAIKey)
.font(VFont.body)
.foregroundColor(VColor.textSecondary)
Spacer()
VButton(label: "Clear", style: .danger) {
store.clearOpenAIKey()
openaiKeyText = ""
}
}
} else {
HStack(spacing: VSpacing.xs) {
Text("Enter OpenAI API Key")
.font(VFont.caption)
.foregroundColor(VColor.textSecondary)
Image(systemName: "info.circle")
.font(.system(size: 12))
.foregroundColor(VColor.textMuted)
}

SecureField("Your OpenAI API key", text: $openaiKeyText)
.textFieldStyle(.plain)
.font(VFont.body)
.foregroundColor(VColor.textPrimary)
.padding(VSpacing.md)
.background(VColor.surface)
.clipShape(RoundedRectangle(cornerRadius: VRadius.md))
.overlay(
RoundedRectangle(cornerRadius: VRadius.md)
.stroke(VColor.surfaceBorder.opacity(0.5), lineWidth: 1)
)

Text("Used for Voice Mode (Whisper transcription). Get your key at platform.openai.com/api-keys")
.font(VFont.caption)
.foregroundColor(VColor.textMuted)

VButton(label: "Save", style: .primary) {
store.saveOpenAIKey(openaiKeyText)
openaiKeyText = ""
}
}
}
.padding(VSpacing.lg)
.vCard(background: VColor.surfaceSubtle)

// ELEVENLABS section (for Voice Mode TTS)
VStack(alignment: .leading, spacing: VSpacing.md) {
Text("ElevenLabs")
.font(VFont.sectionTitle)
.foregroundColor(VColor.textPrimary)

if store.hasElevenLabsKey {
HStack(spacing: VSpacing.sm) {
Image(systemName: "checkmark.circle.fill")
.foregroundColor(VColor.success)
.font(.system(size: 14))
Text(store.maskedElevenLabsKey)
.font(VFont.body)
.foregroundColor(VColor.textSecondary)
Spacer()
VButton(label: "Clear", style: .danger) {
store.clearElevenLabsKey()
elevenLabsKeyText = ""
}
}
} else {
HStack(spacing: VSpacing.xs) {
Text("Enter ElevenLabs API Key")
.font(VFont.caption)
.foregroundColor(VColor.textSecondary)
Image(systemName: "info.circle")
.font(.system(size: 12))
.foregroundColor(VColor.textMuted)
}

SecureField("Your ElevenLabs API key", text: $elevenLabsKeyText)
.textFieldStyle(.plain)
.font(VFont.body)
.foregroundColor(VColor.textPrimary)
.padding(VSpacing.md)
.background(VColor.surface)
.clipShape(RoundedRectangle(cornerRadius: VRadius.md))
.overlay(
RoundedRectangle(cornerRadius: VRadius.md)
.stroke(VColor.surfaceBorder.opacity(0.5), lineWidth: 1)
)

Text("Used for Voice Mode (text-to-speech). Get your key at elevenlabs.io/app/settings/api-keys")
.font(VFont.caption)
.foregroundColor(VColor.textMuted)

VButton(label: "Save", style: .primary) {
store.saveElevenLabsKey(elevenLabsKeyText)
elevenLabsKeyText = ""
}
}
}
.padding(VSpacing.lg)
.vCard(background: VColor.surfaceSubtle)

// INTEGRATIONS section
if daemonClient != nil {
VStack(alignment: .leading, spacing: VSpacing.md) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ enum SidePanelType: Hashable, CaseIterable {
case identity
case documentEditor
case avatarCustomization
case voiceMode

init?(rawValue: String) {
switch rawValue {
Expand All @@ -20,6 +21,7 @@ enum SidePanelType: Hashable, CaseIterable {
case "identity": self = .identity
case "documentEditor": self = .documentEditor
case "avatarCustomization": self = .avatarCustomization
case "voiceMode": self = .voiceMode
default: return nil
}
}
Expand Down
Loading