From accedb4c21d284d82b6d57291a992059165f263a Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 18:55:27 +0000
Subject: [PATCH 1/5] Make dictation engine start non-blocking and improve
 audio resilience

- Add installTapAndStartAsync to AudioEngineController for non-blocking
  engine start using Swift concurrency (withCheckedContinuation)
- Extract installTapAndStartImpl to share logic between sync/async paths
- Listen for AVAudioEngineConfigurationChange to re-prewarm inputNode
  after Bluetooth device connect/disconnect and AirPods mode switches
- Restructure VoiceInputManager.beginRecording() to show recording UI
  and play activation chime immediately, then start engine async via Task
- Move DictationContextCapture off the critical path: engine starts
  concurrently on its audio queue while context capture runs on main
- Add SFSpeechRecognizer transient unavailability retry (recreate if
  isAvailable returns false after sleep/wake or heavy use)
- Handle edge case where PTT is released before async engine start
  completes (stopRecordingForDictation cleans up directly)

Co-Authored-By: tkheyfets <timur@vellum.ai>
---
 .../App/VoiceInputManager.swift               | 185 +++++++++++-------
 .../Voice/AudioEngineController.swift         |  94 +++++++--
 .../Features/Voice/OpenAIVoiceService.swift   |   3 +-
 3 files changed, 191 insertions(+), 91 deletions(-)

diff --git a/clients/macos/vellum-assistant/App/VoiceInputManager.swift b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
index ce491b45629..097bdee93d7 100644
--- a/clients/macos/vellum-assistant/App/VoiceInputManager.swift
+++ b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
@@ -110,7 +110,7 @@ final class VoiceInputManager {
         PTTActivator.cached
     }
 
-    private let speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))
+    private var speechRecognizer: SFSpeechRecognizer? = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))
     private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
     private var recognitionTask: SFSpeechRecognitionTask?
     private let engineController = AudioEngineController(label: "com.vellum.audioEngine.voiceInput")
@@ -549,18 +549,22 @@ final class VoiceInputManager {
         holdTask = nil
     }
 
-    /// Capture frontmost app context (for dictation) and begin recording.
+    /// Start recording immediately for instant UI feedback, then capture
+    /// frontmost app context. The engine starts asynchronously on its audio
+    /// queue while context capture runs on the main thread — both happen
+    /// concurrently, eliminating the sequential 600ms + 2s worst-case delay.
+    ///
     /// When Vellum itself is the frontmost app, skip context capture so the
     /// transcription falls through to the conversation path (auto-submit to chat)
     /// instead of going through DictationTextInserter which would double-insert.
     private func captureContextAndBeginRecording() {
+        beginRecording()
         if currentMode == .dictation {
             let isVellumFrontmost = NSWorkspace.shared.frontmostApplication?.bundleIdentifier == Bundle.main.bundleIdentifier
             if !isVellumFrontmost {
                 currentDictationContext = DictationContextCapture.capture()
             }
         }
-        beginRecording()
     }
 
     /// Stop recording using the appropriate method for the current mode.
@@ -592,6 +596,12 @@ final class VoiceInputManager {
     // MARK: - Recording
 
     private func beginRecording() {
+        // Recreate speech recognizer if transiently unavailable (e.g. after
+        // sleep/wake, heavy use, or audio route changes).
+        if speechRecognizer?.isAvailable != true {
+            log.warning("Speech recognizer unavailable — recreating")
+            speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))
+        }
         guard let speechRecognizer = speechRecognizer, speechRecognizer.isAvailable else {
             log.error("Speech recognizer not available")
             currentDictationContext = nil
@@ -637,6 +647,9 @@ final class VoiceInputManager {
             return
         }
 
+        // Show recording state and play chime immediately for instant feedback.
+        // The audio engine starts asynchronously below — the user hears/sees the
+        // activation before the engine is ready, hiding hardware latency.
         isRecording = true
         onRecordingStateChanged?(true)
         if currentMode == .dictation {
@@ -647,6 +660,7 @@ final class VoiceInputManager {
             }
         }
         log.info("Voice recording started")
+        VoiceFeedback.playActivationChime()
 
         let request = SFSpeechAudioBufferRecognitionRequest()
         request.shouldReportPartialResults = true
@@ -655,87 +669,93 @@ final class VoiceInputManager {
         let ampState = amplitudeState
         ampState.reset()
 
-        // Atomically read the hardware format, install the tap, and start the
-        // engine in a single dispatch to the audio queue. This prevents the
-        // TOCTOU race where a format read via `inputNodeFormat()` becomes stale
-        // before a separate `installTap()` async block executes — which crashes
-        // with NSInternalInconsistencyException on first use after permission grant.
-        guard engineController.installTapAndStart(
-            bufferSize: 1024,
-            block: { [weak self] buffer, _ in
-                request.append(buffer)
-
-                guard let channelData = buffer.floatChannelData else { return }
-                let frameLength = Int(buffer.frameLength)
-                guard frameLength > 0 else { return }
-
-                let channelDataArray = Array(UnsafeBufferPointer(start: channelData[0], count: frameLength))
-                let rawRMS = vDSP.rootMeanSquare(channelDataArray)
-
-                let smoothed = 0.5 * rawRMS + 0.5 * ampState.previousSmoothed
-                ampState.previousSmoothed = smoothed
-
-                // Scale amplitude to 0-1 range for waveform visualization.
-                // Speech RMS is typically 0.01-0.1; multiply to fill the visual range.
-                let scaled = min(smoothed * 14.0, 1.0)
-
-                let now = CFAbsoluteTimeGetCurrent()
-                guard now - ampState.lastEmissionTime >= 0.033 else { return }
-                ampState.lastEmissionTime = now
-
-                VoiceInputManager.amplitudeSubject.send(scaled)
-                DispatchQueue.main.async { [weak self] in
-                    self?.onAmplitudeChanged?(scaled)
-                }
+        let tapBlock: AVAudioNodeTapBlock = { [weak self] buffer, _ in
+            request.append(buffer)
+
+            guard let channelData = buffer.floatChannelData else { return }
+            let frameLength = Int(buffer.frameLength)
+            guard frameLength > 0 else { return }
+
+            let channelDataArray = Array(UnsafeBufferPointer(start: channelData[0], count: frameLength))
+            let rawRMS = vDSP.rootMeanSquare(channelDataArray)
+
+            let smoothed = 0.5 * rawRMS + 0.5 * ampState.previousSmoothed
+            ampState.previousSmoothed = smoothed
+
+            // Scale amplitude to 0-1 range for waveform visualization.
+            // Speech RMS is typically 0.01-0.1; multiply to fill the visual range.
+            let scaled = min(smoothed * 14.0, 1.0)
+
+            let now = CFAbsoluteTimeGetCurrent()
+            guard now - ampState.lastEmissionTime >= 0.033 else { return }
+            ampState.lastEmissionTime = now
+
+            VoiceInputManager.amplitudeSubject.send(scaled)
+            DispatchQueue.main.async { [weak self] in
+                self?.onAmplitudeChanged?(scaled)
             }
-        ) else {
-            log.error("Audio engine failed to start — invalid format or engine error")
-            isRecording = false
-            onRecordingStateChanged?(false)
-            currentDictationContext = nil
-            recognitionRequest = nil
-            overlayWindow.dismiss()
-            resetAudioEngine()
-            return
         }
-        hasInstalledTap = true
 
-        recognitionTask = speechRecognizer.recognitionTask(with: request) { [weak self] result, error in
-            Task { @MainActor in
-                guard let self = self else { return }
-                // Ignore late callbacks delivered after recording was stopped
-                // (e.g. endAudio() triggering a delayed isFinal via Task dispatch).
-                guard self.isRecording else { return }
-
-                if let result = result {
-                    let text = result.bestTranscription.formattedString
-                    if result.isFinal {
-                        log.info("Transcription: \(text, privacy: .public)")
-                        if !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
-                            self.handleFinalTranscription(text)
+        // Start the audio engine asynchronously to avoid blocking the main
+        // thread during Bluetooth negotiation or hardware initialization.
+        // The recognition task is started in the completion after the engine
+        // is running. This eliminates the 2+ second main-thread stall that
+        // occurs with queue.sync when coreaudiod is contended.
+        Task { [weak self] in
+            guard let self else { return }
+            let success = await self.engineController.installTapAndStartAsync(
+                bufferSize: 1024,
+                block: tapBlock
+            )
+            // Recording may have been stopped while the engine was starting
+            guard self.isRecording else { return }
+            guard success else {
+                log.error("Audio engine failed to start — invalid format or engine error")
+                self.isRecording = false
+                self.onRecordingStateChanged?(false)
+                self.currentDictationContext = nil
+                self.recognitionRequest = nil
+                self.overlayWindow.dismiss()
+                self.resetAudioEngine()
+                return
+            }
+            self.hasInstalledTap = true
+
+            self.recognitionTask = speechRecognizer.recognitionTask(with: request) { [weak self] result, error in
+                Task { @MainActor in
+                    guard let self = self else { return }
+                    // Ignore late callbacks delivered after recording was stopped
+                    // (e.g. endAudio() triggering a delayed isFinal via Task dispatch).
+                    guard self.isRecording else { return }
+
+                    if let result = result {
+                        let text = result.bestTranscription.formattedString
+                        if result.isFinal {
+                            log.info("Transcription: \(text, privacy: .public)")
+                            if !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
+                                self.handleFinalTranscription(text)
+                            } else {
+                                VoiceFeedback.playDeactivationChime()
+                            }
+                            self.recognitionTask = nil
+                            self.stopRecording()
                         } else {
-                            VoiceFeedback.playDeactivationChime()
+                            self.onPartialTranscription?(text)
+                            if self.currentMode == .dictation {
+                                self.overlayWindow.updatePartialTranscription(text)
+                            }
                         }
+                    }
+
+                    if let error = error {
+                        log.error("Recognition error: \(error.localizedDescription)")
                         self.recognitionTask = nil
+                        VoiceFeedback.playDeactivationChime()
                         self.stopRecording()
-                    } else {
-                        self.onPartialTranscription?(text)
-                        if self.currentMode == .dictation {
-                            self.overlayWindow.updatePartialTranscription(text)
-                        }
                     }
                 }
-
-                if let error = error {
-                    log.error("Recognition error: \(error.localizedDescription)")
-                    self.recognitionTask = nil
-                    VoiceFeedback.playDeactivationChime()
-                    self.stopRecording()
-                }
             }
         }
-
-        VoiceFeedback.playActivationChime()
     }
 
     // MARK: - Permission Prompt
@@ -765,10 +785,10 @@ final class VoiceInputManager {
 
         log.info("Permissions granted — starting recording")
         prewarmEngine()
+        self.beginRecording()
         if self.currentMode == .dictation {
             self.currentDictationContext = DictationContextCapture.capture()
         }
-        self.beginRecording()
     }
 
 
@@ -844,6 +864,23 @@ final class VoiceInputManager {
         }
         hasInstalledTap = false
 
+        // If the recognition task hasn't been started yet (async engine start
+        // still in progress), there's no callback to deliver isFinal.
+        // Clean up directly instead of waiting for a callback that won't come.
+        guard recognitionTask != nil else {
+            log.info("Recognition task not yet started — cleaning up directly")
+            recognitionRequest = nil
+            isRecording = false
+            currentDictationContext = nil
+            activeOrigin = .hotkey
+            amplitudeState.reset()
+            Self.amplitudeSubject.send(0)
+            onAmplitudeChanged?(0)
+            overlayWindow.dismiss()
+            VoiceFeedback.playDeactivationChime()
+            return
+        }
+
         // Signal end of audio — the recognizer will process remaining audio
         // and fire the callback with isFinal = true.
         recognitionRequest?.endAudio()
diff --git a/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift b/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift
index cfe6ed34f3b..0bf0419b484 100644
--- a/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift
+++ b/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift
@@ -16,14 +16,49 @@ private let log = Logger(subsystem: Bundle.appBundleIdentifier, category: "Audio
 /// ensure `prewarm()` has run first so `inputNode` is already initialized and
 /// sync calls complete in sub-milliseconds.
 ///
+/// Listens for `AVAudioEngineConfigurationChange` notifications to re-warm
+/// `inputNode` after audio route changes (Bluetooth connect/disconnect,
+/// AirPods mode switch, USB mic plug/unplug).
+///
 /// See: https://developer.apple.com/documentation/avfaudio/avaudionode/1387122-installtap
 final class AudioEngineController: @unchecked Sendable {
 
     private let audioEngine = AVAudioEngine()
     private let queue: DispatchQueue
+    private var configChangeObserver: (any NSObjectProtocol)?
 
     init(label: String = "com.vellum.audioEngine") {
         self.queue = DispatchQueue(label: label, qos: .userInitiated)
+        observeConfigurationChanges()
+    }
+
+    deinit {
+        if let observer = configChangeObserver {
+            NotificationCenter.default.removeObserver(observer)
+        }
+    }
+
+    // MARK: - Configuration Change Monitoring
+
+    /// Re-prewarm `inputNode` when the audio hardware configuration changes
+    /// (Bluetooth device connect/disconnect, USB mic plug/unplug, AirPods
+    /// mode switch). Keeps the cached inputNode format fresh so subsequent
+    /// `installTapAndStart` calls complete in sub-milliseconds.
+    ///
+    /// See: https://developer.apple.com/documentation/avfaudio/avaudioengine/1386063-configurationchangenotification
+    private func observeConfigurationChanges() {
+        configChangeObserver = NotificationCenter.default.addObserver(
+            forName: .AVAudioEngineConfigurationChange,
+            object: audioEngine,
+            queue: nil
+        ) { [weak self] _ in
+            guard let self else { return }
+            log.info("Audio configuration changed — re-warming inputNode")
+            self.queue.async {
+                let _ = self.audioEngine.inputNode
+                log.info("Audio engine re-warmed after configuration change")
+            }
+        }
     }
 
     // MARK: - Pre-warm
@@ -94,25 +129,52 @@ final class AudioEngineController: @unchecked Sendable {
         block: @escaping AVAudioNodeTapBlock
     ) -> Bool {
         queue.sync { [self] in
-            let inputNode = audioEngine.inputNode
-            let format = inputNode.outputFormat(forBus: 0)
-            guard format.channelCount > 0, format.sampleRate > 0 else {
-                log.error("Invalid audio format — channels: \(format.channelCount), sampleRate: \(format.sampleRate)")
-                return false
+            installTapAndStartImpl(bufferSize: bufferSize, block: block)
+        }
+    }
+
+    /// Non-blocking variant of `installTapAndStart` using Swift concurrency.
+    /// Dispatches to the audio queue asynchronously and returns the result via
+    /// async/await, keeping the caller's thread free during engine initialization.
+    ///
+    /// Use this for latency-sensitive flows (e.g. PTT dictation) where showing
+    /// immediate UI feedback before the engine is ready improves perceived
+    /// responsiveness.
+    func installTapAndStartAsync(
+        bufferSize: AVAudioFrameCount,
+        block: @escaping AVAudioNodeTapBlock
+    ) async -> Bool {
+        await withCheckedContinuation { continuation in
+            queue.async { [self] in
+                let success = installTapAndStartImpl(bufferSize: bufferSize, block: block)
+                continuation.resume(returning: success)
             }
+        }
+    }
 
+    /// Shared implementation for both sync and async tap+start paths.
+    private func installTapAndStartImpl(
+        bufferSize: AVAudioFrameCount,
+        block: @escaping AVAudioNodeTapBlock
+    ) -> Bool {
+        let inputNode = audioEngine.inputNode
+        let format = inputNode.outputFormat(forBus: 0)
+        guard format.channelCount > 0, format.sampleRate > 0 else {
+            log.error("Invalid audio format — channels: \(format.channelCount), sampleRate: \(format.sampleRate)")
+            return false
+        }
+
+        inputNode.removeTap(onBus: 0)
+        inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: nil, block: block)
+
+        audioEngine.prepare()
+        do {
+            try audioEngine.start()
+            return true
+        } catch {
+            log.error("Failed to start audio engine: \(error.localizedDescription)")
             inputNode.removeTap(onBus: 0)
-            inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: nil, block: block)
-
-            audioEngine.prepare()
-            do {
-                try audioEngine.start()
-                return true
-            } catch {
-                log.error("Failed to start audio engine: \(error.localizedDescription)")
-                inputNode.removeTap(onBus: 0)
-                return false
-            }
+            return false
         }
     }
 
diff --git a/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift b/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift
index 3bb94bf899c..50aa1b807ef 100644
--- a/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift
+++ b/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift
@@ -154,7 +154,8 @@ final class OpenAIVoiceService: VoiceServiceProtocol {
 
         // Reuse existing SFSpeechRecognizer across turns to avoid OS resource
         // release delays that make isAvailable return false on the second turn.
-        if speechRecognizer == nil {
+        // Recreate if transiently unavailable (e.g. after sleep/wake or heavy use).
+        if speechRecognizer == nil || speechRecognizer?.isAvailable != true {
             speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))
         }
         guard let recognizer = speechRecognizer, recognizer.isAvailable else {

From 86a87f29f131b5a1a1fccb9faeb78a242bbaf843 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 19:01:11 +0000
Subject: [PATCH 2/5] Tear down engine when async startup outlives recording
 session

When PTT is released before installTapAndStartAsync completes, the
isRecording guard now stops and removes the tap if the engine started
successfully, preventing the mic path from staying alive with no
active recording session.

Co-Authored-By: tkheyfets <timur@vellum.ai>
---
 .../vellum-assistant/App/VoiceInputManager.swift     | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/clients/macos/vellum-assistant/App/VoiceInputManager.swift b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
index 097bdee93d7..9a0929ab74f 100644
--- a/clients/macos/vellum-assistant/App/VoiceInputManager.swift
+++ b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
@@ -707,8 +707,16 @@ final class VoiceInputManager {
                 bufferSize: 1024,
                 block: tapBlock
             )
-            // Recording may have been stopped while the engine was starting
-            guard self.isRecording else { return }
+            // Recording may have been stopped while the engine was starting.
+            // If so, tear down the engine that just started to avoid leaving
+            // the mic path alive with no active recording session.
+            guard self.isRecording else {
+                if success {
+                    self.engineController.stopAndRemoveTap()
+                    log.info("Engine started after recording stopped — tore down")
+                }
+                return
+            }
             guard success else {
                 log.error("Audio engine failed to start — invalid format or engine error")
                 self.isRecording = false

From f1b8089ff2541010080c20004b0398df82102172 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 19:15:45 +0000
Subject: [PATCH 3/5] Add recording generation token and gate context capture
 on start success

Co-Authored-By: tkheyfets <timur@vellum.ai>
---
 .../App/VoiceInputManager.swift               | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/clients/macos/vellum-assistant/App/VoiceInputManager.swift b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
index 9a0929ab74f..ea5bcb69e07 100644
--- a/clients/macos/vellum-assistant/App/VoiceInputManager.swift
+++ b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
@@ -82,6 +82,12 @@ final class VoiceInputManager {
     /// Guards against double-start/double-stop from rapid key events.
     private var isActivatorHeld = false
 
+    /// Monotonically increasing counter identifying the current recording
+    /// session. The async engine-start Task captures this value and checks
+    /// it after `await` — if it no longer matches, the completion belongs
+    /// to a stale session and is discarded.
+    private var recordingGeneration: UInt64 = 0
+
     /// Whether `start()` has been called (monitors are active).
     /// Used to guard against duplicate registration from deferred startup.
     private(set) var hasStarted = false
@@ -559,6 +565,7 @@ final class VoiceInputManager {
     /// instead of going through DictationTextInserter which would double-insert.
     private func captureContextAndBeginRecording() {
         beginRecording()
+        guard isRecording else { return }
         if currentMode == .dictation {
             let isVellumFrontmost = NSWorkspace.shared.frontmostApplication?.bundleIdentifier == Bundle.main.bundleIdentifier
             if !isVellumFrontmost {
@@ -650,6 +657,8 @@ final class VoiceInputManager {
         // Show recording state and play chime immediately for instant feedback.
         // The audio engine starts asynchronously below — the user hears/sees the
         // activation before the engine is ready, hiding hardware latency.
+        recordingGeneration &+= 1
+        let generation = recordingGeneration
         isRecording = true
         onRecordingStateChanged?(true)
         if currentMode == .dictation {
@@ -707,13 +716,14 @@ final class VoiceInputManager {
                 bufferSize: 1024,
                 block: tapBlock
             )
-            // Recording may have been stopped while the engine was starting.
-            // If so, tear down the engine that just started to avoid leaving
-            // the mic path alive with no active recording session.
-            guard self.isRecording else {
+            // Verify this completion belongs to the current recording session.
+            // A quick release/retry can cause session A's completion to arrive
+            // while session B is active — using the stale request would
+            // desynchronize recognitionTask/recognitionRequest ownership.
+            guard self.isRecording, self.recordingGeneration == generation else {
                 if success {
                     self.engineController.stopAndRemoveTap()
-                    log.info("Engine started after recording stopped — tore down")
+                    log.info("Engine started for stale generation \(generation) (current \(self.recordingGeneration)) — tore down")
                 }
                 return
             }
@@ -794,6 +804,7 @@ final class VoiceInputManager {
         log.info("Permissions granted — starting recording")
         prewarmEngine()
         self.beginRecording()
+        guard self.isRecording else { return }
         if self.currentMode == .dictation {
             self.currentDictationContext = DictationContextCapture.capture()
         }

From 0abb0c93a77264acb6549873e8f10a5abc213670 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 19:24:54 +0000
Subject: [PATCH 4/5] Guard stale teardown against active sessions and gate
 rewarm on mic auth

Co-Authored-By: tkheyfets <timur@vellum.ai>
---
 .../macos/vellum-assistant/App/VoiceInputManager.swift   | 9 +++++++--
 .../Features/Voice/AudioEngineController.swift           | 5 +++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/clients/macos/vellum-assistant/App/VoiceInputManager.swift b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
index ea5bcb69e07..bc035d9fd97 100644
--- a/clients/macos/vellum-assistant/App/VoiceInputManager.swift
+++ b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
@@ -721,9 +721,14 @@ final class VoiceInputManager {
             // while session B is active — using the stale request would
             // desynchronize recognitionTask/recognitionRequest ownership.
             guard self.isRecording, self.recordingGeneration == generation else {
-                if success {
+                // Only tear down if no session is currently active. When a newer
+                // session is running (isRecording true, generation mismatch),
+                // it owns the engine — tearing down here would remove its tap.
+                if success, !self.isRecording {
                     self.engineController.stopAndRemoveTap()
-                    log.info("Engine started for stale generation \(generation) (current \(self.recordingGeneration)) — tore down")
+                    log.info("Engine started for stale generation \(generation) — tore down (no active session)")
+                } else if success {
+                    log.info("Stale generation \(generation) completed — skipping teardown, session \(self.recordingGeneration) owns engine")
                 }
                 return
             }
diff --git a/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift b/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift
index 0bf0419b484..d0ecd229b3c 100644
--- a/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift
+++ b/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift
@@ -1,4 +1,5 @@
 import AVFoundation
+import AVFAudio
 import os
 
 private let log = Logger(subsystem: Bundle.appBundleIdentifier, category: "AudioEngineController")
@@ -53,6 +54,10 @@ final class AudioEngineController: @unchecked Sendable {
             queue: nil
         ) { [weak self] _ in
             guard let self else { return }
+            guard AVCaptureDevice.authorizationStatus(for: .audio) == .authorized else {
+                log.info("Audio configuration changed — skipping re-warm (mic not authorized)")
+                return
+            }
             log.info("Audio configuration changed — re-warming inputNode")
             self.queue.async {
                 let _ = self.audioEngine.inputNode

From d757193141f9f8ac1a8f747bc7d24723981030aa Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 19:37:15 +0000
Subject: [PATCH 5/5] Move context capture to Task.detached to avoid blocking
 main actor

Co-Authored-By: tkheyfets <timur@vellum.ai>
---
 .../App/VoiceInputManager.swift               | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/clients/macos/vellum-assistant/App/VoiceInputManager.swift b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
index bc035d9fd97..84fc1d800c3 100644
--- a/clients/macos/vellum-assistant/App/VoiceInputManager.swift
+++ b/clients/macos/vellum-assistant/App/VoiceInputManager.swift
@@ -556,9 +556,10 @@ final class VoiceInputManager {
     }
 
     /// Start recording immediately for instant UI feedback, then capture
-    /// frontmost app context. The engine starts asynchronously on its audio
-    /// queue while context capture runs on the main thread — both happen
-    /// concurrently, eliminating the sequential 600ms + 2s worst-case delay.
+    /// frontmost app context off the main actor. The engine starts
+    /// asynchronously on its audio queue while context capture runs on a
+    /// detached Task — both happen concurrently without blocking the main
+    /// actor, so key-up events are processed immediately.
     ///
     /// When Vellum itself is the frontmost app, skip context capture so the
     /// transcription falls through to the conversation path (auto-submit to chat)
@@ -569,7 +570,15 @@ final class VoiceInputManager {
         if currentMode == .dictation {
             let isVellumFrontmost = NSWorkspace.shared.frontmostApplication?.bundleIdentifier == Bundle.main.bundleIdentifier
             if !isVellumFrontmost {
-                currentDictationContext = DictationContextCapture.capture()
+                let generation = recordingGeneration
+                Task.detached { [weak self] in
+                    let context = DictationContextCapture.capture()
+                    await MainActor.run { [weak self] in
+                        guard let self else { return }
+                        guard self.isRecording, self.recordingGeneration == generation else { return }
+                        self.currentDictationContext = context
+                    }
+                }
             }
         }
     }
@@ -811,7 +820,15 @@ final class VoiceInputManager {
         self.beginRecording()
         guard self.isRecording else { return }
         if self.currentMode == .dictation {
-            self.currentDictationContext = DictationContextCapture.capture()
+            let generation = self.recordingGeneration
+            Task.detached { [weak self] in
+                let context = DictationContextCapture.capture()
+                await MainActor.run { [weak self] in
+                    guard let self else { return }
+                    guard self.isRecording, self.recordingGeneration == generation else { return }
+                    self.currentDictationContext = context
+                }
+            }
         }
     }