Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions clients/macos/vellum-assistant/App/VoiceInputManager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ final class VoiceInputManager {
/// Whether the microphone is currently recording for PTT/dictation.
private(set) var isRecording = false

/// Timestamp when the current recording session started. Used to detect
/// micro-recordings that stop almost immediately (likely failures).
private var recordingStartTime: CFAbsoluteTime = 0

/// Guards against double-start/double-stop from rapid key events.
private var isActivatorHeld = false

Expand Down Expand Up @@ -612,21 +616,23 @@ final class VoiceInputManager {
// MARK: - Recording

private func beginRecording() {
log.info("beginRecording() called — origin=\(String(describing: activeOrigin)) mode=\(String(describing: currentMode)) isRecording=\(isRecording)")

// Recreate speech recognizer if transiently unavailable (e.g. after
// sleep/wake, heavy use, or audio route changes).
if speechRecognizer?.isAvailable != true {
log.warning("Speech recognizer unavailable — recreating")
log.warning("Speech recognizer unavailable (nil=\(speechRecognizer == nil)) — recreating")
speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))
}
guard let speechRecognizer = speechRecognizer, speechRecognizer.isAvailable else {
log.error("Speech recognizer not available")
log.error("Speech recognizer not available after recreation attempt (nil=\(speechRecognizer == nil), available=\(speechRecognizer?.isAvailable ?? false))")
currentDictationContext = nil
return
}

// Don't start if a previous recognition task is still processing
if recognitionTask != nil {
log.warning("Previous recognition task still active, skipping")
log.warning("Previous recognition task still active (state=\(String(describing: recognitionTask?.state))), skipping")
currentDictationContext = nil
return
}
Expand All @@ -636,9 +642,11 @@ final class VoiceInputManager {
// silently opening System Settings.
let micStatus = AVCaptureDevice.authorizationStatus(for: .audio)
let speechStatus = SFSpeechRecognizer.authorizationStatus()
log.info("Permissions — mic=\(String(describing: micStatus)) speech=\(String(describing: speechStatus))")

if micStatus == .notDetermined || speechStatus == .notDetermined {
// Show a primer explaining why we need mic access, then request.
log.info("Showing permission primer (mic=\(String(describing: micStatus)) speech=\(String(describing: speechStatus)))")
currentDictationContext = nil
permissionOverlay.show(kind: .firstUse, onDismiss: {}, onContinue: { [weak self] in
Task { @MainActor in
Expand All @@ -658,6 +666,7 @@ final class VoiceInputManager {
} else {
deniedPermission = .speechRecognition
}
log.warning("Permission denied — showing overlay (mic=\(String(describing: micStatus)) speech=\(String(describing: speechStatus)))")
permissionOverlay.show(kind: .denied(deniedPermission), onDismiss: {}, onContinue: {})
currentDictationContext = nil
return
Expand All @@ -669,6 +678,7 @@ final class VoiceInputManager {
recordingGeneration &+= 1
let generation = recordingGeneration
isRecording = true
recordingStartTime = CFAbsoluteTimeGetCurrent()
onRecordingStateChanged?(true)
if currentMode == .dictation {
if activeOrigin == .chatComposer {
Expand Down Expand Up @@ -742,7 +752,8 @@ final class VoiceInputManager {
return
}
guard success else {
log.error("Audio engine failed to start — invalid format or engine error")
let elapsed = CFAbsoluteTimeGetCurrent() - self.recordingStartTime
log.error("Audio engine failed to start after \(String(format: "%.1f", elapsed))s — invalid format or engine error. Resetting engine for next attempt.")
self.isRecording = false
self.onRecordingStateChanged?(false)
self.currentDictationContext = nil
Expand All @@ -763,10 +774,12 @@ final class VoiceInputManager {
if let result = result {
let text = result.bestTranscription.formattedString
if result.isFinal {
log.info("Transcription: \(text, privacy: .public)")
let elapsed = CFAbsoluteTimeGetCurrent() - self.recordingStartTime
log.info("Final transcription after \(String(format: "%.1f", elapsed))s: \"\(text, privacy: .public)\"")
if !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
self.handleFinalTranscription(text)
} else {
log.warning("Empty final transcription after \(String(format: "%.1f", elapsed))s — stopping recording")
VoiceFeedback.playDeactivationChime()
}
self.recognitionTask = nil
Expand All @@ -780,7 +793,8 @@ final class VoiceInputManager {
}

if let error = error {
log.error("Recognition error: \(error.localizedDescription)")
let elapsed = CFAbsoluteTimeGetCurrent() - self.recordingStartTime
log.error("Recognition error after \(String(format: "%.1f", elapsed))s: \(error.localizedDescription) (domain=\((error as NSError).domain) code=\((error as NSError).code))")
self.recognitionTask = nil
VoiceFeedback.playDeactivationChime()
self.stopRecording()
Expand Down Expand Up @@ -929,13 +943,19 @@ final class VoiceInputManager {

private func stopRecording() {
guard isRecording else {
log.info("stopRecording() called but isRecording=false — tearing down audio state only")
// Even when isRecording is false, audio state may be inconsistent
// (e.g. a prior error set isRecording=false without fully cleaning up).
// Tear down unconditionally so the cancel button always works.
tearDownAudioState()
return
}

let elapsed = CFAbsoluteTimeGetCurrent() - recordingStartTime
if elapsed < 1.0 {
log.warning("Micro-recording detected: recording stopped after only \(String(format: "%.2f", elapsed))s — likely a failure, not user action")
}

isRecording = false
onRecordingStateChanged?(false)
currentDictationContext = nil
Expand All @@ -949,7 +969,7 @@ final class VoiceInputManager {
overlayWindow.dismiss()
}
awaitingDaemonResponse = false // reset for next recording
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 Pre-existing: awaitingDaemonResponse reset in stopRecording() after overlay check

At line 971, awaitingDaemonResponse is reset to false immediately after the overlay-dismiss check at line 968. In the dictation flow, handleFinalTranscription sets awaitingDaemonResponse = true (line 878), then the recognition callback calls stopRecording() which checks the flag (preserving the overlay) but then resets it. When handleDictationResponse later arrives (line 893-894), the flag is already false. This means any code that checks awaitingDaemonResponse between stopRecording() and handleDictationResponse will see false. This is pre-existing behavior unaffected by this PR, and the only consumer of the flag during that window appears to be another stopRecording() call (which would dismiss the overlay prematurely if triggered). Worth noting but not a regression.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

log.info("Voice recording stopped")
log.info("Voice recording stopped after \(String(format: "%.1f", elapsed))s")

tearDownAudioState()
}
Expand Down