diff --git a/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift b/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift index 8ebdeaa3d55..cfe6ed34f3b 100644 --- a/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift +++ b/clients/macos/vellum-assistant/Features/Voice/AudioEngineController.swift @@ -10,13 +10,11 @@ private let log = Logger(subsystem: Bundle.appBundleIdentifier, category: "Audio /// audio-subsystem queue. When that queue is contended (hardware state changes, /// Bluetooth negotiation, coreaudiod latency), the wait can exceed 2 seconds. /// -/// Fire-and-forget operations (`installTap`, `removeTap`, `stop`, `reset`) -/// use `queue.async` so the caller never blocks. Methods that return a value -/// (`inputNodeFormat`, `prepareAndStart`) or that require ordering guarantees -/// (`tearDown`, `stopAndRemoveTap` — callers call `endAudio()` immediately -/// after) use `queue.sync`. Callers should ensure `prewarm()` has run first -/// so `inputNode` is already initialized and sync calls complete in -/// sub-milliseconds. +/// Fire-and-forget operations (`stop`, `reset`) use `queue.async` so the caller +/// never blocks. Methods that require ordering guarantees (`tearDown`, +/// `stopAndRemoveTap`, `installTapAndStart`) use `queue.sync`. Callers should +/// ensure `prewarm()` has run first so `inputNode` is already initialized and +/// sync calls complete in sub-milliseconds. /// /// See: https://developer.apple.com/documentation/avfaudio/avaudionode/1387122-installtap final class AudioEngineController: @unchecked Sendable { @@ -28,18 +26,6 @@ final class AudioEngineController: @unchecked Sendable { self.queue = DispatchQueue(label: label, qos: .userInitiated) } - // MARK: - Input Node Format - - /// Returns the input node's output format for bus 0. - /// Returns `nil` if the format has zero channels or zero sample rate. - func inputNodeFormat() -> AVAudioFormat? { - queue.sync { [self] in - let format = audioEngine.inputNode.outputFormat(forBus: 0) - guard format.channelCount > 0, format.sampleRate > 0 else { return nil } - return format - } - } - // MARK: - Pre-warm /// Touch `inputNode` to force lazy initialization of the audio subsystem. @@ -51,46 +37,8 @@ final class AudioEngineController: @unchecked Sendable { } } - // MARK: - Tap Management - - /// Remove any existing tap on bus 0, then install a new one. - /// Uses `async` — the next `queue.sync` call (e.g. `prepareAndStart`) will - /// wait for this to complete thanks to serial queue ordering. - func installTap( - bufferSize: AVAudioFrameCount, - format: AVAudioFormat?, - block: @escaping AVAudioNodeTapBlock - ) { - queue.async { [weak self] in - guard let self else { return } - let inputNode = self.audioEngine.inputNode - inputNode.removeTap(onBus: 0) - inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format, block: block) - } - } - - /// Remove the tap on bus 0 from the input node. - func removeTap() { - queue.async { [weak self] in - guard let self else { return } - self.audioEngine.inputNode.removeTap(onBus: 0) - } - } - // MARK: - Engine Lifecycle - func prepare() { - queue.async { [weak self] in - self?.audioEngine.prepare() - } - } - - func start() throws { - try queue.sync { [self] in - try audioEngine.start() - } - } - func stop() { queue.async { [weak self] in guard let self else { return } @@ -122,18 +70,25 @@ final class AudioEngineController: @unchecked Sendable { // MARK: - Combined Operations - /// Atomically reads the input format, installs a tap, and starts the engine - /// in a single synchronous dispatch to the audio queue. + /// Atomically validates audio input, installs a tap with `nil` format, and + /// starts the engine in a single synchronous dispatch to the audio queue. + /// + /// Passing `nil` for `installTap`'s format parameter lets AVAudioEngine use + /// its own internal hardware format, which is always self-consistent. This + /// prevents `NSInternalInconsistencyException` crashes caused by + /// `format.sampleRate != hwFormat.sampleRate` — the cached format from + /// `outputFormat(forBus:)` can diverge from the engine's internal hardware + /// format after audio route changes (Bluetooth, USB mic, AirPods mode + /// switch), even within a single synchronous block. /// - /// Eliminates the TOCTOU race where the format read by `inputNodeFormat()` - /// becomes stale before the separate `installTap()` async block executes — - /// which crashes with `NSInternalInconsistencyException` when the hardware - /// format changes between calls (common on first use after permission grant). + /// The format validation (channels > 0, sampleRate > 0) is kept as a + /// pre-check to detect "no audio input available" — but the validated format + /// is **not** forwarded to `installTap`. /// - /// Returns `true` on success, or `false` if the format is invalid or the - /// engine fails to start. + /// Returns `true` on success, or `false` if no audio input is available or + /// the engine fails to start. /// - /// See: https://developer.apple.com/documentation/avfaudio/avaudionode/1387122-installtap + /// See: https://developer.apple.com/documentation/avfaudio/avaudionode/installtap(onbus:buffersize:format:block:) func installTapAndStart( bufferSize: AVAudioFrameCount, block: @escaping AVAudioNodeTapBlock @@ -147,7 +102,7 @@ final class AudioEngineController: @unchecked Sendable { } inputNode.removeTap(onBus: 0) - inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format, block: block) + inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: nil, block: block) audioEngine.prepare() do { @@ -161,23 +116,6 @@ final class AudioEngineController: @unchecked Sendable { } } - /// Prepare and start the engine. Returns `true` on success. - /// On failure, removes tap and returns `false`. - @discardableResult - func prepareAndStart() -> Bool { - queue.sync { [self] in - audioEngine.prepare() - do { - try audioEngine.start() - return true - } catch { - log.error("Failed to start audio engine: \(error.localizedDescription)") - audioEngine.inputNode.removeTap(onBus: 0) - return false - } - } - } - /// Stop the engine and remove the input tap (if running). /// Uses `sync` because callers depend on the tap being removed before /// they call `recognitionRequest?.endAudio()` — appending audio after diff --git a/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift b/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift index 2628a2a197a..3bb94bf899c 100644 --- a/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift +++ b/clients/macos/vellum-assistant/Features/Voice/OpenAIVoiceService.swift @@ -152,11 +152,6 @@ final class OpenAIVoiceService: VoiceServiceProtocol { latestTranscription = "" livePartialText = "" - guard let format = engineController.inputNodeFormat() else { - log.error("No audio input channels") - return false - } - // Reuse existing SFSpeechRecognizer across turns to avoid OS resource // release delays that make isAvailable return false on the second turn. if speechRecognizer == nil { @@ -225,8 +220,10 @@ final class OpenAIVoiceService: VoiceServiceProtocol { } } - // Install audio tap — feeds buffers to SFSpeechRecognizer + computes RMS for amplitude - engineController.installTap(bufferSize: 4096, format: format) { [weak self] buffer, _ in + // Atomically validate format, install tap, and start engine. + // Passes nil for format so AVAudioEngine uses its internal hardware + // format, preventing sampleRate mismatch crashes. + guard engineController.installTapAndStart(bufferSize: 4096, block: { [weak self] buffer, _ in guard let floatData = buffer.floatChannelData else { return } let frameCount = Int(buffer.frameLength) guard frameCount > 0 else { return } @@ -269,24 +266,17 @@ final class OpenAIVoiceService: VoiceServiceProtocol { self.onSilenceDetected?() } } - } - - // prepare() is async, start() is sync — serial queue guarantees - // prepare() completes before start() executes. - engineController.prepare() - do { - try engineController.start() - isRecording = true - lastSpeechTime = Date() - recordingStartTime = Date() - log.info("Recording started (SFSpeechRecognizer, onDevice: \(recognizer.supportsOnDeviceRecognition, privacy: .public))") - return true - } catch { - log.error("Failed to start audio engine: \(error.localizedDescription)") - engineController.removeTap() + }) else { + log.error("Failed to start audio engine for recording") tearDownRecognition() return false } + + isRecording = true + lastSpeechTime = Date() + recordingStartTime = Date() + log.info("Recording started (SFSpeechRecognizer, onDevice: \(recognizer.supportsOnDeviceRecognition, privacy: .public))") + return true } /// Stop recording and return the transcription from SFSpeechRecognizer. @@ -453,12 +443,10 @@ final class OpenAIVoiceService: VoiceServiceProtocol { guard !bargeInMonitorActive else { return } bargeInMonitorActive = true - guard let format = engineController.inputNodeFormat() else { - bargeInMonitorActive = false - return - } - - engineController.installTap(bufferSize: 4096, format: format) { [weak self] buffer, _ in + // Atomically validate format, install tap, and start engine. + // Passes nil for format so AVAudioEngine uses its internal hardware + // format, preventing sampleRate mismatch crashes. + if engineController.installTapAndStart(bufferSize: 4096, block: { [weak self] buffer, _ in guard let floatData = buffer.floatChannelData else { return } let frameCount = Int(buffer.frameLength) guard frameCount > 0 else { return } @@ -479,9 +467,7 @@ final class OpenAIVoiceService: VoiceServiceProtocol { self.onBargeInDetected?() } } - } - - if engineController.prepareAndStart() { + }) { log.info("Barge-in monitor started") } else { log.error("Failed to start barge-in monitor")