Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@ private let log = Logger(subsystem: Bundle.appBundleIdentifier, category: "Audio
/// audio-subsystem queue. When that queue is contended (hardware state changes,
/// Bluetooth negotiation, coreaudiod latency), the wait can exceed 2 seconds.
///
/// Fire-and-forget operations (`installTap`, `removeTap`, `stop`, `reset`)
/// use `queue.async` so the caller never blocks. Methods that return a value
/// (`inputNodeFormat`, `prepareAndStart`) or that require ordering guarantees
/// (`tearDown`, `stopAndRemoveTap` — callers call `endAudio()` immediately
/// after) use `queue.sync`. Callers should ensure `prewarm()` has run first
/// so `inputNode` is already initialized and sync calls complete in
/// sub-milliseconds.
/// Fire-and-forget operations (`stop`, `reset`) use `queue.async` so the caller
/// never blocks. Methods that require ordering guarantees (`tearDown`,
/// `stopAndRemoveTap`, `installTapAndStart`) use `queue.sync`. Callers should
/// ensure `prewarm()` has run first so `inputNode` is already initialized and
/// sync calls complete in sub-milliseconds.
///
/// See: https://developer.apple.com/documentation/avfaudio/avaudionode/1387122-installtap
final class AudioEngineController: @unchecked Sendable {
Expand All @@ -28,18 +26,6 @@ final class AudioEngineController: @unchecked Sendable {
self.queue = DispatchQueue(label: label, qos: .userInitiated)
}

// MARK: - Input Node Format

/// Returns the input node's output format for bus 0.
/// Returns `nil` if the format has zero channels or zero sample rate.
func inputNodeFormat() -> AVAudioFormat? {
queue.sync { [self] in
let format = audioEngine.inputNode.outputFormat(forBus: 0)
guard format.channelCount > 0, format.sampleRate > 0 else { return nil }
return format
}
}

// MARK: - Pre-warm

/// Touch `inputNode` to force lazy initialization of the audio subsystem.
Expand All @@ -51,46 +37,8 @@ final class AudioEngineController: @unchecked Sendable {
}
}

// MARK: - Tap Management

/// Remove any existing tap on bus 0, then install a new one.
/// Uses `async` — the next `queue.sync` call (e.g. `prepareAndStart`) will
/// wait for this to complete thanks to serial queue ordering.
func installTap(
bufferSize: AVAudioFrameCount,
format: AVAudioFormat?,
block: @escaping AVAudioNodeTapBlock
) {
queue.async { [weak self] in
guard let self else { return }
let inputNode = self.audioEngine.inputNode
inputNode.removeTap(onBus: 0)
inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format, block: block)
}
}

/// Remove the tap on bus 0 from the input node.
func removeTap() {
queue.async { [weak self] in
guard let self else { return }
self.audioEngine.inputNode.removeTap(onBus: 0)
}
}

// MARK: - Engine Lifecycle

func prepare() {
queue.async { [weak self] in
self?.audioEngine.prepare()
}
}

func start() throws {
try queue.sync { [self] in
try audioEngine.start()
}
}

func stop() {
queue.async { [weak self] in
guard let self else { return }
Expand Down Expand Up @@ -122,18 +70,25 @@ final class AudioEngineController: @unchecked Sendable {

// MARK: - Combined Operations

/// Atomically reads the input format, installs a tap, and starts the engine
/// in a single synchronous dispatch to the audio queue.
/// Atomically validates audio input, installs a tap with `nil` format, and
/// starts the engine in a single synchronous dispatch to the audio queue.
///
/// Passing `nil` for `installTap`'s format parameter lets AVAudioEngine use
/// its own internal hardware format, which is always self-consistent. This
/// prevents `NSInternalInconsistencyException` crashes caused by
/// `format.sampleRate != hwFormat.sampleRate` — the cached format from
/// `outputFormat(forBus:)` can diverge from the engine's internal hardware
/// format after audio route changes (Bluetooth, USB mic, AirPods mode
/// switch), even within a single synchronous block.
///
/// Eliminates the TOCTOU race where the format read by `inputNodeFormat()`
/// becomes stale before the separate `installTap()` async block executes —
/// which crashes with `NSInternalInconsistencyException` when the hardware
/// format changes between calls (common on first use after permission grant).
/// The format validation (channels > 0, sampleRate > 0) is kept as a
/// pre-check to detect "no audio input available" — but the validated format
/// is **not** forwarded to `installTap`.
///
/// Returns `true` on success, or `false` if the format is invalid or the
/// engine fails to start.
/// Returns `true` on success, or `false` if no audio input is available or
/// the engine fails to start.
///
/// See: https://developer.apple.com/documentation/avfaudio/avaudionode/1387122-installtap
/// See: https://developer.apple.com/documentation/avfaudio/avaudionode/installtap(onbus:buffersize:format:block:)
func installTapAndStart(
bufferSize: AVAudioFrameCount,
block: @escaping AVAudioNodeTapBlock
Expand All @@ -147,7 +102,7 @@ final class AudioEngineController: @unchecked Sendable {
}

inputNode.removeTap(onBus: 0)
inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format, block: block)
inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: nil, block: block)

audioEngine.prepare()
do {
Expand All @@ -161,23 +116,6 @@ final class AudioEngineController: @unchecked Sendable {
}
}

/// Prepare and start the engine. Returns `true` on success.
/// On failure, removes tap and returns `false`.
@discardableResult
func prepareAndStart() -> Bool {
queue.sync { [self] in
audioEngine.prepare()
do {
try audioEngine.start()
return true
} catch {
log.error("Failed to start audio engine: \(error.localizedDescription)")
audioEngine.inputNode.removeTap(onBus: 0)
return false
}
}
}

/// Stop the engine and remove the input tap (if running).
/// Uses `sync` because callers depend on the tap being removed before
/// they call `recognitionRequest?.endAudio()` — appending audio after
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,6 @@ final class OpenAIVoiceService: VoiceServiceProtocol {
latestTranscription = ""
livePartialText = ""

guard let format = engineController.inputNodeFormat() else {
log.error("No audio input channels")
return false
}

// Reuse existing SFSpeechRecognizer across turns to avoid OS resource
// release delays that make isAvailable return false on the second turn.
if speechRecognizer == nil {
Expand Down Expand Up @@ -225,8 +220,10 @@ final class OpenAIVoiceService: VoiceServiceProtocol {
}
}

// Install audio tap — feeds buffers to SFSpeechRecognizer + computes RMS for amplitude
engineController.installTap(bufferSize: 4096, format: format) { [weak self] buffer, _ in
// Atomically validate format, install tap, and start engine.
// Passes nil for format so AVAudioEngine uses its internal hardware
// format, preventing sampleRate mismatch crashes.
guard engineController.installTapAndStart(bufferSize: 4096, block: { [weak self] buffer, _ in
guard let floatData = buffer.floatChannelData else { return }
let frameCount = Int(buffer.frameLength)
guard frameCount > 0 else { return }
Expand Down Expand Up @@ -269,24 +266,17 @@ final class OpenAIVoiceService: VoiceServiceProtocol {
self.onSilenceDetected?()
}
}
}

// prepare() is async, start() is sync — serial queue guarantees
// prepare() completes before start() executes.
engineController.prepare()
do {
try engineController.start()
isRecording = true
lastSpeechTime = Date()
recordingStartTime = Date()
log.info("Recording started (SFSpeechRecognizer, onDevice: \(recognizer.supportsOnDeviceRecognition, privacy: .public))")
return true
} catch {
log.error("Failed to start audio engine: \(error.localizedDescription)")
engineController.removeTap()
}) else {
log.error("Failed to start audio engine for recording")
tearDownRecognition()
return false
}

isRecording = true
lastSpeechTime = Date()
recordingStartTime = Date()
log.info("Recording started (SFSpeechRecognizer, onDevice: \(recognizer.supportsOnDeviceRecognition, privacy: .public))")
return true
}

/// Stop recording and return the transcription from SFSpeechRecognizer.
Expand Down Expand Up @@ -453,12 +443,10 @@ final class OpenAIVoiceService: VoiceServiceProtocol {
guard !bargeInMonitorActive else { return }
bargeInMonitorActive = true

guard let format = engineController.inputNodeFormat() else {
bargeInMonitorActive = false
return
}

engineController.installTap(bufferSize: 4096, format: format) { [weak self] buffer, _ in
// Atomically validate format, install tap, and start engine.
// Passes nil for format so AVAudioEngine uses its internal hardware
// format, preventing sampleRate mismatch crashes.
if engineController.installTapAndStart(bufferSize: 4096, block: { [weak self] buffer, _ in
guard let floatData = buffer.floatChannelData else { return }
let frameCount = Int(buffer.frameLength)
guard frameCount > 0 else { return }
Expand All @@ -479,9 +467,7 @@ final class OpenAIVoiceService: VoiceServiceProtocol {
self.onBargeInDetected?()
}
}
}

if engineController.prepareAndStart() {
}) {
log.info("Barge-in monitor started")
} else {
log.error("Failed to start barge-in monitor")
Expand Down