fix(macos): lazy-init AVAudioEngine to prevent Bluetooth audio ducking

Creating AVAudioEngine at singleton init time causes macOS to switch
Bluetooth headphones from A2DP (high quality) to HFP (headset) profile,
resulting in degraded audio quality even when Voice Wake is disabled.

This change makes audioEngine optional and only creates it when voice
recognition actually starts, preventing the profile switch for users
who don't use Voice Wake.

Fixes #30

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Tu Nombre Real
2025-12-14 08:14:50 +01:00
committed by Peter Steinberger
parent 2b0f846f1b
commit 5e8c8367f3
2 changed files with 39 additions and 17 deletions

View File

@@ -80,7 +80,9 @@ actor VoicePushToTalk {
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt") private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
private var recognizer: SFSpeechRecognizer? private var recognizer: SFSpeechRecognizer?
private var audioEngine = AVAudioEngine() // Lazily created on begin() to avoid creating an AVAudioEngine at app launch, which can switch Bluetooth
// headphones into the low-quality headset profile even if push-to-talk is never used.
private var audioEngine: AVAudioEngine?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask? private var recognitionTask: SFSpeechRecognitionTask?
private var tapInstalled = false private var tapInstalled = false
@@ -166,7 +168,7 @@ actor VoicePushToTalk {
// Stop feeding Speech buffers first, then end the request. Stopping the engine here can race with // Stop feeding Speech buffers first, then end the request. Stopping the engine here can race with
// Speech draining its converter chain (and we already stop/cancel in finalize). // Speech draining its converter chain (and we already stop/cancel in finalize).
if self.tapInstalled { if self.tapInstalled {
self.audioEngine.inputNode.removeTap(onBus: 0) self.audioEngine?.inputNode.removeTap(onBus: 0)
self.tapInstalled = false self.tapInstalled = false
} }
self.recognitionRequest?.endAudio() self.recognitionRequest?.endAudio()
@@ -201,7 +203,13 @@ actor VoicePushToTalk {
self.recognitionRequest?.shouldReportPartialResults = true self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return } guard let request = self.recognitionRequest else { return }
let input = self.audioEngine.inputNode // Lazily create the engine here so app launch doesn't grab audio resources / trigger Bluetooth HFP.
if self.audioEngine == nil {
self.audioEngine = AVAudioEngine()
}
guard let audioEngine = self.audioEngine else { return }
let input = audioEngine.inputNode
let format = input.outputFormat(forBus: 0) let format = input.outputFormat(forBus: 0)
if self.tapInstalled { if self.tapInstalled {
input.removeTap(onBus: 0) input.removeTap(onBus: 0)
@@ -213,8 +221,8 @@ actor VoicePushToTalk {
} }
self.tapInstalled = true self.tapInstalled = true
self.audioEngine.prepare() audioEngine.prepare()
try self.audioEngine.start() try audioEngine.start()
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
guard let self else { return } guard let self else { return }
@@ -301,13 +309,15 @@ actor VoicePushToTalk {
self.recognitionRequest = nil self.recognitionRequest = nil
self.recognitionTask = nil self.recognitionTask = nil
if self.tapInstalled { if self.tapInstalled {
self.audioEngine.inputNode.removeTap(onBus: 0) self.audioEngine?.inputNode.removeTap(onBus: 0)
self.tapInstalled = false self.tapInstalled = false
} }
if self.audioEngine.isRunning { if self.audioEngine?.isRunning == true {
self.audioEngine.stop() self.audioEngine?.stop()
self.audioEngine.reset() self.audioEngine?.reset()
} }
// Release the engine so we also release any audio session/resources when push-to-talk ends.
self.audioEngine = nil
self.committed = "" self.committed = ""
self.volatile = "" self.volatile = ""

View File

@@ -15,7 +15,9 @@ actor VoiceWakeRuntime {
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.runtime") private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.runtime")
private var recognizer: SFSpeechRecognizer? private var recognizer: SFSpeechRecognizer?
private var audioEngine = AVAudioEngine() // Lazily created on start to avoid creating an AVAudioEngine at app launch, which can switch Bluetooth
// headphones into the low-quality headset profile even if Voice Wake is disabled.
private var audioEngine: AVAudioEngine?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask? private var recognitionTask: SFSpeechRecognitionTask?
private var recognitionGeneration: Int = 0 // drop stale callbacks after restarts private var recognitionGeneration: Int = 0 // drop stale callbacks after restarts
@@ -54,8 +56,10 @@ actor VoiceWakeRuntime {
self.recognitionTask = nil self.recognitionTask = nil
self.recognitionRequest?.endAudio() self.recognitionRequest?.endAudio()
self.recognitionRequest = nil self.recognitionRequest = nil
self.audioEngine.inputNode.removeTap(onBus: 0) self.audioEngine?.inputNode.removeTap(onBus: 0)
self.audioEngine.stop() self.audioEngine?.stop()
// Release the engine so we also release any audio session/resources when Voice Wake is idle.
self.audioEngine = nil
} }
struct RuntimeConfig: Equatable { struct RuntimeConfig: Equatable {
@@ -115,7 +119,13 @@ actor VoiceWakeRuntime {
self.recognitionRequest?.shouldReportPartialResults = true self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return } guard let request = self.recognitionRequest else { return }
let input = self.audioEngine.inputNode // Lazily create the engine here so app launch doesn't grab audio resources / trigger Bluetooth HFP.
if self.audioEngine == nil {
self.audioEngine = AVAudioEngine()
}
guard let audioEngine = self.audioEngine else { return }
let input = audioEngine.inputNode
let format = input.outputFormat(forBus: 0) let format = input.outputFormat(forBus: 0)
input.removeTap(onBus: 0) input.removeTap(onBus: 0)
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
@@ -127,8 +137,8 @@ actor VoiceWakeRuntime {
} }
} }
self.audioEngine.prepare() audioEngine.prepare()
try self.audioEngine.start() try audioEngine.start()
self.currentConfig = config self.currentConfig = config
self.lastHeard = Date() self.lastHeard = Date()
@@ -168,8 +178,10 @@ actor VoiceWakeRuntime {
self.recognitionTask = nil self.recognitionTask = nil
self.recognitionRequest?.endAudio() self.recognitionRequest?.endAudio()
self.recognitionRequest = nil self.recognitionRequest = nil
self.audioEngine.inputNode.removeTap(onBus: 0) self.audioEngine?.inputNode.removeTap(onBus: 0)
self.audioEngine.stop() self.audioEngine?.stop()
// Release the engine so we also release any audio session/resources when Voice Wake is disabled/stopped.
self.audioEngine = nil
self.currentConfig = nil self.currentConfig = nil
self.listeningState = .idle self.listeningState = .idle
self.logger.debug("voicewake runtime stopped") self.logger.debug("voicewake runtime stopped")