fix(ptt): ignore stale recognition callbacks

This commit is contained in:
Peter Steinberger
2025-12-09 19:17:08 +01:00
parent a3bf2bdd8c
commit 0d4bf1c15a

View File

@@ -84,6 +84,9 @@ actor VoicePushToTalk {
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask? private var recognitionTask: SFSpeechRecognitionTask?
// Session token used to drop stale callbacks when a new capture starts.
private var sessionID = UUID()
private var committed: String = "" private var committed: String = ""
private var volatile: String = "" private var volatile: String = ""
private var activeConfig: Config? private var activeConfig: Config?
@@ -106,6 +109,10 @@ actor VoicePushToTalk {
guard voiceWakeSupported else { return } guard voiceWakeSupported else { return }
guard !self.isCapturing else { return } guard !self.isCapturing else { return }
// Start a fresh session and invalidate any in-flight callbacks tied to an older one.
let sessionID = UUID()
self.sessionID = sessionID
// Ensure permissions up front. // Ensure permissions up front.
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true) let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
guard granted else { return } guard granted else { return }
@@ -139,7 +146,7 @@ actor VoicePushToTalk {
} }
do { do {
try await self.startRecognition(localeID: config.localeID) try await self.startRecognition(localeID: config.localeID, sessionID: sessionID)
} catch { } catch {
await MainActor.run { await MainActor.run {
VoiceWakeOverlayController.shared.dismiss() VoiceWakeOverlayController.shared.dismiss()
@@ -151,6 +158,7 @@ actor VoicePushToTalk {
func end() async { func end() async {
guard self.isCapturing else { return } guard self.isCapturing else { return }
self.isCapturing = false self.isCapturing = false
let sessionID = self.sessionID
self.recognitionRequest?.endAudio() self.recognitionRequest?.endAudio()
self.audioEngine.inputNode.removeTap(onBus: 0) self.audioEngine.inputNode.removeTap(onBus: 0)
@@ -160,13 +168,13 @@ actor VoicePushToTalk {
self.timeoutTask?.cancel() self.timeoutTask?.cancel()
self.timeoutTask = Task { [weak self] in self.timeoutTask = Task { [weak self] in
try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result
await self?.finalize(transcriptOverride: nil, reason: "timeout") await self?.finalize(transcriptOverride: nil, reason: "timeout", sessionID: sessionID)
} }
} }
// MARK: - Private // MARK: - Private
private func startRecognition(localeID: String?) async throws { private func startRecognition(localeID: String?, sessionID: UUID) async throws {
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier) let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
self.recognizer = SFSpeechRecognizer(locale: locale) self.recognizer = SFSpeechRecognizer(locale: locale)
guard let recognizer, recognizer.isAvailable else { guard let recognizer, recognizer.isAvailable else {
@@ -199,17 +207,21 @@ actor VoicePushToTalk {
let transcript = result?.bestTranscription.formattedString let transcript = result?.bestTranscription.formattedString
let isFinal = result?.isFinal ?? false let isFinal = result?.isFinal ?? false
// Hop to a Task so UI updates stay off the Speech callback thread. // Hop to a Task so UI updates stay off the Speech callback thread.
Task.detached { [weak self, transcript, isFinal] in Task.detached { [weak self, transcript, isFinal, sessionID] in
guard let self else { return } guard let self else { return }
await self.handle(transcript: transcript, isFinal: isFinal) await self.handle(transcript: transcript, isFinal: isFinal, sessionID: sessionID)
if isFinal { if isFinal {
await self.finalize(transcriptOverride: transcript, reason: "speechFinal") await self.finalize(transcriptOverride: transcript, reason: "speechFinal", sessionID: sessionID)
} }
} }
} }
} }
private func handle(transcript: String?, isFinal: Bool) async { private func handle(transcript: String?, isFinal: Bool, sessionID: UUID) async {
guard sessionID == self.sessionID else {
self.logger.debug("push-to-talk drop transcript for stale session")
return
}
guard let transcript else { return } guard let transcript else { return }
if isFinal { if isFinal {
self.committed = transcript self.committed = transcript
@@ -231,9 +243,14 @@ actor VoicePushToTalk {
} }
} }
private func finalize(transcriptOverride: String?, reason: String) async { private func finalize(transcriptOverride: String?, reason: String, sessionID: UUID?) async {
if self.finalized { return } if self.finalized { return }
if let sessionID, sessionID != self.sessionID {
self.logger.debug("push-to-talk drop finalize for stale session")
return
}
self.finalized = true self.finalized = true
self.isCapturing = false
self.timeoutTask?.cancel(); self.timeoutTask = nil self.timeoutTask?.cancel(); self.timeoutTask = nil
let finalRecognized: String = { let finalRecognized: String = {