From d084a37e11163ec493a45924a8f107f00f0ae644 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 9 Dec 2025 04:35:13 +0100 Subject: [PATCH] feat(mac): tokenized voice overlay adoption --- .../Sources/Clawdis/VoicePushToTalk.swift | 69 ++++++++++++++----- .../Sources/Clawdis/VoiceWakeChime.swift | 8 ++- .../Sources/Clawdis/VoiceWakeRuntime.swift | 47 +++++++++---- docs/mac/voice-overlay.md | 8 ++- 4 files changed, 99 insertions(+), 33 deletions(-) diff --git a/apps/macos/Sources/Clawdis/VoicePushToTalk.swift b/apps/macos/Sources/Clawdis/VoicePushToTalk.swift index 65253fd59..4b00785e6 100644 --- a/apps/macos/Sources/Clawdis/VoicePushToTalk.swift +++ b/apps/macos/Sources/Clawdis/VoicePushToTalk.swift @@ -77,6 +77,8 @@ final class VoicePushToTalkHotkey { actor VoicePushToTalk { static let shared = VoicePushToTalk() + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt") + private var recognizer: SFSpeechRecognizer? private var audioEngine = AVAudioEngine() private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? @@ -89,6 +91,8 @@ actor VoicePushToTalk { private var triggerChimePlayed = false private var finalized = false private var timeoutTask: Task? + private var overlayToken: UUID? + private var adoptedPrefix: String = "" private struct Config { let micID: String? @@ -112,14 +116,22 @@ actor VoicePushToTalk { self.triggerChimePlayed = false self.finalized = false self.timeoutTask?.cancel(); self.timeoutTask = nil + let snapshot = await MainActor.run { VoiceWakeOverlayController.shared.snapshot() } + self.adoptedPrefix = snapshot.isVisible ? snapshot.text.trimmingCharacters(in: .whitespacesAndNewlines) : "" + self.logger.info("ptt begin adopted_prefix_len=\(self.adoptedPrefix.count, privacy: .public)") if config.triggerChime != .none { self.triggerChimePlayed = true - await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) } + await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "ptt.trigger") } } // Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap. await VoiceWakeRuntime.shared.pauseForPushToTalk() - await MainActor.run { - VoiceWakeOverlayController.shared.showPartial(transcript: "") + let adoptedPrefix = self.adoptedPrefix + let adoptedAttributed: NSAttributedString? = adoptedPrefix.isEmpty ? nil : Self.makeAttributed(committed: adoptedPrefix, volatile: "", isFinal: false) + self.overlayToken = await MainActor.run { + VoiceWakeOverlayController.shared.startSession( + source: .pushToTalk, + transcript: adoptedPrefix, + attributed: adoptedAttributed) } do { @@ -143,7 +155,7 @@ actor VoicePushToTalk { // Give Speech a brief window to deliver the final result; otherwise fall back to current text. self.timeoutTask?.cancel() self.timeoutTask = Task { [weak self] in - try? await Task.sleep(nanoseconds: 700_000_000) // 700ms grace period + try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result await self?.finalize(transcriptOverride: nil, reason: "timeout") } } @@ -175,8 +187,7 @@ actor VoicePushToTalk { self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in guard let self else { return } if let error { - Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt") - .debug("push-to-talk error: \(error.localizedDescription, privacy: .public)") + self.logger.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)") } let transcript = result?.bestTranscription.formattedString let isFinal = result?.isFinal ?? false @@ -200,10 +211,13 @@ actor VoicePushToTalk { self.volatile = Self.delta(after: self.committed, current: transcript) } - let snapshot = self.committed + self.volatile - let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: isFinal) - await MainActor.run { - VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed) + let committedWithPrefix = Self.join(self.adoptedPrefix, self.committed) + let snapshot = Self.join(committedWithPrefix, self.volatile) + let attributed = Self.makeAttributed(committed: committedWithPrefix, volatile: self.volatile, isFinal: isFinal) + if let token = self.overlayToken { + await MainActor.run { + VoiceWakeOverlayController.shared.updatePartial(token: token, transcript: snapshot, attributed: attributed) + } } } @@ -212,14 +226,18 @@ actor VoicePushToTalk { self.finalized = true self.timeoutTask?.cancel(); self.timeoutTask = nil - let finalText: String = { + let finalRecognized: String = { if let override = transcriptOverride?.trimmingCharacters(in: .whitespacesAndNewlines) { return override } return (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines) }() + let finalText = Self.join(self.adoptedPrefix, finalRecognized) - let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: true) + let attributed = Self.makeAttributed( + committed: Self.join(self.adoptedPrefix, self.committed), + volatile: self.volatile, + isFinal: true) let forward: VoiceWakeForwardConfig if let cached = self.activeConfig?.forwardConfig { forward = cached @@ -228,19 +246,28 @@ actor VoicePushToTalk { } let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none) + let token = self.overlayToken + let logger = self.logger await MainActor.run { - Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt") - .info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)") + logger.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)") if finalText.isEmpty { - VoiceWakeOverlayController.shared.dismiss(reason: .empty) - } else { + VoiceWakeOverlayController.shared.dismiss(token: token, reason: .empty) + } else if let token { VoiceWakeOverlayController.shared.presentFinal( + token: token, transcript: finalText, forwardConfig: forward, autoSendAfter: nil, sendChime: chime, attributed: attributed) - VoiceWakeOverlayController.shared.sendNow(sendChime: chime) + VoiceWakeOverlayController.shared.sendNow(token: token, sendChime: chime) + } else { + if chime != .none { + VoiceWakeChimePlayer.play(chime, reason: "ptt.fallback_send") + } + Task.detached { + await VoiceWakeForwarder.forward(transcript: finalText, config: forward) + } } } @@ -254,6 +281,8 @@ actor VoicePushToTalk { self.volatile = "" self.activeConfig = nil self.triggerChimePlayed = false + self.overlayToken = nil + self.adoptedPrefix = "" // Resume the wake-word runtime after push-to-talk finishes. await VoiceWakeRuntime.shared.applyPushToTalkCooldown() @@ -284,6 +313,12 @@ actor VoicePushToTalk { return (committedColor, volatileColor) } + private static func join(_ prefix: String, _ suffix: String) -> String { + if prefix.isEmpty { return suffix } + if suffix.isEmpty { return prefix } + return "\(prefix) \(suffix)" + } + private static func delta(after committed: String, current: String) -> String { if current.hasPrefix(committed) { let start = current.index(current.startIndex, offsetBy: committed.count) diff --git a/apps/macos/Sources/Clawdis/VoiceWakeChime.swift b/apps/macos/Sources/Clawdis/VoiceWakeChime.swift index 426531e93..3eface4eb 100644 --- a/apps/macos/Sources/Clawdis/VoiceWakeChime.swift +++ b/apps/macos/Sources/Clawdis/VoiceWakeChime.swift @@ -44,9 +44,13 @@ enum VoiceWakeChimePlayer { private static let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.chime") private static var lastSound: NSSound? - static func play(_ chime: VoiceWakeChime) { + static func play(_ chime: VoiceWakeChime, reason: String? = nil) { guard let sound = self.sound(for: chime) else { return } - self.logger.log(level: .info, "chime play") + if let reason { + self.logger.log(level: .info, "chime play reason=\(reason, privacy: .public)") + } else { + self.logger.log(level: .info, "chime play") + } SoundEffectPlayer.play(sound) } diff --git a/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift b/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift index e35f63236..7ef4c3544 100644 --- a/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift +++ b/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift @@ -32,6 +32,7 @@ actor VoiceWakeRuntime { private var cooldownUntil: Date? private var currentConfig: RuntimeConfig? private var listeningState: ListeningState = .idle + private var overlayToken: UUID? // Tunables // Silence threshold once we've captured user speech (post-trigger). @@ -162,9 +163,11 @@ actor VoiceWakeRuntime { self.listeningState = .idle self.logger.debug("voicewake runtime stopped") + let token = self.overlayToken + self.overlayToken = nil guard dismissOverlay else { return } Task { @MainActor in - VoiceWakeOverlayController.shared.dismiss() + VoiceWakeOverlayController.shared.dismiss(token: token) } } @@ -208,8 +211,10 @@ actor VoiceWakeRuntime { volatile: self.volatileTranscript, isFinal: isFinal) let snapshot = self.committedTranscript + self.volatileTranscript - await MainActor.run { - VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed) + if let token = self.overlayToken { + await MainActor.run { + VoiceWakeOverlayController.shared.updatePartial(token: token, transcript: snapshot, attributed: attributed) + } } } } @@ -249,7 +254,7 @@ actor VoiceWakeRuntime { if config.triggerChime != .none, !self.triggerChimePlayed { self.triggerChimePlayed = true - await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) } + await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "voicewake.trigger") } } let snapshot = self.committedTranscript + self.volatileTranscript @@ -257,8 +262,11 @@ actor VoiceWakeRuntime { committed: self.committedTranscript, volatile: self.volatileTranscript, isFinal: false) - await MainActor.run { - VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed) + self.overlayToken = await MainActor.run { + VoiceWakeOverlayController.shared.startSession( + source: .wakeWord, + transcript: snapshot, + attributed: attributed) } // Keep the "ears" boosted for the capture window so the status icon animates while recording. @@ -309,7 +317,9 @@ actor VoiceWakeRuntime { self.triggerChimePlayed = false await MainActor.run { AppStateStore.shared.stopVoiceEars() } - await MainActor.run { VoiceWakeOverlayController.shared.updateLevel(0) } + if let token = self.overlayToken { + await MainActor.run { VoiceWakeOverlayController.shared.updateLevel(token: token, 0) } + } let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig } // Auto-send should fire as soon as the silence threshold is satisfied (2s after speech, 5s after trigger-only). @@ -320,14 +330,25 @@ actor VoiceWakeRuntime { volatile: "", isFinal: true) let sendChime = finalTranscript.isEmpty ? .none : config.sendChime - await MainActor.run { - VoiceWakeOverlayController.shared.presentFinal( - transcript: finalTranscript, + if let token = self.overlayToken { + await MainActor.run { + VoiceWakeOverlayController.shared.presentFinal( + token: token, + transcript: finalTranscript, forwardConfig: forwardConfig, autoSendAfter: delay, sendChime: sendChime, attributed: finalAttributed) + } + } else if forwardConfig.enabled, !finalTranscript.isEmpty { + if sendChime != .none { + await MainActor.run { VoiceWakeChimePlayer.play(sendChime, reason: "voicewake.send") } + } + Task.detached { + await VoiceWakeForwarder.forward(transcript: finalTranscript, config: forwardConfig) + } } + self.overlayToken = nil self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend) self.restartRecognizer() @@ -349,8 +370,10 @@ actor VoiceWakeRuntime { // Normalize against the adaptive threshold so the UI meter stays roughly 0...1 across devices. let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold))) - Task { @MainActor in - VoiceWakeOverlayController.shared.updateLevel(clamped) + if let token = self.overlayToken { + Task { @MainActor in + VoiceWakeOverlayController.shared.updateLevel(token: token, clamped) + } } } diff --git a/docs/mac/voice-overlay.md b/docs/mac/voice-overlay.md index be741fc6e..dd96fb0f1 100644 --- a/docs/mac/voice-overlay.md +++ b/docs/mac/voice-overlay.md @@ -6,7 +6,12 @@ Audience: macOS app contributors. Goal: keep the voice overlay predictable when - If the overlay is already visible from wake-word and the user presses the hotkey, the hotkey session *adopts* the existing text instead of resetting it. The overlay stays up while the hotkey is held. When the user releases: send if there is trimmed text, otherwise dismiss. - Wake-word alone still auto-sends on silence; push-to-talk sends immediately on release. -### Proposed architecture (to implement next) +### Implemented (Dec 9, 2025) +- Overlay sessions now carry a token per capture (wake-word or push-to-talk). Partial/final/send/dismiss/level updates are dropped when the token doesn’t match, avoiding stale callbacks. +- Push-to-talk adopts any visible overlay text as a prefix (so pressing the hotkey while the wake overlay is up keeps the text and appends new speech). It waits up to 1.5s for a final transcript before falling back to the current text. +- Chime/overlay logging is emitted at `info` in categories `voicewake.overlay`, `voicewake.ptt`, and `voicewake.chime` (session start, partial, final, send, dismiss, chime reason). + +### Next steps 1. **VoiceSessionCoordinator (actor)** - Owns exactly one `VoiceSession` at a time. - API (token-based): `beginWakeCapture`, `beginPushToTalk`, `updatePartial`, `endCapture`, `cancel`, `applyCooldown`. @@ -40,4 +45,3 @@ Audience: macOS app contributors. Goal: keep the voice overlay predictable when 3. Refactor `VoicePushToTalk` to adopt existing sessions and call `endCapture` on release; apply runtime cooldown. 4. Wire `VoiceWakeOverlayController` to the publisher; remove direct calls from runtime/PTT. 5. Add integration tests for session adoption, cooldown, and empty-text dismissal. -