feat(mac): tokenized voice overlay adoption

This commit is contained in:
Peter Steinberger
2025-12-09 04:35:13 +01:00
parent cfd2c41c21
commit d084a37e11
4 changed files with 99 additions and 33 deletions

View File

@@ -77,6 +77,8 @@ final class VoicePushToTalkHotkey {
actor VoicePushToTalk {
static let shared = VoicePushToTalk()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
private var recognizer: SFSpeechRecognizer?
private var audioEngine = AVAudioEngine()
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
@@ -89,6 +91,8 @@ actor VoicePushToTalk {
private var triggerChimePlayed = false
private var finalized = false
private var timeoutTask: Task<Void, Never>?
private var overlayToken: UUID?
private var adoptedPrefix: String = ""
private struct Config {
let micID: String?
@@ -112,14 +116,22 @@ actor VoicePushToTalk {
self.triggerChimePlayed = false
self.finalized = false
self.timeoutTask?.cancel(); self.timeoutTask = nil
let snapshot = await MainActor.run { VoiceWakeOverlayController.shared.snapshot() }
self.adoptedPrefix = snapshot.isVisible ? snapshot.text.trimmingCharacters(in: .whitespacesAndNewlines) : ""
self.logger.info("ptt begin adopted_prefix_len=\(self.adoptedPrefix.count, privacy: .public)")
if config.triggerChime != .none {
self.triggerChimePlayed = true
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) }
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "ptt.trigger") }
}
// Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
await VoiceWakeRuntime.shared.pauseForPushToTalk()
await MainActor.run {
VoiceWakeOverlayController.shared.showPartial(transcript: "")
let adoptedPrefix = self.adoptedPrefix
let adoptedAttributed: NSAttributedString? = adoptedPrefix.isEmpty ? nil : Self.makeAttributed(committed: adoptedPrefix, volatile: "", isFinal: false)
self.overlayToken = await MainActor.run {
VoiceWakeOverlayController.shared.startSession(
source: .pushToTalk,
transcript: adoptedPrefix,
attributed: adoptedAttributed)
}
do {
@@ -143,7 +155,7 @@ actor VoicePushToTalk {
// Give Speech a brief window to deliver the final result; otherwise fall back to current text.
self.timeoutTask?.cancel()
self.timeoutTask = Task { [weak self] in
try? await Task.sleep(nanoseconds: 700_000_000) // 700ms grace period
try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result
await self?.finalize(transcriptOverride: nil, reason: "timeout")
}
}
@@ -175,8 +187,7 @@ actor VoicePushToTalk {
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
guard let self else { return }
if let error {
Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
self.logger.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
}
let transcript = result?.bestTranscription.formattedString
let isFinal = result?.isFinal ?? false
@@ -200,10 +211,13 @@ actor VoicePushToTalk {
self.volatile = Self.delta(after: self.committed, current: transcript)
}
let snapshot = self.committed + self.volatile
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: isFinal)
await MainActor.run {
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
let committedWithPrefix = Self.join(self.adoptedPrefix, self.committed)
let snapshot = Self.join(committedWithPrefix, self.volatile)
let attributed = Self.makeAttributed(committed: committedWithPrefix, volatile: self.volatile, isFinal: isFinal)
if let token = self.overlayToken {
await MainActor.run {
VoiceWakeOverlayController.shared.updatePartial(token: token, transcript: snapshot, attributed: attributed)
}
}
}
@@ -212,14 +226,18 @@ actor VoicePushToTalk {
self.finalized = true
self.timeoutTask?.cancel(); self.timeoutTask = nil
let finalText: String = {
let finalRecognized: String = {
if let override = transcriptOverride?.trimmingCharacters(in: .whitespacesAndNewlines) {
return override
}
return (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
}()
let finalText = Self.join(self.adoptedPrefix, finalRecognized)
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: true)
let attributed = Self.makeAttributed(
committed: Self.join(self.adoptedPrefix, self.committed),
volatile: self.volatile,
isFinal: true)
let forward: VoiceWakeForwardConfig
if let cached = self.activeConfig?.forwardConfig {
forward = cached
@@ -228,19 +246,28 @@ actor VoicePushToTalk {
}
let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)
let token = self.overlayToken
let logger = self.logger
await MainActor.run {
Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
logger.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
if finalText.isEmpty {
VoiceWakeOverlayController.shared.dismiss(reason: .empty)
} else {
VoiceWakeOverlayController.shared.dismiss(token: token, reason: .empty)
} else if let token {
VoiceWakeOverlayController.shared.presentFinal(
token: token,
transcript: finalText,
forwardConfig: forward,
autoSendAfter: nil,
sendChime: chime,
attributed: attributed)
VoiceWakeOverlayController.shared.sendNow(sendChime: chime)
VoiceWakeOverlayController.shared.sendNow(token: token, sendChime: chime)
} else {
if chime != .none {
VoiceWakeChimePlayer.play(chime, reason: "ptt.fallback_send")
}
Task.detached {
await VoiceWakeForwarder.forward(transcript: finalText, config: forward)
}
}
}
@@ -254,6 +281,8 @@ actor VoicePushToTalk {
self.volatile = ""
self.activeConfig = nil
self.triggerChimePlayed = false
self.overlayToken = nil
self.adoptedPrefix = ""
// Resume the wake-word runtime after push-to-talk finishes.
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
@@ -284,6 +313,12 @@ actor VoicePushToTalk {
return (committedColor, volatileColor)
}
private static func join(_ prefix: String, _ suffix: String) -> String {
if prefix.isEmpty { return suffix }
if suffix.isEmpty { return prefix }
return "\(prefix) \(suffix)"
}
private static func delta(after committed: String, current: String) -> String {
if current.hasPrefix(committed) {
let start = current.index(current.startIndex, offsetBy: committed.count)

View File

@@ -44,9 +44,13 @@ enum VoiceWakeChimePlayer {
private static let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.chime")
private static var lastSound: NSSound?
static func play(_ chime: VoiceWakeChime) {
static func play(_ chime: VoiceWakeChime, reason: String? = nil) {
guard let sound = self.sound(for: chime) else { return }
self.logger.log(level: .info, "chime play")
if let reason {
self.logger.log(level: .info, "chime play reason=\(reason, privacy: .public)")
} else {
self.logger.log(level: .info, "chime play")
}
SoundEffectPlayer.play(sound)
}

View File

@@ -32,6 +32,7 @@ actor VoiceWakeRuntime {
private var cooldownUntil: Date?
private var currentConfig: RuntimeConfig?
private var listeningState: ListeningState = .idle
private var overlayToken: UUID?
// Tunables
// Silence threshold once we've captured user speech (post-trigger).
@@ -162,9 +163,11 @@ actor VoiceWakeRuntime {
self.listeningState = .idle
self.logger.debug("voicewake runtime stopped")
let token = self.overlayToken
self.overlayToken = nil
guard dismissOverlay else { return }
Task { @MainActor in
VoiceWakeOverlayController.shared.dismiss()
VoiceWakeOverlayController.shared.dismiss(token: token)
}
}
@@ -208,8 +211,10 @@ actor VoiceWakeRuntime {
volatile: self.volatileTranscript,
isFinal: isFinal)
let snapshot = self.committedTranscript + self.volatileTranscript
await MainActor.run {
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
if let token = self.overlayToken {
await MainActor.run {
VoiceWakeOverlayController.shared.updatePartial(token: token, transcript: snapshot, attributed: attributed)
}
}
}
}
@@ -249,7 +254,7 @@ actor VoiceWakeRuntime {
if config.triggerChime != .none, !self.triggerChimePlayed {
self.triggerChimePlayed = true
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) }
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "voicewake.trigger") }
}
let snapshot = self.committedTranscript + self.volatileTranscript
@@ -257,8 +262,11 @@ actor VoiceWakeRuntime {
committed: self.committedTranscript,
volatile: self.volatileTranscript,
isFinal: false)
await MainActor.run {
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
self.overlayToken = await MainActor.run {
VoiceWakeOverlayController.shared.startSession(
source: .wakeWord,
transcript: snapshot,
attributed: attributed)
}
// Keep the "ears" boosted for the capture window so the status icon animates while recording.
@@ -309,7 +317,9 @@ actor VoiceWakeRuntime {
self.triggerChimePlayed = false
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
await MainActor.run { VoiceWakeOverlayController.shared.updateLevel(0) }
if let token = self.overlayToken {
await MainActor.run { VoiceWakeOverlayController.shared.updateLevel(token: token, 0) }
}
let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
// Auto-send should fire as soon as the silence threshold is satisfied (2s after speech, 5s after trigger-only).
@@ -320,14 +330,25 @@ actor VoiceWakeRuntime {
volatile: "",
isFinal: true)
let sendChime = finalTranscript.isEmpty ? .none : config.sendChime
await MainActor.run {
VoiceWakeOverlayController.shared.presentFinal(
transcript: finalTranscript,
if let token = self.overlayToken {
await MainActor.run {
VoiceWakeOverlayController.shared.presentFinal(
token: token,
transcript: finalTranscript,
forwardConfig: forwardConfig,
autoSendAfter: delay,
sendChime: sendChime,
attributed: finalAttributed)
}
} else if forwardConfig.enabled, !finalTranscript.isEmpty {
if sendChime != .none {
await MainActor.run { VoiceWakeChimePlayer.play(sendChime, reason: "voicewake.send") }
}
Task.detached {
await VoiceWakeForwarder.forward(transcript: finalTranscript, config: forwardConfig)
}
}
self.overlayToken = nil
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
self.restartRecognizer()
@@ -349,8 +370,10 @@ actor VoiceWakeRuntime {
// Normalize against the adaptive threshold so the UI meter stays roughly 0...1 across devices.
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
Task { @MainActor in
VoiceWakeOverlayController.shared.updateLevel(clamped)
if let token = self.overlayToken {
Task { @MainActor in
VoiceWakeOverlayController.shared.updateLevel(token: token, clamped)
}
}
}

View File

@@ -6,7 +6,12 @@ Audience: macOS app contributors. Goal: keep the voice overlay predictable when
- If the overlay is already visible from wake-word and the user presses the hotkey, the hotkey session *adopts* the existing text instead of resetting it. The overlay stays up while the hotkey is held. When the user releases: send if there is trimmed text, otherwise dismiss.
- Wake-word alone still auto-sends on silence; push-to-talk sends immediately on release.
### Proposed architecture (to implement next)
### Implemented (Dec 9, 2025)
- Overlay sessions now carry a token per capture (wake-word or push-to-talk). Partial/final/send/dismiss/level updates are dropped when the token doesnt match, avoiding stale callbacks.
- Push-to-talk adopts any visible overlay text as a prefix (so pressing the hotkey while the wake overlay is up keeps the text and appends new speech). It waits up to 1.5s for a final transcript before falling back to the current text.
- Chime/overlay logging is emitted at `info` in categories `voicewake.overlay`, `voicewake.ptt`, and `voicewake.chime` (session start, partial, final, send, dismiss, chime reason).
### Next steps
1. **VoiceSessionCoordinator (actor)**
- Owns exactly one `VoiceSession` at a time.
- API (token-based): `beginWakeCapture`, `beginPushToTalk`, `updatePartial`, `endCapture`, `cancel`, `applyCooldown`.
@@ -40,4 +45,3 @@ Audience: macOS app contributors. Goal: keep the voice overlay predictable when
3. Refactor `VoicePushToTalk` to adopt existing sessions and call `endCapture` on release; apply runtime cooldown.
4. Wire `VoiceWakeOverlayController` to the publisher; remove direct calls from runtime/PTT.
5. Add integration tests for session adoption, cooldown, and empty-text dismissal.