feat(mac): tokenized voice overlay adoption
This commit is contained in:
@@ -77,6 +77,8 @@ final class VoicePushToTalkHotkey {
|
||||
actor VoicePushToTalk {
|
||||
static let shared = VoicePushToTalk()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
|
||||
|
||||
private var recognizer: SFSpeechRecognizer?
|
||||
private var audioEngine = AVAudioEngine()
|
||||
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||
@@ -89,6 +91,8 @@ actor VoicePushToTalk {
|
||||
private var triggerChimePlayed = false
|
||||
private var finalized = false
|
||||
private var timeoutTask: Task<Void, Never>?
|
||||
private var overlayToken: UUID?
|
||||
private var adoptedPrefix: String = ""
|
||||
|
||||
private struct Config {
|
||||
let micID: String?
|
||||
@@ -112,14 +116,22 @@ actor VoicePushToTalk {
|
||||
self.triggerChimePlayed = false
|
||||
self.finalized = false
|
||||
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
||||
let snapshot = await MainActor.run { VoiceWakeOverlayController.shared.snapshot() }
|
||||
self.adoptedPrefix = snapshot.isVisible ? snapshot.text.trimmingCharacters(in: .whitespacesAndNewlines) : ""
|
||||
self.logger.info("ptt begin adopted_prefix_len=\(self.adoptedPrefix.count, privacy: .public)")
|
||||
if config.triggerChime != .none {
|
||||
self.triggerChimePlayed = true
|
||||
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) }
|
||||
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "ptt.trigger") }
|
||||
}
|
||||
// Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
|
||||
await VoiceWakeRuntime.shared.pauseForPushToTalk()
|
||||
await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.showPartial(transcript: "")
|
||||
let adoptedPrefix = self.adoptedPrefix
|
||||
let adoptedAttributed: NSAttributedString? = adoptedPrefix.isEmpty ? nil : Self.makeAttributed(committed: adoptedPrefix, volatile: "", isFinal: false)
|
||||
self.overlayToken = await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.startSession(
|
||||
source: .pushToTalk,
|
||||
transcript: adoptedPrefix,
|
||||
attributed: adoptedAttributed)
|
||||
}
|
||||
|
||||
do {
|
||||
@@ -143,7 +155,7 @@ actor VoicePushToTalk {
|
||||
// Give Speech a brief window to deliver the final result; otherwise fall back to current text.
|
||||
self.timeoutTask?.cancel()
|
||||
self.timeoutTask = Task { [weak self] in
|
||||
try? await Task.sleep(nanoseconds: 700_000_000) // 700ms grace period
|
||||
try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result
|
||||
await self?.finalize(transcriptOverride: nil, reason: "timeout")
|
||||
}
|
||||
}
|
||||
@@ -175,8 +187,7 @@ actor VoicePushToTalk {
|
||||
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
||||
guard let self else { return }
|
||||
if let error {
|
||||
Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
|
||||
.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
|
||||
self.logger.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
let transcript = result?.bestTranscription.formattedString
|
||||
let isFinal = result?.isFinal ?? false
|
||||
@@ -200,10 +211,13 @@ actor VoicePushToTalk {
|
||||
self.volatile = Self.delta(after: self.committed, current: transcript)
|
||||
}
|
||||
|
||||
let snapshot = self.committed + self.volatile
|
||||
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: isFinal)
|
||||
await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
|
||||
let committedWithPrefix = Self.join(self.adoptedPrefix, self.committed)
|
||||
let snapshot = Self.join(committedWithPrefix, self.volatile)
|
||||
let attributed = Self.makeAttributed(committed: committedWithPrefix, volatile: self.volatile, isFinal: isFinal)
|
||||
if let token = self.overlayToken {
|
||||
await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.updatePartial(token: token, transcript: snapshot, attributed: attributed)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,14 +226,18 @@ actor VoicePushToTalk {
|
||||
self.finalized = true
|
||||
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
||||
|
||||
let finalText: String = {
|
||||
let finalRecognized: String = {
|
||||
if let override = transcriptOverride?.trimmingCharacters(in: .whitespacesAndNewlines) {
|
||||
return override
|
||||
}
|
||||
return (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
}()
|
||||
let finalText = Self.join(self.adoptedPrefix, finalRecognized)
|
||||
|
||||
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: true)
|
||||
let attributed = Self.makeAttributed(
|
||||
committed: Self.join(self.adoptedPrefix, self.committed),
|
||||
volatile: self.volatile,
|
||||
isFinal: true)
|
||||
let forward: VoiceWakeForwardConfig
|
||||
if let cached = self.activeConfig?.forwardConfig {
|
||||
forward = cached
|
||||
@@ -228,19 +246,28 @@ actor VoicePushToTalk {
|
||||
}
|
||||
let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)
|
||||
|
||||
let token = self.overlayToken
|
||||
let logger = self.logger
|
||||
await MainActor.run {
|
||||
Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
|
||||
.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
|
||||
logger.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
|
||||
if finalText.isEmpty {
|
||||
VoiceWakeOverlayController.shared.dismiss(reason: .empty)
|
||||
} else {
|
||||
VoiceWakeOverlayController.shared.dismiss(token: token, reason: .empty)
|
||||
} else if let token {
|
||||
VoiceWakeOverlayController.shared.presentFinal(
|
||||
token: token,
|
||||
transcript: finalText,
|
||||
forwardConfig: forward,
|
||||
autoSendAfter: nil,
|
||||
sendChime: chime,
|
||||
attributed: attributed)
|
||||
VoiceWakeOverlayController.shared.sendNow(sendChime: chime)
|
||||
VoiceWakeOverlayController.shared.sendNow(token: token, sendChime: chime)
|
||||
} else {
|
||||
if chime != .none {
|
||||
VoiceWakeChimePlayer.play(chime, reason: "ptt.fallback_send")
|
||||
}
|
||||
Task.detached {
|
||||
await VoiceWakeForwarder.forward(transcript: finalText, config: forward)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -254,6 +281,8 @@ actor VoicePushToTalk {
|
||||
self.volatile = ""
|
||||
self.activeConfig = nil
|
||||
self.triggerChimePlayed = false
|
||||
self.overlayToken = nil
|
||||
self.adoptedPrefix = ""
|
||||
|
||||
// Resume the wake-word runtime after push-to-talk finishes.
|
||||
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
|
||||
@@ -284,6 +313,12 @@ actor VoicePushToTalk {
|
||||
return (committedColor, volatileColor)
|
||||
}
|
||||
|
||||
private static func join(_ prefix: String, _ suffix: String) -> String {
|
||||
if prefix.isEmpty { return suffix }
|
||||
if suffix.isEmpty { return prefix }
|
||||
return "\(prefix) \(suffix)"
|
||||
}
|
||||
|
||||
private static func delta(after committed: String, current: String) -> String {
|
||||
if current.hasPrefix(committed) {
|
||||
let start = current.index(current.startIndex, offsetBy: committed.count)
|
||||
|
||||
@@ -44,9 +44,13 @@ enum VoiceWakeChimePlayer {
|
||||
private static let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.chime")
|
||||
private static var lastSound: NSSound?
|
||||
|
||||
static func play(_ chime: VoiceWakeChime) {
|
||||
static func play(_ chime: VoiceWakeChime, reason: String? = nil) {
|
||||
guard let sound = self.sound(for: chime) else { return }
|
||||
self.logger.log(level: .info, "chime play")
|
||||
if let reason {
|
||||
self.logger.log(level: .info, "chime play reason=\(reason, privacy: .public)")
|
||||
} else {
|
||||
self.logger.log(level: .info, "chime play")
|
||||
}
|
||||
SoundEffectPlayer.play(sound)
|
||||
}
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ actor VoiceWakeRuntime {
|
||||
private var cooldownUntil: Date?
|
||||
private var currentConfig: RuntimeConfig?
|
||||
private var listeningState: ListeningState = .idle
|
||||
private var overlayToken: UUID?
|
||||
|
||||
// Tunables
|
||||
// Silence threshold once we've captured user speech (post-trigger).
|
||||
@@ -162,9 +163,11 @@ actor VoiceWakeRuntime {
|
||||
self.listeningState = .idle
|
||||
self.logger.debug("voicewake runtime stopped")
|
||||
|
||||
let token = self.overlayToken
|
||||
self.overlayToken = nil
|
||||
guard dismissOverlay else { return }
|
||||
Task { @MainActor in
|
||||
VoiceWakeOverlayController.shared.dismiss()
|
||||
VoiceWakeOverlayController.shared.dismiss(token: token)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,8 +211,10 @@ actor VoiceWakeRuntime {
|
||||
volatile: self.volatileTranscript,
|
||||
isFinal: isFinal)
|
||||
let snapshot = self.committedTranscript + self.volatileTranscript
|
||||
await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
|
||||
if let token = self.overlayToken {
|
||||
await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.updatePartial(token: token, transcript: snapshot, attributed: attributed)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -249,7 +254,7 @@ actor VoiceWakeRuntime {
|
||||
|
||||
if config.triggerChime != .none, !self.triggerChimePlayed {
|
||||
self.triggerChimePlayed = true
|
||||
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) }
|
||||
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "voicewake.trigger") }
|
||||
}
|
||||
|
||||
let snapshot = self.committedTranscript + self.volatileTranscript
|
||||
@@ -257,8 +262,11 @@ actor VoiceWakeRuntime {
|
||||
committed: self.committedTranscript,
|
||||
volatile: self.volatileTranscript,
|
||||
isFinal: false)
|
||||
await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
|
||||
self.overlayToken = await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.startSession(
|
||||
source: .wakeWord,
|
||||
transcript: snapshot,
|
||||
attributed: attributed)
|
||||
}
|
||||
|
||||
// Keep the "ears" boosted for the capture window so the status icon animates while recording.
|
||||
@@ -309,7 +317,9 @@ actor VoiceWakeRuntime {
|
||||
self.triggerChimePlayed = false
|
||||
|
||||
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
|
||||
await MainActor.run { VoiceWakeOverlayController.shared.updateLevel(0) }
|
||||
if let token = self.overlayToken {
|
||||
await MainActor.run { VoiceWakeOverlayController.shared.updateLevel(token: token, 0) }
|
||||
}
|
||||
|
||||
let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
|
||||
// Auto-send should fire as soon as the silence threshold is satisfied (2s after speech, 5s after trigger-only).
|
||||
@@ -320,14 +330,25 @@ actor VoiceWakeRuntime {
|
||||
volatile: "",
|
||||
isFinal: true)
|
||||
let sendChime = finalTranscript.isEmpty ? .none : config.sendChime
|
||||
await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.presentFinal(
|
||||
transcript: finalTranscript,
|
||||
if let token = self.overlayToken {
|
||||
await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.presentFinal(
|
||||
token: token,
|
||||
transcript: finalTranscript,
|
||||
forwardConfig: forwardConfig,
|
||||
autoSendAfter: delay,
|
||||
sendChime: sendChime,
|
||||
attributed: finalAttributed)
|
||||
}
|
||||
} else if forwardConfig.enabled, !finalTranscript.isEmpty {
|
||||
if sendChime != .none {
|
||||
await MainActor.run { VoiceWakeChimePlayer.play(sendChime, reason: "voicewake.send") }
|
||||
}
|
||||
Task.detached {
|
||||
await VoiceWakeForwarder.forward(transcript: finalTranscript, config: forwardConfig)
|
||||
}
|
||||
}
|
||||
self.overlayToken = nil
|
||||
|
||||
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
|
||||
self.restartRecognizer()
|
||||
@@ -349,8 +370,10 @@ actor VoiceWakeRuntime {
|
||||
|
||||
// Normalize against the adaptive threshold so the UI meter stays roughly 0...1 across devices.
|
||||
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
|
||||
Task { @MainActor in
|
||||
VoiceWakeOverlayController.shared.updateLevel(clamped)
|
||||
if let token = self.overlayToken {
|
||||
Task { @MainActor in
|
||||
VoiceWakeOverlayController.shared.updateLevel(token: token, clamped)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,12 @@ Audience: macOS app contributors. Goal: keep the voice overlay predictable when
|
||||
- If the overlay is already visible from wake-word and the user presses the hotkey, the hotkey session *adopts* the existing text instead of resetting it. The overlay stays up while the hotkey is held. When the user releases: send if there is trimmed text, otherwise dismiss.
|
||||
- Wake-word alone still auto-sends on silence; push-to-talk sends immediately on release.
|
||||
|
||||
### Proposed architecture (to implement next)
|
||||
### Implemented (Dec 9, 2025)
|
||||
- Overlay sessions now carry a token per capture (wake-word or push-to-talk). Partial/final/send/dismiss/level updates are dropped when the token doesn’t match, avoiding stale callbacks.
|
||||
- Push-to-talk adopts any visible overlay text as a prefix (so pressing the hotkey while the wake overlay is up keeps the text and appends new speech). It waits up to 1.5s for a final transcript before falling back to the current text.
|
||||
- Chime/overlay logging is emitted at `info` in categories `voicewake.overlay`, `voicewake.ptt`, and `voicewake.chime` (session start, partial, final, send, dismiss, chime reason).
|
||||
|
||||
### Next steps
|
||||
1. **VoiceSessionCoordinator (actor)**
|
||||
- Owns exactly one `VoiceSession` at a time.
|
||||
- API (token-based): `beginWakeCapture`, `beginPushToTalk`, `updatePartial`, `endCapture`, `cancel`, `applyCooldown`.
|
||||
@@ -40,4 +45,3 @@ Audience: macOS app contributors. Goal: keep the voice overlay predictable when
|
||||
3. Refactor `VoicePushToTalk` to adopt existing sessions and call `endCapture` on release; apply runtime cooldown.
|
||||
4. Wire `VoiceWakeOverlayController` to the publisher; remove direct calls from runtime/PTT.
|
||||
5. Add integration tests for session adoption, cooldown, and empty-text dismissal.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user