diff --git a/apps/macos/Sources/Clawdbot/VoiceWakeRuntime.swift b/apps/macos/Sources/Clawdbot/VoiceWakeRuntime.swift index e025f9ccc..b2ef1d442 100644 --- a/apps/macos/Sources/Clawdbot/VoiceWakeRuntime.swift +++ b/apps/macos/Sources/Clawdbot/VoiceWakeRuntime.swift @@ -38,6 +38,15 @@ actor VoiceWakeRuntime { private var overlayToken: UUID? private var activeTriggerEndTime: TimeInterval? private var scheduledRestartTask: Task? + private var lastLoggedText: String? + private var lastLoggedAt: Date? + private var lastTapLogAt: Date? + private var lastCallbackLogAt: Date? + private var lastTranscript: String? + private var lastTranscriptAt: Date? + private var preDetectTask: Task? + private var isStarting: Bool = false + private var triggerOnlyTask: Task? // Tunables // Silence threshold once we've captured user speech (post-trigger). @@ -50,6 +59,8 @@ actor VoiceWakeRuntime { // Voice activity detection parameters (RMS-based). private let minSpeechRMS: Double = 1e-3 private let speechBoostFactor: Double = 6.0 // how far above noise floor we require to mark speech + private let preDetectSilenceWindow: TimeInterval = 1.0 + private let triggerPauseWindow: TimeInterval = 0.55 /// Stops the active Speech pipeline without clearing the stored config, so we can restart cleanly. private func haltRecognitionPipeline() { @@ -106,6 +117,19 @@ actor VoiceWakeRuntime { let config = snapshot.1 + if self.isStarting { + return + } + + if self.scheduledRestartTask != nil, config == self.currentConfig, self.recognitionTask == nil { + return + } + + if self.scheduledRestartTask != nil { + self.scheduledRestartTask?.cancel() + self.scheduledRestartTask = nil + } + if config == self.currentConfig, self.recognitionTask != nil { return } @@ -115,6 +139,11 @@ actor VoiceWakeRuntime { } private func start(with config: RuntimeConfig) async { + if self.isStarting { + return + } + self.isStarting = true + defer { self.isStarting = false } do { self.recognitionGeneration &+= 1 let generation = self.recognitionGeneration @@ -148,10 +177,10 @@ actor VoiceWakeRuntime { input.removeTap(onBus: 0) input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in request?.append(buffer) - if let rms = Self.rmsLevel(buffer: buffer) { - Task.detached { [weak self] in - await self?.noteAudioLevel(rms: rms) - } + guard let rms = Self.rmsLevel(buffer: buffer) else { return } + Task.detached { [weak self] in + await self?.noteAudioLevel(rms: rms) + await self?.noteAudioTap(rms: rms) } } @@ -170,6 +199,7 @@ actor VoiceWakeRuntime { .map { WakeWordSpeechSegments.from(transcription: result.bestTranscription, transcript: $0) } } ?? [] let isFinal = result?.isFinal ?? false + Task { await self.noteRecognitionCallback(transcript: transcript, isFinal: isFinal, error: error) } let update = RecognitionUpdate( transcript: transcript, segments: segments, @@ -205,6 +235,12 @@ actor VoiceWakeRuntime { self.capturedTranscript = "" self.captureStartedAt = nil self.triggerChimePlayed = false + self.lastTranscript = nil + self.lastTranscriptAt = nil + self.preDetectTask?.cancel() + self.preDetectTask = nil + self.triggerOnlyTask?.cancel() + self.triggerOnlyTask = nil self.haltRecognitionPipeline() self.recognizer = nil self.currentConfig = nil @@ -244,7 +280,19 @@ actor VoiceWakeRuntime { let now = Date() if !transcript.isEmpty { self.lastHeard = now + if !self.isCapturing { + self.lastTranscript = transcript + self.lastTranscriptAt = now + } if self.isCapturing { + self.maybeLogRecognition( + transcript: transcript, + segments: update.segments, + triggers: config.triggers, + isFinal: update.isFinal, + match: nil, + usedFallback: false, + capturing: true) let trimmed = Self.commandAfterTrigger( transcript: transcript, segments: update.segments, @@ -278,25 +326,209 @@ actor VoiceWakeRuntime { if self.isCapturing { return } let gateConfig = WakeWordGateConfig(triggers: config.triggers) - if let match = WakeWordGate.match(transcript: transcript, segments: update.segments, config: gateConfig) { + var usedFallback = false + var match = WakeWordGate.match(transcript: transcript, segments: update.segments, config: gateConfig) + if match == nil, update.isFinal { + match = self.textOnlyFallbackMatch( + transcript: transcript, + triggers: config.triggers, + config: gateConfig) + usedFallback = match != nil + } + self.maybeLogRecognition( + transcript: transcript, + segments: update.segments, + triggers: config.triggers, + isFinal: update.isFinal, + match: match, + usedFallback: usedFallback, + capturing: false) + + if let match { if let cooldown = cooldownUntil, now < cooldown { return } + if usedFallback { + self.logger.info("voicewake runtime detected (text-only fallback) len=\(match.command.count)") + } else { + self.logger.info("voicewake runtime detected len=\(match.command.count)") + } await self.beginCapture(command: match.command, triggerEndTime: match.triggerEndTime, config: config) - } else if update.isFinal { - let trimmed = Self.trimmedAfterTrigger(transcript, triggers: config.triggers) - if WakeWordGate.matchesTextOnly(text: transcript, triggers: config.triggers), - Self.startsWithTrigger(transcript: transcript, triggers: config.triggers), - !trimmed.isEmpty - { - if let cooldown = cooldownUntil, now < cooldown { - return - } - await self.beginCapture(command: trimmed, triggerEndTime: nil, config: config) + } else if !transcript.isEmpty, update.error == nil { + if self.isTriggerOnly(transcript: transcript, triggers: config.triggers) { + self.preDetectTask?.cancel() + self.preDetectTask = nil + self.scheduleTriggerOnlyPauseCheck(triggers: config.triggers, config: config) + } else { + self.triggerOnlyTask?.cancel() + self.triggerOnlyTask = nil + self.schedulePreDetectSilenceCheck( + triggers: config.triggers, + gateConfig: gateConfig, + config: config) } } } + private func maybeLogRecognition( + transcript: String, + segments: [WakeWordSegment], + triggers: [String], + isFinal: Bool, + match: WakeWordGateMatch?, + usedFallback: Bool, + capturing: Bool + ) { + guard !transcript.isEmpty else { return } + if transcript == self.lastLoggedText, !isFinal { + if let last = self.lastLoggedAt, Date().timeIntervalSince(last) < 0.25 { + return + } + } + self.lastLoggedText = transcript + self.lastLoggedAt = Date() + + let textOnly = WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) + let timingCount = segments.filter { $0.start > 0 || $0.duration > 0 }.count + let matchSummary = match.map { + "match=true gap=\(String(format: "%.2f", $0.postGap))s cmdLen=\($0.command.count)" + } ?? "match=false" + let segmentSummary = segments.map { seg in + let start = String(format: "%.2f", seg.start) + let end = String(format: "%.2f", seg.end) + return "\(seg.text)@\(start)-\(end)" + }.joined(separator: ", ") + + self.logger.info( + "voicewake runtime transcript='\(transcript, privacy: .public)' textOnly=\(textOnly) " + + "isFinal=\(isFinal) timing=\(timingCount)/\(segments.count) " + + "capturing=\(capturing) fallback=\(usedFallback) " + + "\(matchSummary) segments=[\(segmentSummary, privacy: .public)]") + } + + private func noteAudioTap(rms: Double) { + let now = Date() + if let last = self.lastTapLogAt, now.timeIntervalSince(last) < 1.0 { + return + } + self.lastTapLogAt = now + let db = 20 * log10(max(rms, 1e-7)) + self.logger.debug( + "voicewake runtime audio tap rms=\(String(format: "%.6f", rms)) " + + "db=\(String(format: "%.1f", db)) capturing=\(self.isCapturing)") + } + + private func noteRecognitionCallback(transcript: String?, isFinal: Bool, error: Error?) { + guard transcript?.isEmpty ?? true else { return } + let now = Date() + if let last = self.lastCallbackLogAt, now.timeIntervalSince(last) < 1.0 { + return + } + self.lastCallbackLogAt = now + let errorSummary = error?.localizedDescription ?? "none" + self.logger.debug( + "voicewake runtime callback empty transcript isFinal=\(isFinal) error=\(errorSummary, privacy: .public)") + } + + private func scheduleTriggerOnlyPauseCheck(triggers: [String], config: RuntimeConfig) { + self.triggerOnlyTask?.cancel() + let lastSeenAt = self.lastTranscriptAt + let lastText = self.lastTranscript + let windowNanos = UInt64(self.triggerPauseWindow * 1_000_000_000) + self.triggerOnlyTask = Task { [weak self, lastSeenAt, lastText] in + try? await Task.sleep(nanoseconds: windowNanos) + guard let self else { return } + await self.triggerOnlyPauseCheck( + lastSeenAt: lastSeenAt, + lastText: lastText, + triggers: triggers, + config: config) + } + } + + private func schedulePreDetectSilenceCheck( + triggers: [String], + gateConfig: WakeWordGateConfig, + config: RuntimeConfig) + { + self.preDetectTask?.cancel() + let lastSeenAt = self.lastTranscriptAt + let lastText = self.lastTranscript + let windowNanos = UInt64(self.preDetectSilenceWindow * 1_000_000_000) + self.preDetectTask = Task { [weak self, lastSeenAt, lastText] in + try? await Task.sleep(nanoseconds: windowNanos) + guard let self else { return } + await self.preDetectSilenceCheck( + lastSeenAt: lastSeenAt, + lastText: lastText, + triggers: triggers, + gateConfig: gateConfig, + config: config) + } + } + + private func triggerOnlyPauseCheck( + lastSeenAt: Date?, + lastText: String?, + triggers: [String], + config: RuntimeConfig + ) async { + guard !Task.isCancelled else { return } + guard !self.isCapturing else { return } + guard let lastSeenAt, let lastText else { return } + guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return } + guard self.isTriggerOnly(transcript: lastText, triggers: triggers) else { return } + if let cooldown = self.cooldownUntil, Date() < cooldown { + return + } + self.logger.info("voicewake runtime detected (trigger-only pause)") + await self.beginCapture(command: "", triggerEndTime: nil, config: config) + } + + private func textOnlyFallbackMatch( + transcript: String, + triggers: [String], + config: WakeWordGateConfig + ) -> WakeWordGateMatch? { + guard WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) else { return nil } + guard Self.startsWithTrigger(transcript: transcript, triggers: triggers) else { return nil } + let trimmed = Self.trimmedAfterTrigger(transcript, triggers: triggers) + guard trimmed.count >= config.minCommandLength else { return nil } + return WakeWordGateMatch(triggerEndTime: 0, postGap: 0, command: trimmed) + } + + private func isTriggerOnly(transcript: String, triggers: [String]) -> Bool { + guard WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) else { return false } + guard Self.startsWithTrigger(transcript: transcript, triggers: triggers) else { return false } + return Self.trimmedAfterTrigger(transcript, triggers: triggers).isEmpty + } + + private func preDetectSilenceCheck( + lastSeenAt: Date?, + lastText: String?, + triggers: [String], + gateConfig: WakeWordGateConfig, + config: RuntimeConfig) async + { + guard !Task.isCancelled else { return } + guard !self.isCapturing else { return } + guard let lastSeenAt, let lastText else { return } + guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return } + guard let match = self.textOnlyFallbackMatch( + transcript: lastText, + triggers: triggers, + config: gateConfig) + else { return } + if let cooldown = self.cooldownUntil, Date() < cooldown { + return + } + self.logger.info("voicewake runtime detected (silence fallback) len=\(match.command.count)") + await self.beginCapture( + command: match.command, + triggerEndTime: match.triggerEndTime, + config: config) + } + private func beginCapture(command: String, triggerEndTime: TimeInterval?, config: RuntimeConfig) async { self.listeningState = .voiceWake self.isCapturing = true @@ -309,6 +541,10 @@ actor VoiceWakeRuntime { self.heardBeyondTrigger = !command.isEmpty self.triggerChimePlayed = false self.activeTriggerEndTime = triggerEndTime + self.preDetectTask?.cancel() + self.preDetectTask = nil + self.triggerOnlyTask?.cancel() + self.triggerOnlyTask = nil if config.triggerChime != .none, !self.triggerChimePlayed { self.triggerChimePlayed = true @@ -381,6 +617,12 @@ actor VoiceWakeRuntime { self.heardBeyondTrigger = false self.triggerChimePlayed = false self.activeTriggerEndTime = nil + self.lastTranscript = nil + self.lastTranscriptAt = nil + self.preDetectTask?.cancel() + self.preDetectTask = nil + self.triggerOnlyTask?.cancel() + self.triggerOnlyTask = nil await MainActor.run { AppStateStore.shared.stopVoiceEars() } if let token = self.overlayToken { @@ -464,10 +706,15 @@ actor VoiceWakeRuntime { let nanos = UInt64(max(0, delay) * 1_000_000_000) try? await Task.sleep(nanoseconds: nanos) guard let self else { return } + await self.consumeScheduledRestart() await self.restartRecognizerIfIdleAndOverlayHidden() } } + private func consumeScheduledRestart() { + self.scheduledRestartTask = nil + } + func applyPushToTalkCooldown() { self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend) } diff --git a/docs/platforms/mac/voicewake.md b/docs/platforms/mac/voicewake.md index 33ab02e31..44db6df75 100644 --- a/docs/platforms/mac/voicewake.md +++ b/docs/platforms/mac/voicewake.md @@ -5,7 +5,7 @@ read_when: --- # Voice Wake & Push-to-Talk -Updated: 2025-12-23 · Owners: mac app +Updated: 2026-01-08 · Owners: mac app ## Modes - **Wake-word mode** (default): always-on Speech recognizer waits for trigger tokens (`swabbleTriggerWords`). On match it starts capture, shows the overlay with partial text, and auto-sends after silence. @@ -13,7 +13,7 @@ Updated: 2025-12-23 · Owners: mac app ## Runtime behavior (wake-word) - Speech recognizer lives in `VoiceWakeRuntime`. -- Trigger only fires when there’s a **meaningful pause** between the wake word and the next word (~0.45s gap). +- Trigger only fires when there’s a **meaningful pause** between the wake word and the next word (~0.55s gap). The overlay/chime can start on the pause even before the command begins. - Silence windows: 2.0s when speech is flowing, 5.0s if only the trigger was heard. - Hard stop: 120s to prevent runaway sessions. - Debounce between sessions: 350ms. @@ -42,6 +42,7 @@ Hardening: - **Voice Wake** toggle: enables wake-word runtime. - **Hold Cmd+Fn to talk**: enables the push-to-talk monitor. Disabled on macOS < 26. - Language & mic pickers, live level meter, trigger-word table, tester (local-only; does not forward). +- Mic picker preserves the last selection if a device disconnects, shows a disconnected hint, and temporarily falls back to the system default until it returns. - **Sounds**: chimes on trigger detect and on send; defaults to the macOS “Glass” system sound. You can pick any `NSSound`-loadable file (e.g. MP3/WAV/AIFF) for each event or choose **No Sound**. ## Forwarding behavior