From 0f1a262ae1a1e683ca321c3c477d4980b02c6e98 Mon Sep 17 00:00:00 2001 From: Xaden Ryan Date: Wed, 7 Jan 2026 14:48:37 -0700 Subject: [PATCH] Mac: stabilize voice wake test flow Why: voice wake tests often delivered partial/final transcripts without reliable word timings, so trigger matching failed, timeouts overwrote detections, and test runs/mic capture kept running after UI changes. What: add text-only/prefix fallback and silence-based detection in the test flow, stop/clean up any prior test, cancel timeout on detection/stop, and tear down meter/test when the Voice Wake tab is inactive. Runtime detection now falls back on final text-only matches when timing is missing. UI state now reflects finalizing and prevents hanging tests. --- .../Sources/Clawdbot/SettingsRootView.swift | 2 +- .../Sources/Clawdbot/VoiceWakeRuntime.swift | 43 ++- .../Sources/Clawdbot/VoiceWakeSettings.swift | 96 ++++++- .../Sources/Clawdbot/VoiceWakeTestCard.swift | 6 + .../Sources/Clawdbot/VoiceWakeTester.swift | 264 +++++++++++++++++- 5 files changed, 393 insertions(+), 18 deletions(-) diff --git a/apps/macos/Sources/Clawdbot/SettingsRootView.swift b/apps/macos/Sources/Clawdbot/SettingsRootView.swift index c99e88bab..22c348ea1 100644 --- a/apps/macos/Sources/Clawdbot/SettingsRootView.swift +++ b/apps/macos/Sources/Clawdbot/SettingsRootView.swift @@ -31,7 +31,7 @@ struct SettingsRootView: View { .tabItem { Label("Connections", systemImage: "link") } .tag(SettingsTab.connections) - VoiceWakeSettings(state: self.state) + VoiceWakeSettings(state: self.state, isActive: self.selectedTab == .voiceWake) .tabItem { Label("Voice Wake", systemImage: "waveform.circle") } .tag(SettingsTab.voiceWake) diff --git a/apps/macos/Sources/Clawdbot/VoiceWakeRuntime.swift b/apps/macos/Sources/Clawdbot/VoiceWakeRuntime.swift index e120710db..a0f661051 100644 --- a/apps/macos/Sources/Clawdbot/VoiceWakeRuntime.swift +++ b/apps/macos/Sources/Clawdbot/VoiceWakeRuntime.swift @@ -128,6 +128,7 @@ actor VoiceWakeRuntime { self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() self.recognitionRequest?.shouldReportPartialResults = true + self.recognitionRequest?.taskHint = .dictation guard let request = self.recognitionRequest else { return } // Lazily create the engine here so app launch doesn't grab audio resources / trigger Bluetooth HFP. @@ -217,6 +218,7 @@ actor VoiceWakeRuntime { private func configureSession(localeID: String?) { let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier) self.recognizer = SFSpeechRecognizer(locale: locale) + self.recognizer?.defaultTaskHint = .dictation } private func handleRecognition(_ update: RecognitionUpdate, config: RuntimeConfig) async { @@ -271,10 +273,21 @@ actor VoiceWakeRuntime { return } await self.beginCapture(command: match.command, triggerEndTime: match.triggerEndTime, config: config) + } else if update.isFinal { + let trimmed = Self.trimmedAfterTrigger(transcript, triggers: config.triggers) + if WakeWordGate.matchesTextOnly(text: transcript, triggers: config.triggers), + Self.startsWithTrigger(transcript: transcript, triggers: config.triggers), + !trimmed.isEmpty + { + if let cooldown = cooldownUntil, now < cooldown { + return + } + await self.beginCapture(command: trimmed, triggerEndTime: nil, config: config) + } } } - private func beginCapture(command: String, triggerEndTime: TimeInterval, config: RuntimeConfig) async { + private func beginCapture(command: String, triggerEndTime: TimeInterval?, config: RuntimeConfig) async { self.listeningState = .voiceWake self.isCapturing = true DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "beginCapture") @@ -472,6 +485,34 @@ actor VoiceWakeRuntime { return text } + private static func startsWithTrigger(transcript: String, triggers: [String]) -> Bool { + let tokens = transcript + .split(whereSeparator: { $0.isWhitespace }) + .map { normalizeToken(String($0)) } + .filter { !$0.isEmpty } + guard !tokens.isEmpty else { return false } + for trigger in triggers { + let triggerTokens = trigger + .split(whereSeparator: { $0.isWhitespace }) + .map { normalizeToken(String($0)) } + .filter { !$0.isEmpty } + guard !triggerTokens.isEmpty, tokens.count >= triggerTokens.count else { continue } + if zip(triggerTokens, tokens.prefix(triggerTokens.count)).allSatisfy({ $0 == $1 }) { + return true + } + } + return false + } + + private static func normalizeToken(_ token: String) -> String { + token + .trimmingCharacters(in: Self.whitespaceAndPunctuation) + .lowercased() + } + + private static let whitespaceAndPunctuation = CharacterSet.whitespacesAndNewlines + .union(.punctuationCharacters) + private static func commandAfterTrigger( transcript: String, segments: [WakeWordSegment], diff --git a/apps/macos/Sources/Clawdbot/VoiceWakeSettings.swift b/apps/macos/Sources/Clawdbot/VoiceWakeSettings.swift index a888c1558..4f486b653 100644 --- a/apps/macos/Sources/Clawdbot/VoiceWakeSettings.swift +++ b/apps/macos/Sources/Clawdbot/VoiceWakeSettings.swift @@ -1,15 +1,18 @@ import AppKit import AVFoundation import Observation +import SwabbleKit import Speech import SwiftUI import UniformTypeIdentifiers struct VoiceWakeSettings: View { @Bindable var state: AppState + let isActive: Bool @State private var testState: VoiceWakeTestState = .idle @State private var tester = VoiceWakeTester() @State private var isTesting = false + @State private var testTimeoutTask: Task? @State private var availableMics: [AudioInputDevice] = [] @State private var loadingMics = false @State private var meterLevel: Double = 0 @@ -101,8 +104,22 @@ struct VoiceWakeSettings: View { guard !self.isPreview else { return } Task { await self.restartMeter() } } + .onChange(of: self.isActive) { _, active in + guard !self.isPreview else { return } + if !active { + self.tester.stop() + self.isTesting = false + self.testState = .idle + self.testTimeoutTask?.cancel() + Task { await self.meter.stop() } + } + } .onDisappear { guard !self.isPreview else { return } + self.tester.stop() + self.isTesting = false + self.testState = .idle + self.testTimeoutTask?.cancel() Task { await self.meter.stop() } } } @@ -205,13 +222,23 @@ struct VoiceWakeSettings: View { return } if self.isTesting { - self.tester.stop() + self.tester.finalize() self.isTesting = false - self.testState = .idle + self.testState = .finalizing + Task { @MainActor in + try? await Task.sleep(nanoseconds: 2_000_000_000) + if self.testState == .finalizing { + self.tester.stop() + self.testState = .failed("Stopped") + } + } + self.testTimeoutTask?.cancel() return } let triggers = self.sanitizedTriggers() + self.tester.stop() + self.testTimeoutTask?.cancel() self.isTesting = true self.testState = .requesting Task { @MainActor in @@ -225,18 +252,31 @@ struct VoiceWakeSettings: View { self.testState = newState if case .detected = newState { self.isTesting = false } if case .failed = newState { self.isTesting = false } + if case .detected = newState { self.testTimeoutTask?.cancel() } + if case .failed = newState { self.testTimeoutTask?.cancel() } } }) - try await Task.sleep(nanoseconds: 10 * 1_000_000_000) - if self.isTesting { - self.tester.stop() - self.testState = .failed("Timeout: no trigger heard") - self.isTesting = false + self.testTimeoutTask?.cancel() + self.testTimeoutTask = Task { @MainActor in + try? await Task.sleep(nanoseconds: 10 * 1_000_000_000) + guard !Task.isCancelled else { return } + if self.isTesting { + self.tester.stop() + if case let .hearing(text) = self.testState, + let command = Self.textOnlyCommand(from: text, triggers: triggers) + { + self.testState = .detected(command) + } else { + self.testState = .failed("Timeout: no trigger heard") + } + self.isTesting = false + } } } catch { self.tester.stop() self.testState = .failed(error.localizedDescription) self.isTesting = false + self.testTimeoutTask?.cancel() } } } @@ -314,6 +354,44 @@ struct VoiceWakeSettings: View { sanitizeVoiceWakeTriggers(self.state.swabbleTriggerWords) } + private static func textOnlyCommand(from transcript: String, triggers: [String]) -> String? { + guard !transcript.isEmpty else { return nil } + let normalized = normalizeToken(transcript) + guard !normalized.isEmpty else { return nil } + guard startsWithTrigger(transcript: transcript, triggers: triggers) else { return nil } + guard WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) else { return nil } + let trimmed = WakeWordGate.stripWake(text: transcript, triggers: triggers) + return trimmed.isEmpty ? nil : trimmed + } + + private static func startsWithTrigger(transcript: String, triggers: [String]) -> Bool { + let tokens = transcript + .split(whereSeparator: { $0.isWhitespace }) + .map { normalizeToken(String($0)) } + .filter { !$0.isEmpty } + guard !tokens.isEmpty else { return false } + for trigger in triggers { + let triggerTokens = trigger + .split(whereSeparator: { $0.isWhitespace }) + .map { normalizeToken(String($0)) } + .filter { !$0.isEmpty } + guard !triggerTokens.isEmpty, tokens.count >= triggerTokens.count else { continue } + if zip(triggerTokens, tokens.prefix(triggerTokens.count)).allSatisfy({ $0 == $1 }) { + return true + } + } + return false + } + + private static func normalizeToken(_ token: String) -> String { + token + .trimmingCharacters(in: Self.whitespaceAndPunctuation) + .lowercased() + } + + private static let whitespaceAndPunctuation = CharacterSet.whitespacesAndNewlines + .union(.punctuationCharacters) + private var micPicker: some View { VStack(alignment: .leading, spacing: 6) { HStack(alignment: .firstTextBaseline, spacing: 10) { @@ -506,7 +584,7 @@ struct VoiceWakeSettings: View { #if DEBUG struct VoiceWakeSettings_Previews: PreviewProvider { static var previews: some View { - VoiceWakeSettings(state: .preview) + VoiceWakeSettings(state: .preview, isActive: true) .frame(width: SettingsTab.windowWidth, height: SettingsTab.windowHeight) } } @@ -519,7 +597,7 @@ extension VoiceWakeSettings { state.voicePushToTalkEnabled = true state.swabbleTriggerWords = ["Claude", "Hey"] - let view = VoiceWakeSettings(state: state) + let view = VoiceWakeSettings(state: state, isActive: true) view.availableMics = [AudioInputDevice(uid: "mic-1", name: "Built-in")] view.availableLocales = [Locale(identifier: "en_US")] view.meterLevel = 0.42 diff --git a/apps/macos/Sources/Clawdbot/VoiceWakeTestCard.swift b/apps/macos/Sources/Clawdbot/VoiceWakeTestCard.swift index e13018fd0..7de20885a 100644 --- a/apps/macos/Sources/Clawdbot/VoiceWakeTestCard.swift +++ b/apps/macos/Sources/Clawdbot/VoiceWakeTestCard.swift @@ -57,6 +57,9 @@ struct VoiceWakeTestCard: View { .symbolEffect(.pulse) .foregroundStyle(Color.accentColor)) + case .finalizing: + AnyView(ProgressView().controlSize(.small)) + case .detected: AnyView(Image(systemName: "checkmark.circle.fill").foregroundStyle(.green)) @@ -79,6 +82,9 @@ struct VoiceWakeTestCard: View { case let .hearing(text): "Heard: \(text)" + case .finalizing: + "Finalizing…" + case .detected: "Voice wake detected!" diff --git a/apps/macos/Sources/Clawdbot/VoiceWakeTester.swift b/apps/macos/Sources/Clawdbot/VoiceWakeTester.swift index 85de8c67a..b35a2c099 100644 --- a/apps/macos/Sources/Clawdbot/VoiceWakeTester.swift +++ b/apps/macos/Sources/Clawdbot/VoiceWakeTester.swift @@ -8,6 +8,7 @@ enum VoiceWakeTestState: Equatable { case requesting case listening case hearing(String) + case finalizing case detected(String) case failed(String) } @@ -18,8 +19,15 @@ final class VoiceWakeTester { private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? private var recognitionTask: SFSpeechRecognitionTask? private var isStopping = false + private var isFinalizing = false private var detectionStart: Date? private var lastHeard: Date? + private var lastLoggedText: String? + private var lastLoggedAt: Date? + private var lastTranscript: String? + private var lastTranscriptAt: Date? + private var silenceTask: Task? + private var currentTriggers: [String] = [] private var holdingAfterDetect = false private var detectedText: String? private let logger = Logger(subsystem: "com.clawdbot", category: "voicewake") @@ -37,6 +45,17 @@ final class VoiceWakeTester { { guard self.recognitionTask == nil else { return } self.isStopping = false + self.isFinalizing = false + self.holdingAfterDetect = false + self.detectedText = nil + self.lastHeard = nil + self.lastLoggedText = nil + self.lastLoggedAt = nil + self.lastTranscript = nil + self.lastTranscriptAt = nil + self.silenceTask?.cancel() + self.silenceTask = nil + self.currentTriggers = triggers let chosenLocale = localeID.flatMap { Locale(identifier: $0) } ?? Locale.current let recognizer = SFSpeechRecognizer(locale: chosenLocale) guard let recognizer, recognizer.isAvailable else { @@ -45,6 +64,7 @@ final class VoiceWakeTester { code: 1, userInfo: [NSLocalizedDescriptionKey: "Speech recognition unavailable"]) } + recognizer.defaultTaskHint = .dictation guard Self.hasPrivacyStrings else { throw NSError( @@ -70,6 +90,7 @@ final class VoiceWakeTester { self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() self.recognitionRequest?.shouldReportPartialResults = true + self.recognitionRequest?.taskHint = .dictation let request = self.recognitionRequest let inputNode = self.audioEngine.inputNode @@ -96,9 +117,21 @@ final class VoiceWakeTester { let segments = result.map { WakeWordSpeechSegments.from( transcription: $0.bestTranscription, transcript: text) } ?? [] - let gateConfig = WakeWordGateConfig(triggers: triggers) - let match = WakeWordGate.match(transcript: text, segments: segments, config: gateConfig) let isFinal = result?.isFinal ?? false + let gateConfig = WakeWordGateConfig(triggers: triggers) + var match = WakeWordGate.match(transcript: text, segments: segments, config: gateConfig) + if match == nil, isFinal { + match = self.textOnlyFallbackMatch( + transcript: text, + triggers: triggers, + config: gateConfig) + } + self.maybeLogDebug( + transcript: text, + segments: segments, + triggers: triggers, + match: match, + isFinal: isFinal) let errorMessage = error?.localizedDescription Task { [weak self] in @@ -114,13 +147,47 @@ final class VoiceWakeTester { } func stop() { - self.isStopping = true + self.stop(force: true) + } + + func finalize(timeout: TimeInterval = 1.5) { + guard self.recognitionTask != nil else { + self.stop(force: true) + return + } + self.isFinalizing = true + self.audioEngine.inputNode.removeTap(onBus: 0) + self.recognitionRequest?.endAudio() + self.audioEngine.stop() + Task { [weak self] in + guard let self else { return } + try? await Task.sleep(nanoseconds: UInt64(timeout * 1_000_000_000)) + if !self.isStopping { + self.stop(force: true) + } + } + } + + private func stop(force: Bool) { + if force { self.isStopping = true } + self.isFinalizing = false self.audioEngine.stop() self.recognitionRequest?.endAudio() self.recognitionTask?.cancel() self.recognitionTask = nil self.recognitionRequest = nil self.audioEngine.inputNode.removeTap(onBus: 0) + self.holdingAfterDetect = false + self.detectedText = nil + self.lastHeard = nil + self.detectionStart = nil + self.lastLoggedText = nil + self.lastLoggedAt = nil + self.lastTranscript = nil + self.lastTranscriptAt = nil + self.silenceTask?.cancel() + self.silenceTask = nil + self.currentTriggers = [] } private func handleResult( @@ -132,6 +199,11 @@ final class VoiceWakeTester { { if !text.isEmpty { self.lastHeard = Date() + self.lastTranscript = text + self.lastTranscriptAt = Date() + } + if self.holdingAfterDetect { + return } if let match, !match.command.isEmpty { self.holdingAfterDetect = true @@ -141,17 +213,28 @@ final class VoiceWakeTester { Task.detached { await VoiceWakeForwarder.forward(transcript: match.command) } - Task { @MainActor in onUpdate(.detected(match.command)) } - self.holdUntilSilence(onUpdate: onUpdate) + self.stop() + await MainActor.run { + AppStateStore.shared.stopVoiceEars() + onUpdate(.detected(match.command)) + } return } + if !isFinal, !text.isEmpty { + self.scheduleSilenceCheck( + triggers: self.currentTriggers, + onUpdate: onUpdate) + } + if self.isFinalizing { + Task { @MainActor in onUpdate(.finalizing) } + } if let errorMessage { - self.stop() + self.stop(force: true) Task { @MainActor in onUpdate(.failed(errorMessage)) } return } if isFinal { - self.stop() + self.stop(force: true) let state: VoiceWakeTestState = text.isEmpty ? .failed("No speech detected") : .failed("No trigger heard: “\(text)”") @@ -162,6 +245,139 @@ final class VoiceWakeTester { } } + private func maybeLogDebug( + transcript: String, + segments: [WakeWordSegment], + triggers: [String], + match: WakeWordGateMatch?, + isFinal: Bool) { + guard !transcript.isEmpty else { return } + if transcript == self.lastLoggedText, !isFinal { + if let last = self.lastLoggedAt, Date().timeIntervalSince(last) < 0.25 { + return + } + } + self.lastLoggedText = transcript + self.lastLoggedAt = Date() + + let textOnly = WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) + let gaps = Self.debugCandidateGaps(triggers: triggers, segments: segments) + let segmentSummary = Self.debugSegments(segments) + let timingCount = segments.filter { $0.start > 0 || $0.duration > 0 }.count + let matchSummary = match.map { + "match=true gap=\(String(format: "%.2f", $0.postGap))s cmdLen=\($0.command.count)" + } ?? "match=false" + + self.logger.info( + "voicewake test transcript='\(transcript, privacy: .public)' textOnly=\(textOnly) " + + "isFinal=\(isFinal) timing=\(timingCount)/\(segments.count) " + + "\(matchSummary) gaps=[\(gaps, privacy: .public)] segments=[\(segmentSummary, privacy: .public)]") + } + + private static func debugSegments(_ segments: [WakeWordSegment]) -> String { + segments.map { seg in + let start = String(format: "%.2f", seg.start) + let end = String(format: "%.2f", seg.end) + return "\(seg.text)@\(start)-\(end)" + }.joined(separator: ", ") + } + + private static func debugCandidateGaps(triggers: [String], segments: [WakeWordSegment]) -> String { + let tokens = normalizeSegments(segments) + guard !tokens.isEmpty else { return "" } + let triggerTokens = normalizeTriggers(triggers) + var gaps: [String] = [] + + for trigger in triggerTokens { + let count = trigger.tokens.count + guard count > 0, tokens.count > count else { continue } + for i in 0...(tokens.count - count - 1) { + let matched = (0.. [DebugTriggerTokens] { + var output: [DebugTriggerTokens] = [] + for trigger in triggers { + let tokens = trigger + .split(whereSeparator: { $0.isWhitespace }) + .map { normalizeToken(String($0)) } + .filter { !$0.isEmpty } + if tokens.isEmpty { continue } + output.append(DebugTriggerTokens(tokens: tokens)) + } + return output + } + + private static func normalizeSegments(_ segments: [WakeWordSegment]) -> [DebugToken] { + segments.compactMap { segment in + let normalized = normalizeToken(segment.text) + guard !normalized.isEmpty else { return nil } + return DebugToken( + normalized: normalized, + start: segment.start, + end: segment.end) + } + } + + private static func normalizeToken(_ token: String) -> String { + token + .trimmingCharacters(in: Self.whitespaceAndPunctuation) + .lowercased() + } + + private static let whitespaceAndPunctuation = CharacterSet.whitespacesAndNewlines + .union(.punctuationCharacters) + + private func textOnlyFallbackMatch( + transcript: String, + triggers: [String], + config: WakeWordGateConfig + ) -> WakeWordGateMatch? { + guard WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) else { return nil } + guard Self.startsWithTrigger(transcript: transcript, triggers: triggers) else { return nil } + let trimmed = WakeWordGate.stripWake(text: transcript, triggers: triggers) + guard trimmed.count >= config.minCommandLength else { return nil } + return WakeWordGateMatch(triggerEndTime: 0, postGap: 0, command: trimmed) + } + + private static func startsWithTrigger(transcript: String, triggers: [String]) -> Bool { + let tokens = transcript + .split(whereSeparator: { $0.isWhitespace }) + .map { normalizeToken(String($0)) } + .filter { !$0.isEmpty } + guard !tokens.isEmpty else { return false } + for trigger in triggers { + let triggerTokens = trigger + .split(whereSeparator: { $0.isWhitespace }) + .map { normalizeToken(String($0)) } + .filter { !$0.isEmpty } + guard !triggerTokens.isEmpty, tokens.count >= triggerTokens.count else { continue } + if zip(triggerTokens, tokens.prefix(triggerTokens.count)).allSatisfy({ $0 == $1 }) { + return true + } + } + return false + } + private func holdUntilSilence(onUpdate: @escaping @Sendable (VoiceWakeTestState) -> Void) { Task { [weak self] in guard let self else { return } @@ -187,6 +403,40 @@ final class VoiceWakeTester { } } + private func scheduleSilenceCheck( + triggers: [String], + onUpdate: @escaping @Sendable (VoiceWakeTestState) -> Void + ) { + self.silenceTask?.cancel() + let lastSeenAt = self.lastTranscriptAt + let lastText = self.lastTranscript + self.silenceTask = Task { [weak self] in + guard let self else { return } + try? await Task.sleep(nanoseconds: UInt64(self.silenceWindow * 1_000_000_000)) + guard !Task.isCancelled else { return } + guard !self.isStopping, !self.holdingAfterDetect else { return } + guard let lastSeenAt, let lastText else { return } + guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return } + guard let match = self.textOnlyFallbackMatch( + transcript: lastText, + triggers: triggers, + config: WakeWordGateConfig(triggers: triggers) + ) else { return } + self.holdingAfterDetect = true + self.detectedText = match.command + self.logger.info("voice wake detected (silence); forwarding (len=\(match.command.count))") + await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) } + Task.detached { + await VoiceWakeForwarder.forward(transcript: match.command) + } + self.stop() + await MainActor.run { + AppStateStore.shared.stopVoiceEars() + onUpdate(.detected(match.command)) + } + } + } + private func configureSession(preferredMicID: String?) { _ = preferredMicID }