Mac: stabilize voice wake test flow

Why: voice wake tests often delivered partial/final transcripts without reliable word timings, so trigger matching failed, timeouts overwrote detections, and test runs/mic capture kept running after UI changes.

What: add text-only/prefix fallback and silence-based detection in the test flow, stop/clean up any prior test, cancel timeout on detection/stop, and tear down meter/test when the Voice Wake tab is inactive. Runtime detection now falls back on final text-only matches when timing is missing. UI state now reflects finalizing and prevents hanging tests.
This commit is contained in:
Xaden Ryan
2026-01-07 14:48:37 -07:00
committed by Peter Steinberger
parent 2140caaf67
commit 0f1a262ae1
5 changed files with 393 additions and 18 deletions

View File

@@ -8,6 +8,7 @@ enum VoiceWakeTestState: Equatable {
case requesting
case listening
case hearing(String)
case finalizing
case detected(String)
case failed(String)
}
@@ -18,8 +19,15 @@ final class VoiceWakeTester {
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var isStopping = false
private var isFinalizing = false
private var detectionStart: Date?
private var lastHeard: Date?
private var lastLoggedText: String?
private var lastLoggedAt: Date?
private var lastTranscript: String?
private var lastTranscriptAt: Date?
private var silenceTask: Task<Void, Never>?
private var currentTriggers: [String] = []
private var holdingAfterDetect = false
private var detectedText: String?
private let logger = Logger(subsystem: "com.clawdbot", category: "voicewake")
@@ -37,6 +45,17 @@ final class VoiceWakeTester {
{
guard self.recognitionTask == nil else { return }
self.isStopping = false
self.isFinalizing = false
self.holdingAfterDetect = false
self.detectedText = nil
self.lastHeard = nil
self.lastLoggedText = nil
self.lastLoggedAt = nil
self.lastTranscript = nil
self.lastTranscriptAt = nil
self.silenceTask?.cancel()
self.silenceTask = nil
self.currentTriggers = triggers
let chosenLocale = localeID.flatMap { Locale(identifier: $0) } ?? Locale.current
let recognizer = SFSpeechRecognizer(locale: chosenLocale)
guard let recognizer, recognizer.isAvailable else {
@@ -45,6 +64,7 @@ final class VoiceWakeTester {
code: 1,
userInfo: [NSLocalizedDescriptionKey: "Speech recognition unavailable"])
}
recognizer.defaultTaskHint = .dictation
guard Self.hasPrivacyStrings else {
throw NSError(
@@ -70,6 +90,7 @@ final class VoiceWakeTester {
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
self.recognitionRequest?.taskHint = .dictation
let request = self.recognitionRequest
let inputNode = self.audioEngine.inputNode
@@ -96,9 +117,21 @@ final class VoiceWakeTester {
let segments = result.map { WakeWordSpeechSegments.from(
transcription: $0.bestTranscription,
transcript: text) } ?? []
let gateConfig = WakeWordGateConfig(triggers: triggers)
let match = WakeWordGate.match(transcript: text, segments: segments, config: gateConfig)
let isFinal = result?.isFinal ?? false
let gateConfig = WakeWordGateConfig(triggers: triggers)
var match = WakeWordGate.match(transcript: text, segments: segments, config: gateConfig)
if match == nil, isFinal {
match = self.textOnlyFallbackMatch(
transcript: text,
triggers: triggers,
config: gateConfig)
}
self.maybeLogDebug(
transcript: text,
segments: segments,
triggers: triggers,
match: match,
isFinal: isFinal)
let errorMessage = error?.localizedDescription
Task { [weak self] in
@@ -114,13 +147,47 @@ final class VoiceWakeTester {
}
func stop() {
self.isStopping = true
self.stop(force: true)
}
func finalize(timeout: TimeInterval = 1.5) {
guard self.recognitionTask != nil else {
self.stop(force: true)
return
}
self.isFinalizing = true
self.audioEngine.inputNode.removeTap(onBus: 0)
self.recognitionRequest?.endAudio()
self.audioEngine.stop()
Task { [weak self] in
guard let self else { return }
try? await Task.sleep(nanoseconds: UInt64(timeout * 1_000_000_000))
if !self.isStopping {
self.stop(force: true)
}
}
}
private func stop(force: Bool) {
if force { self.isStopping = true }
self.isFinalizing = false
self.audioEngine.stop()
self.recognitionRequest?.endAudio()
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest = nil
self.audioEngine.inputNode.removeTap(onBus: 0)
self.holdingAfterDetect = false
self.detectedText = nil
self.lastHeard = nil
self.detectionStart = nil
self.lastLoggedText = nil
self.lastLoggedAt = nil
self.lastTranscript = nil
self.lastTranscriptAt = nil
self.silenceTask?.cancel()
self.silenceTask = nil
self.currentTriggers = []
}
private func handleResult(
@@ -132,6 +199,11 @@ final class VoiceWakeTester {
{
if !text.isEmpty {
self.lastHeard = Date()
self.lastTranscript = text
self.lastTranscriptAt = Date()
}
if self.holdingAfterDetect {
return
}
if let match, !match.command.isEmpty {
self.holdingAfterDetect = true
@@ -141,17 +213,28 @@ final class VoiceWakeTester {
Task.detached {
await VoiceWakeForwarder.forward(transcript: match.command)
}
Task { @MainActor in onUpdate(.detected(match.command)) }
self.holdUntilSilence(onUpdate: onUpdate)
self.stop()
await MainActor.run {
AppStateStore.shared.stopVoiceEars()
onUpdate(.detected(match.command))
}
return
}
if !isFinal, !text.isEmpty {
self.scheduleSilenceCheck(
triggers: self.currentTriggers,
onUpdate: onUpdate)
}
if self.isFinalizing {
Task { @MainActor in onUpdate(.finalizing) }
}
if let errorMessage {
self.stop()
self.stop(force: true)
Task { @MainActor in onUpdate(.failed(errorMessage)) }
return
}
if isFinal {
self.stop()
self.stop(force: true)
let state: VoiceWakeTestState = text.isEmpty
? .failed("No speech detected")
: .failed("No trigger heard: “\(text)")
@@ -162,6 +245,139 @@ final class VoiceWakeTester {
}
}
private func maybeLogDebug(
transcript: String,
segments: [WakeWordSegment],
triggers: [String],
match: WakeWordGateMatch?,
isFinal: Bool) {
guard !transcript.isEmpty else { return }
if transcript == self.lastLoggedText, !isFinal {
if let last = self.lastLoggedAt, Date().timeIntervalSince(last) < 0.25 {
return
}
}
self.lastLoggedText = transcript
self.lastLoggedAt = Date()
let textOnly = WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers)
let gaps = Self.debugCandidateGaps(triggers: triggers, segments: segments)
let segmentSummary = Self.debugSegments(segments)
let timingCount = segments.filter { $0.start > 0 || $0.duration > 0 }.count
let matchSummary = match.map {
"match=true gap=\(String(format: "%.2f", $0.postGap))s cmdLen=\($0.command.count)"
} ?? "match=false"
self.logger.info(
"voicewake test transcript='\(transcript, privacy: .public)' textOnly=\(textOnly) " +
"isFinal=\(isFinal) timing=\(timingCount)/\(segments.count) " +
"\(matchSummary) gaps=[\(gaps, privacy: .public)] segments=[\(segmentSummary, privacy: .public)]")
}
private static func debugSegments(_ segments: [WakeWordSegment]) -> String {
segments.map { seg in
let start = String(format: "%.2f", seg.start)
let end = String(format: "%.2f", seg.end)
return "\(seg.text)@\(start)-\(end)"
}.joined(separator: ", ")
}
private static func debugCandidateGaps(triggers: [String], segments: [WakeWordSegment]) -> String {
let tokens = normalizeSegments(segments)
guard !tokens.isEmpty else { return "" }
let triggerTokens = normalizeTriggers(triggers)
var gaps: [String] = []
for trigger in triggerTokens {
let count = trigger.tokens.count
guard count > 0, tokens.count > count else { continue }
for i in 0...(tokens.count - count - 1) {
let matched = (0..<count).allSatisfy { tokens[i + $0].normalized == trigger.tokens[$0] }
if !matched { continue }
let triggerEnd = tokens[i + count - 1].end
let nextToken = tokens[i + count]
let gap = nextToken.start - triggerEnd
let formatted = String(format: "%.2f", gap)
gaps.append("\(trigger.tokens.joined(separator: " ")):\(formatted)s")
}
}
return gaps.joined(separator: ", ")
}
private struct DebugToken {
let normalized: String
let start: TimeInterval
let end: TimeInterval
}
private struct DebugTriggerTokens {
let tokens: [String]
}
private static func normalizeTriggers(_ triggers: [String]) -> [DebugTriggerTokens] {
var output: [DebugTriggerTokens] = []
for trigger in triggers {
let tokens = trigger
.split(whereSeparator: { $0.isWhitespace })
.map { normalizeToken(String($0)) }
.filter { !$0.isEmpty }
if tokens.isEmpty { continue }
output.append(DebugTriggerTokens(tokens: tokens))
}
return output
}
private static func normalizeSegments(_ segments: [WakeWordSegment]) -> [DebugToken] {
segments.compactMap { segment in
let normalized = normalizeToken(segment.text)
guard !normalized.isEmpty else { return nil }
return DebugToken(
normalized: normalized,
start: segment.start,
end: segment.end)
}
}
private static func normalizeToken(_ token: String) -> String {
token
.trimmingCharacters(in: Self.whitespaceAndPunctuation)
.lowercased()
}
private static let whitespaceAndPunctuation = CharacterSet.whitespacesAndNewlines
.union(.punctuationCharacters)
private func textOnlyFallbackMatch(
transcript: String,
triggers: [String],
config: WakeWordGateConfig
) -> WakeWordGateMatch? {
guard WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) else { return nil }
guard Self.startsWithTrigger(transcript: transcript, triggers: triggers) else { return nil }
let trimmed = WakeWordGate.stripWake(text: transcript, triggers: triggers)
guard trimmed.count >= config.minCommandLength else { return nil }
return WakeWordGateMatch(triggerEndTime: 0, postGap: 0, command: trimmed)
}
private static func startsWithTrigger(transcript: String, triggers: [String]) -> Bool {
let tokens = transcript
.split(whereSeparator: { $0.isWhitespace })
.map { normalizeToken(String($0)) }
.filter { !$0.isEmpty }
guard !tokens.isEmpty else { return false }
for trigger in triggers {
let triggerTokens = trigger
.split(whereSeparator: { $0.isWhitespace })
.map { normalizeToken(String($0)) }
.filter { !$0.isEmpty }
guard !triggerTokens.isEmpty, tokens.count >= triggerTokens.count else { continue }
if zip(triggerTokens, tokens.prefix(triggerTokens.count)).allSatisfy({ $0 == $1 }) {
return true
}
}
return false
}
private func holdUntilSilence(onUpdate: @escaping @Sendable (VoiceWakeTestState) -> Void) {
Task { [weak self] in
guard let self else { return }
@@ -187,6 +403,40 @@ final class VoiceWakeTester {
}
}
private func scheduleSilenceCheck(
triggers: [String],
onUpdate: @escaping @Sendable (VoiceWakeTestState) -> Void
) {
self.silenceTask?.cancel()
let lastSeenAt = self.lastTranscriptAt
let lastText = self.lastTranscript
self.silenceTask = Task { [weak self] in
guard let self else { return }
try? await Task.sleep(nanoseconds: UInt64(self.silenceWindow * 1_000_000_000))
guard !Task.isCancelled else { return }
guard !self.isStopping, !self.holdingAfterDetect else { return }
guard let lastSeenAt, let lastText else { return }
guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return }
guard let match = self.textOnlyFallbackMatch(
transcript: lastText,
triggers: triggers,
config: WakeWordGateConfig(triggers: triggers)
) else { return }
self.holdingAfterDetect = true
self.detectedText = match.command
self.logger.info("voice wake detected (silence); forwarding (len=\(match.command.count))")
await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) }
Task.detached {
await VoiceWakeForwarder.forward(transcript: match.command)
}
self.stop()
await MainActor.run {
AppStateStore.shared.stopVoiceEars()
onUpdate(.detected(match.command))
}
}
}
private func configureSession(preferredMicID: String?) {
_ = preferredMicID
}