Voice Wake: trigger on wake-word pause

This commit is contained in:
Xaden Ryan
2026-01-07 17:59:26 -07:00
committed by Peter Steinberger
parent 804177b1f5
commit da972b119d
2 changed files with 265 additions and 17 deletions

View File

@@ -38,6 +38,15 @@ actor VoiceWakeRuntime {
private var overlayToken: UUID?
private var activeTriggerEndTime: TimeInterval?
private var scheduledRestartTask: Task<Void, Never>?
private var lastLoggedText: String?
private var lastLoggedAt: Date?
private var lastTapLogAt: Date?
private var lastCallbackLogAt: Date?
private var lastTranscript: String?
private var lastTranscriptAt: Date?
private var preDetectTask: Task<Void, Never>?
private var isStarting: Bool = false
private var triggerOnlyTask: Task<Void, Never>?
// Tunables
// Silence threshold once we've captured user speech (post-trigger).
@@ -50,6 +59,8 @@ actor VoiceWakeRuntime {
// Voice activity detection parameters (RMS-based).
private let minSpeechRMS: Double = 1e-3
private let speechBoostFactor: Double = 6.0 // how far above noise floor we require to mark speech
private let preDetectSilenceWindow: TimeInterval = 1.0
private let triggerPauseWindow: TimeInterval = 0.55
/// Stops the active Speech pipeline without clearing the stored config, so we can restart cleanly.
private func haltRecognitionPipeline() {
@@ -106,6 +117,19 @@ actor VoiceWakeRuntime {
let config = snapshot.1
if self.isStarting {
return
}
if self.scheduledRestartTask != nil, config == self.currentConfig, self.recognitionTask == nil {
return
}
if self.scheduledRestartTask != nil {
self.scheduledRestartTask?.cancel()
self.scheduledRestartTask = nil
}
if config == self.currentConfig, self.recognitionTask != nil {
return
}
@@ -115,6 +139,11 @@ actor VoiceWakeRuntime {
}
private func start(with config: RuntimeConfig) async {
if self.isStarting {
return
}
self.isStarting = true
defer { self.isStarting = false }
do {
self.recognitionGeneration &+= 1
let generation = self.recognitionGeneration
@@ -148,10 +177,10 @@ actor VoiceWakeRuntime {
input.removeTap(onBus: 0)
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
request?.append(buffer)
if let rms = Self.rmsLevel(buffer: buffer) {
Task.detached { [weak self] in
await self?.noteAudioLevel(rms: rms)
}
guard let rms = Self.rmsLevel(buffer: buffer) else { return }
Task.detached { [weak self] in
await self?.noteAudioLevel(rms: rms)
await self?.noteAudioTap(rms: rms)
}
}
@@ -170,6 +199,7 @@ actor VoiceWakeRuntime {
.map { WakeWordSpeechSegments.from(transcription: result.bestTranscription, transcript: $0) }
} ?? []
let isFinal = result?.isFinal ?? false
Task { await self.noteRecognitionCallback(transcript: transcript, isFinal: isFinal, error: error) }
let update = RecognitionUpdate(
transcript: transcript,
segments: segments,
@@ -205,6 +235,12 @@ actor VoiceWakeRuntime {
self.capturedTranscript = ""
self.captureStartedAt = nil
self.triggerChimePlayed = false
self.lastTranscript = nil
self.lastTranscriptAt = nil
self.preDetectTask?.cancel()
self.preDetectTask = nil
self.triggerOnlyTask?.cancel()
self.triggerOnlyTask = nil
self.haltRecognitionPipeline()
self.recognizer = nil
self.currentConfig = nil
@@ -244,7 +280,19 @@ actor VoiceWakeRuntime {
let now = Date()
if !transcript.isEmpty {
self.lastHeard = now
if !self.isCapturing {
self.lastTranscript = transcript
self.lastTranscriptAt = now
}
if self.isCapturing {
self.maybeLogRecognition(
transcript: transcript,
segments: update.segments,
triggers: config.triggers,
isFinal: update.isFinal,
match: nil,
usedFallback: false,
capturing: true)
let trimmed = Self.commandAfterTrigger(
transcript: transcript,
segments: update.segments,
@@ -278,25 +326,209 @@ actor VoiceWakeRuntime {
if self.isCapturing { return }
let gateConfig = WakeWordGateConfig(triggers: config.triggers)
if let match = WakeWordGate.match(transcript: transcript, segments: update.segments, config: gateConfig) {
var usedFallback = false
var match = WakeWordGate.match(transcript: transcript, segments: update.segments, config: gateConfig)
if match == nil, update.isFinal {
match = self.textOnlyFallbackMatch(
transcript: transcript,
triggers: config.triggers,
config: gateConfig)
usedFallback = match != nil
}
self.maybeLogRecognition(
transcript: transcript,
segments: update.segments,
triggers: config.triggers,
isFinal: update.isFinal,
match: match,
usedFallback: usedFallback,
capturing: false)
if let match {
if let cooldown = cooldownUntil, now < cooldown {
return
}
if usedFallback {
self.logger.info("voicewake runtime detected (text-only fallback) len=\(match.command.count)")
} else {
self.logger.info("voicewake runtime detected len=\(match.command.count)")
}
await self.beginCapture(command: match.command, triggerEndTime: match.triggerEndTime, config: config)
} else if update.isFinal {
let trimmed = Self.trimmedAfterTrigger(transcript, triggers: config.triggers)
if WakeWordGate.matchesTextOnly(text: transcript, triggers: config.triggers),
Self.startsWithTrigger(transcript: transcript, triggers: config.triggers),
!trimmed.isEmpty
{
if let cooldown = cooldownUntil, now < cooldown {
return
}
await self.beginCapture(command: trimmed, triggerEndTime: nil, config: config)
} else if !transcript.isEmpty, update.error == nil {
if self.isTriggerOnly(transcript: transcript, triggers: config.triggers) {
self.preDetectTask?.cancel()
self.preDetectTask = nil
self.scheduleTriggerOnlyPauseCheck(triggers: config.triggers, config: config)
} else {
self.triggerOnlyTask?.cancel()
self.triggerOnlyTask = nil
self.schedulePreDetectSilenceCheck(
triggers: config.triggers,
gateConfig: gateConfig,
config: config)
}
}
}
private func maybeLogRecognition(
transcript: String,
segments: [WakeWordSegment],
triggers: [String],
isFinal: Bool,
match: WakeWordGateMatch?,
usedFallback: Bool,
capturing: Bool
) {
guard !transcript.isEmpty else { return }
if transcript == self.lastLoggedText, !isFinal {
if let last = self.lastLoggedAt, Date().timeIntervalSince(last) < 0.25 {
return
}
}
self.lastLoggedText = transcript
self.lastLoggedAt = Date()
let textOnly = WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers)
let timingCount = segments.filter { $0.start > 0 || $0.duration > 0 }.count
let matchSummary = match.map {
"match=true gap=\(String(format: "%.2f", $0.postGap))s cmdLen=\($0.command.count)"
} ?? "match=false"
let segmentSummary = segments.map { seg in
let start = String(format: "%.2f", seg.start)
let end = String(format: "%.2f", seg.end)
return "\(seg.text)@\(start)-\(end)"
}.joined(separator: ", ")
self.logger.info(
"voicewake runtime transcript='\(transcript, privacy: .public)' textOnly=\(textOnly) " +
"isFinal=\(isFinal) timing=\(timingCount)/\(segments.count) " +
"capturing=\(capturing) fallback=\(usedFallback) " +
"\(matchSummary) segments=[\(segmentSummary, privacy: .public)]")
}
private func noteAudioTap(rms: Double) {
let now = Date()
if let last = self.lastTapLogAt, now.timeIntervalSince(last) < 1.0 {
return
}
self.lastTapLogAt = now
let db = 20 * log10(max(rms, 1e-7))
self.logger.debug(
"voicewake runtime audio tap rms=\(String(format: "%.6f", rms)) " +
"db=\(String(format: "%.1f", db)) capturing=\(self.isCapturing)")
}
private func noteRecognitionCallback(transcript: String?, isFinal: Bool, error: Error?) {
guard transcript?.isEmpty ?? true else { return }
let now = Date()
if let last = self.lastCallbackLogAt, now.timeIntervalSince(last) < 1.0 {
return
}
self.lastCallbackLogAt = now
let errorSummary = error?.localizedDescription ?? "none"
self.logger.debug(
"voicewake runtime callback empty transcript isFinal=\(isFinal) error=\(errorSummary, privacy: .public)")
}
private func scheduleTriggerOnlyPauseCheck(triggers: [String], config: RuntimeConfig) {
self.triggerOnlyTask?.cancel()
let lastSeenAt = self.lastTranscriptAt
let lastText = self.lastTranscript
let windowNanos = UInt64(self.triggerPauseWindow * 1_000_000_000)
self.triggerOnlyTask = Task { [weak self, lastSeenAt, lastText] in
try? await Task.sleep(nanoseconds: windowNanos)
guard let self else { return }
await self.triggerOnlyPauseCheck(
lastSeenAt: lastSeenAt,
lastText: lastText,
triggers: triggers,
config: config)
}
}
private func schedulePreDetectSilenceCheck(
triggers: [String],
gateConfig: WakeWordGateConfig,
config: RuntimeConfig)
{
self.preDetectTask?.cancel()
let lastSeenAt = self.lastTranscriptAt
let lastText = self.lastTranscript
let windowNanos = UInt64(self.preDetectSilenceWindow * 1_000_000_000)
self.preDetectTask = Task { [weak self, lastSeenAt, lastText] in
try? await Task.sleep(nanoseconds: windowNanos)
guard let self else { return }
await self.preDetectSilenceCheck(
lastSeenAt: lastSeenAt,
lastText: lastText,
triggers: triggers,
gateConfig: gateConfig,
config: config)
}
}
private func triggerOnlyPauseCheck(
lastSeenAt: Date?,
lastText: String?,
triggers: [String],
config: RuntimeConfig
) async {
guard !Task.isCancelled else { return }
guard !self.isCapturing else { return }
guard let lastSeenAt, let lastText else { return }
guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return }
guard self.isTriggerOnly(transcript: lastText, triggers: triggers) else { return }
if let cooldown = self.cooldownUntil, Date() < cooldown {
return
}
self.logger.info("voicewake runtime detected (trigger-only pause)")
await self.beginCapture(command: "", triggerEndTime: nil, config: config)
}
private func textOnlyFallbackMatch(
transcript: String,
triggers: [String],
config: WakeWordGateConfig
) -> WakeWordGateMatch? {
guard WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) else { return nil }
guard Self.startsWithTrigger(transcript: transcript, triggers: triggers) else { return nil }
let trimmed = Self.trimmedAfterTrigger(transcript, triggers: triggers)
guard trimmed.count >= config.minCommandLength else { return nil }
return WakeWordGateMatch(triggerEndTime: 0, postGap: 0, command: trimmed)
}
private func isTriggerOnly(transcript: String, triggers: [String]) -> Bool {
guard WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) else { return false }
guard Self.startsWithTrigger(transcript: transcript, triggers: triggers) else { return false }
return Self.trimmedAfterTrigger(transcript, triggers: triggers).isEmpty
}
private func preDetectSilenceCheck(
lastSeenAt: Date?,
lastText: String?,
triggers: [String],
gateConfig: WakeWordGateConfig,
config: RuntimeConfig) async
{
guard !Task.isCancelled else { return }
guard !self.isCapturing else { return }
guard let lastSeenAt, let lastText else { return }
guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return }
guard let match = self.textOnlyFallbackMatch(
transcript: lastText,
triggers: triggers,
config: gateConfig)
else { return }
if let cooldown = self.cooldownUntil, Date() < cooldown {
return
}
self.logger.info("voicewake runtime detected (silence fallback) len=\(match.command.count)")
await self.beginCapture(
command: match.command,
triggerEndTime: match.triggerEndTime,
config: config)
}
private func beginCapture(command: String, triggerEndTime: TimeInterval?, config: RuntimeConfig) async {
self.listeningState = .voiceWake
self.isCapturing = true
@@ -309,6 +541,10 @@ actor VoiceWakeRuntime {
self.heardBeyondTrigger = !command.isEmpty
self.triggerChimePlayed = false
self.activeTriggerEndTime = triggerEndTime
self.preDetectTask?.cancel()
self.preDetectTask = nil
self.triggerOnlyTask?.cancel()
self.triggerOnlyTask = nil
if config.triggerChime != .none, !self.triggerChimePlayed {
self.triggerChimePlayed = true
@@ -381,6 +617,12 @@ actor VoiceWakeRuntime {
self.heardBeyondTrigger = false
self.triggerChimePlayed = false
self.activeTriggerEndTime = nil
self.lastTranscript = nil
self.lastTranscriptAt = nil
self.preDetectTask?.cancel()
self.preDetectTask = nil
self.triggerOnlyTask?.cancel()
self.triggerOnlyTask = nil
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
if let token = self.overlayToken {
@@ -464,10 +706,15 @@ actor VoiceWakeRuntime {
let nanos = UInt64(max(0, delay) * 1_000_000_000)
try? await Task.sleep(nanoseconds: nanos)
guard let self else { return }
await self.consumeScheduledRestart()
await self.restartRecognizerIfIdleAndOverlayHidden()
}
}
private func consumeScheduledRestart() {
self.scheduledRestartTask = nil
}
func applyPushToTalkCooldown() {
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
}