VoiceWake: capture utterance and add prefix

This commit is contained in:
Peter Steinberger
2025-12-08 01:35:42 +01:00
parent 6415ae79be
commit a6e0ec38e7
7 changed files with 164 additions and 17 deletions

View File

@@ -178,15 +178,24 @@ final class AppState: ObservableObject {
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
}
func triggerVoiceEars(ttl: TimeInterval = 5) {
func triggerVoiceEars(ttl: TimeInterval? = 5) {
self.earBoostTask?.cancel()
self.earBoostActive = true
guard let ttl else { return }
self.earBoostTask = Task { [weak self] in
try? await Task.sleep(nanoseconds: UInt64(ttl * 1_000_000_000))
await MainActor.run { [weak self] in self?.earBoostActive = false }
}
}
func stopVoiceEars() {
self.earBoostTask?.cancel()
self.earBoostTask = nil
self.earBoostActive = false
}
func setVoiceWakeEnabled(_ enabled: Bool) async {
guard voiceWakeSupported else {
self.swabbleEnabled = false

View File

@@ -28,6 +28,19 @@ enum VoiceWakeForwarder {
private static let cliSearchCandidates = ["clawdis-mac"] + cliHelperSearchPaths.map { "\($0)/clawdis-mac" }
private static let cliCache = CLICache()
static func prefixedTranscript(_ transcript: String, machineName: String? = nil) -> String {
let resolvedMachine = machineName
.flatMap { name -> String? in
let trimmed = name.trimmingCharacters(in: .whitespacesAndNewlines)
return trimmed.isEmpty ? nil : trimmed
}
?? Host.current().localizedName
?? ProcessInfo.processInfo.hostName
let safeMachine = resolvedMachine.isEmpty ? "this Mac" : resolvedMachine
return "User talked via voice recognition on \(safeMachine) - repeat prompt first + remember some words might be incorrectly transcribed.\n\n\(transcript)"
}
static func clearCliCache() {
self.cliCache.set(nil)
}

View File

@@ -14,9 +14,18 @@ actor VoiceWakeRuntime {
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var lastHeard: Date?
private var captureStartedAt: Date?
private var captureTask: Task<Void, Never>?
private var capturedTranscript: String = ""
private var isCapturing: Bool = false
private var cooldownUntil: Date?
private var currentConfig: RuntimeConfig?
// Tunables
private let silenceWindow: TimeInterval = 1.0
private let captureHardStop: TimeInterval = 8.0
private let debounceAfterSend: TimeInterval = 0.35
struct RuntimeConfig: Equatable {
let triggers: [String]
let micID: String?
@@ -95,6 +104,11 @@ actor VoiceWakeRuntime {
}
private func stop() {
self.captureTask?.cancel()
self.captureTask = nil
self.isCapturing = false
self.capturedTranscript = ""
self.captureStartedAt = nil
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest?.endAudio()
@@ -120,21 +134,22 @@ actor VoiceWakeRuntime {
}
guard let transcript else { return }
if !transcript.isEmpty { self.lastHeard = Date() }
let now = Date()
if !transcript.isEmpty {
self.lastHeard = now
if self.isCapturing {
self.capturedTranscript = transcript
}
}
if self.isCapturing { return }
if Self.matches(text: transcript, triggers: config.triggers) {
let now = Date()
if let cooldown = cooldownUntil, now < cooldown {
return
}
self.cooldownUntil = now.addingTimeInterval(2.5)
await MainActor.run { AppStateStore.shared.triggerVoiceEars() }
let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
if forwardConfig.enabled {
Task.detached {
await VoiceWakeForwarder.forward(transcript: transcript, config: forwardConfig)
}
}
await self.beginCapture(transcript: transcript, config: config)
}
}
@@ -149,6 +164,77 @@ actor VoiceWakeRuntime {
return false
}
private func beginCapture(transcript: String, config: RuntimeConfig) async {
self.isCapturing = true
self.capturedTranscript = transcript
self.captureStartedAt = Date()
self.cooldownUntil = nil
await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) }
self.captureTask?.cancel()
self.captureTask = Task { [weak self] in
guard let self else { return }
await self.monitorCapture(config: config)
}
}
private func monitorCapture(config: RuntimeConfig) async {
let start = self.captureStartedAt ?? Date()
let hardStop = start.addingTimeInterval(self.captureHardStop)
while self.isCapturing {
let now = Date()
if now >= hardStop {
await self.finalizeCapture(config: config)
return
}
if let last = self.lastHeard, now.timeIntervalSince(last) >= self.silenceWindow {
await self.finalizeCapture(config: config)
return
}
try? await Task.sleep(nanoseconds: 200_000_000)
}
}
private func finalizeCapture(config: RuntimeConfig) async {
guard self.isCapturing else { return }
self.isCapturing = false
self.captureTask?.cancel()
self.captureTask = nil
let finalTranscript = self.capturedTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
self.capturedTranscript = ""
self.captureStartedAt = nil
self.lastHeard = nil
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
if !finalTranscript.isEmpty {
await self.send(transcript: finalTranscript, config: config)
}
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
// Restart the recognizer so we listen for the next trigger with a clean buffer.
let current = self.currentConfig
self.stop()
if let current { await self.start(with: current) }
}
private func send(transcript: String, config: RuntimeConfig) async {
let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
guard forwardConfig.enabled else { return }
let payload = VoiceWakeForwarder.prefixedTranscript(transcript)
Task.detached {
await VoiceWakeForwarder.forward(transcript: payload, config: forwardConfig)
}
}
#if DEBUG
static func _testMatches(text: String, triggers: [String]) -> Bool {
self.matches(text: text, triggers: triggers)

View File

@@ -23,6 +23,7 @@ final class VoiceWakeTester {
private var holdingAfterDetect = false
private var detectedText: String?
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake")
private let silenceWindow: TimeInterval = 1.0
init(locale: Locale = .current) {
self.recognizer = SFSpeechRecognizer(locale: locale)
@@ -132,10 +133,11 @@ final class VoiceWakeTester {
self.holdingAfterDetect = true
self.detectedText = text
self.logger.info("voice wake detected; forwarding (len=\(text.count))")
await MainActor.run { AppStateStore.shared.triggerVoiceEars() }
await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) }
let config = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
Task.detached {
await VoiceWakeForwarder.forward(transcript: text, config: config)
let payload = VoiceWakeForwarder.prefixedTranscript(text)
await VoiceWakeForwarder.forward(transcript: payload, config: config)
}
Task { @MainActor in onUpdate(.detected(text)) }
self.holdUntilSilence(onUpdate: onUpdate)
@@ -162,8 +164,7 @@ final class VoiceWakeTester {
Task { [weak self] in
guard let self else { return }
let detectedAt = Date()
let hardStop = detectedAt.addingTimeInterval(3) // cap overall listen after trigger
let silenceWindow: TimeInterval = 0.8
let hardStop = detectedAt.addingTimeInterval(6) // cap overall listen after trigger
while !self.isStopping {
let now = Date()
@@ -175,6 +176,7 @@ final class VoiceWakeTester {
}
if !self.isStopping {
self.stop()
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
if let detectedText {
self.logger.info("voice wake hold finished; len=\(detectedText.count)")
Task { @MainActor in onUpdate(.detected(detectedText)) }