From a6e0ec38e7ed03226c7b089e2555a90afca128a9 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 8 Dec 2025 01:35:42 +0100 Subject: [PATCH] VoiceWake: capture utterance and add prefix --- apps/macos/Sources/Clawdis/AppState.swift | 11 +- .../Sources/Clawdis/VoiceWakeForwarder.swift | 13 +++ .../Sources/Clawdis/VoiceWakeRuntime.swift | 106 ++++++++++++++++-- .../Sources/Clawdis/VoiceWakeTester.swift | 10 +- .../VoiceWakeForwarderTests.swift | 9 ++ docs/mac/icon.md | 4 +- docs/mac/voicewake.md | 28 +++++ 7 files changed, 164 insertions(+), 17 deletions(-) create mode 100644 docs/mac/voicewake.md diff --git a/apps/macos/Sources/Clawdis/AppState.swift b/apps/macos/Sources/Clawdis/AppState.swift index 91ce13a16..39346fd9d 100644 --- a/apps/macos/Sources/Clawdis/AppState.swift +++ b/apps/macos/Sources/Clawdis/AppState.swift @@ -178,15 +178,24 @@ final class AppState: ObservableObject { Task { await VoiceWakeRuntime.shared.refresh(state: self) } } - func triggerVoiceEars(ttl: TimeInterval = 5) { + func triggerVoiceEars(ttl: TimeInterval? = 5) { self.earBoostTask?.cancel() self.earBoostActive = true + + guard let ttl else { return } + self.earBoostTask = Task { [weak self] in try? await Task.sleep(nanoseconds: UInt64(ttl * 1_000_000_000)) await MainActor.run { [weak self] in self?.earBoostActive = false } } } + func stopVoiceEars() { + self.earBoostTask?.cancel() + self.earBoostTask = nil + self.earBoostActive = false + } + func setVoiceWakeEnabled(_ enabled: Bool) async { guard voiceWakeSupported else { self.swabbleEnabled = false diff --git a/apps/macos/Sources/Clawdis/VoiceWakeForwarder.swift b/apps/macos/Sources/Clawdis/VoiceWakeForwarder.swift index 09376ec28..dbff1200b 100644 --- a/apps/macos/Sources/Clawdis/VoiceWakeForwarder.swift +++ b/apps/macos/Sources/Clawdis/VoiceWakeForwarder.swift @@ -28,6 +28,19 @@ enum VoiceWakeForwarder { private static let cliSearchCandidates = ["clawdis-mac"] + cliHelperSearchPaths.map { "\($0)/clawdis-mac" } private static let cliCache = CLICache() + static func prefixedTranscript(_ transcript: String, machineName: String? = nil) -> String { + let resolvedMachine = machineName + .flatMap { name -> String? in + let trimmed = name.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? nil : trimmed + } + ?? Host.current().localizedName + ?? ProcessInfo.processInfo.hostName + + let safeMachine = resolvedMachine.isEmpty ? "this Mac" : resolvedMachine + return "User talked via voice recognition on \(safeMachine) - repeat prompt first + remember some words might be incorrectly transcribed.\n\n\(transcript)" + } + static func clearCliCache() { self.cliCache.set(nil) } diff --git a/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift b/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift index c72398639..354735324 100644 --- a/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift +++ b/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift @@ -14,9 +14,18 @@ actor VoiceWakeRuntime { private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? private var recognitionTask: SFSpeechRecognitionTask? private var lastHeard: Date? + private var captureStartedAt: Date? + private var captureTask: Task? + private var capturedTranscript: String = "" + private var isCapturing: Bool = false private var cooldownUntil: Date? private var currentConfig: RuntimeConfig? + // Tunables + private let silenceWindow: TimeInterval = 1.0 + private let captureHardStop: TimeInterval = 8.0 + private let debounceAfterSend: TimeInterval = 0.35 + struct RuntimeConfig: Equatable { let triggers: [String] let micID: String? @@ -95,6 +104,11 @@ actor VoiceWakeRuntime { } private func stop() { + self.captureTask?.cancel() + self.captureTask = nil + self.isCapturing = false + self.capturedTranscript = "" + self.captureStartedAt = nil self.recognitionTask?.cancel() self.recognitionTask = nil self.recognitionRequest?.endAudio() @@ -120,21 +134,22 @@ actor VoiceWakeRuntime { } guard let transcript else { return } - if !transcript.isEmpty { self.lastHeard = Date() } + + let now = Date() + if !transcript.isEmpty { + self.lastHeard = now + if self.isCapturing { + self.capturedTranscript = transcript + } + } + + if self.isCapturing { return } if Self.matches(text: transcript, triggers: config.triggers) { - let now = Date() if let cooldown = cooldownUntil, now < cooldown { return } - self.cooldownUntil = now.addingTimeInterval(2.5) - await MainActor.run { AppStateStore.shared.triggerVoiceEars() } - let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig } - if forwardConfig.enabled { - Task.detached { - await VoiceWakeForwarder.forward(transcript: transcript, config: forwardConfig) - } - } + await self.beginCapture(transcript: transcript, config: config) } } @@ -149,6 +164,77 @@ actor VoiceWakeRuntime { return false } + private func beginCapture(transcript: String, config: RuntimeConfig) async { + self.isCapturing = true + self.capturedTranscript = transcript + self.captureStartedAt = Date() + self.cooldownUntil = nil + + await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) } + + self.captureTask?.cancel() + self.captureTask = Task { [weak self] in + guard let self else { return } + await self.monitorCapture(config: config) + } + } + + private func monitorCapture(config: RuntimeConfig) async { + let start = self.captureStartedAt ?? Date() + let hardStop = start.addingTimeInterval(self.captureHardStop) + + while self.isCapturing { + let now = Date() + if now >= hardStop { + await self.finalizeCapture(config: config) + return + } + + if let last = self.lastHeard, now.timeIntervalSince(last) >= self.silenceWindow { + await self.finalizeCapture(config: config) + return + } + + try? await Task.sleep(nanoseconds: 200_000_000) + } + } + + private func finalizeCapture(config: RuntimeConfig) async { + guard self.isCapturing else { return } + self.isCapturing = false + self.captureTask?.cancel() + self.captureTask = nil + + let finalTranscript = self.capturedTranscript.trimmingCharacters(in: .whitespacesAndNewlines) + self.capturedTranscript = "" + self.captureStartedAt = nil + self.lastHeard = nil + + await MainActor.run { AppStateStore.shared.stopVoiceEars() } + + if !finalTranscript.isEmpty { + await self.send(transcript: finalTranscript, config: config) + } + + self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend) + + // Restart the recognizer so we listen for the next trigger with a clean buffer. + let current = self.currentConfig + self.stop() + if let current { await self.start(with: current) } + } + + private func send(transcript: String, config: RuntimeConfig) async { + let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig } + guard forwardConfig.enabled else { return } + + let payload = VoiceWakeForwarder.prefixedTranscript(transcript) + + Task.detached { + await VoiceWakeForwarder.forward(transcript: payload, config: forwardConfig) + } + } + #if DEBUG static func _testMatches(text: String, triggers: [String]) -> Bool { self.matches(text: text, triggers: triggers) diff --git a/apps/macos/Sources/Clawdis/VoiceWakeTester.swift b/apps/macos/Sources/Clawdis/VoiceWakeTester.swift index 72302821d..c1d521d26 100644 --- a/apps/macos/Sources/Clawdis/VoiceWakeTester.swift +++ b/apps/macos/Sources/Clawdis/VoiceWakeTester.swift @@ -23,6 +23,7 @@ final class VoiceWakeTester { private var holdingAfterDetect = false private var detectedText: String? private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake") + private let silenceWindow: TimeInterval = 1.0 init(locale: Locale = .current) { self.recognizer = SFSpeechRecognizer(locale: locale) @@ -132,10 +133,11 @@ final class VoiceWakeTester { self.holdingAfterDetect = true self.detectedText = text self.logger.info("voice wake detected; forwarding (len=\(text.count))") - await MainActor.run { AppStateStore.shared.triggerVoiceEars() } + await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) } let config = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig } Task.detached { - await VoiceWakeForwarder.forward(transcript: text, config: config) + let payload = VoiceWakeForwarder.prefixedTranscript(text) + await VoiceWakeForwarder.forward(transcript: payload, config: config) } Task { @MainActor in onUpdate(.detected(text)) } self.holdUntilSilence(onUpdate: onUpdate) @@ -162,8 +164,7 @@ final class VoiceWakeTester { Task { [weak self] in guard let self else { return } let detectedAt = Date() - let hardStop = detectedAt.addingTimeInterval(3) // cap overall listen after trigger - let silenceWindow: TimeInterval = 0.8 + let hardStop = detectedAt.addingTimeInterval(6) // cap overall listen after trigger while !self.isStopping { let now = Date() @@ -175,6 +176,7 @@ final class VoiceWakeTester { } if !self.isStopping { self.stop() + await MainActor.run { AppStateStore.shared.stopVoiceEars() } if let detectedText { self.logger.info("voice wake hold finished; len=\(detectedText.count)") Task { @MainActor in onUpdate(.detected(detectedText)) } diff --git a/apps/macos/Tests/ClawdisIPCTests/VoiceWakeForwarderTests.swift b/apps/macos/Tests/ClawdisIPCTests/VoiceWakeForwarderTests.swift index 5acf448ab..86ad1d4db 100644 --- a/apps/macos/Tests/ClawdisIPCTests/VoiceWakeForwarderTests.swift +++ b/apps/macos/Tests/ClawdisIPCTests/VoiceWakeForwarderTests.swift @@ -73,4 +73,13 @@ import Testing let escapedQuote = VoiceWakeForwarder.shellEscape(textWithQuote) #expect(escapedQuote == "'Debug test works (and a funny pun)'\\'''") } + + @Test func prefixedTranscriptUsesMachineName() { + let transcript = "hello world" + let prefixed = VoiceWakeForwarder.prefixedTranscript(transcript, machineName: "My-Mac") + + #expect(prefixed.starts(with: "User talked via voice recognition on")) + #expect(prefixed.contains("My-Mac")) + #expect(prefixed.hasSuffix("\n\nhello world")) + } } diff --git a/docs/mac/icon.md b/docs/mac/icon.md index 2aad77d56..1d238a808 100644 --- a/docs/mac/icon.md +++ b/docs/mac/icon.md @@ -4,11 +4,11 @@ Author: steipete · Updated: 2025-12-06 · Scope: macOS app (`apps/macos`) - **Idle:** Normal icon animation (blink, occasional wiggle). - **Paused:** Status item uses `appearsDisabled`; no motion. -- **Voice trigger (big ears):** Voice wake detector calls `AppState.triggerVoiceEars()` → `earBoostActive=true` for ~5s. Ears scale up (1.9x), get circular ear holes for readability, then auto-reset. Only fired from the in-app voice pipeline. +- **Voice trigger (big ears):** Voice wake detector calls `AppState.triggerVoiceEars(ttl: nil)` when the wake word is heard, keeping `earBoostActive=true` while the utterance is captured. Ears scale up (1.9x), get circular ear holes for readability, then drop via `stopVoiceEars()` after 1s of silence. Only fired from the in-app voice pipeline. - **Working (agent running):** `AppState.isWorking=true` drives a “tail/leg scurry” micro-motion: faster leg wiggle and slight offset while work is in-flight. Currently toggled around WebChat agent runs; add the same toggle around other long tasks when you wire them. Wiring points -- Voice wake: see `VoiceWakeTester.handleResult` in `AppMain.swift`—on detection it calls `triggerVoiceEars()`. +- Voice wake: runtime/tester call `AppState.triggerVoiceEars(ttl: nil)` on trigger and `stopVoiceEars()` after 1s of silence to match the capture window. - Agent activity: set `AppStateStore.shared.setWorking(true/false)` around work spans (already done in WebChat agent call). Keep spans short and reset in `defer` blocks to avoid stuck animations. Shapes & sizes diff --git a/docs/mac/voicewake.md b/docs/mac/voicewake.md new file mode 100644 index 000000000..095642cca --- /dev/null +++ b/docs/mac/voicewake.md @@ -0,0 +1,28 @@ +# Voice Wake Pipeline + +Updated: 2025-12-08 · Owners: mac app + +## Runtime behavior +- Always-on listener (Speech framework) waits for any trigger word. +- On first trigger hit: start capture, raise ears immediately via `AppState.triggerVoiceEars(ttl: nil)`, reset capture buffer. +- While capturing: keep buffer in sync with partial transcripts; update `lastHeard` whenever audio arrives. +- End capture when 1.0s of silence is observed (or 8s hard stop), then call `stopVoiceEars()`, prepend the voice-prefix string, send once to Claude, and restart the recognizer for a clean next trigger. A short 350ms debounce prevents double-fires. + +## Visual states +- **Listening for trigger:** idle icon. +- **Wake word detected / capturing:** ears enlarged with holes; stays up until silence end, not a fixed timer. +- **After send:** ears drop immediately when silence window elapses; icon returns to idle. + +## Forwarding payload +- Uses `VoiceWakeForwarder.prefixedTranscript(_:)` to prepend the model hint: + `User talked via voice recognition on - repeat prompt first + remember some words might be incorrectly transcribed.` +- Machine name resolves to Host.localizedName or hostName; caller can override for tests. + +## Testing hooks +- Settings tester mirrors runtime: same capture/silence flow, same prefix, same ear behavior. +- Unit test: `VoiceWakeForwarderTests.prefixedTranscriptUsesMachineName` covers the prefix format. + +## Tuning knobs (swift constants) +- Silence window: 1.0s (`silenceWindow` in `VoiceWakeRuntime`). +- Hard stop after trigger: 8s (`captureHardStop`). +- Post-send debounce: 0.35s (`debounceAfterSend`).