Files
clawdbot/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift
2025-12-08 01:35:42 +01:00

244 lines
8.2 KiB
Swift

import AVFoundation
import Foundation
import OSLog
import Speech
/// Background listener that keeps the voice-wake pipeline alive outside the settings test view.
actor VoiceWakeRuntime {
static let shared = VoiceWakeRuntime()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.runtime")
private var recognizer: SFSpeechRecognizer?
private var audioEngine = AVAudioEngine()
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var lastHeard: Date?
private var captureStartedAt: Date?
private var captureTask: Task<Void, Never>?
private var capturedTranscript: String = ""
private var isCapturing: Bool = false
private var cooldownUntil: Date?
private var currentConfig: RuntimeConfig?
// Tunables
private let silenceWindow: TimeInterval = 1.0
private let captureHardStop: TimeInterval = 8.0
private let debounceAfterSend: TimeInterval = 0.35
struct RuntimeConfig: Equatable {
let triggers: [String]
let micID: String?
let localeID: String?
}
func refresh(state: AppState) async {
let snapshot = await MainActor.run { () -> (Bool, RuntimeConfig) in
let enabled = state.swabbleEnabled
let config = RuntimeConfig(
triggers: state.swabbleTriggerWords,
micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
localeID: state.voiceWakeLocaleID.isEmpty ? nil : state.voiceWakeLocaleID)
return (enabled, config)
}
guard voiceWakeSupported, snapshot.0 else {
self.stop()
return
}
guard PermissionManager.voiceWakePermissionsGranted() else {
self.logger.debug("voicewake runtime not starting: permissions missing")
self.stop()
return
}
let config = snapshot.1
if config == self.currentConfig, self.recognitionTask != nil {
return
}
self.stop()
await self.start(with: config)
}
private func start(with config: RuntimeConfig) async {
do {
self.configureSession(localeID: config.localeID)
guard let recognizer, recognizer.isAvailable else {
self.logger.error("voicewake runtime: speech recognizer unavailable")
return
}
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return }
let input = self.audioEngine.inputNode
let format = input.outputFormat(forBus: 0)
input.removeTap(onBus: 0)
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
request?.append(buffer)
}
self.audioEngine.prepare()
try self.audioEngine.start()
self.currentConfig = config
self.lastHeard = Date()
self.cooldownUntil = nil
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
guard let self else { return }
let transcript = result?.bestTranscription.formattedString
Task { await self.handleRecognition(transcript: transcript, error: error, config: config) }
}
self.logger.info("voicewake runtime started")
} catch {
self.logger.error("voicewake runtime failed to start: \(error.localizedDescription, privacy: .public)")
self.stop()
}
}
private func stop() {
self.captureTask?.cancel()
self.captureTask = nil
self.isCapturing = false
self.capturedTranscript = ""
self.captureStartedAt = nil
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
self.audioEngine.inputNode.removeTap(onBus: 0)
self.audioEngine.stop()
self.currentConfig = nil
self.logger.debug("voicewake runtime stopped")
}
private func configureSession(localeID: String?) {
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
self.recognizer = SFSpeechRecognizer(locale: locale)
}
private func handleRecognition(
transcript: String?,
error: Error?,
config: RuntimeConfig) async
{
if let error {
self.logger.debug("voicewake recognition error: \(error.localizedDescription, privacy: .public)")
}
guard let transcript else { return }
let now = Date()
if !transcript.isEmpty {
self.lastHeard = now
if self.isCapturing {
self.capturedTranscript = transcript
}
}
if self.isCapturing { return }
if Self.matches(text: transcript, triggers: config.triggers) {
if let cooldown = cooldownUntil, now < cooldown {
return
}
await self.beginCapture(transcript: transcript, config: config)
}
}
private static func matches(text: String, triggers: [String]) -> Bool {
guard !text.isEmpty else { return false }
let normalized = text.lowercased()
for trigger in triggers {
let t = trigger.lowercased().trimmingCharacters(in: .whitespacesAndNewlines)
if t.isEmpty { continue }
if normalized.contains(t) { return true }
}
return false
}
private func beginCapture(transcript: String, config: RuntimeConfig) async {
self.isCapturing = true
self.capturedTranscript = transcript
self.captureStartedAt = Date()
self.cooldownUntil = nil
await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) }
self.captureTask?.cancel()
self.captureTask = Task { [weak self] in
guard let self else { return }
await self.monitorCapture(config: config)
}
}
private func monitorCapture(config: RuntimeConfig) async {
let start = self.captureStartedAt ?? Date()
let hardStop = start.addingTimeInterval(self.captureHardStop)
while self.isCapturing {
let now = Date()
if now >= hardStop {
await self.finalizeCapture(config: config)
return
}
if let last = self.lastHeard, now.timeIntervalSince(last) >= self.silenceWindow {
await self.finalizeCapture(config: config)
return
}
try? await Task.sleep(nanoseconds: 200_000_000)
}
}
private func finalizeCapture(config: RuntimeConfig) async {
guard self.isCapturing else { return }
self.isCapturing = false
self.captureTask?.cancel()
self.captureTask = nil
let finalTranscript = self.capturedTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
self.capturedTranscript = ""
self.captureStartedAt = nil
self.lastHeard = nil
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
if !finalTranscript.isEmpty {
await self.send(transcript: finalTranscript, config: config)
}
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
// Restart the recognizer so we listen for the next trigger with a clean buffer.
let current = self.currentConfig
self.stop()
if let current { await self.start(with: current) }
}
private func send(transcript: String, config: RuntimeConfig) async {
let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
guard forwardConfig.enabled else { return }
let payload = VoiceWakeForwarder.prefixedTranscript(transcript)
Task.detached {
await VoiceWakeForwarder.forward(transcript: payload, config: forwardConfig)
}
}
#if DEBUG
static func _testMatches(text: String, triggers: [String]) -> Bool {
self.matches(text: text, triggers: triggers)
}
#endif
}