Files
clawdbot/apps/ios/Sources/Voice/VoiceWakeManager.swift
2025-12-13 00:27:15 +00:00

207 lines
6.4 KiB
Swift

import AVFAudio
import Foundation
import Speech
enum SpeechAudioTapFactory {
static func makeAppendTap(requestBox: SpeechRequestBox) -> @Sendable (AVAudioPCMBuffer, AVAudioTime) -> Void {
{ buffer, _ in
requestBox.append(buffer)
}
}
}
final class SpeechRequestBox: @unchecked Sendable {
let request: SFSpeechAudioBufferRecognitionRequest
init(request: SFSpeechAudioBufferRecognitionRequest) {
self.request = request
}
func append(_ buffer: AVAudioPCMBuffer) {
self.request.append(buffer)
}
}
@MainActor
final class VoiceWakeManager: NSObject, ObservableObject {
@Published var isEnabled: Bool = false
@Published var isListening: Bool = false
@Published var statusText: String = "Off"
private let audioEngine = AVAudioEngine()
private var speechRecognizer: SFSpeechRecognizer?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var lastDispatched: String?
private var onCommand: (@Sendable (String) async -> Void)?
func configure(onCommand: @escaping @Sendable (String) async -> Void) {
self.onCommand = onCommand
}
func setEnabled(_ enabled: Bool) {
self.isEnabled = enabled
if enabled {
Task { await self.start() }
} else {
self.stop()
}
}
func start() async {
guard self.isEnabled else { return }
if self.isListening { return }
self.statusText = "Requesting permissions…"
let micOk = await Self.requestMicrophonePermission()
guard micOk else {
self.statusText = "Microphone permission denied"
self.isListening = false
return
}
let speechOk = await Self.requestSpeechPermission()
guard speechOk else {
self.statusText = "Speech recognition permission denied"
self.isListening = false
return
}
self.speechRecognizer = SFSpeechRecognizer()
guard self.speechRecognizer != nil else {
self.statusText = "Speech recognizer unavailable"
self.isListening = false
return
}
do {
try Self.configureAudioSession()
try self.startRecognition()
self.isListening = true
self.statusText = "Listening"
} catch {
self.isListening = false
self.statusText = "Start failed: \(error.localizedDescription)"
}
}
func stop() {
self.isEnabled = false
self.isListening = false
self.statusText = "Off"
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest = nil
if self.audioEngine.isRunning {
self.audioEngine.stop()
self.audioEngine.inputNode.removeTap(onBus: 0)
}
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
}
private func startRecognition() throws {
self.recognitionTask?.cancel()
self.recognitionTask = nil
let request = SFSpeechAudioBufferRecognitionRequest()
request.shouldReportPartialResults = true
self.recognitionRequest = request
let inputNode = self.audioEngine.inputNode
inputNode.removeTap(onBus: 0)
let requestBox = SpeechRequestBox(request: request)
let recordingFormat = inputNode.outputFormat(forBus: 0)
let tap = SpeechAudioTapFactory.makeAppendTap(requestBox: requestBox)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat, block: tap)
self.audioEngine.prepare()
try self.audioEngine.start()
self.recognitionTask = self.speechRecognizer?
.recognitionTask(with: request) { [weak manager = self] result, error in
Task { @MainActor in
manager?.handleRecognitionCallback(result: result, error: error)
}
}
}
private func handleRecognitionCallback(result: SFSpeechRecognitionResult?, error: Error?) {
if let error {
self.statusText = "Recognizer error: \(error.localizedDescription)"
self.isListening = false
let shouldRestart = self.isEnabled
if shouldRestart {
Task {
try? await Task.sleep(nanoseconds: 700_000_000)
await self.start()
}
}
return
}
guard let result else { return }
let transcript = result.bestTranscription.formattedString
guard let cmd = self.extractCommand(from: transcript) else { return }
if cmd == self.lastDispatched { return }
self.lastDispatched = cmd
self.statusText = "Triggered"
Task { [weak self] in
guard let self else { return }
await self.onCommand?(cmd)
await self.startIfEnabled()
}
}
private func startIfEnabled() async {
let shouldRestart = self.isEnabled
if shouldRestart {
await self.start()
}
}
private func extractCommand(from transcript: String) -> String? {
let lower = transcript.lowercased()
guard let range = lower.range(of: "clawdis", options: .backwards) else { return nil }
let after = lower[range.upperBound...]
let trimmed = after.trimmingCharacters(in: .whitespacesAndNewlines)
if trimmed.isEmpty { return nil }
return trimmed
}
private static func configureAudioSession() throws {
let session = AVAudioSession.sharedInstance()
try session.setCategory(.playAndRecord, mode: .measurement, options: [
.duckOthers,
.mixWithOthers,
.allowBluetoothHFP,
.defaultToSpeaker,
])
try session.setActive(true, options: [])
}
private nonisolated static func requestMicrophonePermission() async -> Bool {
await withCheckedContinuation(isolation: nil) { cont in
AVAudioApplication.requestRecordPermission { ok in
cont.resume(returning: ok)
}
}
}
private nonisolated static func requestSpeechPermission() async -> Bool {
await withCheckedContinuation(isolation: nil) { cont in
SFSpeechRecognizer.requestAuthorization { status in
cont.resume(returning: status == .authorized)
}
}
}
}