Files
clawdbot/apps/macos/Sources/Clawdis/VoicePushToTalk.swift
2025-12-09 00:08:19 +01:00

265 lines
9.7 KiB
Swift

import AppKit
import AVFoundation
import OSLog
import Speech
/// Observes right Option and starts a push-to-talk capture while it is held.
@MainActor
final class VoicePushToTalkHotkey {
static let shared = VoicePushToTalkHotkey()
private var monitor: Any?
private var optionDown = false // right option only
private var active = false
func setEnabled(_ enabled: Bool) {
if enabled {
self.startMonitoring()
} else {
self.stopMonitoring()
}
}
private func startMonitoring() {
guard self.monitor == nil else { return }
// Listen-only global monitor; we rely on Input Monitoring permission to receive events.
self.monitor = NSEvent.addGlobalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
guard let self else { return }
self.updateModifierState(from: event)
}
}
private func stopMonitoring() {
if let monitor {
NSEvent.removeMonitor(monitor)
self.monitor = nil
}
self.optionDown = false
self.active = false
}
private func updateModifierState(from event: NSEvent) {
// Right Option (keyCode 61) acts as a hold-to-talk modifier.
if event.keyCode == 61 {
self.optionDown = event.modifierFlags.contains(.option)
}
let chordActive = self.optionDown
if chordActive && !self.active {
self.active = true
Task {
await VoicePushToTalk.shared.begin()
}
} else if !chordActive && self.active {
self.active = false
Task {
await VoicePushToTalk.shared.end()
}
}
}
}
/// Short-lived speech recognizer that records while the hotkey is held.
actor VoicePushToTalk {
static let shared = VoicePushToTalk()
private var recognizer: SFSpeechRecognizer?
private var audioEngine = AVAudioEngine()
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var committed: String = ""
private var volatile: String = ""
private var activeConfig: Config?
private var isCapturing = false
private var triggerChimePlayed = false
private struct Config {
let micID: String?
let localeID: String?
let forwardConfig: VoiceWakeForwardConfig
let triggerChime: VoiceWakeChime
let sendChime: VoiceWakeChime
}
func begin() async {
guard voiceWakeSupported else { return }
guard !self.isCapturing else { return }
// Ensure permissions up front.
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
guard granted else { return }
let config = await MainActor.run { self.makeConfig() }
self.activeConfig = config
self.isCapturing = true
self.triggerChimePlayed = false
if config.triggerChime != .none {
self.triggerChimePlayed = true
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) }
}
// Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
await VoiceWakeRuntime.shared.pauseForPushToTalk()
await MainActor.run {
VoiceWakeOverlayController.shared.showPartial(transcript: "")
}
do {
try await self.startRecognition(localeID: config.localeID)
} catch {
await MainActor.run {
VoiceWakeOverlayController.shared.dismiss()
}
self.isCapturing = false
}
}
func end() async {
guard self.isCapturing else { return }
self.isCapturing = false
self.recognitionTask?.cancel()
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
self.recognitionTask = nil
self.audioEngine.inputNode.removeTap(onBus: 0)
self.audioEngine.stop()
let finalText = (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: true)
let forward: VoiceWakeForwardConfig
if let cached = self.activeConfig?.forwardConfig {
forward = cached
} else {
forward = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
}
let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)
await MainActor.run {
VoiceWakeOverlayController.shared.presentFinal(
transcript: finalText,
forwardConfig: forward,
delay: finalText.isEmpty ? 0.0 : 0.8,
sendChime: chime,
attributed: attributed)
}
self.committed = ""
self.volatile = ""
self.activeConfig = nil
self.triggerChimePlayed = false
// Resume the wake-word runtime after push-to-talk finishes.
_ = await MainActor.run {
Task {
await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared)
}
}
}
// MARK: - Private
private func startRecognition(localeID: String?) async throws {
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
self.recognizer = SFSpeechRecognizer(locale: locale)
guard let recognizer, recognizer.isAvailable else {
throw NSError(domain: "VoicePushToTalk", code: 1, userInfo: [NSLocalizedDescriptionKey: "Recognizer unavailable"])
}
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return }
let input = self.audioEngine.inputNode
let format = input.outputFormat(forBus: 0)
input.removeTap(onBus: 0)
// Pipe raw mic buffers into the Speech request while the chord is held.
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
request?.append(buffer)
}
self.audioEngine.prepare()
try self.audioEngine.start()
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
guard let self else { return }
if let error {
Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
}
let transcript = result?.bestTranscription.formattedString
let isFinal = result?.isFinal ?? false
// Hop to a Task so UI updates stay off the Speech callback thread.
Task.detached { [weak self, transcript, isFinal] in
guard let self else { return }
await self.handle(transcript: transcript, isFinal: isFinal)
}
}
}
private func handle(transcript: String?, isFinal: Bool) async {
guard let transcript else { return }
if isFinal {
self.committed = transcript
self.volatile = ""
} else {
self.volatile = Self.delta(after: self.committed, current: transcript)
}
let snapshot = self.committed + self.volatile
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: isFinal)
await MainActor.run {
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
}
}
@MainActor
private func makeConfig() -> Config {
let state = AppStateStore.shared
return Config(
micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
localeID: state.voiceWakeLocaleID,
forwardConfig: state.voiceWakeForwardConfig,
triggerChime: state.voiceWakeTriggerChime,
sendChime: state.voiceWakeSendChime)
}
// MARK: - Test helpers
static func _testDelta(committed: String, current: String) -> String {
self.delta(after: committed, current: current)
}
static func _testAttributedColors(isFinal: Bool) -> (NSColor, NSColor) {
let sample = self.makeAttributed(committed: "a", volatile: "b", isFinal: isFinal)
let committedColor = sample.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear
let volatileColor = sample.attribute(.foregroundColor, at: 1, effectiveRange: nil) as? NSColor ?? .clear
return (committedColor, volatileColor)
}
private static func delta(after committed: String, current: String) -> String {
if current.hasPrefix(committed) {
let start = current.index(current.startIndex, offsetBy: committed.count)
return String(current[start...])
}
return current
}
private static func makeAttributed(committed: String, volatile: String, isFinal: Bool) -> NSAttributedString {
let full = NSMutableAttributedString()
let committedAttr: [NSAttributedString.Key: Any] = [
.foregroundColor: NSColor.labelColor,
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
]
full.append(NSAttributedString(string: committed, attributes: committedAttr))
let volatileColor: NSColor = isFinal ? .labelColor : NSColor.tertiaryLabelColor
let volatileAttr: [NSAttributedString.Key: Any] = [
.foregroundColor: volatileColor,
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
]
full.append(NSAttributedString(string: volatile, attributes: volatileAttr))
return full
}
}