265 lines
9.7 KiB
Swift
265 lines
9.7 KiB
Swift
import AppKit
|
|
import AVFoundation
|
|
import OSLog
|
|
import Speech
|
|
|
|
/// Observes right Option and starts a push-to-talk capture while it is held.
|
|
@MainActor
|
|
final class VoicePushToTalkHotkey {
|
|
static let shared = VoicePushToTalkHotkey()
|
|
|
|
private var monitor: Any?
|
|
private var optionDown = false // right option only
|
|
private var active = false
|
|
|
|
func setEnabled(_ enabled: Bool) {
|
|
if enabled {
|
|
self.startMonitoring()
|
|
} else {
|
|
self.stopMonitoring()
|
|
}
|
|
}
|
|
|
|
private func startMonitoring() {
|
|
guard self.monitor == nil else { return }
|
|
// Listen-only global monitor; we rely on Input Monitoring permission to receive events.
|
|
self.monitor = NSEvent.addGlobalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
|
|
guard let self else { return }
|
|
self.updateModifierState(from: event)
|
|
}
|
|
}
|
|
|
|
private func stopMonitoring() {
|
|
if let monitor {
|
|
NSEvent.removeMonitor(monitor)
|
|
self.monitor = nil
|
|
}
|
|
self.optionDown = false
|
|
self.active = false
|
|
}
|
|
|
|
private func updateModifierState(from event: NSEvent) {
|
|
// Right Option (keyCode 61) acts as a hold-to-talk modifier.
|
|
if event.keyCode == 61 {
|
|
self.optionDown = event.modifierFlags.contains(.option)
|
|
}
|
|
|
|
let chordActive = self.optionDown
|
|
if chordActive && !self.active {
|
|
self.active = true
|
|
Task {
|
|
await VoicePushToTalk.shared.begin()
|
|
}
|
|
} else if !chordActive && self.active {
|
|
self.active = false
|
|
Task {
|
|
await VoicePushToTalk.shared.end()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Short-lived speech recognizer that records while the hotkey is held.
|
|
actor VoicePushToTalk {
|
|
static let shared = VoicePushToTalk()
|
|
|
|
private var recognizer: SFSpeechRecognizer?
|
|
private var audioEngine = AVAudioEngine()
|
|
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
|
private var recognitionTask: SFSpeechRecognitionTask?
|
|
|
|
private var committed: String = ""
|
|
private var volatile: String = ""
|
|
private var activeConfig: Config?
|
|
private var isCapturing = false
|
|
private var triggerChimePlayed = false
|
|
|
|
private struct Config {
|
|
let micID: String?
|
|
let localeID: String?
|
|
let forwardConfig: VoiceWakeForwardConfig
|
|
let triggerChime: VoiceWakeChime
|
|
let sendChime: VoiceWakeChime
|
|
}
|
|
|
|
func begin() async {
|
|
guard voiceWakeSupported else { return }
|
|
guard !self.isCapturing else { return }
|
|
|
|
// Ensure permissions up front.
|
|
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
|
|
guard granted else { return }
|
|
|
|
let config = await MainActor.run { self.makeConfig() }
|
|
self.activeConfig = config
|
|
self.isCapturing = true
|
|
self.triggerChimePlayed = false
|
|
if config.triggerChime != .none {
|
|
self.triggerChimePlayed = true
|
|
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) }
|
|
}
|
|
// Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
|
|
await VoiceWakeRuntime.shared.pauseForPushToTalk()
|
|
await MainActor.run {
|
|
VoiceWakeOverlayController.shared.showPartial(transcript: "")
|
|
}
|
|
|
|
do {
|
|
try await self.startRecognition(localeID: config.localeID)
|
|
} catch {
|
|
await MainActor.run {
|
|
VoiceWakeOverlayController.shared.dismiss()
|
|
}
|
|
self.isCapturing = false
|
|
}
|
|
}
|
|
|
|
func end() async {
|
|
guard self.isCapturing else { return }
|
|
self.isCapturing = false
|
|
|
|
self.recognitionTask?.cancel()
|
|
self.recognitionRequest?.endAudio()
|
|
self.recognitionRequest = nil
|
|
self.recognitionTask = nil
|
|
self.audioEngine.inputNode.removeTap(onBus: 0)
|
|
self.audioEngine.stop()
|
|
|
|
let finalText = (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
|
|
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: true)
|
|
let forward: VoiceWakeForwardConfig
|
|
if let cached = self.activeConfig?.forwardConfig {
|
|
forward = cached
|
|
} else {
|
|
forward = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
|
|
}
|
|
|
|
let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)
|
|
|
|
await MainActor.run {
|
|
VoiceWakeOverlayController.shared.presentFinal(
|
|
transcript: finalText,
|
|
forwardConfig: forward,
|
|
delay: finalText.isEmpty ? 0.0 : 0.8,
|
|
sendChime: chime,
|
|
attributed: attributed)
|
|
}
|
|
|
|
self.committed = ""
|
|
self.volatile = ""
|
|
self.activeConfig = nil
|
|
self.triggerChimePlayed = false
|
|
|
|
// Resume the wake-word runtime after push-to-talk finishes.
|
|
_ = await MainActor.run {
|
|
Task {
|
|
await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared)
|
|
}
|
|
}
|
|
}
|
|
|
|
// MARK: - Private
|
|
|
|
private func startRecognition(localeID: String?) async throws {
|
|
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
|
|
self.recognizer = SFSpeechRecognizer(locale: locale)
|
|
guard let recognizer, recognizer.isAvailable else {
|
|
throw NSError(domain: "VoicePushToTalk", code: 1, userInfo: [NSLocalizedDescriptionKey: "Recognizer unavailable"])
|
|
}
|
|
|
|
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
|
self.recognitionRequest?.shouldReportPartialResults = true
|
|
guard let request = self.recognitionRequest else { return }
|
|
|
|
let input = self.audioEngine.inputNode
|
|
let format = input.outputFormat(forBus: 0)
|
|
input.removeTap(onBus: 0)
|
|
// Pipe raw mic buffers into the Speech request while the chord is held.
|
|
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
|
|
request?.append(buffer)
|
|
}
|
|
|
|
self.audioEngine.prepare()
|
|
try self.audioEngine.start()
|
|
|
|
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
|
guard let self else { return }
|
|
if let error {
|
|
Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
|
|
.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
|
|
}
|
|
let transcript = result?.bestTranscription.formattedString
|
|
let isFinal = result?.isFinal ?? false
|
|
// Hop to a Task so UI updates stay off the Speech callback thread.
|
|
Task.detached { [weak self, transcript, isFinal] in
|
|
guard let self else { return }
|
|
await self.handle(transcript: transcript, isFinal: isFinal)
|
|
}
|
|
}
|
|
}
|
|
|
|
private func handle(transcript: String?, isFinal: Bool) async {
|
|
guard let transcript else { return }
|
|
if isFinal {
|
|
self.committed = transcript
|
|
self.volatile = ""
|
|
} else {
|
|
self.volatile = Self.delta(after: self.committed, current: transcript)
|
|
}
|
|
|
|
let snapshot = self.committed + self.volatile
|
|
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: isFinal)
|
|
await MainActor.run {
|
|
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
|
|
}
|
|
}
|
|
|
|
@MainActor
|
|
private func makeConfig() -> Config {
|
|
let state = AppStateStore.shared
|
|
return Config(
|
|
micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
|
|
localeID: state.voiceWakeLocaleID,
|
|
forwardConfig: state.voiceWakeForwardConfig,
|
|
triggerChime: state.voiceWakeTriggerChime,
|
|
sendChime: state.voiceWakeSendChime)
|
|
}
|
|
|
|
// MARK: - Test helpers
|
|
|
|
static func _testDelta(committed: String, current: String) -> String {
|
|
self.delta(after: committed, current: current)
|
|
}
|
|
|
|
static func _testAttributedColors(isFinal: Bool) -> (NSColor, NSColor) {
|
|
let sample = self.makeAttributed(committed: "a", volatile: "b", isFinal: isFinal)
|
|
let committedColor = sample.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear
|
|
let volatileColor = sample.attribute(.foregroundColor, at: 1, effectiveRange: nil) as? NSColor ?? .clear
|
|
return (committedColor, volatileColor)
|
|
}
|
|
|
|
private static func delta(after committed: String, current: String) -> String {
|
|
if current.hasPrefix(committed) {
|
|
let start = current.index(current.startIndex, offsetBy: committed.count)
|
|
return String(current[start...])
|
|
}
|
|
return current
|
|
}
|
|
|
|
private static func makeAttributed(committed: String, volatile: String, isFinal: Bool) -> NSAttributedString {
|
|
let full = NSMutableAttributedString()
|
|
let committedAttr: [NSAttributedString.Key: Any] = [
|
|
.foregroundColor: NSColor.labelColor,
|
|
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
|
|
]
|
|
full.append(NSAttributedString(string: committed, attributes: committedAttr))
|
|
let volatileColor: NSColor = isFinal ? .labelColor : NSColor.tertiaryLabelColor
|
|
let volatileAttr: [NSAttributedString.Key: Any] = [
|
|
.foregroundColor: volatileColor,
|
|
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
|
|
]
|
|
full.append(NSAttributedString(string: volatile, attributes: volatileAttr))
|
|
return full
|
|
}
|
|
}
|