Files
clawdbot/apps/macos/Sources/Clawdbot/VoicePushToTalk.swift

422 lines
17 KiB
Swift

import AppKit
import AVFoundation
import Dispatch
import OSLog
import Speech
/// Observes right Option and starts a push-to-talk capture while it is held.
final class VoicePushToTalkHotkey: @unchecked Sendable {
static let shared = VoicePushToTalkHotkey()
private var globalMonitor: Any?
private var localMonitor: Any?
private var optionDown = false // right option only
private var active = false
private let beginAction: @Sendable () async -> Void
private let endAction: @Sendable () async -> Void
init(
beginAction: @escaping @Sendable () async -> Void = { await VoicePushToTalk.shared.begin() },
endAction: @escaping @Sendable () async -> Void = { await VoicePushToTalk.shared.end() })
{
self.beginAction = beginAction
self.endAction = endAction
}
func setEnabled(_ enabled: Bool) {
if ProcessInfo.processInfo.isRunningTests { return }
self.withMainThread { [weak self] in
guard let self else { return }
if enabled {
self.startMonitoring()
} else {
self.stopMonitoring()
}
}
}
private func startMonitoring() {
// assert(Thread.isMainThread) - Removed for Swift 6
guard self.globalMonitor == nil, self.localMonitor == nil else { return }
// Listen-only global monitor; we rely on Input Monitoring permission to receive events.
self.globalMonitor = NSEvent.addGlobalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
let keyCode = event.keyCode
let flags = event.modifierFlags
self?.handleFlagsChanged(keyCode: keyCode, modifierFlags: flags)
}
// Also listen locally so we still catch events when the app is active/focused.
self.localMonitor = NSEvent.addLocalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
let keyCode = event.keyCode
let flags = event.modifierFlags
self?.handleFlagsChanged(keyCode: keyCode, modifierFlags: flags)
return event
}
}
private func stopMonitoring() {
// assert(Thread.isMainThread) - Removed for Swift 6
if let globalMonitor {
NSEvent.removeMonitor(globalMonitor)
self.globalMonitor = nil
}
if let localMonitor {
NSEvent.removeMonitor(localMonitor)
self.localMonitor = nil
}
self.optionDown = false
self.active = false
}
private func handleFlagsChanged(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
self.withMainThread { [weak self] in
self?.updateModifierState(keyCode: keyCode, modifierFlags: modifierFlags)
}
}
private func withMainThread(_ block: @escaping @Sendable () -> Void) {
DispatchQueue.main.async(execute: block)
}
private func updateModifierState(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
// assert(Thread.isMainThread) - Removed for Swift 6
// Right Option (keyCode 61) acts as a hold-to-talk modifier.
if keyCode == 61 {
self.optionDown = modifierFlags.contains(.option)
}
let chordActive = self.optionDown
if chordActive, !self.active {
self.active = true
Task {
Logger(subsystem: "com.clawdbot", category: "voicewake.ptt")
.info("ptt hotkey down")
await self.beginAction()
}
} else if !chordActive, self.active {
self.active = false
Task {
Logger(subsystem: "com.clawdbot", category: "voicewake.ptt")
.info("ptt hotkey up")
await self.endAction()
}
}
}
func _testUpdateModifierState(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
self.updateModifierState(keyCode: keyCode, modifierFlags: modifierFlags)
}
}
/// Short-lived speech recognizer that records while the hotkey is held.
actor VoicePushToTalk {
static let shared = VoicePushToTalk()
private let logger = Logger(subsystem: "com.clawdbot", category: "voicewake.ptt")
private var recognizer: SFSpeechRecognizer?
// Lazily created on begin() to avoid creating an AVAudioEngine at app launch, which can switch Bluetooth
// headphones into the low-quality headset profile even if push-to-talk is never used.
private var audioEngine: AVAudioEngine?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var tapInstalled = false
// Session token used to drop stale callbacks when a new capture starts.
private var sessionID = UUID()
private var committed: String = ""
private var volatile: String = ""
private var activeConfig: Config?
private var isCapturing = false
private var triggerChimePlayed = false
private var finalized = false
private var timeoutTask: Task<Void, Never>?
private var overlayToken: UUID?
private var adoptedPrefix: String = ""
private struct Config {
let micID: String?
let localeID: String?
let triggerChime: VoiceWakeChime
let sendChime: VoiceWakeChime
}
func begin() async {
guard voiceWakeSupported else { return }
guard !self.isCapturing else { return }
// Start a fresh session and invalidate any in-flight callbacks tied to an older one.
let sessionID = UUID()
self.sessionID = sessionID
// Ensure permissions up front.
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
guard granted else { return }
let config = await MainActor.run { self.makeConfig() }
self.activeConfig = config
self.isCapturing = true
self.triggerChimePlayed = false
self.finalized = false
self.timeoutTask?.cancel(); self.timeoutTask = nil
let snapshot = await MainActor.run { VoiceSessionCoordinator.shared.snapshot() }
self.adoptedPrefix = snapshot.visible ? snapshot.text.trimmingCharacters(in: .whitespacesAndNewlines) : ""
self.logger.info("ptt begin adopted_prefix_len=\(self.adoptedPrefix.count, privacy: .public)")
if config.triggerChime != .none {
self.triggerChimePlayed = true
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "ptt.trigger") }
}
// Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
await VoiceWakeRuntime.shared.pauseForPushToTalk()
let adoptedPrefix = self.adoptedPrefix
let adoptedAttributed: NSAttributedString? = adoptedPrefix.isEmpty ? nil : Self.makeAttributed(
committed: adoptedPrefix,
volatile: "",
isFinal: false)
self.overlayToken = await MainActor.run {
VoiceSessionCoordinator.shared.startSession(
source: .pushToTalk,
text: adoptedPrefix,
attributed: adoptedAttributed,
forwardEnabled: true)
}
do {
try await self.startRecognition(localeID: config.localeID, sessionID: sessionID)
} catch {
await MainActor.run {
VoiceWakeOverlayController.shared.dismiss()
}
self.isCapturing = false
// If push-to-talk fails to start after pausing wake-word, ensure we resume listening.
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared)
}
}
func end() async {
guard self.isCapturing else { return }
self.isCapturing = false
let sessionID = self.sessionID
// Stop feeding Speech buffers first, then end the request. Stopping the engine here can race with
// Speech draining its converter chain (and we already stop/cancel in finalize).
if self.tapInstalled {
self.audioEngine?.inputNode.removeTap(onBus: 0)
self.tapInstalled = false
}
self.recognitionRequest?.endAudio()
// If we captured nothing, dismiss immediately when the user lets go.
if self.committed.isEmpty, self.volatile.isEmpty, self.adoptedPrefix.isEmpty {
await self.finalize(transcriptOverride: "", reason: "emptyOnRelease", sessionID: sessionID)
return
}
// Otherwise, give Speech a brief window to deliver the final result; then fall back.
self.timeoutTask?.cancel()
self.timeoutTask = Task { [weak self] in
try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result
await self?.finalize(transcriptOverride: nil, reason: "timeout", sessionID: sessionID)
}
}
// MARK: - Private
private func startRecognition(localeID: String?, sessionID: UUID) async throws {
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
self.recognizer = SFSpeechRecognizer(locale: locale)
guard let recognizer, recognizer.isAvailable else {
throw NSError(
domain: "VoicePushToTalk",
code: 1,
userInfo: [NSLocalizedDescriptionKey: "Recognizer unavailable"])
}
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return }
// Lazily create the engine here so app launch doesn't grab audio resources / trigger Bluetooth HFP.
if self.audioEngine == nil {
self.audioEngine = AVAudioEngine()
}
guard let audioEngine = self.audioEngine else { return }
let input = audioEngine.inputNode
let format = input.outputFormat(forBus: 0)
if self.tapInstalled {
input.removeTap(onBus: 0)
self.tapInstalled = false
}
// Pipe raw mic buffers into the Speech request while the chord is held.
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
request?.append(buffer)
}
self.tapInstalled = true
audioEngine.prepare()
try audioEngine.start()
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
guard let self else { return }
if let error {
self.logger.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
}
let transcript = result?.bestTranscription.formattedString
let isFinal = result?.isFinal ?? false
// Hop to a Task so UI updates stay off the Speech callback thread.
Task.detached { [weak self, transcript, isFinal, sessionID] in
guard let self else { return }
await self.handle(transcript: transcript, isFinal: isFinal, sessionID: sessionID)
}
}
}
private func handle(transcript: String?, isFinal: Bool, sessionID: UUID) async {
guard sessionID == self.sessionID else {
self.logger.debug("push-to-talk drop transcript for stale session")
return
}
guard let transcript else { return }
if isFinal {
self.committed = transcript
self.volatile = ""
} else {
self.volatile = Self.delta(after: self.committed, current: transcript)
}
let committedWithPrefix = Self.join(self.adoptedPrefix, self.committed)
let snapshot = Self.join(committedWithPrefix, self.volatile)
let attributed = Self.makeAttributed(committed: committedWithPrefix, volatile: self.volatile, isFinal: isFinal)
if let token = self.overlayToken {
await MainActor.run {
VoiceSessionCoordinator.shared.updatePartial(
token: token,
text: snapshot,
attributed: attributed)
}
}
}
private func finalize(transcriptOverride: String?, reason: String, sessionID: UUID?) async {
if self.finalized { return }
if let sessionID, sessionID != self.sessionID {
self.logger.debug("push-to-talk drop finalize for stale session")
return
}
self.finalized = true
self.isCapturing = false
self.timeoutTask?.cancel(); self.timeoutTask = nil
let finalRecognized: String = {
if let override = transcriptOverride?.trimmingCharacters(in: .whitespacesAndNewlines) {
return override
}
return (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
}()
let finalText = Self.join(self.adoptedPrefix, finalRecognized)
let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)
let token = self.overlayToken
let logger = self.logger
await MainActor.run {
logger.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
if let token {
VoiceSessionCoordinator.shared.finalize(
token: token,
text: finalText,
sendChime: chime,
autoSendAfter: nil)
VoiceSessionCoordinator.shared.sendNow(token: token, reason: reason)
} else if !finalText.isEmpty {
if chime != .none {
VoiceWakeChimePlayer.play(chime, reason: "ptt.fallback_send")
}
Task.detached {
await VoiceWakeForwarder.forward(transcript: finalText)
}
}
}
self.recognitionTask?.cancel()
self.recognitionRequest = nil
self.recognitionTask = nil
if self.tapInstalled {
self.audioEngine?.inputNode.removeTap(onBus: 0)
self.tapInstalled = false
}
if self.audioEngine?.isRunning == true {
self.audioEngine?.stop()
self.audioEngine?.reset()
}
// Release the engine so we also release any audio session/resources when push-to-talk ends.
self.audioEngine = nil
self.committed = ""
self.volatile = ""
self.activeConfig = nil
self.triggerChimePlayed = false
self.overlayToken = nil
self.adoptedPrefix = ""
// Resume the wake-word runtime after push-to-talk finishes.
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
_ = await MainActor.run { Task { await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared) } }
}
@MainActor
private func makeConfig() -> Config {
let state = AppStateStore.shared
return Config(
micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
localeID: state.voiceWakeLocaleID,
triggerChime: state.voiceWakeTriggerChime,
sendChime: state.voiceWakeSendChime)
}
// MARK: - Test helpers
static func _testDelta(committed: String, current: String) -> String {
self.delta(after: committed, current: current)
}
static func _testAttributedColors(isFinal: Bool) -> (NSColor, NSColor) {
let sample = self.makeAttributed(committed: "a", volatile: "b", isFinal: isFinal)
let committedColor = sample.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear
let volatileColor = sample.attribute(.foregroundColor, at: 1, effectiveRange: nil) as? NSColor ?? .clear
return (committedColor, volatileColor)
}
private static func join(_ prefix: String, _ suffix: String) -> String {
if prefix.isEmpty { return suffix }
if suffix.isEmpty { return prefix }
return "\(prefix) \(suffix)"
}
private static func delta(after committed: String, current: String) -> String {
if current.hasPrefix(committed) {
let start = current.index(current.startIndex, offsetBy: committed.count)
return String(current[start...])
}
return current
}
private static func makeAttributed(committed: String, volatile: String, isFinal: Bool) -> NSAttributedString {
let full = NSMutableAttributedString()
let committedAttr: [NSAttributedString.Key: Any] = [
.foregroundColor: NSColor.labelColor,
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
]
full.append(NSAttributedString(string: committed, attributes: committedAttr))
let volatileColor: NSColor = isFinal ? .labelColor : NSColor.tertiaryLabelColor
let volatileAttr: [NSAttributedString.Key: Any] = [
.foregroundColor: volatileColor,
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
]
full.append(NSAttributedString(string: volatile, attributes: volatileAttr))
return full
}
}