chore: rename project to clawdbot
This commit is contained in:
425
apps/macos/Sources/Clawdbot/VoicePushToTalk.swift
Normal file
425
apps/macos/Sources/Clawdbot/VoicePushToTalk.swift
Normal file
@@ -0,0 +1,425 @@
|
||||
import AppKit
|
||||
import AVFoundation
|
||||
import Dispatch
|
||||
import OSLog
|
||||
import Speech
|
||||
|
||||
/// Observes right Option and starts a push-to-talk capture while it is held.
|
||||
final class VoicePushToTalkHotkey: @unchecked Sendable {
|
||||
static let shared = VoicePushToTalkHotkey()
|
||||
|
||||
private var globalMonitor: Any?
|
||||
private var localMonitor: Any?
|
||||
private var optionDown = false // right option only
|
||||
private var active = false
|
||||
|
||||
private let beginAction: @Sendable () async -> Void
|
||||
private let endAction: @Sendable () async -> Void
|
||||
|
||||
init(
|
||||
beginAction: @escaping @Sendable () async -> Void = { await VoicePushToTalk.shared.begin() },
|
||||
endAction: @escaping @Sendable () async -> Void = { await VoicePushToTalk.shared.end() })
|
||||
{
|
||||
self.beginAction = beginAction
|
||||
self.endAction = endAction
|
||||
}
|
||||
|
||||
func setEnabled(_ enabled: Bool) {
|
||||
if ProcessInfo.processInfo.isRunningTests { return }
|
||||
self.withMainThread { [weak self] in
|
||||
guard let self else { return }
|
||||
if enabled {
|
||||
self.startMonitoring()
|
||||
} else {
|
||||
self.stopMonitoring()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func startMonitoring() {
|
||||
assert(Thread.isMainThread)
|
||||
guard self.globalMonitor == nil, self.localMonitor == nil else { return }
|
||||
// Listen-only global monitor; we rely on Input Monitoring permission to receive events.
|
||||
self.globalMonitor = NSEvent.addGlobalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
|
||||
let keyCode = event.keyCode
|
||||
let flags = event.modifierFlags
|
||||
self?.handleFlagsChanged(keyCode: keyCode, modifierFlags: flags)
|
||||
}
|
||||
// Also listen locally so we still catch events when the app is active/focused.
|
||||
self.localMonitor = NSEvent.addLocalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
|
||||
let keyCode = event.keyCode
|
||||
let flags = event.modifierFlags
|
||||
self?.handleFlagsChanged(keyCode: keyCode, modifierFlags: flags)
|
||||
return event
|
||||
}
|
||||
}
|
||||
|
||||
private func stopMonitoring() {
|
||||
assert(Thread.isMainThread)
|
||||
if let globalMonitor {
|
||||
NSEvent.removeMonitor(globalMonitor)
|
||||
self.globalMonitor = nil
|
||||
}
|
||||
if let localMonitor {
|
||||
NSEvent.removeMonitor(localMonitor)
|
||||
self.localMonitor = nil
|
||||
}
|
||||
self.optionDown = false
|
||||
self.active = false
|
||||
}
|
||||
|
||||
private func handleFlagsChanged(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
|
||||
self.withMainThread { [weak self] in
|
||||
self?.updateModifierState(keyCode: keyCode, modifierFlags: modifierFlags)
|
||||
}
|
||||
}
|
||||
|
||||
private func withMainThread(_ block: @escaping @Sendable () -> Void) {
|
||||
if Thread.isMainThread {
|
||||
block()
|
||||
} else {
|
||||
DispatchQueue.main.async(execute: block)
|
||||
}
|
||||
}
|
||||
|
||||
private func updateModifierState(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
|
||||
assert(Thread.isMainThread)
|
||||
// Right Option (keyCode 61) acts as a hold-to-talk modifier.
|
||||
if keyCode == 61 {
|
||||
self.optionDown = modifierFlags.contains(.option)
|
||||
}
|
||||
|
||||
let chordActive = self.optionDown
|
||||
if chordActive, !self.active {
|
||||
self.active = true
|
||||
Task {
|
||||
Logger(subsystem: "com.clawdbot", category: "voicewake.ptt")
|
||||
.info("ptt hotkey down")
|
||||
await self.beginAction()
|
||||
}
|
||||
} else if !chordActive, self.active {
|
||||
self.active = false
|
||||
Task {
|
||||
Logger(subsystem: "com.clawdbot", category: "voicewake.ptt")
|
||||
.info("ptt hotkey up")
|
||||
await self.endAction()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func _testUpdateModifierState(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
|
||||
self.updateModifierState(keyCode: keyCode, modifierFlags: modifierFlags)
|
||||
}
|
||||
}
|
||||
|
||||
/// Short-lived speech recognizer that records while the hotkey is held.
|
||||
actor VoicePushToTalk {
|
||||
static let shared = VoicePushToTalk()
|
||||
|
||||
private let logger = Logger(subsystem: "com.clawdbot", category: "voicewake.ptt")
|
||||
|
||||
private var recognizer: SFSpeechRecognizer?
|
||||
// Lazily created on begin() to avoid creating an AVAudioEngine at app launch, which can switch Bluetooth
|
||||
// headphones into the low-quality headset profile even if push-to-talk is never used.
|
||||
private var audioEngine: AVAudioEngine?
|
||||
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||
private var recognitionTask: SFSpeechRecognitionTask?
|
||||
private var tapInstalled = false
|
||||
|
||||
// Session token used to drop stale callbacks when a new capture starts.
|
||||
private var sessionID = UUID()
|
||||
|
||||
private var committed: String = ""
|
||||
private var volatile: String = ""
|
||||
private var activeConfig: Config?
|
||||
private var isCapturing = false
|
||||
private var triggerChimePlayed = false
|
||||
private var finalized = false
|
||||
private var timeoutTask: Task<Void, Never>?
|
||||
private var overlayToken: UUID?
|
||||
private var adoptedPrefix: String = ""
|
||||
|
||||
private struct Config {
|
||||
let micID: String?
|
||||
let localeID: String?
|
||||
let triggerChime: VoiceWakeChime
|
||||
let sendChime: VoiceWakeChime
|
||||
}
|
||||
|
||||
func begin() async {
|
||||
guard voiceWakeSupported else { return }
|
||||
guard !self.isCapturing else { return }
|
||||
|
||||
// Start a fresh session and invalidate any in-flight callbacks tied to an older one.
|
||||
let sessionID = UUID()
|
||||
self.sessionID = sessionID
|
||||
|
||||
// Ensure permissions up front.
|
||||
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
|
||||
guard granted else { return }
|
||||
|
||||
let config = await MainActor.run { self.makeConfig() }
|
||||
self.activeConfig = config
|
||||
self.isCapturing = true
|
||||
self.triggerChimePlayed = false
|
||||
self.finalized = false
|
||||
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
||||
let snapshot = await MainActor.run { VoiceSessionCoordinator.shared.snapshot() }
|
||||
self.adoptedPrefix = snapshot.visible ? snapshot.text.trimmingCharacters(in: .whitespacesAndNewlines) : ""
|
||||
self.logger.info("ptt begin adopted_prefix_len=\(self.adoptedPrefix.count, privacy: .public)")
|
||||
if config.triggerChime != .none {
|
||||
self.triggerChimePlayed = true
|
||||
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "ptt.trigger") }
|
||||
}
|
||||
// Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
|
||||
await VoiceWakeRuntime.shared.pauseForPushToTalk()
|
||||
let adoptedPrefix = self.adoptedPrefix
|
||||
let adoptedAttributed: NSAttributedString? = adoptedPrefix.isEmpty ? nil : Self.makeAttributed(
|
||||
committed: adoptedPrefix,
|
||||
volatile: "",
|
||||
isFinal: false)
|
||||
self.overlayToken = await MainActor.run {
|
||||
VoiceSessionCoordinator.shared.startSession(
|
||||
source: .pushToTalk,
|
||||
text: adoptedPrefix,
|
||||
attributed: adoptedAttributed,
|
||||
forwardEnabled: true)
|
||||
}
|
||||
|
||||
do {
|
||||
try await self.startRecognition(localeID: config.localeID, sessionID: sessionID)
|
||||
} catch {
|
||||
await MainActor.run {
|
||||
VoiceWakeOverlayController.shared.dismiss()
|
||||
}
|
||||
self.isCapturing = false
|
||||
// If push-to-talk fails to start after pausing wake-word, ensure we resume listening.
|
||||
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
|
||||
await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared)
|
||||
}
|
||||
}
|
||||
|
||||
func end() async {
|
||||
guard self.isCapturing else { return }
|
||||
self.isCapturing = false
|
||||
let sessionID = self.sessionID
|
||||
|
||||
// Stop feeding Speech buffers first, then end the request. Stopping the engine here can race with
|
||||
// Speech draining its converter chain (and we already stop/cancel in finalize).
|
||||
if self.tapInstalled {
|
||||
self.audioEngine?.inputNode.removeTap(onBus: 0)
|
||||
self.tapInstalled = false
|
||||
}
|
||||
self.recognitionRequest?.endAudio()
|
||||
|
||||
// If we captured nothing, dismiss immediately when the user lets go.
|
||||
if self.committed.isEmpty, self.volatile.isEmpty, self.adoptedPrefix.isEmpty {
|
||||
await self.finalize(transcriptOverride: "", reason: "emptyOnRelease", sessionID: sessionID)
|
||||
return
|
||||
}
|
||||
|
||||
// Otherwise, give Speech a brief window to deliver the final result; then fall back.
|
||||
self.timeoutTask?.cancel()
|
||||
self.timeoutTask = Task { [weak self] in
|
||||
try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result
|
||||
await self?.finalize(transcriptOverride: nil, reason: "timeout", sessionID: sessionID)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Private
|
||||
|
||||
private func startRecognition(localeID: String?, sessionID: UUID) async throws {
|
||||
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
|
||||
self.recognizer = SFSpeechRecognizer(locale: locale)
|
||||
guard let recognizer, recognizer.isAvailable else {
|
||||
throw NSError(
|
||||
domain: "VoicePushToTalk",
|
||||
code: 1,
|
||||
userInfo: [NSLocalizedDescriptionKey: "Recognizer unavailable"])
|
||||
}
|
||||
|
||||
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
||||
self.recognitionRequest?.shouldReportPartialResults = true
|
||||
guard let request = self.recognitionRequest else { return }
|
||||
|
||||
// Lazily create the engine here so app launch doesn't grab audio resources / trigger Bluetooth HFP.
|
||||
if self.audioEngine == nil {
|
||||
self.audioEngine = AVAudioEngine()
|
||||
}
|
||||
guard let audioEngine = self.audioEngine else { return }
|
||||
|
||||
let input = audioEngine.inputNode
|
||||
let format = input.outputFormat(forBus: 0)
|
||||
if self.tapInstalled {
|
||||
input.removeTap(onBus: 0)
|
||||
self.tapInstalled = false
|
||||
}
|
||||
// Pipe raw mic buffers into the Speech request while the chord is held.
|
||||
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
|
||||
request?.append(buffer)
|
||||
}
|
||||
self.tapInstalled = true
|
||||
|
||||
audioEngine.prepare()
|
||||
try audioEngine.start()
|
||||
|
||||
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
||||
guard let self else { return }
|
||||
if let error {
|
||||
self.logger.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
let transcript = result?.bestTranscription.formattedString
|
||||
let isFinal = result?.isFinal ?? false
|
||||
// Hop to a Task so UI updates stay off the Speech callback thread.
|
||||
Task.detached { [weak self, transcript, isFinal, sessionID] in
|
||||
guard let self else { return }
|
||||
await self.handle(transcript: transcript, isFinal: isFinal, sessionID: sessionID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func handle(transcript: String?, isFinal: Bool, sessionID: UUID) async {
|
||||
guard sessionID == self.sessionID else {
|
||||
self.logger.debug("push-to-talk drop transcript for stale session")
|
||||
return
|
||||
}
|
||||
guard let transcript else { return }
|
||||
if isFinal {
|
||||
self.committed = transcript
|
||||
self.volatile = ""
|
||||
} else {
|
||||
self.volatile = Self.delta(after: self.committed, current: transcript)
|
||||
}
|
||||
|
||||
let committedWithPrefix = Self.join(self.adoptedPrefix, self.committed)
|
||||
let snapshot = Self.join(committedWithPrefix, self.volatile)
|
||||
let attributed = Self.makeAttributed(committed: committedWithPrefix, volatile: self.volatile, isFinal: isFinal)
|
||||
if let token = self.overlayToken {
|
||||
await MainActor.run {
|
||||
VoiceSessionCoordinator.shared.updatePartial(
|
||||
token: token,
|
||||
text: snapshot,
|
||||
attributed: attributed)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func finalize(transcriptOverride: String?, reason: String, sessionID: UUID?) async {
|
||||
if self.finalized { return }
|
||||
if let sessionID, sessionID != self.sessionID {
|
||||
self.logger.debug("push-to-talk drop finalize for stale session")
|
||||
return
|
||||
}
|
||||
self.finalized = true
|
||||
self.isCapturing = false
|
||||
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
||||
|
||||
let finalRecognized: String = {
|
||||
if let override = transcriptOverride?.trimmingCharacters(in: .whitespacesAndNewlines) {
|
||||
return override
|
||||
}
|
||||
return (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
}()
|
||||
let finalText = Self.join(self.adoptedPrefix, finalRecognized)
|
||||
let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)
|
||||
|
||||
let token = self.overlayToken
|
||||
let logger = self.logger
|
||||
await MainActor.run {
|
||||
logger.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
|
||||
if let token {
|
||||
VoiceSessionCoordinator.shared.finalize(
|
||||
token: token,
|
||||
text: finalText,
|
||||
sendChime: chime,
|
||||
autoSendAfter: nil)
|
||||
VoiceSessionCoordinator.shared.sendNow(token: token, reason: reason)
|
||||
} else if !finalText.isEmpty {
|
||||
if chime != .none {
|
||||
VoiceWakeChimePlayer.play(chime, reason: "ptt.fallback_send")
|
||||
}
|
||||
Task.detached {
|
||||
await VoiceWakeForwarder.forward(transcript: finalText)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.recognitionTask?.cancel()
|
||||
self.recognitionRequest = nil
|
||||
self.recognitionTask = nil
|
||||
if self.tapInstalled {
|
||||
self.audioEngine?.inputNode.removeTap(onBus: 0)
|
||||
self.tapInstalled = false
|
||||
}
|
||||
if self.audioEngine?.isRunning == true {
|
||||
self.audioEngine?.stop()
|
||||
self.audioEngine?.reset()
|
||||
}
|
||||
// Release the engine so we also release any audio session/resources when push-to-talk ends.
|
||||
self.audioEngine = nil
|
||||
|
||||
self.committed = ""
|
||||
self.volatile = ""
|
||||
self.activeConfig = nil
|
||||
self.triggerChimePlayed = false
|
||||
self.overlayToken = nil
|
||||
self.adoptedPrefix = ""
|
||||
|
||||
// Resume the wake-word runtime after push-to-talk finishes.
|
||||
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
|
||||
_ = await MainActor.run { Task { await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared) } }
|
||||
}
|
||||
|
||||
@MainActor
|
||||
private func makeConfig() -> Config {
|
||||
let state = AppStateStore.shared
|
||||
return Config(
|
||||
micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
|
||||
localeID: state.voiceWakeLocaleID,
|
||||
triggerChime: state.voiceWakeTriggerChime,
|
||||
sendChime: state.voiceWakeSendChime)
|
||||
}
|
||||
|
||||
// MARK: - Test helpers
|
||||
|
||||
static func _testDelta(committed: String, current: String) -> String {
|
||||
self.delta(after: committed, current: current)
|
||||
}
|
||||
|
||||
static func _testAttributedColors(isFinal: Bool) -> (NSColor, NSColor) {
|
||||
let sample = self.makeAttributed(committed: "a", volatile: "b", isFinal: isFinal)
|
||||
let committedColor = sample.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear
|
||||
let volatileColor = sample.attribute(.foregroundColor, at: 1, effectiveRange: nil) as? NSColor ?? .clear
|
||||
return (committedColor, volatileColor)
|
||||
}
|
||||
|
||||
private static func join(_ prefix: String, _ suffix: String) -> String {
|
||||
if prefix.isEmpty { return suffix }
|
||||
if suffix.isEmpty { return prefix }
|
||||
return "\(prefix) \(suffix)"
|
||||
}
|
||||
|
||||
private static func delta(after committed: String, current: String) -> String {
|
||||
if current.hasPrefix(committed) {
|
||||
let start = current.index(current.startIndex, offsetBy: committed.count)
|
||||
return String(current[start...])
|
||||
}
|
||||
return current
|
||||
}
|
||||
|
||||
private static func makeAttributed(committed: String, volatile: String, isFinal: Bool) -> NSAttributedString {
|
||||
let full = NSMutableAttributedString()
|
||||
let committedAttr: [NSAttributedString.Key: Any] = [
|
||||
.foregroundColor: NSColor.labelColor,
|
||||
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
|
||||
]
|
||||
full.append(NSAttributedString(string: committed, attributes: committedAttr))
|
||||
let volatileColor: NSColor = isFinal ? .labelColor : NSColor.tertiaryLabelColor
|
||||
let volatileAttr: [NSAttributedString.Key: Any] = [
|
||||
.foregroundColor: volatileColor,
|
||||
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
|
||||
]
|
||||
full.append(NSAttributedString(string: volatile, attributes: volatileAttr))
|
||||
return full
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user