feat(mac): add push-to-talk hotkey

This commit is contained in:
Peter Steinberger
2025-12-08 17:23:44 +01:00
parent a34ab1d36e
commit 0317eec10d
7 changed files with 297 additions and 0 deletions

View File

@@ -97,6 +97,10 @@ final class AppState: ObservableObject {
didSet { UserDefaults.standard.set(self.voiceWakeForwardCommand, forKey: voiceWakeForwardCommandKey) }
}
@Published var voicePushToTalkEnabled: Bool {
didSet { UserDefaults.standard.set(self.voicePushToTalkEnabled, forKey: voicePushToTalkEnabledKey) }
}
@Published var isWorking: Bool = false
@Published var earBoostActive: Bool = false
@Published var heartbeatsEnabled: Bool {
@@ -158,6 +162,9 @@ final class AppState: ObservableObject {
.string(forKey: voiceWakeForwardTargetKey) ?? legacyTarget
self.voiceWakeForwardIdentity = UserDefaults.standard.string(forKey: voiceWakeForwardIdentityKey) ?? ""
self.voicePushToTalkEnabled = UserDefaults.standard
.object(forKey: voicePushToTalkEnabledKey) as? Bool ?? false
var storedForwardCommand = UserDefaults.standard
.string(forKey: voiceWakeForwardCommandKey) ?? defaultVoiceWakeForwardCommand
// Guard against older prefs missing flags; the forwarder depends on these for replies.

View File

@@ -20,6 +20,7 @@ let voiceWakeForwardUserKey = "clawdis.voiceWakeForwardUser"
let voiceWakeForwardPortKey = "clawdis.voiceWakeForwardPort"
let voiceWakeForwardIdentityKey = "clawdis.voiceWakeForwardIdentity"
let voiceWakeForwardCommandKey = "clawdis.voiceWakeForwardCommand"
let voicePushToTalkEnabledKey = "clawdis.voicePushToTalkEnabled"
let connectionModeKey = "clawdis.connectionMode"
let remoteTargetKey = "clawdis.remoteTarget"
let remoteIdentityKey = "clawdis.remoteIdentity"

View File

@@ -92,6 +92,12 @@ private struct MenuContent: View {
await self.loadMicrophones(force: true)
}
}
.task {
VoicePushToTalkHotkey.shared.setEnabled(voiceWakeSupported && self.state.voicePushToTalkEnabled)
}
.onChange(of: self.state.voicePushToTalkEnabled) { _, enabled in
VoicePushToTalkHotkey.shared.setEnabled(voiceWakeSupported && enabled)
}
}
private func open(tab: SettingsTab) {

View File

@@ -0,0 +1,249 @@
import AppKit
import AVFoundation
import OSLog
import Speech
/// Observes Cmd+Fn and starts a push-to-talk capture while both are held.
@MainActor
final class VoicePushToTalkHotkey {
static let shared = VoicePushToTalkHotkey()
private var monitor: Any?
private var fnDown = false
private var commandDown = false
private var active = false
func setEnabled(_ enabled: Bool) {
if enabled {
self.startMonitoring()
} else {
self.stopMonitoring()
}
}
private func startMonitoring() {
guard self.monitor == nil else { return }
// Listen-only global monitor; Fn only surfaces on .flagsChanged and cannot be registered as a hotkey.
self.monitor = NSEvent.addGlobalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
guard let self else { return }
self.updateModifierState(from: event)
}
}
private func stopMonitoring() {
if let monitor {
NSEvent.removeMonitor(monitor)
self.monitor = nil
}
self.fnDown = false
self.commandDown = false
self.active = false
}
private func updateModifierState(from event: NSEvent) {
switch event.keyCode {
case 63: // Fn
self.fnDown = event.modifierFlags.contains(.function)
case 55, 54: // Left / Right command
self.commandDown = event.modifierFlags.contains(.command)
default:
break
}
// Walkie-talkie chord is live only while both keys stay down.
let chordActive = self.fnDown && self.commandDown
if chordActive && !self.active {
self.active = true
Task {
await VoicePushToTalk.shared.begin()
}
} else if !chordActive && self.active {
self.active = false
Task {
await VoicePushToTalk.shared.end()
}
}
}
}
/// Short-lived speech recognizer that records while the hotkey is held.
actor VoicePushToTalk {
static let shared = VoicePushToTalk()
private var recognizer: SFSpeechRecognizer?
private var audioEngine = AVAudioEngine()
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var committed: String = ""
private var volatile: String = ""
private var activeConfig: Config?
private var isCapturing = false
private struct Config {
let micID: String?
let localeID: String?
let forwardConfig: VoiceWakeForwardConfig
}
func begin() async {
guard voiceWakeSupported else { return }
guard !self.isCapturing else { return }
// Ensure permissions up front.
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
guard granted else { return }
let config = await MainActor.run { self.makeConfig() }
self.activeConfig = config
self.isCapturing = true
await VoiceWakeRuntime.shared.pauseForPushToTalk()
await MainActor.run {
VoiceWakeOverlayController.shared.showPartial(transcript: "")
}
do {
try await self.startRecognition(localeID: config.localeID)
} catch {
await MainActor.run {
VoiceWakeOverlayController.shared.dismiss()
}
self.isCapturing = false
}
}
func end() async {
guard self.isCapturing else { return }
self.isCapturing = false
self.recognitionTask?.cancel()
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
self.recognitionTask = nil
self.audioEngine.inputNode.removeTap(onBus: 0)
self.audioEngine.stop()
let finalText = (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: true)
let forward: VoiceWakeForwardConfig
if let cached = self.activeConfig?.forwardConfig {
forward = cached
} else {
forward = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
}
await MainActor.run {
VoiceWakeOverlayController.shared.presentFinal(
transcript: finalText,
forwardConfig: forward,
delay: finalText.isEmpty ? 0.0 : 0.8,
attributed: attributed)
}
self.committed = ""
self.volatile = ""
self.activeConfig = nil
// Resume the wake-word runtime after push-to-talk finishes.
_ = await MainActor.run {
Task {
await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared)
}
}
}
// MARK: - Private
private func startRecognition(localeID: String?) async throws {
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
self.recognizer = SFSpeechRecognizer(locale: locale)
guard let recognizer, recognizer.isAvailable else {
throw NSError(domain: "VoicePushToTalk", code: 1, userInfo: [NSLocalizedDescriptionKey: "Recognizer unavailable"])
}
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return }
let input = self.audioEngine.inputNode
let format = input.outputFormat(forBus: 0)
input.removeTap(onBus: 0)
// Pipe raw mic buffers into the Speech request while the chord is held.
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
request?.append(buffer)
}
self.audioEngine.prepare()
try self.audioEngine.start()
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
guard let self else { return }
if let error {
Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
}
let transcript = result?.bestTranscription.formattedString
let isFinal = result?.isFinal ?? false
Task.detached { [weak self, transcript, isFinal] in
guard let self else { return }
await self.handle(transcript: transcript, isFinal: isFinal)
}
}
}
private func handle(transcript: String?, isFinal: Bool) async {
guard let transcript else { return }
if isFinal {
self.committed = transcript
self.volatile = ""
} else {
self.volatile = Self.delta(after: self.committed, current: transcript)
}
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: isFinal)
let snapshot = self.committed + self.volatile
await MainActor.run {
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
}
}
@MainActor
private func makeConfig() -> Config {
let state = AppStateStore.shared
return Config(
micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
localeID: state.voiceWakeLocaleID,
forwardConfig: state.voiceWakeForwardConfig)
}
// MARK: - Test helpers
static func _testDelta(committed: String, current: String) -> String {
self.delta(after: committed, current: current)
}
static func _testAttributedColors(isFinal: Bool) -> (NSColor, NSColor) {
let sample = self.makeAttributed(committed: "a", volatile: "b", isFinal: isFinal)
let committedColor = sample.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear
let volatileColor = sample.attribute(.foregroundColor, at: 1, effectiveRange: nil) as? NSColor ?? .clear
return (committedColor, volatileColor)
}
private static func delta(after committed: String, current: String) -> String {
if current.hasPrefix(committed) {
let start = current.index(current.startIndex, offsetBy: committed.count)
return String(current[start...])
}
return current
}
private static func makeAttributed(committed: String, volatile: String, isFinal: Bool) -> NSAttributedString {
let full = NSMutableAttributedString()
let committedAttr: [NSAttributedString.Key: Any] = [.foregroundColor: NSColor.labelColor]
full.append(NSAttributedString(string: committed, attributes: committedAttr))
let volatileColor: NSColor = isFinal ? .labelColor : .secondaryLabelColor
let volatileAttr: [NSAttributedString.Key: Any] = [.foregroundColor: volatileColor]
full.append(NSAttributedString(string: volatile, attributes: volatileAttr))
return full
}
}

View File

@@ -288,6 +288,10 @@ actor VoiceWakeRuntime {
}
}
func pauseForPushToTalk() {
self.stop()
}
private func updateHeardBeyondTrigger(withTrimmed trimmed: String) {
if !self.heardBeyondTrigger, !trimmed.isEmpty {
self.heardBeyondTrigger = true

View File

@@ -47,6 +47,12 @@ struct VoiceWakeSettings: View {
binding: self.voiceWakeBinding)
.disabled(!voiceWakeSupported)
SettingsToggleRow(
title: "Hold Cmd+Fn to talk",
subtitle: "Push-to-talk mode that starts listening while you hold the hotkey and shows the preview overlay.",
binding: self.$state.voicePushToTalkEnabled)
.disabled(!voiceWakeSupported)
if !voiceWakeSupported {
Label("Voice Wake requires macOS 26 or newer.", systemImage: "exclamationmark.triangle.fill")
.font(.callout)

View File

@@ -0,0 +1,24 @@
import Testing
@testable import Clawdis
@Suite struct VoicePushToTalkTests {
@Test func deltaTrimsCommittedPrefix() {
let delta = VoicePushToTalk._testDelta(committed: "hello ", current: "hello world again")
#expect(delta == "world again")
}
@Test func deltaFallsBackWhenPrefixDiffers() {
let delta = VoicePushToTalk._testDelta(committed: "goodbye", current: "hello world")
#expect(delta == "hello world")
}
@Test func attributedColorsDifferWhenNotFinal() {
let colors = VoicePushToTalk._testAttributedColors(isFinal: false)
#expect(colors.0 != colors.1)
}
@Test func attributedColorsMatchWhenFinal() {
let colors = VoicePushToTalk._testAttributedColors(isFinal: true)
#expect(colors.0 == colors.1)
}
}