From 0317eec10d6d2fc24dfad61c96a9a2dce0f3c54d Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 8 Dec 2025 17:23:44 +0100 Subject: [PATCH] feat(mac): add push-to-talk hotkey --- apps/macos/Sources/Clawdis/AppState.swift | 7 + apps/macos/Sources/Clawdis/Constants.swift | 1 + apps/macos/Sources/Clawdis/MenuBar.swift | 6 + .../Sources/Clawdis/VoicePushToTalk.swift | 249 ++++++++++++++++++ .../Sources/Clawdis/VoiceWakeRuntime.swift | 4 + .../Sources/Clawdis/VoiceWakeSettings.swift | 6 + .../VoicePushToTalkTests.swift | 24 ++ 7 files changed, 297 insertions(+) create mode 100644 apps/macos/Sources/Clawdis/VoicePushToTalk.swift create mode 100644 apps/macos/Tests/ClawdisIPCTests/VoicePushToTalkTests.swift diff --git a/apps/macos/Sources/Clawdis/AppState.swift b/apps/macos/Sources/Clawdis/AppState.swift index 7e049065f..4706b5971 100644 --- a/apps/macos/Sources/Clawdis/AppState.swift +++ b/apps/macos/Sources/Clawdis/AppState.swift @@ -97,6 +97,10 @@ final class AppState: ObservableObject { didSet { UserDefaults.standard.set(self.voiceWakeForwardCommand, forKey: voiceWakeForwardCommandKey) } } + @Published var voicePushToTalkEnabled: Bool { + didSet { UserDefaults.standard.set(self.voicePushToTalkEnabled, forKey: voicePushToTalkEnabledKey) } + } + @Published var isWorking: Bool = false @Published var earBoostActive: Bool = false @Published var heartbeatsEnabled: Bool { @@ -158,6 +162,9 @@ final class AppState: ObservableObject { .string(forKey: voiceWakeForwardTargetKey) ?? legacyTarget self.voiceWakeForwardIdentity = UserDefaults.standard.string(forKey: voiceWakeForwardIdentityKey) ?? "" + self.voicePushToTalkEnabled = UserDefaults.standard + .object(forKey: voicePushToTalkEnabledKey) as? Bool ?? false + var storedForwardCommand = UserDefaults.standard .string(forKey: voiceWakeForwardCommandKey) ?? defaultVoiceWakeForwardCommand // Guard against older prefs missing flags; the forwarder depends on these for replies. diff --git a/apps/macos/Sources/Clawdis/Constants.swift b/apps/macos/Sources/Clawdis/Constants.swift index d88268549..eff3e7421 100644 --- a/apps/macos/Sources/Clawdis/Constants.swift +++ b/apps/macos/Sources/Clawdis/Constants.swift @@ -20,6 +20,7 @@ let voiceWakeForwardUserKey = "clawdis.voiceWakeForwardUser" let voiceWakeForwardPortKey = "clawdis.voiceWakeForwardPort" let voiceWakeForwardIdentityKey = "clawdis.voiceWakeForwardIdentity" let voiceWakeForwardCommandKey = "clawdis.voiceWakeForwardCommand" +let voicePushToTalkEnabledKey = "clawdis.voicePushToTalkEnabled" let connectionModeKey = "clawdis.connectionMode" let remoteTargetKey = "clawdis.remoteTarget" let remoteIdentityKey = "clawdis.remoteIdentity" diff --git a/apps/macos/Sources/Clawdis/MenuBar.swift b/apps/macos/Sources/Clawdis/MenuBar.swift index 2554529de..14d7d119c 100644 --- a/apps/macos/Sources/Clawdis/MenuBar.swift +++ b/apps/macos/Sources/Clawdis/MenuBar.swift @@ -92,6 +92,12 @@ private struct MenuContent: View { await self.loadMicrophones(force: true) } } + .task { + VoicePushToTalkHotkey.shared.setEnabled(voiceWakeSupported && self.state.voicePushToTalkEnabled) + } + .onChange(of: self.state.voicePushToTalkEnabled) { _, enabled in + VoicePushToTalkHotkey.shared.setEnabled(voiceWakeSupported && enabled) + } } private func open(tab: SettingsTab) { diff --git a/apps/macos/Sources/Clawdis/VoicePushToTalk.swift b/apps/macos/Sources/Clawdis/VoicePushToTalk.swift new file mode 100644 index 000000000..22948acb6 --- /dev/null +++ b/apps/macos/Sources/Clawdis/VoicePushToTalk.swift @@ -0,0 +1,249 @@ +import AppKit +import AVFoundation +import OSLog +import Speech + +/// Observes Cmd+Fn and starts a push-to-talk capture while both are held. +@MainActor +final class VoicePushToTalkHotkey { + static let shared = VoicePushToTalkHotkey() + + private var monitor: Any? + private var fnDown = false + private var commandDown = false + private var active = false + + func setEnabled(_ enabled: Bool) { + if enabled { + self.startMonitoring() + } else { + self.stopMonitoring() + } + } + + private func startMonitoring() { + guard self.monitor == nil else { return } + // Listen-only global monitor; Fn only surfaces on .flagsChanged and cannot be registered as a hotkey. + self.monitor = NSEvent.addGlobalMonitorForEvents(matching: .flagsChanged) { [weak self] event in + guard let self else { return } + self.updateModifierState(from: event) + } + } + + private func stopMonitoring() { + if let monitor { + NSEvent.removeMonitor(monitor) + self.monitor = nil + } + self.fnDown = false + self.commandDown = false + self.active = false + } + + private func updateModifierState(from event: NSEvent) { + switch event.keyCode { + case 63: // Fn + self.fnDown = event.modifierFlags.contains(.function) + case 55, 54: // Left / Right command + self.commandDown = event.modifierFlags.contains(.command) + default: + break + } + + // “Walkie-talkie” chord is live only while both keys stay down. + let chordActive = self.fnDown && self.commandDown + if chordActive && !self.active { + self.active = true + Task { + await VoicePushToTalk.shared.begin() + } + } else if !chordActive && self.active { + self.active = false + Task { + await VoicePushToTalk.shared.end() + } + } + } +} + +/// Short-lived speech recognizer that records while the hotkey is held. +actor VoicePushToTalk { + static let shared = VoicePushToTalk() + + private var recognizer: SFSpeechRecognizer? + private var audioEngine = AVAudioEngine() + private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? + private var recognitionTask: SFSpeechRecognitionTask? + + private var committed: String = "" + private var volatile: String = "" + private var activeConfig: Config? + private var isCapturing = false + + private struct Config { + let micID: String? + let localeID: String? + let forwardConfig: VoiceWakeForwardConfig + } + + func begin() async { + guard voiceWakeSupported else { return } + guard !self.isCapturing else { return } + + // Ensure permissions up front. + let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true) + guard granted else { return } + + let config = await MainActor.run { self.makeConfig() } + self.activeConfig = config + self.isCapturing = true + await VoiceWakeRuntime.shared.pauseForPushToTalk() + await MainActor.run { + VoiceWakeOverlayController.shared.showPartial(transcript: "") + } + + do { + try await self.startRecognition(localeID: config.localeID) + } catch { + await MainActor.run { + VoiceWakeOverlayController.shared.dismiss() + } + self.isCapturing = false + } + } + + func end() async { + guard self.isCapturing else { return } + self.isCapturing = false + + self.recognitionTask?.cancel() + self.recognitionRequest?.endAudio() + self.recognitionRequest = nil + self.recognitionTask = nil + self.audioEngine.inputNode.removeTap(onBus: 0) + self.audioEngine.stop() + + let finalText = (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines) + let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: true) + let forward: VoiceWakeForwardConfig + if let cached = self.activeConfig?.forwardConfig { + forward = cached + } else { + forward = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig } + } + + await MainActor.run { + VoiceWakeOverlayController.shared.presentFinal( + transcript: finalText, + forwardConfig: forward, + delay: finalText.isEmpty ? 0.0 : 0.8, + attributed: attributed) + } + + self.committed = "" + self.volatile = "" + self.activeConfig = nil + + // Resume the wake-word runtime after push-to-talk finishes. + _ = await MainActor.run { + Task { + await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared) + } + } + } + + // MARK: - Private + + private func startRecognition(localeID: String?) async throws { + let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier) + self.recognizer = SFSpeechRecognizer(locale: locale) + guard let recognizer, recognizer.isAvailable else { + throw NSError(domain: "VoicePushToTalk", code: 1, userInfo: [NSLocalizedDescriptionKey: "Recognizer unavailable"]) + } + + self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() + self.recognitionRequest?.shouldReportPartialResults = true + guard let request = self.recognitionRequest else { return } + + let input = self.audioEngine.inputNode + let format = input.outputFormat(forBus: 0) + input.removeTap(onBus: 0) + // Pipe raw mic buffers into the Speech request while the chord is held. + input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in + request?.append(buffer) + } + + self.audioEngine.prepare() + try self.audioEngine.start() + + self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in + guard let self else { return } + if let error { + Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt") + .debug("push-to-talk error: \(error.localizedDescription, privacy: .public)") + } + let transcript = result?.bestTranscription.formattedString + let isFinal = result?.isFinal ?? false + Task.detached { [weak self, transcript, isFinal] in + guard let self else { return } + await self.handle(transcript: transcript, isFinal: isFinal) + } + } + } + + private func handle(transcript: String?, isFinal: Bool) async { + guard let transcript else { return } + if isFinal { + self.committed = transcript + self.volatile = "" + } else { + self.volatile = Self.delta(after: self.committed, current: transcript) + } + + let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: isFinal) + let snapshot = self.committed + self.volatile + await MainActor.run { + VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed) + } + } + + @MainActor + private func makeConfig() -> Config { + let state = AppStateStore.shared + return Config( + micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID, + localeID: state.voiceWakeLocaleID, + forwardConfig: state.voiceWakeForwardConfig) + } + + // MARK: - Test helpers + + static func _testDelta(committed: String, current: String) -> String { + self.delta(after: committed, current: current) + } + + static func _testAttributedColors(isFinal: Bool) -> (NSColor, NSColor) { + let sample = self.makeAttributed(committed: "a", volatile: "b", isFinal: isFinal) + let committedColor = sample.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear + let volatileColor = sample.attribute(.foregroundColor, at: 1, effectiveRange: nil) as? NSColor ?? .clear + return (committedColor, volatileColor) + } + + private static func delta(after committed: String, current: String) -> String { + if current.hasPrefix(committed) { + let start = current.index(current.startIndex, offsetBy: committed.count) + return String(current[start...]) + } + return current + } + + private static func makeAttributed(committed: String, volatile: String, isFinal: Bool) -> NSAttributedString { + let full = NSMutableAttributedString() + let committedAttr: [NSAttributedString.Key: Any] = [.foregroundColor: NSColor.labelColor] + full.append(NSAttributedString(string: committed, attributes: committedAttr)) + let volatileColor: NSColor = isFinal ? .labelColor : .secondaryLabelColor + let volatileAttr: [NSAttributedString.Key: Any] = [.foregroundColor: volatileColor] + full.append(NSAttributedString(string: volatile, attributes: volatileAttr)) + return full + } +} diff --git a/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift b/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift index 49f6c6bca..2e63fcc8e 100644 --- a/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift +++ b/apps/macos/Sources/Clawdis/VoiceWakeRuntime.swift @@ -288,6 +288,10 @@ actor VoiceWakeRuntime { } } + func pauseForPushToTalk() { + self.stop() + } + private func updateHeardBeyondTrigger(withTrimmed trimmed: String) { if !self.heardBeyondTrigger, !trimmed.isEmpty { self.heardBeyondTrigger = true diff --git a/apps/macos/Sources/Clawdis/VoiceWakeSettings.swift b/apps/macos/Sources/Clawdis/VoiceWakeSettings.swift index 3db253c09..09d30224c 100644 --- a/apps/macos/Sources/Clawdis/VoiceWakeSettings.swift +++ b/apps/macos/Sources/Clawdis/VoiceWakeSettings.swift @@ -47,6 +47,12 @@ struct VoiceWakeSettings: View { binding: self.voiceWakeBinding) .disabled(!voiceWakeSupported) + SettingsToggleRow( + title: "Hold Cmd+Fn to talk", + subtitle: "Push-to-talk mode that starts listening while you hold the hotkey and shows the preview overlay.", + binding: self.$state.voicePushToTalkEnabled) + .disabled(!voiceWakeSupported) + if !voiceWakeSupported { Label("Voice Wake requires macOS 26 or newer.", systemImage: "exclamationmark.triangle.fill") .font(.callout) diff --git a/apps/macos/Tests/ClawdisIPCTests/VoicePushToTalkTests.swift b/apps/macos/Tests/ClawdisIPCTests/VoicePushToTalkTests.swift new file mode 100644 index 000000000..9d586aa38 --- /dev/null +++ b/apps/macos/Tests/ClawdisIPCTests/VoicePushToTalkTests.swift @@ -0,0 +1,24 @@ +import Testing +@testable import Clawdis + +@Suite struct VoicePushToTalkTests { + @Test func deltaTrimsCommittedPrefix() { + let delta = VoicePushToTalk._testDelta(committed: "hello ", current: "hello world again") + #expect(delta == "world again") + } + + @Test func deltaFallsBackWhenPrefixDiffers() { + let delta = VoicePushToTalk._testDelta(committed: "goodbye", current: "hello world") + #expect(delta == "hello world") + } + + @Test func attributedColorsDifferWhenNotFinal() { + let colors = VoicePushToTalk._testAttributedColors(isFinal: false) + #expect(colors.0 != colors.1) + } + + @Test func attributedColorsMatchWhenFinal() { + let colors = VoicePushToTalk._testAttributedColors(isFinal: true) + #expect(colors.0 == colors.1) + } +}