clawdbot/apps/macos/Sources/Clawdbot/VoicePushToTalk.swift

import AppKit
import AVFoundation
import Dispatch
import OSLog
import Speech

/// Observes right Option and starts a push-to-talk capture while it is held.
final class VoicePushToTalkHotkey: @unchecked Sendable {
    static let shared = VoicePushToTalkHotkey()

    private var globalMonitor: Any?
    private var localMonitor: Any?
    private var optionDown = false // right option only
    private var active = false

    private let beginAction: @Sendable () async -> Void
    private let endAction: @Sendable () async -> Void

    init(
        beginAction: @escaping @Sendable () async -> Void = { await VoicePushToTalk.shared.begin() },
        endAction: @escaping @Sendable () async -> Void = { await VoicePushToTalk.shared.end() })
    {
        self.beginAction = beginAction
        self.endAction = endAction
    }

    func setEnabled(_ enabled: Bool) {
        if ProcessInfo.processInfo.isRunningTests { return }
        self.withMainThread { [weak self] in
            guard let self else { return }
            if enabled {
                self.startMonitoring()
            } else {
                self.stopMonitoring()
            }
        }
    }

    private func startMonitoring() {
        // assert(Thread.isMainThread) - Removed for Swift 6
        guard self.globalMonitor == nil, self.localMonitor == nil else { return }
        // Listen-only global monitor; we rely on Input Monitoring permission to receive events.
        self.globalMonitor = NSEvent.addGlobalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
            let keyCode = event.keyCode
            let flags = event.modifierFlags
            self?.handleFlagsChanged(keyCode: keyCode, modifierFlags: flags)
        }
        // Also listen locally so we still catch events when the app is active/focused.
        self.localMonitor = NSEvent.addLocalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
            let keyCode = event.keyCode
            let flags = event.modifierFlags
            self?.handleFlagsChanged(keyCode: keyCode, modifierFlags: flags)
            return event
        }
    }

    private func stopMonitoring() {
        // assert(Thread.isMainThread) - Removed for Swift 6
        if let globalMonitor {
            NSEvent.removeMonitor(globalMonitor)
            self.globalMonitor = nil
        }
        if let localMonitor {
            NSEvent.removeMonitor(localMonitor)
            self.localMonitor = nil
        }
        self.optionDown = false
        self.active = false
    }

    private func handleFlagsChanged(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
        self.withMainThread { [weak self] in
            self?.updateModifierState(keyCode: keyCode, modifierFlags: modifierFlags)
        }
    }

    private func withMainThread(_ block: @escaping @Sendable () -> Void) {
        DispatchQueue.main.async(execute: block)
    }

    private func updateModifierState(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
        // assert(Thread.isMainThread)  - Removed for Swift 6
        // Right Option (keyCode 61) acts as a hold-to-talk modifier.
        if keyCode == 61 {
            self.optionDown = modifierFlags.contains(.option)
        }

        let chordActive = self.optionDown
        if chordActive, !self.active {
            self.active = true
            Task {
                Logger(subsystem: "com.clawdbot", category: "voicewake.ptt")
                    .info("ptt hotkey down")
                await self.beginAction()
            }
        } else if !chordActive, self.active {
            self.active = false
            Task {
                Logger(subsystem: "com.clawdbot", category: "voicewake.ptt")
                    .info("ptt hotkey up")
                await self.endAction()
            }
        }
    }

    func _testUpdateModifierState(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
        self.updateModifierState(keyCode: keyCode, modifierFlags: modifierFlags)
    }
}

/// Short-lived speech recognizer that records while the hotkey is held.
actor VoicePushToTalk {
    static let shared = VoicePushToTalk()

    private let logger = Logger(subsystem: "com.clawdbot", category: "voicewake.ptt")

    private var recognizer: SFSpeechRecognizer?
    // Lazily created on begin() to avoid creating an AVAudioEngine at app launch, which can switch Bluetooth
    // headphones into the low-quality headset profile even if push-to-talk is never used.
    private var audioEngine: AVAudioEngine?
    private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
    private var recognitionTask: SFSpeechRecognitionTask?
    private var tapInstalled = false

    // Session token used to drop stale callbacks when a new capture starts.
    private var sessionID = UUID()

    private var committed: String = ""
    private var volatile: String = ""
    private var activeConfig: Config?
    private var isCapturing = false
    private var triggerChimePlayed = false
    private var finalized = false
    private var timeoutTask: Task<Void, Never>?
    private var overlayToken: UUID?
    private var adoptedPrefix: String = ""

    private struct Config {
        let micID: String?
        let localeID: String?
        let triggerChime: VoiceWakeChime
        let sendChime: VoiceWakeChime
    }

    func begin() async {
        guard voiceWakeSupported else { return }
        guard !self.isCapturing else { return }

        // Start a fresh session and invalidate any in-flight callbacks tied to an older one.
        let sessionID = UUID()
        self.sessionID = sessionID

        // Ensure permissions up front.
        let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
        guard granted else { return }

        let config = await MainActor.run { self.makeConfig() }
        self.activeConfig = config
        self.isCapturing = true
        self.triggerChimePlayed = false
        self.finalized = false
        self.timeoutTask?.cancel(); self.timeoutTask = nil
        let snapshot = await MainActor.run { VoiceSessionCoordinator.shared.snapshot() }
        self.adoptedPrefix = snapshot.visible ? snapshot.text.trimmingCharacters(in: .whitespacesAndNewlines) : ""
        self.logger.info("ptt begin adopted_prefix_len=\(self.adoptedPrefix.count, privacy: .public)")
        if config.triggerChime != .none {
            self.triggerChimePlayed = true
            await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "ptt.trigger") }
        }
        // Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
        await VoiceWakeRuntime.shared.pauseForPushToTalk()
        let adoptedPrefix = self.adoptedPrefix
        let adoptedAttributed: NSAttributedString? = adoptedPrefix.isEmpty ? nil : Self.makeAttributed(
            committed: adoptedPrefix,
            volatile: "",
            isFinal: false)
        self.overlayToken = await MainActor.run {
            VoiceSessionCoordinator.shared.startSession(
                source: .pushToTalk,
                text: adoptedPrefix,
                attributed: adoptedAttributed,
                forwardEnabled: true)
        }

        do {
            try await self.startRecognition(localeID: config.localeID, sessionID: sessionID)
        } catch {
            await MainActor.run {
                VoiceWakeOverlayController.shared.dismiss()
            }
            self.isCapturing = false
            // If push-to-talk fails to start after pausing wake-word, ensure we resume listening.
            await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
            await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared)
        }
    }

    func end() async {
        guard self.isCapturing else { return }
        self.isCapturing = false
        let sessionID = self.sessionID

        // Stop feeding Speech buffers first, then end the request. Stopping the engine here can race with
        // Speech draining its converter chain (and we already stop/cancel in finalize).
        if self.tapInstalled {
            self.audioEngine?.inputNode.removeTap(onBus: 0)
            self.tapInstalled = false
        }
        self.recognitionRequest?.endAudio()

        // If we captured nothing, dismiss immediately when the user lets go.
        if self.committed.isEmpty, self.volatile.isEmpty, self.adoptedPrefix.isEmpty {
            await self.finalize(transcriptOverride: "", reason: "emptyOnRelease", sessionID: sessionID)
            return
        }

        // Otherwise, give Speech a brief window to deliver the final result; then fall back.
        self.timeoutTask?.cancel()
        self.timeoutTask = Task { [weak self] in
            try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result
            await self?.finalize(transcriptOverride: nil, reason: "timeout", sessionID: sessionID)
        }
    }

    // MARK: - Private

    private func startRecognition(localeID: String?, sessionID: UUID) async throws {
        let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
        self.recognizer = SFSpeechRecognizer(locale: locale)
        guard let recognizer, recognizer.isAvailable else {
            throw NSError(
                domain: "VoicePushToTalk",
                code: 1,
                userInfo: [NSLocalizedDescriptionKey: "Recognizer unavailable"])
        }

        self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
        self.recognitionRequest?.shouldReportPartialResults = true
        guard let request = self.recognitionRequest else { return }

        // Lazily create the engine here so app launch doesn't grab audio resources / trigger Bluetooth HFP.
        if self.audioEngine == nil {
            self.audioEngine = AVAudioEngine()
        }
        guard let audioEngine = self.audioEngine else { return }

        let input = audioEngine.inputNode
        let format = input.outputFormat(forBus: 0)
        if self.tapInstalled {
            input.removeTap(onBus: 0)
            self.tapInstalled = false
        }
        // Pipe raw mic buffers into the Speech request while the chord is held.
        input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
            request?.append(buffer)
        }
        self.tapInstalled = true

        audioEngine.prepare()
        try audioEngine.start()

        self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
            guard let self else { return }
            if let error {
                self.logger.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
            }
            let transcript = result?.bestTranscription.formattedString
            let isFinal = result?.isFinal ?? false
            // Hop to a Task so UI updates stay off the Speech callback thread.
            Task.detached { [weak self, transcript, isFinal, sessionID] in
                guard let self else { return }
                await self.handle(transcript: transcript, isFinal: isFinal, sessionID: sessionID)
            }
        }
    }

    private func handle(transcript: String?, isFinal: Bool, sessionID: UUID) async {
        guard sessionID == self.sessionID else {
            self.logger.debug("push-to-talk drop transcript for stale session")
            return
        }
        guard let transcript else { return }
        if isFinal {
            self.committed = transcript
            self.volatile = ""
        } else {
            self.volatile = Self.delta(after: self.committed, current: transcript)
        }

        let committedWithPrefix = Self.join(self.adoptedPrefix, self.committed)
        let snapshot = Self.join(committedWithPrefix, self.volatile)
        let attributed = Self.makeAttributed(committed: committedWithPrefix, volatile: self.volatile, isFinal: isFinal)
        if let token = self.overlayToken {
            await MainActor.run {
                VoiceSessionCoordinator.shared.updatePartial(
                    token: token,
                    text: snapshot,
                    attributed: attributed)
            }
        }
    }

    private func finalize(transcriptOverride: String?, reason: String, sessionID: UUID?) async {
        if self.finalized { return }
        if let sessionID, sessionID != self.sessionID {
            self.logger.debug("push-to-talk drop finalize for stale session")
            return
        }
        self.finalized = true
        self.isCapturing = false
        self.timeoutTask?.cancel(); self.timeoutTask = nil

        let finalRecognized: String = {
            if let override = transcriptOverride?.trimmingCharacters(in: .whitespacesAndNewlines) {
                return override
            }
            return (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
        }()
        let finalText = Self.join(self.adoptedPrefix, finalRecognized)
        let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)

        let token = self.overlayToken
        let logger = self.logger
        await MainActor.run {
            logger.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
            if let token {
                VoiceSessionCoordinator.shared.finalize(
                    token: token,
                    text: finalText,
                    sendChime: chime,
                    autoSendAfter: nil)
                VoiceSessionCoordinator.shared.sendNow(token: token, reason: reason)
            } else if !finalText.isEmpty {
                if chime != .none {
                    VoiceWakeChimePlayer.play(chime, reason: "ptt.fallback_send")
                }
                Task.detached {
                    await VoiceWakeForwarder.forward(transcript: finalText)
                }
            }
        }

        self.recognitionTask?.cancel()
        self.recognitionRequest = nil
        self.recognitionTask = nil
        if self.tapInstalled {
            self.audioEngine?.inputNode.removeTap(onBus: 0)
            self.tapInstalled = false
        }
        if self.audioEngine?.isRunning == true {
            self.audioEngine?.stop()
            self.audioEngine?.reset()
        }
        // Release the engine so we also release any audio session/resources when push-to-talk ends.
        self.audioEngine = nil

        self.committed = ""
        self.volatile = ""
        self.activeConfig = nil
        self.triggerChimePlayed = false
        self.overlayToken = nil
        self.adoptedPrefix = ""

        // Resume the wake-word runtime after push-to-talk finishes.
        await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
        _ = await MainActor.run { Task { await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared) } }
    }

    @MainActor
    private func makeConfig() -> Config {
        let state = AppStateStore.shared
        return Config(
            micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
            localeID: state.voiceWakeLocaleID,
            triggerChime: state.voiceWakeTriggerChime,
            sendChime: state.voiceWakeSendChime)
    }

    // MARK: - Test helpers

    static func _testDelta(committed: String, current: String) -> String {
        self.delta(after: committed, current: current)
    }

    static func _testAttributedColors(isFinal: Bool) -> (NSColor, NSColor) {
        let sample = self.makeAttributed(committed: "a", volatile: "b", isFinal: isFinal)
        let committedColor = sample.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear
        let volatileColor = sample.attribute(.foregroundColor, at: 1, effectiveRange: nil) as? NSColor ?? .clear
        return (committedColor, volatileColor)
    }

    private static func join(_ prefix: String, _ suffix: String) -> String {
        if prefix.isEmpty { return suffix }
        if suffix.isEmpty { return prefix }
        return "\(prefix) \(suffix)"
    }

    private static func delta(after committed: String, current: String) -> String {
        if current.hasPrefix(committed) {
            let start = current.index(current.startIndex, offsetBy: committed.count)
            return String(current[start...])
        }
        return current
    }

    private static func makeAttributed(committed: String, volatile: String, isFinal: Bool) -> NSAttributedString {
        let full = NSMutableAttributedString()
        let committedAttr: [NSAttributedString.Key: Any] = [
            .foregroundColor: NSColor.labelColor,
            .font: NSFont.systemFont(ofSize: 13, weight: .regular),
        ]
        full.append(NSAttributedString(string: committed, attributes: committedAttr))
        let volatileColor: NSColor = isFinal ? .labelColor : NSColor.tertiaryLabelColor
        let volatileAttr: [NSAttributedString.Key: Any] = [
            .foregroundColor: volatileColor,
            .font: NSFont.systemFont(ofSize: 13, weight: .regular),
        ]
        full.append(NSAttributedString(string: volatile, attributes: volatileAttr))
        return full
    }
}