feat: add talk mode across nodes
This commit is contained in:
@@ -121,6 +121,15 @@ final class AppState {
|
||||
forKey: voicePushToTalkEnabledKey) } }
|
||||
}
|
||||
|
||||
var talkEnabled: Bool {
|
||||
didSet {
|
||||
self.ifNotPreview {
|
||||
UserDefaults.standard.set(self.talkEnabled, forKey: talkEnabledKey)
|
||||
Task { await TalkModeController.shared.setEnabled(self.talkEnabled) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var iconOverride: IconOverrideSelection {
|
||||
didSet { self.ifNotPreview { UserDefaults.standard.set(self.iconOverride.rawValue, forKey: iconOverrideKey) } }
|
||||
}
|
||||
@@ -216,6 +225,7 @@ final class AppState {
|
||||
.stringArray(forKey: voiceWakeAdditionalLocalesKey) ?? []
|
||||
self.voicePushToTalkEnabled = UserDefaults.standard
|
||||
.object(forKey: voicePushToTalkEnabledKey) as? Bool ?? false
|
||||
self.talkEnabled = UserDefaults.standard.bool(forKey: talkEnabledKey)
|
||||
if let storedHeartbeats = UserDefaults.standard.object(forKey: heartbeatsEnabledKey) as? Bool {
|
||||
self.heartbeatsEnabled = storedHeartbeats
|
||||
} else {
|
||||
@@ -256,9 +266,13 @@ final class AppState {
|
||||
if self.swabbleEnabled, !PermissionManager.voiceWakePermissionsGranted() {
|
||||
self.swabbleEnabled = false
|
||||
}
|
||||
if self.talkEnabled, !PermissionManager.voiceWakePermissionsGranted() {
|
||||
self.talkEnabled = false
|
||||
}
|
||||
|
||||
if !self.isPreview {
|
||||
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
|
||||
Task { await TalkModeController.shared.setEnabled(self.talkEnabled) }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -312,6 +326,23 @@ final class AppState {
|
||||
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
|
||||
}
|
||||
|
||||
func setTalkEnabled(_ enabled: Bool) async {
|
||||
guard voiceWakeSupported else {
|
||||
self.talkEnabled = false
|
||||
return
|
||||
}
|
||||
|
||||
self.talkEnabled = enabled
|
||||
guard !self.isPreview else { return }
|
||||
|
||||
if !enabled { return }
|
||||
|
||||
if PermissionManager.voiceWakePermissionsGranted() { return }
|
||||
|
||||
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
|
||||
self.talkEnabled = granted
|
||||
}
|
||||
|
||||
// MARK: - Global wake words sync (Gateway-owned)
|
||||
|
||||
func applyGlobalVoiceWakeTriggers(_ triggers: [String]) {
|
||||
@@ -367,6 +398,7 @@ extension AppState {
|
||||
state.voiceWakeLocaleID = Locale.current.identifier
|
||||
state.voiceWakeAdditionalLocaleIDs = ["en-US", "de-DE"]
|
||||
state.voicePushToTalkEnabled = false
|
||||
state.talkEnabled = false
|
||||
state.iconOverride = .system
|
||||
state.heartbeatsEnabled = true
|
||||
state.connectionMode = .local
|
||||
|
||||
@@ -30,6 +30,10 @@ struct ConfigSettings: View {
|
||||
@State private var browserColorHex: String = "#FF4500"
|
||||
@State private var browserAttachOnly: Bool = false
|
||||
|
||||
// Talk mode settings (stored in ~/.clawdis/clawdis.json under "talk")
|
||||
@State private var talkVoiceId: String = ""
|
||||
@State private var talkInterruptOnSpeech: Bool = true
|
||||
|
||||
var body: some View {
|
||||
ScrollView { self.content }
|
||||
.onChange(of: self.modelCatalogPath) { _, _ in
|
||||
@@ -53,6 +57,7 @@ struct ConfigSettings: View {
|
||||
self.header
|
||||
self.agentSection
|
||||
self.heartbeatSection
|
||||
self.talkSection
|
||||
self.browserSection
|
||||
Spacer(minLength: 0)
|
||||
}
|
||||
@@ -266,6 +271,37 @@ struct ConfigSettings: View {
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
}
|
||||
|
||||
private var talkSection: some View {
|
||||
GroupBox("Talk Mode") {
|
||||
Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) {
|
||||
GridRow {
|
||||
self.gridLabel("Voice ID")
|
||||
VStack(alignment: .leading, spacing: 6) {
|
||||
ComboBox("ElevenLabs voice ID", text: self.$talkVoiceId) {
|
||||
ForEach(self.talkVoiceSuggestions, id: \.self) { value in
|
||||
Text(value).tag(value)
|
||||
}
|
||||
}
|
||||
.textFieldStyle(.roundedBorder)
|
||||
.frame(maxWidth: .infinity)
|
||||
.onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() }
|
||||
Text("Defaults to ELEVENLABS_VOICE_ID / SAG_VOICE_ID if unset.")
|
||||
.font(.footnote)
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
}
|
||||
GridRow {
|
||||
self.gridLabel("Interrupt")
|
||||
Toggle("Stop speaking when you start talking", isOn: self.$talkInterruptOnSpeech)
|
||||
.labelsHidden()
|
||||
.toggleStyle(.checkbox)
|
||||
.onChange(of: self.talkInterruptOnSpeech) { _, _ in self.autosaveConfig() }
|
||||
}
|
||||
}
|
||||
}
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
}
|
||||
|
||||
private func gridLabel(_ text: String) -> some View {
|
||||
Text(text)
|
||||
.foregroundStyle(.secondary)
|
||||
@@ -278,6 +314,7 @@ struct ConfigSettings: View {
|
||||
let heartbeatMinutes = agent?["heartbeatMinutes"] as? Int
|
||||
let heartbeatBody = agent?["heartbeatBody"] as? String
|
||||
let browser = parsed["browser"] as? [String: Any]
|
||||
let talk = parsed["talk"] as? [String: Any]
|
||||
|
||||
let loadedModel = (agent?["model"] as? String) ?? ""
|
||||
if !loadedModel.isEmpty {
|
||||
@@ -297,6 +334,13 @@ struct ConfigSettings: View {
|
||||
if let color = browser["color"] as? String, !color.isEmpty { self.browserColorHex = color }
|
||||
if let attachOnly = browser["attachOnly"] as? Bool { self.browserAttachOnly = attachOnly }
|
||||
}
|
||||
|
||||
if let talk {
|
||||
if let voice = talk["voiceId"] as? String { self.talkVoiceId = voice }
|
||||
if let interrupt = talk["interruptOnSpeech"] as? Bool {
|
||||
self.talkInterruptOnSpeech = interrupt
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func autosaveConfig() {
|
||||
@@ -312,6 +356,7 @@ struct ConfigSettings: View {
|
||||
var root = self.loadConfigDict()
|
||||
var agent = root["agent"] as? [String: Any] ?? [:]
|
||||
var browser = root["browser"] as? [String: Any] ?? [:]
|
||||
var talk = root["talk"] as? [String: Any] ?? [:]
|
||||
|
||||
let chosenModel = (self.configModel == "__custom__" ? self.customModel : self.configModel)
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
@@ -337,6 +382,15 @@ struct ConfigSettings: View {
|
||||
browser["attachOnly"] = self.browserAttachOnly
|
||||
root["browser"] = browser
|
||||
|
||||
let trimmedVoice = self.talkVoiceId.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if trimmedVoice.isEmpty {
|
||||
talk.removeValue(forKey: "voiceId")
|
||||
} else {
|
||||
talk["voiceId"] = trimmedVoice
|
||||
}
|
||||
talk["interruptOnSpeech"] = self.talkInterruptOnSpeech
|
||||
root["talk"] = talk
|
||||
|
||||
ClawdisConfigFile.saveDict(root)
|
||||
}
|
||||
|
||||
@@ -354,6 +408,20 @@ struct ConfigSettings: View {
|
||||
return Color(red: r, green: g, blue: b)
|
||||
}
|
||||
|
||||
private var talkVoiceSuggestions: [String] {
|
||||
let env = ProcessInfo.processInfo.environment
|
||||
let candidates = [
|
||||
self.talkVoiceId,
|
||||
env["ELEVENLABS_VOICE_ID"] ?? "",
|
||||
env["SAG_VOICE_ID"] ?? "",
|
||||
]
|
||||
var seen = Set<String>()
|
||||
return candidates
|
||||
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
||||
.filter { !$0.isEmpty }
|
||||
.filter { seen.insert($0).inserted }
|
||||
}
|
||||
|
||||
private var browserPathLabel: String? {
|
||||
guard self.browserEnabled else { return nil }
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ let voiceWakeMicKey = "clawdis.voiceWakeMicID"
|
||||
let voiceWakeLocaleKey = "clawdis.voiceWakeLocaleID"
|
||||
let voiceWakeAdditionalLocalesKey = "clawdis.voiceWakeAdditionalLocaleIDs"
|
||||
let voicePushToTalkEnabledKey = "clawdis.voicePushToTalkEnabled"
|
||||
let talkEnabledKey = "clawdis.talkEnabled"
|
||||
let iconOverrideKey = "clawdis.iconOverride"
|
||||
let connectionModeKey = "clawdis.connectionMode"
|
||||
let remoteTargetKey = "clawdis.remoteTarget"
|
||||
|
||||
@@ -72,6 +72,11 @@ struct MenuContent: View {
|
||||
if self.showVoiceWakeMicPicker {
|
||||
self.voiceWakeMicMenu
|
||||
}
|
||||
Toggle(isOn: self.talkBinding) {
|
||||
Label("Talk", systemImage: "bubble.left.and.waveform")
|
||||
}
|
||||
.disabled(!voiceWakeSupported)
|
||||
.opacity(voiceWakeSupported ? 1 : 0.5)
|
||||
Divider()
|
||||
Button {
|
||||
Task { @MainActor in
|
||||
@@ -331,6 +336,14 @@ struct MenuContent: View {
|
||||
})
|
||||
}
|
||||
|
||||
private var talkBinding: Binding<Bool> {
|
||||
Binding(
|
||||
get: { self.state.talkEnabled },
|
||||
set: { newValue in
|
||||
Task { await self.state.setTalkEnabled(newValue) }
|
||||
})
|
||||
}
|
||||
|
||||
private var showVoiceWakeMicPicker: Bool {
|
||||
voiceWakeSupported && self.state.swabbleEnabled
|
||||
}
|
||||
|
||||
54
apps/macos/Sources/Clawdis/TalkAudioPlayer.swift
Normal file
54
apps/macos/Sources/Clawdis/TalkAudioPlayer.swift
Normal file
@@ -0,0 +1,54 @@
|
||||
import AVFoundation
|
||||
import Foundation
|
||||
import OSLog
|
||||
|
||||
@MainActor
|
||||
final class TalkAudioPlayer: NSObject, AVAudioPlayerDelegate {
|
||||
static let shared = TalkAudioPlayer()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts")
|
||||
private var player: AVAudioPlayer?
|
||||
private var continuation: CheckedContinuation<TalkPlaybackResult, Never>?
|
||||
|
||||
func play(data: Data) async -> TalkPlaybackResult {
|
||||
self.stopInternal(interrupted: true)
|
||||
do {
|
||||
let player = try AVAudioPlayer(data: data)
|
||||
self.player = player
|
||||
player.delegate = self
|
||||
player.prepareToPlay()
|
||||
player.play()
|
||||
return await withCheckedContinuation { continuation in
|
||||
self.continuation = continuation
|
||||
}
|
||||
} catch {
|
||||
self.logger.error("talk audio player failed: \(error.localizedDescription, privacy: .public)")
|
||||
return TalkPlaybackResult(finished: false, interruptedAt: nil)
|
||||
}
|
||||
}
|
||||
|
||||
func stop() -> Double? {
|
||||
guard let player else { return nil }
|
||||
let time = player.currentTime
|
||||
self.stopInternal(interrupted: true, interruptedAt: time)
|
||||
return time
|
||||
}
|
||||
|
||||
func audioPlayerDidFinishPlaying(_: AVAudioPlayer, successfully flag: Bool) {
|
||||
self.stopInternal(interrupted: !flag)
|
||||
}
|
||||
|
||||
private func stopInternal(interrupted: Bool, interruptedAt: Double? = nil) {
|
||||
self.player?.stop()
|
||||
self.player = nil
|
||||
if let continuation {
|
||||
self.continuation = nil
|
||||
continuation.resume(returning: TalkPlaybackResult(finished: !interrupted, interruptedAt: interruptedAt))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct TalkPlaybackResult: Sendable {
|
||||
let finished: Bool
|
||||
let interruptedAt: Double?
|
||||
}
|
||||
42
apps/macos/Sources/Clawdis/TalkModeController.swift
Normal file
42
apps/macos/Sources/Clawdis/TalkModeController.swift
Normal file
@@ -0,0 +1,42 @@
|
||||
import Observation
|
||||
import OSLog
|
||||
|
||||
@MainActor
|
||||
@Observable
|
||||
final class TalkModeController {
|
||||
static let shared = TalkModeController()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.controller")
|
||||
|
||||
func setEnabled(_ enabled: Bool) async {
|
||||
self.logger.info("talk enabled=\(enabled)")
|
||||
if enabled {
|
||||
TalkOverlayController.shared.present()
|
||||
} else {
|
||||
TalkOverlayController.shared.dismiss()
|
||||
}
|
||||
await TalkModeRuntime.shared.setEnabled(enabled)
|
||||
}
|
||||
|
||||
func updatePhase(_ phase: TalkModePhase) {
|
||||
TalkOverlayController.shared.updatePhase(phase)
|
||||
}
|
||||
|
||||
func updateLevel(_ level: Double) {
|
||||
TalkOverlayController.shared.updateLevel(level)
|
||||
}
|
||||
|
||||
func stopSpeaking(reason: TalkStopReason = .userTap) {
|
||||
Task { await TalkModeRuntime.shared.stopSpeaking(reason: reason) }
|
||||
}
|
||||
|
||||
func exitTalkMode() {
|
||||
Task { await AppStateStore.shared.setTalkEnabled(false) }
|
||||
}
|
||||
}
|
||||
|
||||
enum TalkStopReason {
|
||||
case userTap
|
||||
case speech
|
||||
case manual
|
||||
}
|
||||
684
apps/macos/Sources/Clawdis/TalkModeRuntime.swift
Normal file
684
apps/macos/Sources/Clawdis/TalkModeRuntime.swift
Normal file
@@ -0,0 +1,684 @@
|
||||
import AVFoundation
|
||||
import ClawdisChatUI
|
||||
import ClawdisKit
|
||||
import Foundation
|
||||
import OSLog
|
||||
import Speech
|
||||
|
||||
actor TalkModeRuntime {
|
||||
static let shared = TalkModeRuntime()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime")
|
||||
|
||||
private var recognizer: SFSpeechRecognizer?
|
||||
private var audioEngine: AVAudioEngine?
|
||||
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||
private var recognitionTask: SFSpeechRecognitionTask?
|
||||
private var recognitionGeneration: Int = 0
|
||||
|
||||
private var captureTask: Task<Void, Never>?
|
||||
private var silenceTask: Task<Void, Never>?
|
||||
private var phase: TalkModePhase = .idle
|
||||
private var isEnabled = false
|
||||
|
||||
private var lastHeard: Date?
|
||||
private var noiseFloorRMS: Double = 1e-4
|
||||
private var lastTranscript: String = ""
|
||||
private var lastSpeechEnergyAt: Date?
|
||||
|
||||
private var defaultVoiceId: String?
|
||||
private var currentVoiceId: String?
|
||||
private var defaultModelId: String?
|
||||
private var currentModelId: String?
|
||||
private var voiceOverrideActive = false
|
||||
private var modelOverrideActive = false
|
||||
private var defaultOutputFormat: String?
|
||||
private var interruptOnSpeech: Bool = true
|
||||
private var lastInterruptedAtSeconds: Double?
|
||||
private var lastSpokenText: String?
|
||||
|
||||
private let silenceWindow: TimeInterval = 0.7
|
||||
private let minSpeechRMS: Double = 1e-3
|
||||
private let speechBoostFactor: Double = 6.0
|
||||
|
||||
// MARK: - Lifecycle
|
||||
|
||||
func setEnabled(_ enabled: Bool) async {
|
||||
guard enabled != self.isEnabled else { return }
|
||||
self.isEnabled = enabled
|
||||
if enabled {
|
||||
await self.start()
|
||||
} else {
|
||||
await self.stop()
|
||||
}
|
||||
}
|
||||
|
||||
private func start() async {
|
||||
guard voiceWakeSupported else { return }
|
||||
guard PermissionManager.voiceWakePermissionsGranted() else {
|
||||
self.logger.debug("talk runtime not starting: permissions missing")
|
||||
return
|
||||
}
|
||||
await self.reloadConfig()
|
||||
await self.startRecognition()
|
||||
self.phase = .listening
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.listening) }
|
||||
self.startSilenceMonitor()
|
||||
}
|
||||
|
||||
private func stop() async {
|
||||
self.captureTask?.cancel()
|
||||
self.captureTask = nil
|
||||
self.silenceTask?.cancel()
|
||||
self.silenceTask = nil
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.lastSpeechEnergyAt = nil
|
||||
self.phase = .idle
|
||||
await self.stopRecognition()
|
||||
await self.stopSpeaking(reason: .manual)
|
||||
await MainActor.run {
|
||||
TalkModeController.shared.updateLevel(0)
|
||||
TalkModeController.shared.updatePhase(.idle)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Speech recognition
|
||||
|
||||
private struct RecognitionUpdate {
|
||||
let transcript: String?
|
||||
let segments: [SFTranscriptionSegment]
|
||||
let isFinal: Bool
|
||||
let error: Error?
|
||||
let generation: Int
|
||||
}
|
||||
|
||||
private func startRecognition() async {
|
||||
await self.stopRecognition()
|
||||
self.recognitionGeneration &+= 1
|
||||
let generation = self.recognitionGeneration
|
||||
|
||||
let locale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID }
|
||||
self.recognizer = SFSpeechRecognizer(locale: Locale(identifier: locale))
|
||||
guard let recognizer, recognizer.isAvailable else {
|
||||
self.logger.error("talk recognizer unavailable")
|
||||
return
|
||||
}
|
||||
|
||||
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
||||
self.recognitionRequest?.shouldReportPartialResults = true
|
||||
guard let request = self.recognitionRequest else { return }
|
||||
|
||||
if self.audioEngine == nil {
|
||||
self.audioEngine = AVAudioEngine()
|
||||
}
|
||||
guard let audioEngine = self.audioEngine else { return }
|
||||
|
||||
let input = audioEngine.inputNode
|
||||
let format = input.outputFormat(forBus: 0)
|
||||
input.removeTap(onBus: 0)
|
||||
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
|
||||
request?.append(buffer)
|
||||
if let rms = Self.rmsLevel(buffer: buffer) {
|
||||
Task.detached { [weak self] in
|
||||
await self?.noteAudioLevel(rms: rms)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
audioEngine.prepare()
|
||||
do {
|
||||
try audioEngine.start()
|
||||
} catch {
|
||||
self.logger.error("talk audio engine start failed: \(error.localizedDescription, privacy: .public)")
|
||||
return
|
||||
}
|
||||
|
||||
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in
|
||||
guard let self else { return }
|
||||
let transcript = result?.bestTranscription.formattedString
|
||||
let update = RecognitionUpdate(
|
||||
transcript: transcript,
|
||||
segments: result?.bestTranscription.segments ?? [],
|
||||
isFinal: result?.isFinal ?? false,
|
||||
error: error,
|
||||
generation: generation)
|
||||
Task { await self.handleRecognition(update) }
|
||||
}
|
||||
}
|
||||
|
||||
private func stopRecognition() async {
|
||||
self.recognitionGeneration &+= 1
|
||||
self.recognitionTask?.cancel()
|
||||
self.recognitionTask = nil
|
||||
self.recognitionRequest?.endAudio()
|
||||
self.recognitionRequest = nil
|
||||
self.audioEngine?.inputNode.removeTap(onBus: 0)
|
||||
self.audioEngine?.stop()
|
||||
self.audioEngine = nil
|
||||
self.recognizer = nil
|
||||
}
|
||||
|
||||
private func handleRecognition(_ update: RecognitionUpdate) async {
|
||||
guard update.generation == self.recognitionGeneration else { return }
|
||||
if let error = update.error {
|
||||
self.logger.debug("talk recognition error: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
guard let transcript = update.transcript else { return }
|
||||
|
||||
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if self.phase == .speaking, self.interruptOnSpeech {
|
||||
if await self.shouldInterrupt(transcript: trimmed, segments: update.segments) {
|
||||
await self.stopSpeaking(reason: .speech)
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
await self.startListening()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
guard self.phase == .listening else { return }
|
||||
|
||||
if !trimmed.isEmpty {
|
||||
self.lastTranscript = trimmed
|
||||
self.lastHeard = Date()
|
||||
}
|
||||
|
||||
if update.isFinal {
|
||||
self.lastTranscript = trimmed
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Silence handling
|
||||
|
||||
private func startSilenceMonitor() {
|
||||
self.silenceTask?.cancel()
|
||||
self.silenceTask = Task { [weak self] in
|
||||
guard let self else { return }
|
||||
while self.isEnabled {
|
||||
try? await Task.sleep(nanoseconds: 200_000_000)
|
||||
await self.checkSilence()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func checkSilence() async {
|
||||
guard self.phase == .listening else { return }
|
||||
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !transcript.isEmpty else { return }
|
||||
guard let lastHeard else { return }
|
||||
let elapsed = Date().timeIntervalSince(lastHeard)
|
||||
guard elapsed >= self.silenceWindow else { return }
|
||||
await self.finalizeTranscript(transcript)
|
||||
}
|
||||
|
||||
private func startListening() async {
|
||||
self.phase = .listening
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
await MainActor.run {
|
||||
TalkModeController.shared.updatePhase(.listening)
|
||||
TalkModeController.shared.updateLevel(0)
|
||||
}
|
||||
}
|
||||
|
||||
private func finalizeTranscript(_ text: String) async {
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.phase = .thinking
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
|
||||
await self.stopRecognition()
|
||||
await self.sendAndSpeak(text)
|
||||
}
|
||||
|
||||
// MARK: - Gateway + TTS
|
||||
|
||||
private func sendAndSpeak(_ transcript: String) async {
|
||||
await self.reloadConfig()
|
||||
let prompt = self.buildPrompt(transcript: transcript)
|
||||
let runId = UUID().uuidString
|
||||
|
||||
do {
|
||||
let response = try await GatewayConnection.shared.chatSend(
|
||||
sessionKey: "main",
|
||||
message: prompt,
|
||||
thinking: "low",
|
||||
idempotencyKey: runId,
|
||||
attachments: [])
|
||||
let completion = await self.waitForChatCompletion(
|
||||
runId: response.runId,
|
||||
timeoutSeconds: 120)
|
||||
guard completion == .final else {
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
return
|
||||
}
|
||||
|
||||
guard let assistantText = await self.latestAssistantText(sessionKey: "main") else {
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
return
|
||||
}
|
||||
|
||||
await self.playAssistant(text: assistantText)
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
return
|
||||
} catch {
|
||||
self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)")
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
private func buildPrompt(transcript: String) -> String {
|
||||
var lines: [String] = [
|
||||
"Talk Mode active. Reply in a concise, spoken tone.",
|
||||
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
|
||||
]
|
||||
|
||||
if let interrupted = self.lastInterruptedAtSeconds {
|
||||
let formatted = String(format: "%.1f", interrupted)
|
||||
lines.append("Assistant speech interrupted at \(formatted)s.")
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
}
|
||||
|
||||
lines.append("")
|
||||
lines.append(transcript)
|
||||
return lines.joined(separator: "\n")
|
||||
}
|
||||
|
||||
private enum ChatCompletionState {
|
||||
case final
|
||||
case aborted
|
||||
case error
|
||||
case timeout
|
||||
}
|
||||
|
||||
private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState {
|
||||
await withTaskGroup(of: ChatCompletionState.self) { group in
|
||||
group.addTask { [runId] in
|
||||
let stream = GatewayConnection.shared.subscribe()
|
||||
for await push in stream {
|
||||
if case let .event(evt) = push, evt.event == "chat", let payload = evt.payload {
|
||||
if let chat = try? JSONDecoder().decode(
|
||||
ClawdisChatEventPayload.self,
|
||||
from: JSONEncoder().encode(payload))
|
||||
{
|
||||
guard chat.runId == runId else { continue }
|
||||
switch chat.state {
|
||||
case .some("final"): return .final
|
||||
case .some("aborted"): return .aborted
|
||||
case .some("error"): return .error
|
||||
default: break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return .timeout
|
||||
}
|
||||
group.addTask {
|
||||
try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000)
|
||||
return .timeout
|
||||
}
|
||||
let result = await group.next() ?? .timeout
|
||||
group.cancelAll()
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
private func latestAssistantText(sessionKey: String) async -> String? {
|
||||
do {
|
||||
let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey)
|
||||
let messages = history.messages ?? []
|
||||
let decoded = messages.compactMap { item in
|
||||
guard let data = try? JSONEncoder().encode(item) else { return nil }
|
||||
return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data)
|
||||
}
|
||||
guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil }
|
||||
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
|
||||
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
return trimmed.isEmpty ? nil : trimmed
|
||||
} catch {
|
||||
self.logger.error("talk history fetch failed: \(error.localizedDescription, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
private func playAssistant(text: String) async {
|
||||
let parse = TalkDirectiveParser.parse(text)
|
||||
let directive = parse.directive
|
||||
let cleaned = parse.stripped.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !cleaned.isEmpty else { return }
|
||||
|
||||
if !parse.unknownKeys.isEmpty {
|
||||
self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)")
|
||||
}
|
||||
|
||||
if let voice = directive?.voiceId {
|
||||
if directive?.once == true {
|
||||
self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)")
|
||||
} else {
|
||||
self.currentVoiceId = voice
|
||||
self.voiceOverrideActive = true
|
||||
self.logger.info("talk voice override voiceId=\(voice, privacy: .public)")
|
||||
}
|
||||
}
|
||||
|
||||
if let model = directive?.modelId {
|
||||
if directive?.once == true {
|
||||
self.logger.info("talk model override (once) modelId=\(model, privacy: .public)")
|
||||
} else {
|
||||
self.currentModelId = model
|
||||
self.modelOverrideActive = true
|
||||
}
|
||||
}
|
||||
|
||||
let voiceId =
|
||||
directive?.voiceId ??
|
||||
self.currentVoiceId ??
|
||||
self.defaultVoiceId
|
||||
|
||||
guard let voiceId, !voiceId.isEmpty else {
|
||||
self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
|
||||
return
|
||||
}
|
||||
|
||||
let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? ""
|
||||
if apiKey.isEmpty {
|
||||
self.logger.error("talk missing ELEVENLABS_API_KEY")
|
||||
return
|
||||
}
|
||||
|
||||
await self.startRecognition()
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
|
||||
self.phase = .speaking
|
||||
self.lastSpokenText = cleaned
|
||||
|
||||
let resolvedSpeed = Self.resolveSpeed(
|
||||
speed: directive?.speed,
|
||||
rateWPM: directive?.rateWPM,
|
||||
logger: self.logger)
|
||||
|
||||
let request = ElevenLabsRequest(
|
||||
text: cleaned,
|
||||
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
|
||||
outputFormat: directive?.outputFormat ?? self.defaultOutputFormat,
|
||||
speed: resolvedSpeed,
|
||||
stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger),
|
||||
similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger),
|
||||
style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger),
|
||||
speakerBoost: directive?.speakerBoost,
|
||||
seed: Self.validatedSeed(directive?.seed, logger: self.logger),
|
||||
normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger),
|
||||
language: Self.validatedLanguage(directive?.language, logger: self.logger))
|
||||
|
||||
do {
|
||||
let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize(
|
||||
voiceId: voiceId,
|
||||
request: request)
|
||||
let result = await MainActor.run { await TalkAudioPlayer.shared.play(data: audio) }
|
||||
if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
|
||||
if self.interruptOnSpeech {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
|
||||
self.phase = .thinking
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
|
||||
}
|
||||
|
||||
func stopSpeaking(reason: TalkStopReason) async {
|
||||
guard self.phase == .speaking else { return }
|
||||
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
|
||||
if reason == .speech, let interruptedAt {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
self.phase = .thinking
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
|
||||
}
|
||||
|
||||
// MARK: - Config
|
||||
|
||||
private func reloadConfig() async {
|
||||
let cfg = await self.fetchTalkConfig()
|
||||
self.defaultVoiceId = cfg.voiceId
|
||||
if !self.voiceOverrideActive {
|
||||
self.currentVoiceId = cfg.voiceId
|
||||
}
|
||||
self.defaultModelId = cfg.modelId
|
||||
if !self.modelOverrideActive {
|
||||
self.currentModelId = cfg.modelId
|
||||
}
|
||||
self.defaultOutputFormat = cfg.outputFormat
|
||||
self.interruptOnSpeech = cfg.interruptOnSpeech
|
||||
}
|
||||
|
||||
private struct TalkRuntimeConfig {
|
||||
let voiceId: String?
|
||||
let modelId: String?
|
||||
let outputFormat: String?
|
||||
let interruptOnSpeech: Bool
|
||||
}
|
||||
|
||||
private func fetchTalkConfig() async -> TalkRuntimeConfig {
|
||||
let env = ProcessInfo.processInfo.environment
|
||||
let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
|
||||
do {
|
||||
let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded(
|
||||
method: .configGet,
|
||||
params: nil,
|
||||
timeoutMs: 8000)
|
||||
let talk = snap.config?["talk"]?.dictionaryValue
|
||||
let voice = talk?["voiceId"]?.stringValue
|
||||
let model = talk?["modelId"]?.stringValue
|
||||
let outputFormat = talk?["outputFormat"]?.stringValue
|
||||
let interrupt = talk?["interruptOnSpeech"]?.boolValue
|
||||
let resolvedVoice =
|
||||
(voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ??
|
||||
(envVoice?.isEmpty == false ? envVoice : nil) ??
|
||||
(sagVoice?.isEmpty == false ? sagVoice : nil)
|
||||
return TalkRuntimeConfig(
|
||||
voiceId: resolvedVoice,
|
||||
modelId: model,
|
||||
outputFormat: outputFormat,
|
||||
interruptOnSpeech: interrupt ?? true)
|
||||
} catch {
|
||||
let resolvedVoice =
|
||||
(envVoice?.isEmpty == false ? envVoice : nil) ??
|
||||
(sagVoice?.isEmpty == false ? sagVoice : nil)
|
||||
return TalkRuntimeConfig(
|
||||
voiceId: resolvedVoice,
|
||||
modelId: nil,
|
||||
outputFormat: nil,
|
||||
interruptOnSpeech: true)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Audio level handling
|
||||
|
||||
private func noteAudioLevel(rms: Double) async {
|
||||
if self.phase != .listening && self.phase != .speaking { return }
|
||||
let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01
|
||||
self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha)
|
||||
|
||||
let threshold = max(self.minSpeechRMS, self.noiseFloorRMS * self.speechBoostFactor)
|
||||
if rms >= threshold {
|
||||
let now = Date()
|
||||
self.lastHeard = now
|
||||
self.lastSpeechEnergyAt = now
|
||||
}
|
||||
|
||||
if self.phase == .listening {
|
||||
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
|
||||
await MainActor.run { TalkModeController.shared.updateLevel(clamped) }
|
||||
}
|
||||
}
|
||||
|
||||
private static func rmsLevel(buffer: AVAudioPCMBuffer) -> Double? {
|
||||
guard let channelData = buffer.floatChannelData?.pointee else { return nil }
|
||||
let frameCount = Int(buffer.frameLength)
|
||||
guard frameCount > 0 else { return nil }
|
||||
var sum: Double = 0
|
||||
for i in 0..<frameCount {
|
||||
let sample = Double(channelData[i])
|
||||
sum += sample * sample
|
||||
}
|
||||
return sqrt(sum / Double(frameCount))
|
||||
}
|
||||
|
||||
private func shouldInterrupt(transcript: String, segments: [SFTranscriptionSegment]) async -> Bool {
|
||||
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard trimmed.count >= 3 else { return false }
|
||||
if self.isLikelyEcho(of: trimmed) { return false }
|
||||
let now = Date()
|
||||
if let lastSpeechEnergyAt, now.timeIntervalSince(lastSpeechEnergyAt) > 0.35 {
|
||||
return false
|
||||
}
|
||||
let hasConfidence = segments.contains { $0.confidence > 0.6 }
|
||||
return hasConfidence
|
||||
}
|
||||
|
||||
private func isLikelyEcho(of transcript: String) -> Bool {
|
||||
guard let spoken = self.lastSpokenText?.lowercased(), !spoken.isEmpty else { return false }
|
||||
let probe = transcript.lowercased()
|
||||
if probe.count < 6 {
|
||||
return spoken.contains(probe)
|
||||
}
|
||||
return spoken.contains(probe)
|
||||
}
|
||||
|
||||
private static func resolveSpeed(speed: Double?, rateWPM: Int?, logger: Logger) -> Double? {
|
||||
if let rateWPM, rateWPM > 0 {
|
||||
let resolved = Double(rateWPM) / 175.0
|
||||
if resolved <= 0.5 || resolved >= 2.0 {
|
||||
logger.warning("talk rateWPM out of range: \(rateWPM, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return resolved
|
||||
}
|
||||
if let speed {
|
||||
if speed <= 0.5 || speed >= 2.0 {
|
||||
logger.warning("talk speed out of range: \(speed, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return speed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private static func validatedUnit(_ value: Double?, name: String, logger: Logger) -> Double? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 1 {
|
||||
logger.warning("talk \(name, privacy: .public) out of range: \(value, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 4294967295 {
|
||||
logger.warning("talk seed out of range: \(value, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return UInt32(value)
|
||||
}
|
||||
|
||||
private static func validatedNormalize(_ value: String?, logger: Logger) -> String? {
|
||||
guard let value else { return nil }
|
||||
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard ["auto", "on", "off"].contains(normalized) else {
|
||||
logger.warning("talk normalize invalid: \(normalized, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return normalized
|
||||
}
|
||||
|
||||
private static func validatedLanguage(_ value: String?, logger: Logger) -> String? {
|
||||
guard let value else { return nil }
|
||||
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else {
|
||||
logger.warning("talk language invalid: \(normalized, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return normalized
|
||||
}
|
||||
}
|
||||
|
||||
private struct ElevenLabsRequest {
|
||||
let text: String
|
||||
let modelId: String?
|
||||
let outputFormat: String?
|
||||
let speed: Double?
|
||||
let stability: Double?
|
||||
let similarity: Double?
|
||||
let style: Double?
|
||||
let speakerBoost: Bool?
|
||||
let seed: UInt32?
|
||||
let normalize: String?
|
||||
let language: String?
|
||||
}
|
||||
|
||||
private struct ElevenLabsClient {
|
||||
let apiKey: String
|
||||
let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
|
||||
|
||||
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("text-to-speech")
|
||||
url.appendPathComponent(voiceId)
|
||||
|
||||
var payload: [String: Any] = [
|
||||
"text": request.text,
|
||||
]
|
||||
if let modelId = request.modelId, !modelId.isEmpty {
|
||||
payload["model_id"] = modelId
|
||||
}
|
||||
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
|
||||
payload["output_format"] = outputFormat
|
||||
}
|
||||
if let seed = request.seed {
|
||||
payload["seed"] = seed
|
||||
}
|
||||
if let normalize = request.normalize {
|
||||
payload["apply_text_normalization"] = normalize
|
||||
}
|
||||
if let language = request.language {
|
||||
payload["language_code"] = language
|
||||
}
|
||||
var voiceSettings: [String: Any] = [:]
|
||||
if let speed = request.speed { voiceSettings["speed"] = speed }
|
||||
if let stability = request.stability { voiceSettings["stability"] = stability }
|
||||
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
|
||||
if let style = request.style { voiceSettings["style"] = style }
|
||||
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
|
||||
if !voiceSettings.isEmpty {
|
||||
payload["voice_settings"] = voiceSettings
|
||||
}
|
||||
|
||||
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "POST"
|
||||
req.httpBody = body
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
|
||||
let message = String(data: data, encoding: .utf8) ?? "unknown"
|
||||
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
|
||||
])
|
||||
}
|
||||
return data
|
||||
}
|
||||
}
|
||||
8
apps/macos/Sources/Clawdis/TalkModeTypes.swift
Normal file
8
apps/macos/Sources/Clawdis/TalkModeTypes.swift
Normal file
@@ -0,0 +1,8 @@
|
||||
import Foundation
|
||||
|
||||
enum TalkModePhase: String {
|
||||
case idle
|
||||
case listening
|
||||
case thinking
|
||||
case speaking
|
||||
}
|
||||
119
apps/macos/Sources/Clawdis/TalkOverlay.swift
Normal file
119
apps/macos/Sources/Clawdis/TalkOverlay.swift
Normal file
@@ -0,0 +1,119 @@
|
||||
import AppKit
|
||||
import Observation
|
||||
import OSLog
|
||||
import SwiftUI
|
||||
|
||||
@MainActor
|
||||
@Observable
|
||||
final class TalkOverlayController {
|
||||
static let shared = TalkOverlayController()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay")
|
||||
|
||||
struct Model {
|
||||
var isVisible: Bool = false
|
||||
var phase: TalkModePhase = .idle
|
||||
var level: Double = 0
|
||||
}
|
||||
|
||||
var model = Model()
|
||||
private var window: NSPanel?
|
||||
private var hostingView: NSHostingView<TalkOverlayView>?
|
||||
|
||||
private let width: CGFloat = 92
|
||||
private let height: CGFloat = 92
|
||||
private let padding: CGFloat = 8
|
||||
|
||||
func present() {
|
||||
self.ensureWindow()
|
||||
self.hostingView?.rootView = TalkOverlayView(controller: self)
|
||||
let target = self.targetFrame()
|
||||
|
||||
guard let window else { return }
|
||||
if !self.model.isVisible {
|
||||
self.model.isVisible = true
|
||||
let start = target.offsetBy(dx: 0, dy: -6)
|
||||
window.setFrame(start, display: true)
|
||||
window.alphaValue = 0
|
||||
window.orderFrontRegardless()
|
||||
NSAnimationContext.runAnimationGroup { context in
|
||||
context.duration = 0.18
|
||||
context.timingFunction = CAMediaTimingFunction(name: .easeOut)
|
||||
window.animator().setFrame(target, display: true)
|
||||
window.animator().alphaValue = 1
|
||||
}
|
||||
} else {
|
||||
window.setFrame(target, display: true)
|
||||
window.orderFrontRegardless()
|
||||
}
|
||||
}
|
||||
|
||||
func dismiss() {
|
||||
guard let window else {
|
||||
self.model.isVisible = false
|
||||
return
|
||||
}
|
||||
|
||||
let target = window.frame.offsetBy(dx: 6, dy: 6)
|
||||
NSAnimationContext.runAnimationGroup { context in
|
||||
context.duration = 0.16
|
||||
context.timingFunction = CAMediaTimingFunction(name: .easeOut)
|
||||
window.animator().setFrame(target, display: true)
|
||||
window.animator().alphaValue = 0
|
||||
} completionHandler: {
|
||||
Task { @MainActor in
|
||||
window.orderOut(nil)
|
||||
self.model.isVisible = false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func updatePhase(_ phase: TalkModePhase) {
|
||||
guard self.model.phase != phase else { return }
|
||||
self.logger.info("talk overlay phase=\(phase.rawValue, privacy: .public)")
|
||||
self.model.phase = phase
|
||||
}
|
||||
|
||||
func updateLevel(_ level: Double) {
|
||||
guard self.model.isVisible else { return }
|
||||
self.model.level = max(0, min(1, level))
|
||||
}
|
||||
|
||||
// MARK: - Private
|
||||
|
||||
private func ensureWindow() {
|
||||
if self.window != nil { return }
|
||||
let panel = NSPanel(
|
||||
contentRect: NSRect(x: 0, y: 0, width: self.width, height: self.height),
|
||||
styleMask: [.nonactivatingPanel, .borderless],
|
||||
backing: .buffered,
|
||||
defer: false)
|
||||
panel.isOpaque = false
|
||||
panel.backgroundColor = .clear
|
||||
panel.hasShadow = false
|
||||
panel.level = NSWindow.Level(rawValue: NSWindow.Level.popUpMenu.rawValue - 4)
|
||||
panel.collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .transient]
|
||||
panel.hidesOnDeactivate = false
|
||||
panel.isMovable = false
|
||||
panel.isFloatingPanel = true
|
||||
panel.becomesKeyOnlyIfNeeded = true
|
||||
panel.titleVisibility = .hidden
|
||||
panel.titlebarAppearsTransparent = true
|
||||
|
||||
let host = NSHostingView(rootView: TalkOverlayView(controller: self))
|
||||
host.translatesAutoresizingMaskIntoConstraints = false
|
||||
panel.contentView = host
|
||||
self.hostingView = host
|
||||
self.window = panel
|
||||
}
|
||||
|
||||
private func targetFrame() -> NSRect {
|
||||
guard let screen = NSScreen.main else { return .zero }
|
||||
let size = NSSize(width: self.width, height: self.height)
|
||||
let visible = screen.visibleFrame
|
||||
let origin = CGPoint(
|
||||
x: visible.maxX - size.width - self.padding,
|
||||
y: visible.maxY - size.height - self.padding)
|
||||
return NSRect(origin: origin, size: size)
|
||||
}
|
||||
}
|
||||
139
apps/macos/Sources/Clawdis/TalkOverlayView.swift
Normal file
139
apps/macos/Sources/Clawdis/TalkOverlayView.swift
Normal file
@@ -0,0 +1,139 @@
|
||||
import SwiftUI
|
||||
|
||||
struct TalkOverlayView: View {
|
||||
var controller: TalkOverlayController
|
||||
@State private var hovering = false
|
||||
|
||||
var body: some View {
|
||||
ZStack(alignment: .topLeading) {
|
||||
TalkCloudView(phase: self.controller.model.phase, level: self.controller.model.level)
|
||||
.frame(width: 76, height: 64)
|
||||
.contentShape(Rectangle())
|
||||
.onTapGesture {
|
||||
TalkModeController.shared.stopSpeaking(reason: .userTap)
|
||||
}
|
||||
.padding(8)
|
||||
|
||||
Button {
|
||||
TalkModeController.shared.exitTalkMode()
|
||||
} label: {
|
||||
Image(systemName: "xmark")
|
||||
.font(.system(size: 10, weight: .bold))
|
||||
.foregroundStyle(Color.white.opacity(self.hovering ? 0.95 : 0.7))
|
||||
.frame(width: 18, height: 18)
|
||||
.background(Color.black.opacity(self.hovering ? 0.45 : 0.3))
|
||||
.clipShape(Circle())
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.contentShape(Circle())
|
||||
.padding(4)
|
||||
.onHover { self.hovering = $0 }
|
||||
}
|
||||
.frame(width: 92, height: 92, alignment: .center)
|
||||
}
|
||||
}
|
||||
|
||||
private struct TalkCloudView: View {
|
||||
let phase: TalkModePhase
|
||||
let level: Double
|
||||
|
||||
var body: some View {
|
||||
TimelineView(.animation) { context in
|
||||
let t = context.date.timeIntervalSinceReferenceDate
|
||||
let pulse = phase == .speaking ? (1 + 0.04 * sin(t * 6)) : 1
|
||||
let sink = phase == .thinking ? (3 + 2 * sin(t * 2)) : 0
|
||||
let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.14) : 1
|
||||
let baseScale = phase == .thinking ? 0.94 : 1
|
||||
|
||||
ZStack {
|
||||
CloudShape()
|
||||
.fill(self.cloudGradient)
|
||||
.overlay(
|
||||
CloudShape()
|
||||
.stroke(Color.white.opacity(0.35), lineWidth: 0.8))
|
||||
.shadow(color: Color.black.opacity(0.18), radius: 8, x: 0, y: 4)
|
||||
.scaleEffect(baseScale * pulse * listenScale)
|
||||
.offset(y: sink)
|
||||
|
||||
if phase == .listening {
|
||||
Circle()
|
||||
.stroke(self.ringGradient, lineWidth: 1)
|
||||
.scaleEffect(1 + CGFloat(self.level) * 0.45)
|
||||
.opacity(0.3 + CGFloat(self.level) * 0.4)
|
||||
.animation(.easeOut(duration: 0.08), value: self.level)
|
||||
}
|
||||
|
||||
if phase == .thinking {
|
||||
TalkThinkingDots(time: t)
|
||||
.offset(y: 18)
|
||||
}
|
||||
|
||||
if phase == .speaking {
|
||||
TalkSpeakingRings(time: t)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var cloudGradient: LinearGradient {
|
||||
LinearGradient(
|
||||
colors: [Color(red: 0.95, green: 0.98, blue: 1.0), Color(red: 0.75, green: 0.88, blue: 1.0)],
|
||||
startPoint: .topLeading,
|
||||
endPoint: .bottomTrailing)
|
||||
}
|
||||
|
||||
private var ringGradient: LinearGradient {
|
||||
LinearGradient(
|
||||
colors: [Color.white.opacity(0.6), Color.white.opacity(0.1)],
|
||||
startPoint: .top,
|
||||
endPoint: .bottom)
|
||||
}
|
||||
}
|
||||
|
||||
private struct TalkThinkingDots: View {
|
||||
let time: TimeInterval
|
||||
|
||||
var body: some View {
|
||||
HStack(spacing: 4) {
|
||||
ForEach(0..<3, id: \.self) { idx in
|
||||
let phase = (time * 2 + Double(idx) * 0.45).truncatingRemainder(dividingBy: 1)
|
||||
Circle()
|
||||
.fill(Color.white.opacity(0.75))
|
||||
.frame(width: 5, height: 5)
|
||||
.opacity(0.35 + 0.55 * phase)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private struct TalkSpeakingRings: View {
|
||||
let time: TimeInterval
|
||||
|
||||
var body: some View {
|
||||
ZStack {
|
||||
ForEach(0..<3, id: \.self) { idx in
|
||||
let phase = (time * 1.1 + Double(idx) / 3).truncatingRemainder(dividingBy: 1)
|
||||
Circle()
|
||||
.stroke(Color.white.opacity(0.6 - phase * 0.5), lineWidth: 1)
|
||||
.scaleEffect(0.8 + phase * 0.7)
|
||||
.opacity(0.6 - phase * 0.6)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private struct CloudShape: Shape {
|
||||
func path(in rect: CGRect) -> Path {
|
||||
let w = rect.width
|
||||
let h = rect.height
|
||||
let baseHeight = h * 0.44
|
||||
let baseRect = CGRect(x: rect.minX, y: rect.minY + h * 0.46, width: w, height: baseHeight)
|
||||
|
||||
var path = Path()
|
||||
path.addRoundedRect(in: baseRect, cornerSize: CGSize(width: baseHeight / 2, height: baseHeight / 2))
|
||||
path.addEllipse(in: CGRect(x: rect.minX + w * 0.05, y: rect.minY + h * 0.28, width: w * 0.36, height: h * 0.36))
|
||||
path.addEllipse(in: CGRect(x: rect.minX + w * 0.28, y: rect.minY + h * 0.05, width: w * 0.44, height: h * 0.44))
|
||||
path.addEllipse(in: CGRect(x: rect.minX + w * 0.62, y: rect.minY + h * 0.3, width: w * 0.3, height: h * 0.3))
|
||||
return path
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user