feat: add talk mode across nodes

This commit is contained in:
Peter Steinberger
2025-12-29 23:21:05 +01:00
parent 6927b0fb8d
commit 20d7882033
26 changed files with 3087 additions and 0 deletions

View File

@@ -121,6 +121,15 @@ final class AppState {
forKey: voicePushToTalkEnabledKey) } }
}
var talkEnabled: Bool {
didSet {
self.ifNotPreview {
UserDefaults.standard.set(self.talkEnabled, forKey: talkEnabledKey)
Task { await TalkModeController.shared.setEnabled(self.talkEnabled) }
}
}
}
var iconOverride: IconOverrideSelection {
didSet { self.ifNotPreview { UserDefaults.standard.set(self.iconOverride.rawValue, forKey: iconOverrideKey) } }
}
@@ -216,6 +225,7 @@ final class AppState {
.stringArray(forKey: voiceWakeAdditionalLocalesKey) ?? []
self.voicePushToTalkEnabled = UserDefaults.standard
.object(forKey: voicePushToTalkEnabledKey) as? Bool ?? false
self.talkEnabled = UserDefaults.standard.bool(forKey: talkEnabledKey)
if let storedHeartbeats = UserDefaults.standard.object(forKey: heartbeatsEnabledKey) as? Bool {
self.heartbeatsEnabled = storedHeartbeats
} else {
@@ -256,9 +266,13 @@ final class AppState {
if self.swabbleEnabled, !PermissionManager.voiceWakePermissionsGranted() {
self.swabbleEnabled = false
}
if self.talkEnabled, !PermissionManager.voiceWakePermissionsGranted() {
self.talkEnabled = false
}
if !self.isPreview {
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
Task { await TalkModeController.shared.setEnabled(self.talkEnabled) }
}
}
@@ -312,6 +326,23 @@ final class AppState {
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
}
func setTalkEnabled(_ enabled: Bool) async {
guard voiceWakeSupported else {
self.talkEnabled = false
return
}
self.talkEnabled = enabled
guard !self.isPreview else { return }
if !enabled { return }
if PermissionManager.voiceWakePermissionsGranted() { return }
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
self.talkEnabled = granted
}
// MARK: - Global wake words sync (Gateway-owned)
func applyGlobalVoiceWakeTriggers(_ triggers: [String]) {
@@ -367,6 +398,7 @@ extension AppState {
state.voiceWakeLocaleID = Locale.current.identifier
state.voiceWakeAdditionalLocaleIDs = ["en-US", "de-DE"]
state.voicePushToTalkEnabled = false
state.talkEnabled = false
state.iconOverride = .system
state.heartbeatsEnabled = true
state.connectionMode = .local

View File

@@ -30,6 +30,10 @@ struct ConfigSettings: View {
@State private var browserColorHex: String = "#FF4500"
@State private var browserAttachOnly: Bool = false
// Talk mode settings (stored in ~/.clawdis/clawdis.json under "talk")
@State private var talkVoiceId: String = ""
@State private var talkInterruptOnSpeech: Bool = true
var body: some View {
ScrollView { self.content }
.onChange(of: self.modelCatalogPath) { _, _ in
@@ -53,6 +57,7 @@ struct ConfigSettings: View {
self.header
self.agentSection
self.heartbeatSection
self.talkSection
self.browserSection
Spacer(minLength: 0)
}
@@ -266,6 +271,37 @@ struct ConfigSettings: View {
.frame(maxWidth: .infinity, alignment: .leading)
}
private var talkSection: some View {
GroupBox("Talk Mode") {
Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) {
GridRow {
self.gridLabel("Voice ID")
VStack(alignment: .leading, spacing: 6) {
ComboBox("ElevenLabs voice ID", text: self.$talkVoiceId) {
ForEach(self.talkVoiceSuggestions, id: \.self) { value in
Text(value).tag(value)
}
}
.textFieldStyle(.roundedBorder)
.frame(maxWidth: .infinity)
.onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() }
Text("Defaults to ELEVENLABS_VOICE_ID / SAG_VOICE_ID if unset.")
.font(.footnote)
.foregroundStyle(.secondary)
}
}
GridRow {
self.gridLabel("Interrupt")
Toggle("Stop speaking when you start talking", isOn: self.$talkInterruptOnSpeech)
.labelsHidden()
.toggleStyle(.checkbox)
.onChange(of: self.talkInterruptOnSpeech) { _, _ in self.autosaveConfig() }
}
}
}
.frame(maxWidth: .infinity, alignment: .leading)
}
private func gridLabel(_ text: String) -> some View {
Text(text)
.foregroundStyle(.secondary)
@@ -278,6 +314,7 @@ struct ConfigSettings: View {
let heartbeatMinutes = agent?["heartbeatMinutes"] as? Int
let heartbeatBody = agent?["heartbeatBody"] as? String
let browser = parsed["browser"] as? [String: Any]
let talk = parsed["talk"] as? [String: Any]
let loadedModel = (agent?["model"] as? String) ?? ""
if !loadedModel.isEmpty {
@@ -297,6 +334,13 @@ struct ConfigSettings: View {
if let color = browser["color"] as? String, !color.isEmpty { self.browserColorHex = color }
if let attachOnly = browser["attachOnly"] as? Bool { self.browserAttachOnly = attachOnly }
}
if let talk {
if let voice = talk["voiceId"] as? String { self.talkVoiceId = voice }
if let interrupt = talk["interruptOnSpeech"] as? Bool {
self.talkInterruptOnSpeech = interrupt
}
}
}
private func autosaveConfig() {
@@ -312,6 +356,7 @@ struct ConfigSettings: View {
var root = self.loadConfigDict()
var agent = root["agent"] as? [String: Any] ?? [:]
var browser = root["browser"] as? [String: Any] ?? [:]
var talk = root["talk"] as? [String: Any] ?? [:]
let chosenModel = (self.configModel == "__custom__" ? self.customModel : self.configModel)
.trimmingCharacters(in: .whitespacesAndNewlines)
@@ -337,6 +382,15 @@ struct ConfigSettings: View {
browser["attachOnly"] = self.browserAttachOnly
root["browser"] = browser
let trimmedVoice = self.talkVoiceId.trimmingCharacters(in: .whitespacesAndNewlines)
if trimmedVoice.isEmpty {
talk.removeValue(forKey: "voiceId")
} else {
talk["voiceId"] = trimmedVoice
}
talk["interruptOnSpeech"] = self.talkInterruptOnSpeech
root["talk"] = talk
ClawdisConfigFile.saveDict(root)
}
@@ -354,6 +408,20 @@ struct ConfigSettings: View {
return Color(red: r, green: g, blue: b)
}
private var talkVoiceSuggestions: [String] {
let env = ProcessInfo.processInfo.environment
let candidates = [
self.talkVoiceId,
env["ELEVENLABS_VOICE_ID"] ?? "",
env["SAG_VOICE_ID"] ?? "",
]
var seen = Set<String>()
return candidates
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
.filter { seen.insert($0).inserted }
}
private var browserPathLabel: String? {
guard self.browserEnabled else { return nil }

View File

@@ -16,6 +16,7 @@ let voiceWakeMicKey = "clawdis.voiceWakeMicID"
let voiceWakeLocaleKey = "clawdis.voiceWakeLocaleID"
let voiceWakeAdditionalLocalesKey = "clawdis.voiceWakeAdditionalLocaleIDs"
let voicePushToTalkEnabledKey = "clawdis.voicePushToTalkEnabled"
let talkEnabledKey = "clawdis.talkEnabled"
let iconOverrideKey = "clawdis.iconOverride"
let connectionModeKey = "clawdis.connectionMode"
let remoteTargetKey = "clawdis.remoteTarget"

View File

@@ -72,6 +72,11 @@ struct MenuContent: View {
if self.showVoiceWakeMicPicker {
self.voiceWakeMicMenu
}
Toggle(isOn: self.talkBinding) {
Label("Talk", systemImage: "bubble.left.and.waveform")
}
.disabled(!voiceWakeSupported)
.opacity(voiceWakeSupported ? 1 : 0.5)
Divider()
Button {
Task { @MainActor in
@@ -331,6 +336,14 @@ struct MenuContent: View {
})
}
private var talkBinding: Binding<Bool> {
Binding(
get: { self.state.talkEnabled },
set: { newValue in
Task { await self.state.setTalkEnabled(newValue) }
})
}
private var showVoiceWakeMicPicker: Bool {
voiceWakeSupported && self.state.swabbleEnabled
}

View File

@@ -0,0 +1,54 @@
import AVFoundation
import Foundation
import OSLog
@MainActor
final class TalkAudioPlayer: NSObject, AVAudioPlayerDelegate {
static let shared = TalkAudioPlayer()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts")
private var player: AVAudioPlayer?
private var continuation: CheckedContinuation<TalkPlaybackResult, Never>?
func play(data: Data) async -> TalkPlaybackResult {
self.stopInternal(interrupted: true)
do {
let player = try AVAudioPlayer(data: data)
self.player = player
player.delegate = self
player.prepareToPlay()
player.play()
return await withCheckedContinuation { continuation in
self.continuation = continuation
}
} catch {
self.logger.error("talk audio player failed: \(error.localizedDescription, privacy: .public)")
return TalkPlaybackResult(finished: false, interruptedAt: nil)
}
}
func stop() -> Double? {
guard let player else { return nil }
let time = player.currentTime
self.stopInternal(interrupted: true, interruptedAt: time)
return time
}
func audioPlayerDidFinishPlaying(_: AVAudioPlayer, successfully flag: Bool) {
self.stopInternal(interrupted: !flag)
}
private func stopInternal(interrupted: Bool, interruptedAt: Double? = nil) {
self.player?.stop()
self.player = nil
if let continuation {
self.continuation = nil
continuation.resume(returning: TalkPlaybackResult(finished: !interrupted, interruptedAt: interruptedAt))
}
}
}
struct TalkPlaybackResult: Sendable {
let finished: Bool
let interruptedAt: Double?
}

View File

@@ -0,0 +1,42 @@
import Observation
import OSLog
@MainActor
@Observable
final class TalkModeController {
static let shared = TalkModeController()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.controller")
func setEnabled(_ enabled: Bool) async {
self.logger.info("talk enabled=\(enabled)")
if enabled {
TalkOverlayController.shared.present()
} else {
TalkOverlayController.shared.dismiss()
}
await TalkModeRuntime.shared.setEnabled(enabled)
}
func updatePhase(_ phase: TalkModePhase) {
TalkOverlayController.shared.updatePhase(phase)
}
func updateLevel(_ level: Double) {
TalkOverlayController.shared.updateLevel(level)
}
func stopSpeaking(reason: TalkStopReason = .userTap) {
Task { await TalkModeRuntime.shared.stopSpeaking(reason: reason) }
}
func exitTalkMode() {
Task { await AppStateStore.shared.setTalkEnabled(false) }
}
}
enum TalkStopReason {
case userTap
case speech
case manual
}

View File

@@ -0,0 +1,684 @@
import AVFoundation
import ClawdisChatUI
import ClawdisKit
import Foundation
import OSLog
import Speech
actor TalkModeRuntime {
static let shared = TalkModeRuntime()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime")
private var recognizer: SFSpeechRecognizer?
private var audioEngine: AVAudioEngine?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var recognitionGeneration: Int = 0
private var captureTask: Task<Void, Never>?
private var silenceTask: Task<Void, Never>?
private var phase: TalkModePhase = .idle
private var isEnabled = false
private var lastHeard: Date?
private var noiseFloorRMS: Double = 1e-4
private var lastTranscript: String = ""
private var lastSpeechEnergyAt: Date?
private var defaultVoiceId: String?
private var currentVoiceId: String?
private var defaultModelId: String?
private var currentModelId: String?
private var voiceOverrideActive = false
private var modelOverrideActive = false
private var defaultOutputFormat: String?
private var interruptOnSpeech: Bool = true
private var lastInterruptedAtSeconds: Double?
private var lastSpokenText: String?
private let silenceWindow: TimeInterval = 0.7
private let minSpeechRMS: Double = 1e-3
private let speechBoostFactor: Double = 6.0
// MARK: - Lifecycle
func setEnabled(_ enabled: Bool) async {
guard enabled != self.isEnabled else { return }
self.isEnabled = enabled
if enabled {
await self.start()
} else {
await self.stop()
}
}
private func start() async {
guard voiceWakeSupported else { return }
guard PermissionManager.voiceWakePermissionsGranted() else {
self.logger.debug("talk runtime not starting: permissions missing")
return
}
await self.reloadConfig()
await self.startRecognition()
self.phase = .listening
await MainActor.run { TalkModeController.shared.updatePhase(.listening) }
self.startSilenceMonitor()
}
private func stop() async {
self.captureTask?.cancel()
self.captureTask = nil
self.silenceTask?.cancel()
self.silenceTask = nil
self.lastTranscript = ""
self.lastHeard = nil
self.lastSpeechEnergyAt = nil
self.phase = .idle
await self.stopRecognition()
await self.stopSpeaking(reason: .manual)
await MainActor.run {
TalkModeController.shared.updateLevel(0)
TalkModeController.shared.updatePhase(.idle)
}
}
// MARK: - Speech recognition
private struct RecognitionUpdate {
let transcript: String?
let segments: [SFTranscriptionSegment]
let isFinal: Bool
let error: Error?
let generation: Int
}
private func startRecognition() async {
await self.stopRecognition()
self.recognitionGeneration &+= 1
let generation = self.recognitionGeneration
let locale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID }
self.recognizer = SFSpeechRecognizer(locale: Locale(identifier: locale))
guard let recognizer, recognizer.isAvailable else {
self.logger.error("talk recognizer unavailable")
return
}
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return }
if self.audioEngine == nil {
self.audioEngine = AVAudioEngine()
}
guard let audioEngine = self.audioEngine else { return }
let input = audioEngine.inputNode
let format = input.outputFormat(forBus: 0)
input.removeTap(onBus: 0)
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
request?.append(buffer)
if let rms = Self.rmsLevel(buffer: buffer) {
Task.detached { [weak self] in
await self?.noteAudioLevel(rms: rms)
}
}
}
audioEngine.prepare()
do {
try audioEngine.start()
} catch {
self.logger.error("talk audio engine start failed: \(error.localizedDescription, privacy: .public)")
return
}
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in
guard let self else { return }
let transcript = result?.bestTranscription.formattedString
let update = RecognitionUpdate(
transcript: transcript,
segments: result?.bestTranscription.segments ?? [],
isFinal: result?.isFinal ?? false,
error: error,
generation: generation)
Task { await self.handleRecognition(update) }
}
}
private func stopRecognition() async {
self.recognitionGeneration &+= 1
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
self.audioEngine?.inputNode.removeTap(onBus: 0)
self.audioEngine?.stop()
self.audioEngine = nil
self.recognizer = nil
}
private func handleRecognition(_ update: RecognitionUpdate) async {
guard update.generation == self.recognitionGeneration else { return }
if let error = update.error {
self.logger.debug("talk recognition error: \(error.localizedDescription, privacy: .public)")
}
guard let transcript = update.transcript else { return }
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
if self.phase == .speaking, self.interruptOnSpeech {
if await self.shouldInterrupt(transcript: trimmed, segments: update.segments) {
await self.stopSpeaking(reason: .speech)
self.lastTranscript = ""
self.lastHeard = nil
await self.startListening()
}
return
}
guard self.phase == .listening else { return }
if !trimmed.isEmpty {
self.lastTranscript = trimmed
self.lastHeard = Date()
}
if update.isFinal {
self.lastTranscript = trimmed
}
}
// MARK: - Silence handling
private func startSilenceMonitor() {
self.silenceTask?.cancel()
self.silenceTask = Task { [weak self] in
guard let self else { return }
while self.isEnabled {
try? await Task.sleep(nanoseconds: 200_000_000)
await self.checkSilence()
}
}
}
private func checkSilence() async {
guard self.phase == .listening else { return }
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !transcript.isEmpty else { return }
guard let lastHeard else { return }
let elapsed = Date().timeIntervalSince(lastHeard)
guard elapsed >= self.silenceWindow else { return }
await self.finalizeTranscript(transcript)
}
private func startListening() async {
self.phase = .listening
self.lastTranscript = ""
self.lastHeard = nil
await MainActor.run {
TalkModeController.shared.updatePhase(.listening)
TalkModeController.shared.updateLevel(0)
}
}
private func finalizeTranscript(_ text: String) async {
self.lastTranscript = ""
self.lastHeard = nil
self.phase = .thinking
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
await self.stopRecognition()
await self.sendAndSpeak(text)
}
// MARK: - Gateway + TTS
private func sendAndSpeak(_ transcript: String) async {
await self.reloadConfig()
let prompt = self.buildPrompt(transcript: transcript)
let runId = UUID().uuidString
do {
let response = try await GatewayConnection.shared.chatSend(
sessionKey: "main",
message: prompt,
thinking: "low",
idempotencyKey: runId,
attachments: [])
let completion = await self.waitForChatCompletion(
runId: response.runId,
timeoutSeconds: 120)
guard completion == .final else {
await self.startListening()
await self.startRecognition()
return
}
guard let assistantText = await self.latestAssistantText(sessionKey: "main") else {
await self.startListening()
await self.startRecognition()
return
}
await self.playAssistant(text: assistantText)
await self.startListening()
await self.startRecognition()
return
} catch {
self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)")
await self.startListening()
await self.startRecognition()
return
}
}
private func buildPrompt(transcript: String) -> String {
var lines: [String] = [
"Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
]
if let interrupted = self.lastInterruptedAtSeconds {
let formatted = String(format: "%.1f", interrupted)
lines.append("Assistant speech interrupted at \(formatted)s.")
self.lastInterruptedAtSeconds = nil
}
lines.append("")
lines.append(transcript)
return lines.joined(separator: "\n")
}
private enum ChatCompletionState {
case final
case aborted
case error
case timeout
}
private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState {
await withTaskGroup(of: ChatCompletionState.self) { group in
group.addTask { [runId] in
let stream = GatewayConnection.shared.subscribe()
for await push in stream {
if case let .event(evt) = push, evt.event == "chat", let payload = evt.payload {
if let chat = try? JSONDecoder().decode(
ClawdisChatEventPayload.self,
from: JSONEncoder().encode(payload))
{
guard chat.runId == runId else { continue }
switch chat.state {
case .some("final"): return .final
case .some("aborted"): return .aborted
case .some("error"): return .error
default: break
}
}
}
}
return .timeout
}
group.addTask {
try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000)
return .timeout
}
let result = await group.next() ?? .timeout
group.cancelAll()
return result
}
}
private func latestAssistantText(sessionKey: String) async -> String? {
do {
let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey)
let messages = history.messages ?? []
let decoded = messages.compactMap { item in
guard let data = try? JSONEncoder().encode(item) else { return nil }
return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data)
}
guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil }
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
return trimmed.isEmpty ? nil : trimmed
} catch {
self.logger.error("talk history fetch failed: \(error.localizedDescription, privacy: .public)")
return nil
}
}
private func playAssistant(text: String) async {
let parse = TalkDirectiveParser.parse(text)
let directive = parse.directive
let cleaned = parse.stripped.trimmingCharacters(in: .whitespacesAndNewlines)
guard !cleaned.isEmpty else { return }
if !parse.unknownKeys.isEmpty {
self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)")
}
if let voice = directive?.voiceId {
if directive?.once == true {
self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)")
} else {
self.currentVoiceId = voice
self.voiceOverrideActive = true
self.logger.info("talk voice override voiceId=\(voice, privacy: .public)")
}
}
if let model = directive?.modelId {
if directive?.once == true {
self.logger.info("talk model override (once) modelId=\(model, privacy: .public)")
} else {
self.currentModelId = model
self.modelOverrideActive = true
}
}
let voiceId =
directive?.voiceId ??
self.currentVoiceId ??
self.defaultVoiceId
guard let voiceId, !voiceId.isEmpty else {
self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
return
}
let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? ""
if apiKey.isEmpty {
self.logger.error("talk missing ELEVENLABS_API_KEY")
return
}
await self.startRecognition()
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
self.phase = .speaking
self.lastSpokenText = cleaned
let resolvedSpeed = Self.resolveSpeed(
speed: directive?.speed,
rateWPM: directive?.rateWPM,
logger: self.logger)
let request = ElevenLabsRequest(
text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
outputFormat: directive?.outputFormat ?? self.defaultOutputFormat,
speed: resolvedSpeed,
stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger),
similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger),
style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger),
speakerBoost: directive?.speakerBoost,
seed: Self.validatedSeed(directive?.seed, logger: self.logger),
normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger),
language: Self.validatedLanguage(directive?.language, logger: self.logger))
do {
let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize(
voiceId: voiceId,
request: request)
let result = await MainActor.run { await TalkAudioPlayer.shared.play(data: audio) }
if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
if self.interruptOnSpeech {
self.lastInterruptedAtSeconds = interruptedAt
}
}
} catch {
self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)")
}
self.phase = .thinking
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
}
func stopSpeaking(reason: TalkStopReason) async {
guard self.phase == .speaking else { return }
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
if reason == .speech, let interruptedAt {
self.lastInterruptedAtSeconds = interruptedAt
}
self.phase = .thinking
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
}
// MARK: - Config
private func reloadConfig() async {
let cfg = await self.fetchTalkConfig()
self.defaultVoiceId = cfg.voiceId
if !self.voiceOverrideActive {
self.currentVoiceId = cfg.voiceId
}
self.defaultModelId = cfg.modelId
if !self.modelOverrideActive {
self.currentModelId = cfg.modelId
}
self.defaultOutputFormat = cfg.outputFormat
self.interruptOnSpeech = cfg.interruptOnSpeech
}
private struct TalkRuntimeConfig {
let voiceId: String?
let modelId: String?
let outputFormat: String?
let interruptOnSpeech: Bool
}
private func fetchTalkConfig() async -> TalkRuntimeConfig {
let env = ProcessInfo.processInfo.environment
let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
do {
let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded(
method: .configGet,
params: nil,
timeoutMs: 8000)
let talk = snap.config?["talk"]?.dictionaryValue
let voice = talk?["voiceId"]?.stringValue
let model = talk?["modelId"]?.stringValue
let outputFormat = talk?["outputFormat"]?.stringValue
let interrupt = talk?["interruptOnSpeech"]?.boolValue
let resolvedVoice =
(voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ??
(envVoice?.isEmpty == false ? envVoice : nil) ??
(sagVoice?.isEmpty == false ? sagVoice : nil)
return TalkRuntimeConfig(
voiceId: resolvedVoice,
modelId: model,
outputFormat: outputFormat,
interruptOnSpeech: interrupt ?? true)
} catch {
let resolvedVoice =
(envVoice?.isEmpty == false ? envVoice : nil) ??
(sagVoice?.isEmpty == false ? sagVoice : nil)
return TalkRuntimeConfig(
voiceId: resolvedVoice,
modelId: nil,
outputFormat: nil,
interruptOnSpeech: true)
}
}
// MARK: - Audio level handling
private func noteAudioLevel(rms: Double) async {
if self.phase != .listening && self.phase != .speaking { return }
let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01
self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha)
let threshold = max(self.minSpeechRMS, self.noiseFloorRMS * self.speechBoostFactor)
if rms >= threshold {
let now = Date()
self.lastHeard = now
self.lastSpeechEnergyAt = now
}
if self.phase == .listening {
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
await MainActor.run { TalkModeController.shared.updateLevel(clamped) }
}
}
private static func rmsLevel(buffer: AVAudioPCMBuffer) -> Double? {
guard let channelData = buffer.floatChannelData?.pointee else { return nil }
let frameCount = Int(buffer.frameLength)
guard frameCount > 0 else { return nil }
var sum: Double = 0
for i in 0..<frameCount {
let sample = Double(channelData[i])
sum += sample * sample
}
return sqrt(sum / Double(frameCount))
}
private func shouldInterrupt(transcript: String, segments: [SFTranscriptionSegment]) async -> Bool {
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
guard trimmed.count >= 3 else { return false }
if self.isLikelyEcho(of: trimmed) { return false }
let now = Date()
if let lastSpeechEnergyAt, now.timeIntervalSince(lastSpeechEnergyAt) > 0.35 {
return false
}
let hasConfidence = segments.contains { $0.confidence > 0.6 }
return hasConfidence
}
private func isLikelyEcho(of transcript: String) -> Bool {
guard let spoken = self.lastSpokenText?.lowercased(), !spoken.isEmpty else { return false }
let probe = transcript.lowercased()
if probe.count < 6 {
return spoken.contains(probe)
}
return spoken.contains(probe)
}
private static func resolveSpeed(speed: Double?, rateWPM: Int?, logger: Logger) -> Double? {
if let rateWPM, rateWPM > 0 {
let resolved = Double(rateWPM) / 175.0
if resolved <= 0.5 || resolved >= 2.0 {
logger.warning("talk rateWPM out of range: \(rateWPM, privacy: .public)")
return nil
}
return resolved
}
if let speed {
if speed <= 0.5 || speed >= 2.0 {
logger.warning("talk speed out of range: \(speed, privacy: .public)")
return nil
}
return speed
}
return nil
}
private static func validatedUnit(_ value: Double?, name: String, logger: Logger) -> Double? {
guard let value else { return nil }
if value < 0 || value > 1 {
logger.warning("talk \(name, privacy: .public) out of range: \(value, privacy: .public)")
return nil
}
return value
}
private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? {
guard let value else { return nil }
if value < 0 || value > 4294967295 {
logger.warning("talk seed out of range: \(value, privacy: .public)")
return nil
}
return UInt32(value)
}
private static func validatedNormalize(_ value: String?, logger: Logger) -> String? {
guard let value else { return nil }
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard ["auto", "on", "off"].contains(normalized) else {
logger.warning("talk normalize invalid: \(normalized, privacy: .public)")
return nil
}
return normalized
}
private static func validatedLanguage(_ value: String?, logger: Logger) -> String? {
guard let value else { return nil }
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else {
logger.warning("talk language invalid: \(normalized, privacy: .public)")
return nil
}
return normalized
}
}
private struct ElevenLabsRequest {
let text: String
let modelId: String?
let outputFormat: String?
let speed: Double?
let stability: Double?
let similarity: Double?
let style: Double?
let speakerBoost: Bool?
let seed: UInt32?
let normalize: String?
let language: String?
}
private struct ElevenLabsClient {
let apiKey: String
let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
var payload: [String: Any] = [
"text": request.text,
]
if let modelId = request.modelId, !modelId.isEmpty {
payload["model_id"] = modelId
}
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
payload["output_format"] = outputFormat
}
if let seed = request.seed {
payload["seed"] = seed
}
if let normalize = request.normalize {
payload["apply_text_normalization"] = normalize
}
if let language = request.language {
payload["language_code"] = language
}
var voiceSettings: [String: Any] = [:]
if let speed = request.speed { voiceSettings["speed"] = speed }
if let stability = request.stability { voiceSettings["stability"] = stability }
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
if let style = request.style { voiceSettings["style"] = style }
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
if !voiceSettings.isEmpty {
payload["voice_settings"] = voiceSettings
}
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = String(data: data, encoding: .utf8) ?? "unknown"
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
])
}
return data
}
}

View File

@@ -0,0 +1,8 @@
import Foundation
enum TalkModePhase: String {
case idle
case listening
case thinking
case speaking
}

View File

@@ -0,0 +1,119 @@
import AppKit
import Observation
import OSLog
import SwiftUI
@MainActor
@Observable
final class TalkOverlayController {
static let shared = TalkOverlayController()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay")
struct Model {
var isVisible: Bool = false
var phase: TalkModePhase = .idle
var level: Double = 0
}
var model = Model()
private var window: NSPanel?
private var hostingView: NSHostingView<TalkOverlayView>?
private let width: CGFloat = 92
private let height: CGFloat = 92
private let padding: CGFloat = 8
func present() {
self.ensureWindow()
self.hostingView?.rootView = TalkOverlayView(controller: self)
let target = self.targetFrame()
guard let window else { return }
if !self.model.isVisible {
self.model.isVisible = true
let start = target.offsetBy(dx: 0, dy: -6)
window.setFrame(start, display: true)
window.alphaValue = 0
window.orderFrontRegardless()
NSAnimationContext.runAnimationGroup { context in
context.duration = 0.18
context.timingFunction = CAMediaTimingFunction(name: .easeOut)
window.animator().setFrame(target, display: true)
window.animator().alphaValue = 1
}
} else {
window.setFrame(target, display: true)
window.orderFrontRegardless()
}
}
func dismiss() {
guard let window else {
self.model.isVisible = false
return
}
let target = window.frame.offsetBy(dx: 6, dy: 6)
NSAnimationContext.runAnimationGroup { context in
context.duration = 0.16
context.timingFunction = CAMediaTimingFunction(name: .easeOut)
window.animator().setFrame(target, display: true)
window.animator().alphaValue = 0
} completionHandler: {
Task { @MainActor in
window.orderOut(nil)
self.model.isVisible = false
}
}
}
func updatePhase(_ phase: TalkModePhase) {
guard self.model.phase != phase else { return }
self.logger.info("talk overlay phase=\(phase.rawValue, privacy: .public)")
self.model.phase = phase
}
func updateLevel(_ level: Double) {
guard self.model.isVisible else { return }
self.model.level = max(0, min(1, level))
}
// MARK: - Private
private func ensureWindow() {
if self.window != nil { return }
let panel = NSPanel(
contentRect: NSRect(x: 0, y: 0, width: self.width, height: self.height),
styleMask: [.nonactivatingPanel, .borderless],
backing: .buffered,
defer: false)
panel.isOpaque = false
panel.backgroundColor = .clear
panel.hasShadow = false
panel.level = NSWindow.Level(rawValue: NSWindow.Level.popUpMenu.rawValue - 4)
panel.collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .transient]
panel.hidesOnDeactivate = false
panel.isMovable = false
panel.isFloatingPanel = true
panel.becomesKeyOnlyIfNeeded = true
panel.titleVisibility = .hidden
panel.titlebarAppearsTransparent = true
let host = NSHostingView(rootView: TalkOverlayView(controller: self))
host.translatesAutoresizingMaskIntoConstraints = false
panel.contentView = host
self.hostingView = host
self.window = panel
}
private func targetFrame() -> NSRect {
guard let screen = NSScreen.main else { return .zero }
let size = NSSize(width: self.width, height: self.height)
let visible = screen.visibleFrame
let origin = CGPoint(
x: visible.maxX - size.width - self.padding,
y: visible.maxY - size.height - self.padding)
return NSRect(origin: origin, size: size)
}
}

View File

@@ -0,0 +1,139 @@
import SwiftUI
struct TalkOverlayView: View {
var controller: TalkOverlayController
@State private var hovering = false
var body: some View {
ZStack(alignment: .topLeading) {
TalkCloudView(phase: self.controller.model.phase, level: self.controller.model.level)
.frame(width: 76, height: 64)
.contentShape(Rectangle())
.onTapGesture {
TalkModeController.shared.stopSpeaking(reason: .userTap)
}
.padding(8)
Button {
TalkModeController.shared.exitTalkMode()
} label: {
Image(systemName: "xmark")
.font(.system(size: 10, weight: .bold))
.foregroundStyle(Color.white.opacity(self.hovering ? 0.95 : 0.7))
.frame(width: 18, height: 18)
.background(Color.black.opacity(self.hovering ? 0.45 : 0.3))
.clipShape(Circle())
}
.buttonStyle(.plain)
.contentShape(Circle())
.padding(4)
.onHover { self.hovering = $0 }
}
.frame(width: 92, height: 92, alignment: .center)
}
}
private struct TalkCloudView: View {
let phase: TalkModePhase
let level: Double
var body: some View {
TimelineView(.animation) { context in
let t = context.date.timeIntervalSinceReferenceDate
let pulse = phase == .speaking ? (1 + 0.04 * sin(t * 6)) : 1
let sink = phase == .thinking ? (3 + 2 * sin(t * 2)) : 0
let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.14) : 1
let baseScale = phase == .thinking ? 0.94 : 1
ZStack {
CloudShape()
.fill(self.cloudGradient)
.overlay(
CloudShape()
.stroke(Color.white.opacity(0.35), lineWidth: 0.8))
.shadow(color: Color.black.opacity(0.18), radius: 8, x: 0, y: 4)
.scaleEffect(baseScale * pulse * listenScale)
.offset(y: sink)
if phase == .listening {
Circle()
.stroke(self.ringGradient, lineWidth: 1)
.scaleEffect(1 + CGFloat(self.level) * 0.45)
.opacity(0.3 + CGFloat(self.level) * 0.4)
.animation(.easeOut(duration: 0.08), value: self.level)
}
if phase == .thinking {
TalkThinkingDots(time: t)
.offset(y: 18)
}
if phase == .speaking {
TalkSpeakingRings(time: t)
}
}
}
}
private var cloudGradient: LinearGradient {
LinearGradient(
colors: [Color(red: 0.95, green: 0.98, blue: 1.0), Color(red: 0.75, green: 0.88, blue: 1.0)],
startPoint: .topLeading,
endPoint: .bottomTrailing)
}
private var ringGradient: LinearGradient {
LinearGradient(
colors: [Color.white.opacity(0.6), Color.white.opacity(0.1)],
startPoint: .top,
endPoint: .bottom)
}
}
private struct TalkThinkingDots: View {
let time: TimeInterval
var body: some View {
HStack(spacing: 4) {
ForEach(0..<3, id: \.self) { idx in
let phase = (time * 2 + Double(idx) * 0.45).truncatingRemainder(dividingBy: 1)
Circle()
.fill(Color.white.opacity(0.75))
.frame(width: 5, height: 5)
.opacity(0.35 + 0.55 * phase)
}
}
}
}
private struct TalkSpeakingRings: View {
let time: TimeInterval
var body: some View {
ZStack {
ForEach(0..<3, id: \.self) { idx in
let phase = (time * 1.1 + Double(idx) / 3).truncatingRemainder(dividingBy: 1)
Circle()
.stroke(Color.white.opacity(0.6 - phase * 0.5), lineWidth: 1)
.scaleEffect(0.8 + phase * 0.7)
.opacity(0.6 - phase * 0.6)
}
}
}
}
private struct CloudShape: Shape {
func path(in rect: CGRect) -> Path {
let w = rect.width
let h = rect.height
let baseHeight = h * 0.44
let baseRect = CGRect(x: rect.minX, y: rect.minY + h * 0.46, width: w, height: baseHeight)
var path = Path()
path.addRoundedRect(in: baseRect, cornerSize: CGSize(width: baseHeight / 2, height: baseHeight / 2))
path.addEllipse(in: CGRect(x: rect.minX + w * 0.05, y: rect.minY + h * 0.28, width: w * 0.36, height: h * 0.36))
path.addEllipse(in: CGRect(x: rect.minX + w * 0.28, y: rect.minY + h * 0.05, width: w * 0.44, height: h * 0.44))
path.addEllipse(in: CGRect(x: rect.minX + w * 0.62, y: rect.minY + h * 0.3, width: w * 0.3, height: h * 0.3))
return path
}
}