From 39fccc36998a1152d801e6070f6afb6205e0b1b0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 00:51:17 +0100 Subject: [PATCH] fix: talk overlay + elevenlabs defaults --- CHANGELOG.md | 1 + .../Sources/Clawdis/ConfigSettings.swift | 63 ++++++++++ .../Sources/Clawdis/MenuContentView.swift | 2 +- .../Sources/Clawdis/TalkModeRuntime.swift | 112 +++++++++++++++-- .../Sources/Clawdis/TalkOverlayView.swift | 118 +++++++----------- 5 files changed, 208 insertions(+), 88 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd94a3ce5..62c9357ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ ### Fixes - macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background. - macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries. +- macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). - macOS menu: device list now shows connected nodes only. diff --git a/apps/macos/Sources/Clawdis/ConfigSettings.swift b/apps/macos/Sources/Clawdis/ConfigSettings.swift index 7c0867d79..eb22490c0 100644 --- a/apps/macos/Sources/Clawdis/ConfigSettings.swift +++ b/apps/macos/Sources/Clawdis/ConfigSettings.swift @@ -33,6 +33,7 @@ struct ConfigSettings: View { // Talk mode settings (stored in ~/.clawdis/clawdis.json under "talk") @State private var talkVoiceId: String = "" @State private var talkInterruptOnSpeech: Bool = true + @State private var talkApiKey: String = "" var body: some View { ScrollView { self.content } @@ -301,6 +302,30 @@ struct ConfigSettings: View { .foregroundStyle(.secondary) } } + GridRow { + self.gridLabel("API key") + VStack(alignment: .leading, spacing: 6) { + HStack(spacing: 8) { + SecureField("ELEVENLABS_API_KEY", text: self.$talkApiKey) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: .infinity) + .disabled(self.hasEnvApiKey) + .onChange(of: self.talkApiKey) { _, _ in self.autosaveConfig() } + if !self.hasEnvApiKey && !self.talkApiKey.isEmpty { + Button("Clear") { + self.talkApiKey = "" + self.autosaveConfig() + } + } + } + self.statusLine(label: self.apiKeyStatusLabel, color: self.apiKeyStatusColor) + if self.hasEnvApiKey { + Text("Using ELEVENLABS_API_KEY from the environment.") + .font(.footnote) + .foregroundStyle(.secondary) + } + } + } GridRow { self.gridLabel("Interrupt") Toggle("Stop speaking when you start talking", isOn: self.$talkInterruptOnSpeech) @@ -319,6 +344,18 @@ struct ConfigSettings: View { .frame(width: self.labelColumnWidth, alignment: .leading) } + private func statusLine(label: String, color: Color) -> some View { + HStack(spacing: 6) { + Circle() + .fill(color) + .frame(width: 6, height: 6) + Text(label) + .font(.footnote) + .foregroundStyle(.secondary) + } + .padding(.top, 2) + } + private func loadConfig() { let parsed = self.loadConfigDict() let agent = parsed["agent"] as? [String: Any] @@ -348,6 +385,7 @@ struct ConfigSettings: View { if let talk { if let voice = talk["voiceId"] as? String { self.talkVoiceId = voice } + if let apiKey = talk["apiKey"] as? String { self.talkApiKey = apiKey } if let interrupt = talk["interruptOnSpeech"] as? Bool { self.talkInterruptOnSpeech = interrupt } @@ -399,6 +437,12 @@ struct ConfigSettings: View { } else { talk["voiceId"] = trimmedVoice } + let trimmedApiKey = self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines) + if trimmedApiKey.isEmpty { + talk.removeValue(forKey: "apiKey") + } else { + talk["apiKey"] = trimmedApiKey + } talk["interruptOnSpeech"] = self.talkInterruptOnSpeech root["talk"] = talk @@ -433,6 +477,25 @@ struct ConfigSettings: View { .filter { seen.insert($0).inserted } } + private var hasEnvApiKey: Bool { + let raw = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? "" + return !raw.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + } + + private var apiKeyStatusLabel: String { + if self.hasEnvApiKey { return "ElevenLabs API key: found (environment)" } + if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + return "ElevenLabs API key: stored in config" + } + return "ElevenLabs API key: missing" + } + + private var apiKeyStatusColor: Color { + if self.hasEnvApiKey { return .green } + if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { return .green } + return .red + } + private var browserPathLabel: String? { guard self.browserEnabled else { return nil } diff --git a/apps/macos/Sources/Clawdis/MenuContentView.swift b/apps/macos/Sources/Clawdis/MenuContentView.swift index e1453e5a2..c43986925 100644 --- a/apps/macos/Sources/Clawdis/MenuContentView.swift +++ b/apps/macos/Sources/Clawdis/MenuContentView.swift @@ -113,7 +113,7 @@ struct MenuContent: View { Button { Task { await self.state.setTalkEnabled(!self.state.talkEnabled) } } label: { - Label(self.state.talkEnabled ? "Stop Talk Mode" : "Talk Mode", systemImage: "bubble.left.and.waveform") + Label(self.state.talkEnabled ? "Stop Talk Mode" : "Talk Mode", systemImage: "waveform.circle.fill") } .disabled(!voiceWakeSupported) .opacity(voiceWakeSupported ? 1 : 0.5) diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 3be350d9d..0443e26ea 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -9,6 +9,7 @@ actor TalkModeRuntime { static let shared = TalkModeRuntime() private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime") + private let ttsLogger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts") private var recognizer: SFSpeechRecognizer? private var audioEngine: AVAudioEngine? @@ -36,6 +37,8 @@ actor TalkModeRuntime { private var interruptOnSpeech: Bool = true private var lastInterruptedAtSeconds: Double? private var lastSpokenText: String? + private var apiKey: String? + private var fallbackVoiceId: String? private let silenceWindow: TimeInterval = 0.7 private let minSpeechRMS: Double = 1e-3 @@ -379,19 +382,17 @@ actor TalkModeRuntime { } } - let voiceId = - directive?.voiceId ?? - self.currentVoiceId ?? - self.defaultVoiceId - - guard let voiceId, !voiceId.isEmpty else { - self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID") + guard let apiKey = self.apiKey, !apiKey.isEmpty else { + self.logger.error("talk missing ELEVENLABS_API_KEY") return } - let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? "" - if apiKey.isEmpty { - self.logger.error("talk missing ELEVENLABS_API_KEY") + let requestedVoice = + directive?.voiceId ?? + self.currentVoiceId ?? + self.defaultVoiceId + guard let voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey) else { + self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID") return } @@ -419,7 +420,7 @@ actor TalkModeRuntime { language: Self.validatedLanguage(directive?.language, logger: self.logger)) do { - let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( + let audio = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).synthesize( voiceId: voiceId, request: request) let result = await TalkAudioPlayer.shared.play(data: audio) @@ -436,6 +437,33 @@ actor TalkModeRuntime { await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } } + private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? { + let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + if !trimmed.isEmpty { return trimmed } + if let fallbackVoiceId { return fallbackVoiceId } + + do { + let voices = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).listVoices() + guard let first = voices.first else { + self.ttsLogger.error("elevenlabs voices list empty") + return nil + } + self.fallbackVoiceId = first.voiceId + if self.defaultVoiceId == nil { + self.defaultVoiceId = first.voiceId + } + if !self.voiceOverrideActive { + self.currentVoiceId = first.voiceId + } + let name = first.name ?? "unknown" + self.ttsLogger.info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))") + return first.voiceId + } catch { + self.ttsLogger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)") + return nil + } + } + func stopSpeaking(reason: TalkStopReason) async { guard self.phase == .speaking else { return } let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() } @@ -460,6 +488,7 @@ actor TalkModeRuntime { } self.defaultOutputFormat = cfg.outputFormat self.interruptOnSpeech = cfg.interruptOnSpeech + self.apiKey = cfg.apiKey } private struct TalkRuntimeConfig { @@ -467,12 +496,14 @@ actor TalkModeRuntime { let modelId: String? let outputFormat: String? let interruptOnSpeech: Bool + let apiKey: String? } private func fetchTalkConfig() async -> TalkRuntimeConfig { let env = ProcessInfo.processInfo.environment let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) + let envApiKey = env["ELEVENLABS_API_KEY"]?.trimmingCharacters(in: .whitespacesAndNewlines) do { let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded( @@ -484,24 +515,31 @@ actor TalkModeRuntime { let model = talk?["modelId"]?.stringValue let outputFormat = talk?["outputFormat"]?.stringValue let interrupt = talk?["interruptOnSpeech"]?.boolValue + let apiKey = talk?["apiKey"]?.stringValue let resolvedVoice = (voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ?? (envVoice?.isEmpty == false ? envVoice : nil) ?? (sagVoice?.isEmpty == false ? sagVoice : nil) + let resolvedApiKey = + (envApiKey?.isEmpty == false ? envApiKey : nil) ?? + (apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? apiKey : nil) return TalkRuntimeConfig( voiceId: resolvedVoice, modelId: model, outputFormat: outputFormat, - interruptOnSpeech: interrupt ?? true) + interruptOnSpeech: interrupt ?? true, + apiKey: resolvedApiKey) } catch { let resolvedVoice = (envVoice?.isEmpty == false ? envVoice : nil) ?? (sagVoice?.isEmpty == false ? sagVoice : nil) + let resolvedApiKey = envApiKey?.isEmpty == false ? envApiKey : nil return TalkRuntimeConfig( voiceId: resolvedVoice, modelId: nil, outputFormat: nil, - interruptOnSpeech: true) + interruptOnSpeech: true, + apiKey: resolvedApiKey) } } @@ -631,6 +669,7 @@ private struct ElevenLabsRequest { private struct ElevenLabsClient { let apiKey: String + let logger: Logger let baseUrl: URL = URL(string: "https://api.elevenlabs.io")! func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data { @@ -639,6 +678,11 @@ private struct ElevenLabsClient { url.appendPathComponent("text-to-speech") url.appendPathComponent(voiceId) + let charCount = request.text.count + self.logger.info( + "elevenlabs tts request voice=\(voiceId, privacy: .public) model=\(request.modelId ?? "default", privacy: .public) chars=\(charCount, privacy: .public)") + let startedAt = Date() + var payload: [String: Any] = [ "text": request.text, ] @@ -678,10 +722,52 @@ private struct ElevenLabsClient { let (data, response) = try await URLSession.shared.data(for: req) if let http = response as? HTTPURLResponse, http.statusCode >= 400 { let message = String(data: data, encoding: .utf8) ?? "unknown" + self.logger.error( + "elevenlabs tts failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)") throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)", ]) } + let elapsed = Date().timeIntervalSince(startedAt) + self.logger.info("elevenlabs tts ok bytes=\(data.count, privacy: .public) dur=\(elapsed, privacy: .public)s") return data } + + func listVoices() async throws -> [ElevenLabsVoice] { + var url = self.baseUrl + url.appendPathComponent("v1") + url.appendPathComponent("voices") + + self.logger.info("elevenlabs voices list request") + var req = URLRequest(url: url) + req.httpMethod = "GET" + req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") + + let (data, response) = try await URLSession.shared.data(for: req) + if let http = response as? HTTPURLResponse, http.statusCode >= 400 { + let message = String(data: data, encoding: .utf8) ?? "unknown" + self.logger.error( + "elevenlabs voices list failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)") + throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)", + ]) + } + + let decoded = try JSONDecoder().decode(ElevenLabsVoicesResponse.self, from: data) + return decoded.voices + } +} + +private struct ElevenLabsVoice: Decodable { + let voiceId: String + let name: String? + + enum CodingKeys: String, CodingKey { + case voiceId = "voice_id" + case name + } +} + +private struct ElevenLabsVoicesResponse: Decodable { + let voices: [ElevenLabsVoice] } diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift index 2f2be75ca..29d7a6914 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -6,13 +6,13 @@ struct TalkOverlayView: View { var body: some View { ZStack(alignment: .topLeading) { - TalkCloudView(phase: self.controller.model.phase, level: self.controller.model.level) - .frame(width: 76, height: 64) + TalkOrbView(phase: self.controller.model.phase, level: self.controller.model.level) + .frame(width: 72, height: 72) .contentShape(Rectangle()) .onTapGesture { TalkModeController.shared.stopSpeaking(reason: .userTap) } - .padding(8) + .padding(10) Button { TalkModeController.shared.exitTalkMode() @@ -33,107 +33,77 @@ struct TalkOverlayView: View { } } -private struct TalkCloudView: View { +private struct TalkOrbView: View { let phase: TalkModePhase let level: Double var body: some View { TimelineView(.animation) { context in let t = context.date.timeIntervalSinceReferenceDate - let pulse = phase == .speaking ? (1 + 0.04 * sin(t * 6)) : 1 - let sink = phase == .thinking ? (3 + 2 * sin(t * 2)) : 0 - let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.14) : 1 - let baseScale = phase == .thinking ? 0.94 : 1 + let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.12) : 1 + let pulse = phase == .speaking ? (1 + 0.06 * sin(t * 6)) : 1 ZStack { - CloudShape() - .fill(self.cloudGradient) - .overlay( - CloudShape() - .stroke(Color.white.opacity(0.35), lineWidth: 0.8)) - .shadow(color: Color.black.opacity(0.18), radius: 8, x: 0, y: 4) - .scaleEffect(baseScale * pulse * listenScale) - .offset(y: sink) + Circle() + .fill(self.orbGradient) + .overlay(Circle().stroke(Color.white.opacity(0.45), lineWidth: 1)) + .shadow(color: Color.black.opacity(0.22), radius: 10, x: 0, y: 5) + .scaleEffect(pulse * listenScale) - if phase == .listening { - Circle() - .stroke(self.ringGradient, lineWidth: 1) - .scaleEffect(1 + CGFloat(self.level) * 0.45) - .opacity(0.3 + CGFloat(self.level) * 0.4) - .animation(.easeOut(duration: 0.08), value: self.level) - } + TalkWaveRings(phase: phase, level: level, time: t) if phase == .thinking { - TalkThinkingDots(time: t) - .offset(y: 18) - } - - if phase == .speaking { - TalkSpeakingRings(time: t) + TalkOrbitArcs(time: t) } } } } - private var cloudGradient: LinearGradient { - LinearGradient( - colors: [Color(red: 0.95, green: 0.98, blue: 1.0), Color(red: 0.75, green: 0.88, blue: 1.0)], - startPoint: .topLeading, - endPoint: .bottomTrailing) - } - - private var ringGradient: LinearGradient { - LinearGradient( - colors: [Color.white.opacity(0.6), Color.white.opacity(0.1)], - startPoint: .top, - endPoint: .bottom) + private var orbGradient: RadialGradient { + RadialGradient( + colors: [Color.white, Color(red: 0.62, green: 0.88, blue: 1.0)], + center: .topLeading, + startRadius: 4, + endRadius: 52) } } -private struct TalkThinkingDots: View { - let time: TimeInterval - - var body: some View { - HStack(spacing: 4) { - ForEach(0..<3, id: \.self) { idx in - let phase = (time * 2 + Double(idx) * 0.45).truncatingRemainder(dividingBy: 1) - Circle() - .fill(Color.white.opacity(0.75)) - .frame(width: 5, height: 5) - .opacity(0.35 + 0.55 * phase) - } - } - } -} - -private struct TalkSpeakingRings: View { +private struct TalkWaveRings: View { + let phase: TalkModePhase + let level: Double let time: TimeInterval var body: some View { ZStack { ForEach(0..<3, id: \.self) { idx in - let phase = (time * 1.1 + Double(idx) / 3).truncatingRemainder(dividingBy: 1) + let speed = phase == .speaking ? 1.4 : phase == .listening ? 0.9 : 0.6 + let progress = (time * speed + Double(idx) * 0.28).truncatingRemainder(dividingBy: 1) + let amplitude = phase == .speaking ? 0.95 : phase == .listening ? 0.5 + level * 0.7 : 0.35 + let scale = 0.75 + progress * amplitude + (phase == .listening ? level * 0.15 : 0) + let alpha = phase == .speaking ? 0.55 : phase == .listening ? 0.45 + level * 0.25 : 0.28 Circle() - .stroke(Color.white.opacity(0.6 - phase * 0.5), lineWidth: 1) - .scaleEffect(0.8 + phase * 0.7) - .opacity(0.6 - phase * 0.6) + .stroke(Color.white.opacity(alpha - progress * 0.35), lineWidth: 1.2) + .scaleEffect(scale) + .opacity(alpha - progress * 0.6) } } } } -private struct CloudShape: Shape { - func path(in rect: CGRect) -> Path { - let w = rect.width - let h = rect.height - let baseHeight = h * 0.44 - let baseRect = CGRect(x: rect.minX, y: rect.minY + h * 0.46, width: w, height: baseHeight) +private struct TalkOrbitArcs: View { + let time: TimeInterval - var path = Path() - path.addRoundedRect(in: baseRect, cornerSize: CGSize(width: baseHeight / 2, height: baseHeight / 2)) - path.addEllipse(in: CGRect(x: rect.minX + w * 0.05, y: rect.minY + h * 0.28, width: w * 0.36, height: h * 0.36)) - path.addEllipse(in: CGRect(x: rect.minX + w * 0.28, y: rect.minY + h * 0.05, width: w * 0.44, height: h * 0.44)) - path.addEllipse(in: CGRect(x: rect.minX + w * 0.62, y: rect.minY + h * 0.3, width: w * 0.3, height: h * 0.3)) - return path + var body: some View { + ZStack { + Circle() + .trim(from: 0.08, to: 0.26) + .stroke(Color.white.opacity(0.75), style: StrokeStyle(lineWidth: 1.4, lineCap: .round)) + .rotationEffect(.degrees(time * 42)) + Circle() + .trim(from: 0.62, to: 0.86) + .stroke(Color.white.opacity(0.55), style: StrokeStyle(lineWidth: 1.2, lineCap: .round)) + .rotationEffect(.degrees(-time * 35)) + } + .scaleEffect(1.05) } }