fix: talk overlay + elevenlabs defaults

This commit is contained in:
Peter Steinberger
2025-12-30 00:51:17 +01:00
parent 53eccc1c1e
commit 39fccc3699
5 changed files with 208 additions and 88 deletions

View File

@@ -8,6 +8,7 @@
### Fixes
- macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background.
- macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries.
- macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured.
- iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first).
- macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries).
- macOS menu: device list now shows connected nodes only.

View File

@@ -33,6 +33,7 @@ struct ConfigSettings: View {
// Talk mode settings (stored in ~/.clawdis/clawdis.json under "talk")
@State private var talkVoiceId: String = ""
@State private var talkInterruptOnSpeech: Bool = true
@State private var talkApiKey: String = ""
var body: some View {
ScrollView { self.content }
@@ -301,6 +302,30 @@ struct ConfigSettings: View {
.foregroundStyle(.secondary)
}
}
GridRow {
self.gridLabel("API key")
VStack(alignment: .leading, spacing: 6) {
HStack(spacing: 8) {
SecureField("ELEVENLABS_API_KEY", text: self.$talkApiKey)
.textFieldStyle(.roundedBorder)
.frame(maxWidth: .infinity)
.disabled(self.hasEnvApiKey)
.onChange(of: self.talkApiKey) { _, _ in self.autosaveConfig() }
if !self.hasEnvApiKey && !self.talkApiKey.isEmpty {
Button("Clear") {
self.talkApiKey = ""
self.autosaveConfig()
}
}
}
self.statusLine(label: self.apiKeyStatusLabel, color: self.apiKeyStatusColor)
if self.hasEnvApiKey {
Text("Using ELEVENLABS_API_KEY from the environment.")
.font(.footnote)
.foregroundStyle(.secondary)
}
}
}
GridRow {
self.gridLabel("Interrupt")
Toggle("Stop speaking when you start talking", isOn: self.$talkInterruptOnSpeech)
@@ -319,6 +344,18 @@ struct ConfigSettings: View {
.frame(width: self.labelColumnWidth, alignment: .leading)
}
private func statusLine(label: String, color: Color) -> some View {
HStack(spacing: 6) {
Circle()
.fill(color)
.frame(width: 6, height: 6)
Text(label)
.font(.footnote)
.foregroundStyle(.secondary)
}
.padding(.top, 2)
}
private func loadConfig() {
let parsed = self.loadConfigDict()
let agent = parsed["agent"] as? [String: Any]
@@ -348,6 +385,7 @@ struct ConfigSettings: View {
if let talk {
if let voice = talk["voiceId"] as? String { self.talkVoiceId = voice }
if let apiKey = talk["apiKey"] as? String { self.talkApiKey = apiKey }
if let interrupt = talk["interruptOnSpeech"] as? Bool {
self.talkInterruptOnSpeech = interrupt
}
@@ -399,6 +437,12 @@ struct ConfigSettings: View {
} else {
talk["voiceId"] = trimmedVoice
}
let trimmedApiKey = self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines)
if trimmedApiKey.isEmpty {
talk.removeValue(forKey: "apiKey")
} else {
talk["apiKey"] = trimmedApiKey
}
talk["interruptOnSpeech"] = self.talkInterruptOnSpeech
root["talk"] = talk
@@ -433,6 +477,25 @@ struct ConfigSettings: View {
.filter { seen.insert($0).inserted }
}
private var hasEnvApiKey: Bool {
let raw = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? ""
return !raw.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
}
private var apiKeyStatusLabel: String {
if self.hasEnvApiKey { return "ElevenLabs API key: found (environment)" }
if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
return "ElevenLabs API key: stored in config"
}
return "ElevenLabs API key: missing"
}
private var apiKeyStatusColor: Color {
if self.hasEnvApiKey { return .green }
if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { return .green }
return .red
}
private var browserPathLabel: String? {
guard self.browserEnabled else { return nil }

View File

@@ -113,7 +113,7 @@ struct MenuContent: View {
Button {
Task { await self.state.setTalkEnabled(!self.state.talkEnabled) }
} label: {
Label(self.state.talkEnabled ? "Stop Talk Mode" : "Talk Mode", systemImage: "bubble.left.and.waveform")
Label(self.state.talkEnabled ? "Stop Talk Mode" : "Talk Mode", systemImage: "waveform.circle.fill")
}
.disabled(!voiceWakeSupported)
.opacity(voiceWakeSupported ? 1 : 0.5)

View File

@@ -9,6 +9,7 @@ actor TalkModeRuntime {
static let shared = TalkModeRuntime()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime")
private let ttsLogger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts")
private var recognizer: SFSpeechRecognizer?
private var audioEngine: AVAudioEngine?
@@ -36,6 +37,8 @@ actor TalkModeRuntime {
private var interruptOnSpeech: Bool = true
private var lastInterruptedAtSeconds: Double?
private var lastSpokenText: String?
private var apiKey: String?
private var fallbackVoiceId: String?
private let silenceWindow: TimeInterval = 0.7
private let minSpeechRMS: Double = 1e-3
@@ -379,19 +382,17 @@ actor TalkModeRuntime {
}
}
let voiceId =
directive?.voiceId ??
self.currentVoiceId ??
self.defaultVoiceId
guard let voiceId, !voiceId.isEmpty else {
self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
guard let apiKey = self.apiKey, !apiKey.isEmpty else {
self.logger.error("talk missing ELEVENLABS_API_KEY")
return
}
let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? ""
if apiKey.isEmpty {
self.logger.error("talk missing ELEVENLABS_API_KEY")
let requestedVoice =
directive?.voiceId ??
self.currentVoiceId ??
self.defaultVoiceId
guard let voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey) else {
self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
return
}
@@ -419,7 +420,7 @@ actor TalkModeRuntime {
language: Self.validatedLanguage(directive?.language, logger: self.logger))
do {
let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize(
let audio = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).synthesize(
voiceId: voiceId,
request: request)
let result = await TalkAudioPlayer.shared.play(data: audio)
@@ -436,6 +437,33 @@ actor TalkModeRuntime {
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
}
private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
if !trimmed.isEmpty { return trimmed }
if let fallbackVoiceId { return fallbackVoiceId }
do {
let voices = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).listVoices()
guard let first = voices.first else {
self.ttsLogger.error("elevenlabs voices list empty")
return nil
}
self.fallbackVoiceId = first.voiceId
if self.defaultVoiceId == nil {
self.defaultVoiceId = first.voiceId
}
if !self.voiceOverrideActive {
self.currentVoiceId = first.voiceId
}
let name = first.name ?? "unknown"
self.ttsLogger.info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
return first.voiceId
} catch {
self.ttsLogger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
return nil
}
}
func stopSpeaking(reason: TalkStopReason) async {
guard self.phase == .speaking else { return }
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
@@ -460,6 +488,7 @@ actor TalkModeRuntime {
}
self.defaultOutputFormat = cfg.outputFormat
self.interruptOnSpeech = cfg.interruptOnSpeech
self.apiKey = cfg.apiKey
}
private struct TalkRuntimeConfig {
@@ -467,12 +496,14 @@ actor TalkModeRuntime {
let modelId: String?
let outputFormat: String?
let interruptOnSpeech: Bool
let apiKey: String?
}
private func fetchTalkConfig() async -> TalkRuntimeConfig {
let env = ProcessInfo.processInfo.environment
let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
let envApiKey = env["ELEVENLABS_API_KEY"]?.trimmingCharacters(in: .whitespacesAndNewlines)
do {
let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded(
@@ -484,24 +515,31 @@ actor TalkModeRuntime {
let model = talk?["modelId"]?.stringValue
let outputFormat = talk?["outputFormat"]?.stringValue
let interrupt = talk?["interruptOnSpeech"]?.boolValue
let apiKey = talk?["apiKey"]?.stringValue
let resolvedVoice =
(voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ??
(envVoice?.isEmpty == false ? envVoice : nil) ??
(sagVoice?.isEmpty == false ? sagVoice : nil)
let resolvedApiKey =
(envApiKey?.isEmpty == false ? envApiKey : nil) ??
(apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? apiKey : nil)
return TalkRuntimeConfig(
voiceId: resolvedVoice,
modelId: model,
outputFormat: outputFormat,
interruptOnSpeech: interrupt ?? true)
interruptOnSpeech: interrupt ?? true,
apiKey: resolvedApiKey)
} catch {
let resolvedVoice =
(envVoice?.isEmpty == false ? envVoice : nil) ??
(sagVoice?.isEmpty == false ? sagVoice : nil)
let resolvedApiKey = envApiKey?.isEmpty == false ? envApiKey : nil
return TalkRuntimeConfig(
voiceId: resolvedVoice,
modelId: nil,
outputFormat: nil,
interruptOnSpeech: true)
interruptOnSpeech: true,
apiKey: resolvedApiKey)
}
}
@@ -631,6 +669,7 @@ private struct ElevenLabsRequest {
private struct ElevenLabsClient {
let apiKey: String
let logger: Logger
let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
@@ -639,6 +678,11 @@ private struct ElevenLabsClient {
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
let charCount = request.text.count
self.logger.info(
"elevenlabs tts request voice=\(voiceId, privacy: .public) model=\(request.modelId ?? "default", privacy: .public) chars=\(charCount, privacy: .public)")
let startedAt = Date()
var payload: [String: Any] = [
"text": request.text,
]
@@ -678,10 +722,52 @@ private struct ElevenLabsClient {
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = String(data: data, encoding: .utf8) ?? "unknown"
self.logger.error(
"elevenlabs tts failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)")
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
])
}
let elapsed = Date().timeIntervalSince(startedAt)
self.logger.info("elevenlabs tts ok bytes=\(data.count, privacy: .public) dur=\(elapsed, privacy: .public)s")
return data
}
func listVoices() async throws -> [ElevenLabsVoice] {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("voices")
self.logger.info("elevenlabs voices list request")
var req = URLRequest(url: url)
req.httpMethod = "GET"
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = String(data: data, encoding: .utf8) ?? "unknown"
self.logger.error(
"elevenlabs voices list failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)")
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
])
}
let decoded = try JSONDecoder().decode(ElevenLabsVoicesResponse.self, from: data)
return decoded.voices
}
}
private struct ElevenLabsVoice: Decodable {
let voiceId: String
let name: String?
enum CodingKeys: String, CodingKey {
case voiceId = "voice_id"
case name
}
}
private struct ElevenLabsVoicesResponse: Decodable {
let voices: [ElevenLabsVoice]
}

View File

@@ -6,13 +6,13 @@ struct TalkOverlayView: View {
var body: some View {
ZStack(alignment: .topLeading) {
TalkCloudView(phase: self.controller.model.phase, level: self.controller.model.level)
.frame(width: 76, height: 64)
TalkOrbView(phase: self.controller.model.phase, level: self.controller.model.level)
.frame(width: 72, height: 72)
.contentShape(Rectangle())
.onTapGesture {
TalkModeController.shared.stopSpeaking(reason: .userTap)
}
.padding(8)
.padding(10)
Button {
TalkModeController.shared.exitTalkMode()
@@ -33,107 +33,77 @@ struct TalkOverlayView: View {
}
}
private struct TalkCloudView: View {
private struct TalkOrbView: View {
let phase: TalkModePhase
let level: Double
var body: some View {
TimelineView(.animation) { context in
let t = context.date.timeIntervalSinceReferenceDate
let pulse = phase == .speaking ? (1 + 0.04 * sin(t * 6)) : 1
let sink = phase == .thinking ? (3 + 2 * sin(t * 2)) : 0
let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.14) : 1
let baseScale = phase == .thinking ? 0.94 : 1
let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.12) : 1
let pulse = phase == .speaking ? (1 + 0.06 * sin(t * 6)) : 1
ZStack {
CloudShape()
.fill(self.cloudGradient)
.overlay(
CloudShape()
.stroke(Color.white.opacity(0.35), lineWidth: 0.8))
.shadow(color: Color.black.opacity(0.18), radius: 8, x: 0, y: 4)
.scaleEffect(baseScale * pulse * listenScale)
.offset(y: sink)
Circle()
.fill(self.orbGradient)
.overlay(Circle().stroke(Color.white.opacity(0.45), lineWidth: 1))
.shadow(color: Color.black.opacity(0.22), radius: 10, x: 0, y: 5)
.scaleEffect(pulse * listenScale)
if phase == .listening {
Circle()
.stroke(self.ringGradient, lineWidth: 1)
.scaleEffect(1 + CGFloat(self.level) * 0.45)
.opacity(0.3 + CGFloat(self.level) * 0.4)
.animation(.easeOut(duration: 0.08), value: self.level)
}
TalkWaveRings(phase: phase, level: level, time: t)
if phase == .thinking {
TalkThinkingDots(time: t)
.offset(y: 18)
}
if phase == .speaking {
TalkSpeakingRings(time: t)
TalkOrbitArcs(time: t)
}
}
}
}
private var cloudGradient: LinearGradient {
LinearGradient(
colors: [Color(red: 0.95, green: 0.98, blue: 1.0), Color(red: 0.75, green: 0.88, blue: 1.0)],
startPoint: .topLeading,
endPoint: .bottomTrailing)
}
private var ringGradient: LinearGradient {
LinearGradient(
colors: [Color.white.opacity(0.6), Color.white.opacity(0.1)],
startPoint: .top,
endPoint: .bottom)
private var orbGradient: RadialGradient {
RadialGradient(
colors: [Color.white, Color(red: 0.62, green: 0.88, blue: 1.0)],
center: .topLeading,
startRadius: 4,
endRadius: 52)
}
}
private struct TalkThinkingDots: View {
let time: TimeInterval
var body: some View {
HStack(spacing: 4) {
ForEach(0..<3, id: \.self) { idx in
let phase = (time * 2 + Double(idx) * 0.45).truncatingRemainder(dividingBy: 1)
Circle()
.fill(Color.white.opacity(0.75))
.frame(width: 5, height: 5)
.opacity(0.35 + 0.55 * phase)
}
}
}
}
private struct TalkSpeakingRings: View {
private struct TalkWaveRings: View {
let phase: TalkModePhase
let level: Double
let time: TimeInterval
var body: some View {
ZStack {
ForEach(0..<3, id: \.self) { idx in
let phase = (time * 1.1 + Double(idx) / 3).truncatingRemainder(dividingBy: 1)
let speed = phase == .speaking ? 1.4 : phase == .listening ? 0.9 : 0.6
let progress = (time * speed + Double(idx) * 0.28).truncatingRemainder(dividingBy: 1)
let amplitude = phase == .speaking ? 0.95 : phase == .listening ? 0.5 + level * 0.7 : 0.35
let scale = 0.75 + progress * amplitude + (phase == .listening ? level * 0.15 : 0)
let alpha = phase == .speaking ? 0.55 : phase == .listening ? 0.45 + level * 0.25 : 0.28
Circle()
.stroke(Color.white.opacity(0.6 - phase * 0.5), lineWidth: 1)
.scaleEffect(0.8 + phase * 0.7)
.opacity(0.6 - phase * 0.6)
.stroke(Color.white.opacity(alpha - progress * 0.35), lineWidth: 1.2)
.scaleEffect(scale)
.opacity(alpha - progress * 0.6)
}
}
}
}
private struct CloudShape: Shape {
func path(in rect: CGRect) -> Path {
let w = rect.width
let h = rect.height
let baseHeight = h * 0.44
let baseRect = CGRect(x: rect.minX, y: rect.minY + h * 0.46, width: w, height: baseHeight)
private struct TalkOrbitArcs: View {
let time: TimeInterval
var path = Path()
path.addRoundedRect(in: baseRect, cornerSize: CGSize(width: baseHeight / 2, height: baseHeight / 2))
path.addEllipse(in: CGRect(x: rect.minX + w * 0.05, y: rect.minY + h * 0.28, width: w * 0.36, height: h * 0.36))
path.addEllipse(in: CGRect(x: rect.minX + w * 0.28, y: rect.minY + h * 0.05, width: w * 0.44, height: h * 0.44))
path.addEllipse(in: CGRect(x: rect.minX + w * 0.62, y: rect.minY + h * 0.3, width: w * 0.3, height: h * 0.3))
return path
var body: some View {
ZStack {
Circle()
.trim(from: 0.08, to: 0.26)
.stroke(Color.white.opacity(0.75), style: StrokeStyle(lineWidth: 1.4, lineCap: .round))
.rotationEffect(.degrees(time * 42))
Circle()
.trim(from: 0.62, to: 0.86)
.stroke(Color.white.opacity(0.55), style: StrokeStyle(lineWidth: 1.2, lineCap: .round))
.rotationEffect(.degrees(-time * 35))
}
.scaleEffect(1.05)
}
}