From e119a823345f8c73770b9d8f354e423012d4deba Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 01:57:45 +0100 Subject: [PATCH] feat: talk mode key distribution and tts polling --- CHANGELOG.md | 2 + .../clawdis/node/voice/TalkModeManager.kt | 35 +++++++++++-- apps/ios/Sources/Voice/TalkModeManager.swift | 34 +++++++++++-- apps/macos/Sources/Clawdis/AppState.swift | 12 ++++- .../Sources/Clawdis/ConfigSettings.swift | 22 +++++++++ .../Sources/Clawdis/GatewayConnection.swift | 7 +++ .../Sources/Clawdis/TalkModeController.swift | 1 + .../Sources/Clawdis/TalkModeRuntime.swift | 32 ++++++++++-- apps/macos/Sources/Clawdis/TalkOverlay.swift | 6 +-- .../Sources/Clawdis/TalkOverlayView.swift | 17 ++++--- docs/configuration.md | 2 + docs/talk.md | 2 + src/config/config.test.ts | 47 ++++++++++++++++++ src/config/config.ts | 49 ++++++++++++++++++- src/gateway/protocol/index.ts | 5 ++ src/gateway/protocol/schema.ts | 10 ++++ src/gateway/server.ts | 44 +++++++++++++++++ 17 files changed, 303 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f8317f04..97643cee5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ - macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries. - macOS Debug: hide “Restart Gateway” when the app won’t start a local gateway (remote mode / attach-only). - macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured. +- Talk Mode: wait for chat history to surface the assistant reply before starting TTS (macOS/iOS/Android). +- Gateway config: inject `talk.apiKey` from `ELEVENLABS_API_KEY`/shell profile so nodes can fetch it on demand. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). - macOS menu: device list now shows connected nodes only. diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt index 920466739..5cad01140 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt @@ -76,6 +76,7 @@ class TalkModeManager( private var defaultModelId: String? = null private var currentModelId: String? = null private var defaultOutputFormat: String? = null + private var apiKey: String? = null private var interruptOnSpeech: Boolean = true private var voiceOverrideActive = false private var modelOverrideActive = false @@ -268,6 +269,7 @@ class TalkModeManager( } try { + val startedAt = System.currentTimeMillis().toDouble() / 1000.0 val runId = sendChat(prompt, bridge) val ok = waitForChatFinal(runId) if (!ok) { @@ -275,7 +277,7 @@ class TalkModeManager( start() return } - val assistant = fetchLatestAssistantText(bridge) + val assistant = waitForAssistantText(bridge, startedAt, 12_000) if (assistant.isNullOrBlank()) { _statusText.value = "No reply" start() @@ -345,13 +347,34 @@ class TalkModeManager( return result } - private suspend fun fetchLatestAssistantText(bridge: BridgeSession): String? { + private suspend fun waitForAssistantText( + bridge: BridgeSession, + sinceSeconds: Double, + timeoutMs: Long, + ): String? { + val deadline = SystemClock.elapsedRealtime() + timeoutMs + while (SystemClock.elapsedRealtime() < deadline) { + val text = fetchLatestAssistantText(bridge, sinceSeconds) + if (!text.isNullOrBlank()) return text + delay(300) + } + return null + } + + private suspend fun fetchLatestAssistantText( + bridge: BridgeSession, + sinceSeconds: Double? = null, + ): String? { val res = bridge.request("chat.history", "{\"sessionKey\":\"main\"}") val root = json.parseToJsonElement(res).asObjectOrNull() ?: return null val messages = root["messages"] as? JsonArray ?: return null for (item in messages.reversed()) { val obj = item.asObjectOrNull() ?: continue if (obj["role"].asStringOrNull() != "assistant") continue + if (sinceSeconds != null) { + val timestamp = obj["timestamp"].asDoubleOrNull() + if (timestamp != null && timestamp < sinceSeconds - 0.5) continue + } val content = obj["content"] as? JsonArray ?: continue val text = content.mapNotNull { entry -> @@ -390,7 +413,9 @@ class TalkModeManager( return } - val apiKey = System.getenv("ELEVENLABS_API_KEY")?.trim() + val apiKey = + apiKey?.trim()?.takeIf { it.isNotEmpty() } + ?: System.getenv("ELEVENLABS_API_KEY")?.trim() if (apiKey.isNullOrEmpty()) { _statusText.value = "Missing ELEVENLABS_API_KEY" return @@ -495,6 +520,7 @@ class TalkModeManager( val bridge = session ?: return val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim() val sagVoice = System.getenv("SAG_VOICE_ID")?.trim() + val envKey = System.getenv("ELEVENLABS_API_KEY")?.trim() try { val res = bridge.request("config.get", "{}") val root = json.parseToJsonElement(res).asObjectOrNull() @@ -503,6 +529,7 @@ class TalkModeManager( val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + val key = talk?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull() defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } @@ -510,9 +537,11 @@ class TalkModeManager( defaultModelId = model if (!modelOverrideActive) currentModelId = defaultModelId defaultOutputFormat = outputFormat + apiKey = key ?: envKey?.takeIf { it.isNotEmpty() } if (interrupt != null) interruptOnSpeech = interrupt } catch (_: Throwable) { defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } + apiKey = envKey?.takeIf { it.isNotEmpty() } } } diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 82cd451c3..3766845b1 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -28,6 +28,7 @@ final class TalkModeManager: NSObject { private var defaultModelId: String? private var currentModelId: String? private var defaultOutputFormat: String? + private var apiKey: String? private var interruptOnSpeech: Bool = true private var bridge: BridgeSession? @@ -189,6 +190,7 @@ final class TalkModeManager: NSObject { } do { + let startedAt = Date().timeIntervalSince1970 let runId = try await self.sendChat(prompt, bridge: bridge) let ok = await self.waitForChatFinal(runId: runId, bridge: bridge) if !ok { @@ -197,7 +199,11 @@ final class TalkModeManager: NSObject { return } - guard let assistantText = try await self.fetchLatestAssistantText(bridge: bridge) else { + guard let assistantText = try await self.waitForAssistantText( + bridge: bridge, + since: startedAt, + timeoutSeconds: 12) + else { self.statusText = "No reply" await self.start() return @@ -259,7 +265,22 @@ final class TalkModeManager: NSObject { return false } - private func fetchLatestAssistantText(bridge: BridgeSession) async throws -> String? { + private func waitForAssistantText( + bridge: BridgeSession, + since: Double, + timeoutSeconds: Int) async throws -> String? + { + let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds)) + while Date() < deadline { + if let text = try await self.fetchLatestAssistantText(bridge: bridge, since: since) { + return text + } + try? await Task.sleep(nanoseconds: 300_000_000) + } + return nil + } + + private func fetchLatestAssistantText(bridge: BridgeSession, since: Double? = nil) async throws -> String? { let res = try await bridge.request( method: "chat.history", paramsJSON: "{\"sessionKey\":\"main\"}", @@ -268,6 +289,9 @@ final class TalkModeManager: NSObject { guard let messages = json["messages"] as? [[String: Any]] else { return nil } for msg in messages.reversed() { guard (msg["role"] as? String) == "assistant" else { continue } + if let since, let timestamp = msg["timestamp"] as? Double, timestamp < since - 0.5 { + continue + } guard let content = msg["content"] as? [[String: Any]] else { continue } let text = content.compactMap { $0["text"] as? String }.joined(separator: "\n") let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) @@ -299,7 +323,10 @@ final class TalkModeManager: NSObject { return } - guard let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"], !apiKey.isEmpty else { + let resolvedKey = + (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ?? + ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] + guard let apiKey = resolvedKey, !apiKey.isEmpty else { self.statusText = "Missing ELEVENLABS_API_KEY" return } @@ -375,6 +402,7 @@ final class TalkModeManager: NSObject { self.currentModelId = self.defaultModelId self.defaultOutputFormat = (talk?["outputFormat"] as? String)? .trimmingCharacters(in: .whitespacesAndNewlines) + self.apiKey = (talk?["apiKey"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) if let interrupt = talk?["interruptOnSpeech"] as? Bool { self.interruptOnSpeech = interrupt } diff --git a/apps/macos/Sources/Clawdis/AppState.swift b/apps/macos/Sources/Clawdis/AppState.swift index 94e20538a..c73383241 100644 --- a/apps/macos/Sources/Clawdis/AppState.swift +++ b/apps/macos/Sources/Clawdis/AppState.swift @@ -329,18 +329,26 @@ final class AppState { func setTalkEnabled(_ enabled: Bool) async { guard voiceWakeSupported else { self.talkEnabled = false + await GatewayConnection.shared.talkMode(enabled: false, phase: "disabled") return } self.talkEnabled = enabled guard !self.isPreview else { return } - if !enabled { return } + if !enabled { + await GatewayConnection.shared.talkMode(enabled: false, phase: "disabled") + return + } - if PermissionManager.voiceWakePermissionsGranted() { return } + if PermissionManager.voiceWakePermissionsGranted() { + await GatewayConnection.shared.talkMode(enabled: true, phase: "enabled") + return + } let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true) self.talkEnabled = granted + await GatewayConnection.shared.talkMode(enabled: granted, phase: granted ? "enabled" : "denied") } // MARK: - Global wake words sync (Gateway-owned) diff --git a/apps/macos/Sources/Clawdis/ConfigSettings.swift b/apps/macos/Sources/Clawdis/ConfigSettings.swift index eb22490c0..784fe7a71 100644 --- a/apps/macos/Sources/Clawdis/ConfigSettings.swift +++ b/apps/macos/Sources/Clawdis/ConfigSettings.swift @@ -34,6 +34,7 @@ struct ConfigSettings: View { @State private var talkVoiceId: String = "" @State private var talkInterruptOnSpeech: Bool = true @State private var talkApiKey: String = "" + @State private var gatewayApiKeyFound = false var body: some View { ScrollView { self.content } @@ -49,6 +50,7 @@ struct ConfigSettings: View { self.hasLoaded = true self.loadConfig() await self.loadModels() + await self.refreshGatewayTalkApiKey() self.allowAutosave = true } } @@ -323,6 +325,10 @@ struct ConfigSettings: View { Text("Using ELEVENLABS_API_KEY from the environment.") .font(.footnote) .foregroundStyle(.secondary) + } else if self.gatewayApiKeyFound && self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + Text("Using API key from the gateway profile.") + .font(.footnote) + .foregroundStyle(.secondary) } } } @@ -392,6 +398,20 @@ struct ConfigSettings: View { } } + private func refreshGatewayTalkApiKey() async { + do { + let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded( + method: .configGet, + params: nil, + timeoutMs: 8000) + let talk = snap.config?["talk"]?.dictionaryValue + let apiKey = talk?["apiKey"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) + self.gatewayApiKeyFound = !(apiKey ?? "").isEmpty + } catch { + self.gatewayApiKeyFound = false + } + } + private func autosaveConfig() { guard self.allowAutosave else { return } Task { await self.saveConfig() } @@ -487,12 +507,14 @@ struct ConfigSettings: View { if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { return "ElevenLabs API key: stored in config" } + if self.gatewayApiKeyFound { return "ElevenLabs API key: found (gateway)" } return "ElevenLabs API key: missing" } private var apiKeyStatusColor: Color { if self.hasEnvApiKey { return .green } if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { return .green } + if self.gatewayApiKeyFound { return .green } return .red } diff --git a/apps/macos/Sources/Clawdis/GatewayConnection.swift b/apps/macos/Sources/Clawdis/GatewayConnection.swift index f255f69b9..289507cc5 100644 --- a/apps/macos/Sources/Clawdis/GatewayConnection.swift +++ b/apps/macos/Sources/Clawdis/GatewayConnection.swift @@ -51,6 +51,7 @@ actor GatewayConnection { case providersStatus = "providers.status" case configGet = "config.get" case configSet = "config.set" + case talkMode = "talk.mode" case webLoginStart = "web.login.start" case webLoginWait = "web.login.wait" case webLogout = "web.logout" @@ -483,6 +484,12 @@ extension GatewayConnection { return res.aborted ?? false } + func talkMode(enabled: Bool, phase: String? = nil) async { + var params: [String: AnyCodable] = ["enabled": AnyCodable(enabled)] + if let phase { params["phase"] = AnyCodable(phase) } + try? await self.requestVoid(method: .talkMode, params: params) + } + // MARK: - VoiceWake func voiceWakeGetTriggers() async throws -> [String] { diff --git a/apps/macos/Sources/Clawdis/TalkModeController.swift b/apps/macos/Sources/Clawdis/TalkModeController.swift index 920af0539..707b56995 100644 --- a/apps/macos/Sources/Clawdis/TalkModeController.swift +++ b/apps/macos/Sources/Clawdis/TalkModeController.swift @@ -20,6 +20,7 @@ final class TalkModeController { func updatePhase(_ phase: TalkModePhase) { TalkOverlayController.shared.updatePhase(phase) + Task { await GatewayConnection.shared.talkMode(enabled: AppStateStore.shared.talkEnabled, phase: phase.rawValue) } } func updateLevel(_ level: Double) { diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 0443e26ea..54804337e 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -244,6 +244,7 @@ actor TalkModeRuntime { await self.reloadConfig() let prompt = self.buildPrompt(transcript: transcript) let runId = UUID().uuidString + let startedAt = Date().timeIntervalSince1970 do { let response = try await GatewayConnection.shared.chatSend( @@ -261,7 +262,11 @@ actor TalkModeRuntime { return } - guard let assistantText = await self.latestAssistantText(sessionKey: "main") else { + guard let assistantText = await self.waitForAssistantText( + sessionKey: "main", + since: startedAt, + timeoutSeconds: 12) + else { await self.startListening() await self.startRecognition() return @@ -335,7 +340,22 @@ actor TalkModeRuntime { } } - private func latestAssistantText(sessionKey: String) async -> String? { + private func waitForAssistantText( + sessionKey: String, + since: Double, + timeoutSeconds: Int) async -> String? + { + let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds)) + while Date() < deadline { + if let text = await self.latestAssistantText(sessionKey: sessionKey, since: since) { + return text + } + try? await Task.sleep(nanoseconds: 300_000_000) + } + return nil + } + + private func latestAssistantText(sessionKey: String, since: Double? = nil) async -> String? { do { let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey) let messages = history.messages ?? [] @@ -343,7 +363,13 @@ actor TalkModeRuntime { guard let data = try? JSONEncoder().encode(item) else { return nil } return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data) } - guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil } + let assistant = decoded.last { message in + guard message.role == "assistant" else { return false } + guard let since else { return true } + guard let timestamp = message.timestamp else { return false } + return timestamp >= since - 0.5 + } + guard let assistant else { return nil } let text = assistant.content.compactMap { $0.text }.joined(separator: "\n") let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) return trimmed.isEmpty ? nil : trimmed diff --git a/apps/macos/Sources/Clawdis/TalkOverlay.swift b/apps/macos/Sources/Clawdis/TalkOverlay.swift index 59555a104..e41d758f7 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlay.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift @@ -20,9 +20,9 @@ final class TalkOverlayController { private var window: NSPanel? private var hostingView: NSHostingView? - private let width: CGFloat = 120 - private let height: CGFloat = 120 - private let padding: CGFloat = 6 + private let width: CGFloat = 160 + private let height: CGFloat = 160 + private let padding: CGFloat = 8 func present() { self.ensureWindow() diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift index f5484c439..d7b400ed3 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -7,12 +7,12 @@ struct TalkOverlayView: View { var body: some View { ZStack(alignment: .topLeading) { TalkOrbView(phase: self.controller.model.phase, level: self.controller.model.level) - .frame(width: 80, height: 80) + .frame(width: 96, height: 96) .contentShape(Rectangle()) .onTapGesture { TalkModeController.shared.stopSpeaking(reason: .userTap) } - .padding(16) + .padding(26) Button { TalkModeController.shared.exitTalkMode() @@ -29,7 +29,7 @@ struct TalkOverlayView: View { .padding(4) .onHover { self.hovering = $0 } } - .frame(width: 120, height: 120, alignment: .center) + .frame(width: 160, height: 160, alignment: .center) } } @@ -72,6 +72,7 @@ private struct TalkWaveRings: View { let phase: TalkModePhase let level: Double let time: TimeInterval + private let ringColor = Color(red: 0.82, green: 0.94, blue: 1.0) var body: some View { ZStack { @@ -80,9 +81,9 @@ private struct TalkWaveRings: View { let progress = (time * speed + Double(idx) * 0.28).truncatingRemainder(dividingBy: 1) let amplitude = phase == .speaking ? 0.95 : phase == .listening ? 0.5 + level * 0.7 : 0.35 let scale = 0.75 + progress * amplitude + (phase == .listening ? level * 0.15 : 0) - let alpha = phase == .speaking ? 0.55 : phase == .listening ? 0.45 + level * 0.25 : 0.28 + let alpha = phase == .speaking ? 0.72 : phase == .listening ? 0.58 + level * 0.28 : 0.4 Circle() - .stroke(Color.white.opacity(alpha - progress * 0.35), lineWidth: 1.2) + .stroke(self.ringColor.opacity(alpha - progress * 0.3), lineWidth: 1.6) .scaleEffect(scale) .opacity(alpha - progress * 0.6) } @@ -97,13 +98,13 @@ private struct TalkOrbitArcs: View { ZStack { Circle() .trim(from: 0.08, to: 0.26) - .stroke(Color.white.opacity(0.75), style: StrokeStyle(lineWidth: 1.4, lineCap: .round)) + .stroke(Color.white.opacity(0.88), style: StrokeStyle(lineWidth: 1.6, lineCap: .round)) .rotationEffect(.degrees(time * 42)) Circle() .trim(from: 0.62, to: 0.86) - .stroke(Color.white.opacity(0.55), style: StrokeStyle(lineWidth: 1.2, lineCap: .round)) + .stroke(Color.white.opacity(0.7), style: StrokeStyle(lineWidth: 1.4, lineCap: .round)) .rotationEffect(.degrees(-time * 35)) } - .scaleEffect(1.05) + .scaleEffect(1.08) } } diff --git a/docs/configuration.md b/docs/configuration.md index f15a8f046..a49e916f8 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -198,6 +198,7 @@ Controls inbound/outbound prefixes and timestamps. ### `talk` Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` when unset. +`apiKey` falls back to `ELEVENLABS_API_KEY` (or the gateway’s shell profile) when unset. ```json5 { @@ -205,6 +206,7 @@ Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_V voiceId: "elevenlabs_voice_id", modelId: "eleven_v3", outputFormat: "mp3_44100_128", + apiKey: "elevenlabs_api_key", interruptOnSpeech: true } } diff --git a/docs/talk.md b/docs/talk.md index 4c3cf53cb..41f8239a4 100644 --- a/docs/talk.md +++ b/docs/talk.md @@ -47,6 +47,7 @@ Supported keys: "voiceId": "elevenlabs_voice_id", "modelId": "eleven_v3", "outputFormat": "mp3_44100_128", + "apiKey": "elevenlabs_api_key", "interruptOnSpeech": true } } @@ -55,6 +56,7 @@ Supported keys: Defaults: - `interruptOnSpeech`: true - `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` +- `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available) ## macOS UI - Menu bar toggle: **Talk** diff --git a/src/config/config.test.ts b/src/config/config.test.ts index c6669848a..ff2009a35 100644 --- a/src/config/config.test.ts +++ b/src/config/config.test.ts @@ -174,3 +174,50 @@ describe("config identity defaults", () => { }); }); }); + +describe("talk api key fallback", () => { + let previousEnv: string | undefined; + + beforeEach(() => { + previousEnv = process.env.ELEVENLABS_API_KEY; + delete process.env.ELEVENLABS_API_KEY; + }); + + afterEach(() => { + process.env.ELEVENLABS_API_KEY = previousEnv; + }); + + it("injects talk.apiKey from profile when config is missing", async () => { + await withTempHome(async (home) => { + await fs.writeFile( + path.join(home, ".profile"), + "export ELEVENLABS_API_KEY=profile-key\n", + "utf-8", + ); + + vi.resetModules(); + const { readConfigFileSnapshot } = await import("./config.js"); + const snap = await readConfigFileSnapshot(); + + expect(snap.config?.talk?.apiKey).toBe("profile-key"); + expect(snap.exists).toBe(false); + }); + }); + + it("prefers ELEVENLABS_API_KEY env over profile", async () => { + await withTempHome(async (home) => { + await fs.writeFile( + path.join(home, ".profile"), + "export ELEVENLABS_API_KEY=profile-key\n", + "utf-8", + ); + process.env.ELEVENLABS_API_KEY = "env-key"; + + vi.resetModules(); + const { readConfigFileSnapshot } = await import("./config.js"); + const snap = await readConfigFileSnapshot(); + + expect(snap.config?.talk?.apiKey).toBe("env-key"); + }); + }); +}); diff --git a/src/config/config.ts b/src/config/config.ts index 40ae5da06..870ba9621 100644 --- a/src/config/config.ts +++ b/src/config/config.ts @@ -226,6 +226,8 @@ export type TalkConfig = { modelId?: string; /** Default ElevenLabs output format (e.g. mp3_44100_128). */ outputFormat?: string; + /** ElevenLabs API key (optional; falls back to ELEVENLABS_API_KEY). */ + apiKey?: string; /** Stop speaking when user starts talking (default: true). */ interruptOnSpeech?: boolean; }; @@ -802,6 +804,7 @@ const ClawdisSchema = z.object({ voiceId: z.string().optional(), modelId: z.string().optional(), outputFormat: z.string().optional(), + apiKey: z.string().optional(), interruptOnSpeech: z.boolean().optional(), }) .optional(), @@ -964,17 +967,59 @@ export function parseConfigJson5( } } +function readTalkApiKeyFromProfile(): string | null { + const home = os.homedir(); + const candidates = [".profile", ".zprofile", ".zshrc", ".bashrc"].map( + (name) => path.join(home, name), + ); + for (const candidate of candidates) { + if (!fs.existsSync(candidate)) continue; + try { + const text = fs.readFileSync(candidate, "utf-8"); + const match = text.match( + /(?:^|\n)\s*(?:export\s+)?ELEVENLABS_API_KEY\s*=\s*["']?([^\n"']+)["']?/, + ); + const value = match?.[1]?.trim(); + if (value) return value; + } catch { + // Ignore profile read errors. + } + } + return null; +} + +function resolveTalkApiKey(): string | null { + const envValue = (process.env.ELEVENLABS_API_KEY ?? "").trim(); + if (envValue) return envValue; + return readTalkApiKeyFromProfile(); +} + +function applyTalkApiKey(config: ClawdisConfig): ClawdisConfig { + const resolved = resolveTalkApiKey(); + if (!resolved) return config; + const existing = config.talk?.apiKey?.trim(); + if (existing) return config; + return { + ...config, + talk: { + ...config.talk, + apiKey: resolved, + }, + }; +} + export async function readConfigFileSnapshot(): Promise { const configPath = CONFIG_PATH_CLAWDIS; const exists = fs.existsSync(configPath); if (!exists) { + const config = applyTalkApiKey({}); return { path: configPath, exists: false, raw: null, parsed: {}, valid: true, - config: {}, + config, issues: [], }; } @@ -1015,7 +1060,7 @@ export async function readConfigFileSnapshot(): Promise { raw, parsed: parsedRes.parsed, valid: true, - config: validated.config, + config: applyTalkApiKey(validated.config), issues: [], }; } catch (err) { diff --git a/src/gateway/protocol/index.ts b/src/gateway/protocol/index.ts index 725f37cc6..42a46160a 100644 --- a/src/gateway/protocol/index.ts +++ b/src/gateway/protocol/index.ts @@ -95,6 +95,8 @@ import { SnapshotSchema, type StateVersion, StateVersionSchema, + type TalkModeParams, + TalkModeParamsSchema, type TickEvent, TickEventSchema, type WakeParams, @@ -169,6 +171,8 @@ export const validateConfigGetParams = ajv.compile( export const validateConfigSetParams = ajv.compile( ConfigSetParamsSchema, ); +export const validateTalkModeParams = + ajv.compile(TalkModeParamsSchema); export const validateProvidersStatusParams = ajv.compile( ProvidersStatusParamsSchema, ); @@ -297,6 +301,7 @@ export type { NodePairApproveParams, ConfigGetParams, ConfigSetParams, + TalkModeParams, ProvidersStatusParams, WebLoginStartParams, WebLoginWaitParams, diff --git a/src/gateway/protocol/schema.ts b/src/gateway/protocol/schema.ts index 7ea3fc23a..ffd5260f8 100644 --- a/src/gateway/protocol/schema.ts +++ b/src/gateway/protocol/schema.ts @@ -339,6 +339,14 @@ export const ConfigSetParamsSchema = Type.Object( { additionalProperties: false }, ); +export const TalkModeParamsSchema = Type.Object( + { + enabled: Type.Boolean(), + phase: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + export const ProvidersStatusParamsSchema = Type.Object( { probe: Type.Optional(Type.Boolean()), @@ -668,6 +676,7 @@ export const ProtocolSchemas: Record = { SessionsCompactParams: SessionsCompactParamsSchema, ConfigGetParams: ConfigGetParamsSchema, ConfigSetParams: ConfigSetParamsSchema, + TalkModeParams: TalkModeParamsSchema, ProvidersStatusParams: ProvidersStatusParamsSchema, WebLoginStartParams: WebLoginStartParamsSchema, WebLoginWaitParams: WebLoginWaitParamsSchema, @@ -724,6 +733,7 @@ export type SessionsDeleteParams = Static; export type SessionsCompactParams = Static; export type ConfigGetParams = Static; export type ConfigSetParams = Static; +export type TalkModeParams = Static; export type ProvidersStatusParams = Static; export type WebLoginStartParams = Static; export type WebLoginWaitParams = Static; diff --git a/src/gateway/server.ts b/src/gateway/server.ts index 819e8fefb..db6e506f8 100644 --- a/src/gateway/server.ts +++ b/src/gateway/server.ts @@ -393,6 +393,7 @@ import { validateSkillsInstallParams, validateSkillsStatusParams, validateSkillsUpdateParams, + validateTalkModeParams, validateWakeParams, validateWebLoginStartParams, validateWebLoginWaitParams, @@ -469,6 +470,7 @@ const METHODS = [ "status", "config.get", "config.set", + "talk.mode", "models.list", "skills.status", "skills.install", @@ -518,6 +520,7 @@ const EVENTS = [ "chat", "presence", "tick", + "talk.mode", "shutdown", "health", "heartbeat", @@ -2379,6 +2382,25 @@ export async function startGatewayServer( }), }; } + case "talk.mode": { + const params = parseParams(); + if (!validateTalkModeParams(params)) { + return { + ok: false, + error: { + code: ErrorCodes.INVALID_REQUEST, + message: `invalid talk.mode params: ${formatValidationErrors(validateTalkModeParams.errors)}`, + }, + }; + } + const payload = { + enabled: (params as { enabled: boolean }).enabled, + phase: (params as { phase?: string }).phase ?? null, + ts: Date.now(), + }; + broadcast("talk.mode", payload, { dropIfSlow: true }); + return { ok: true, payloadJSON: JSON.stringify(payload) }; + } case "models.list": { const params = parseParams(); if (!validateModelsListParams(params)) { @@ -4615,6 +4637,28 @@ export async function startGatewayServer( ); break; } + case "talk.mode": { + const params = (req.params ?? {}) as Record; + if (!validateTalkModeParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.mode params: ${formatValidationErrors(validateTalkModeParams.errors)}`, + ), + ); + break; + } + const payload = { + enabled: (params as { enabled: boolean }).enabled, + phase: (params as { phase?: string }).phase ?? null, + ts: Date.now(), + }; + broadcast("talk.mode", payload, { dropIfSlow: true }); + respond(true, payload, undefined); + break; + } case "skills.status": { const params = (req.params ?? {}) as Record; if (!validateSkillsStatusParams(params)) {