feat: talk mode key distribution and tts polling
This commit is contained in:
@@ -76,6 +76,7 @@ class TalkModeManager(
|
||||
private var defaultModelId: String? = null
|
||||
private var currentModelId: String? = null
|
||||
private var defaultOutputFormat: String? = null
|
||||
private var apiKey: String? = null
|
||||
private var interruptOnSpeech: Boolean = true
|
||||
private var voiceOverrideActive = false
|
||||
private var modelOverrideActive = false
|
||||
@@ -268,6 +269,7 @@ class TalkModeManager(
|
||||
}
|
||||
|
||||
try {
|
||||
val startedAt = System.currentTimeMillis().toDouble() / 1000.0
|
||||
val runId = sendChat(prompt, bridge)
|
||||
val ok = waitForChatFinal(runId)
|
||||
if (!ok) {
|
||||
@@ -275,7 +277,7 @@ class TalkModeManager(
|
||||
start()
|
||||
return
|
||||
}
|
||||
val assistant = fetchLatestAssistantText(bridge)
|
||||
val assistant = waitForAssistantText(bridge, startedAt, 12_000)
|
||||
if (assistant.isNullOrBlank()) {
|
||||
_statusText.value = "No reply"
|
||||
start()
|
||||
@@ -345,13 +347,34 @@ class TalkModeManager(
|
||||
return result
|
||||
}
|
||||
|
||||
private suspend fun fetchLatestAssistantText(bridge: BridgeSession): String? {
|
||||
private suspend fun waitForAssistantText(
|
||||
bridge: BridgeSession,
|
||||
sinceSeconds: Double,
|
||||
timeoutMs: Long,
|
||||
): String? {
|
||||
val deadline = SystemClock.elapsedRealtime() + timeoutMs
|
||||
while (SystemClock.elapsedRealtime() < deadline) {
|
||||
val text = fetchLatestAssistantText(bridge, sinceSeconds)
|
||||
if (!text.isNullOrBlank()) return text
|
||||
delay(300)
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
private suspend fun fetchLatestAssistantText(
|
||||
bridge: BridgeSession,
|
||||
sinceSeconds: Double? = null,
|
||||
): String? {
|
||||
val res = bridge.request("chat.history", "{\"sessionKey\":\"main\"}")
|
||||
val root = json.parseToJsonElement(res).asObjectOrNull() ?: return null
|
||||
val messages = root["messages"] as? JsonArray ?: return null
|
||||
for (item in messages.reversed()) {
|
||||
val obj = item.asObjectOrNull() ?: continue
|
||||
if (obj["role"].asStringOrNull() != "assistant") continue
|
||||
if (sinceSeconds != null) {
|
||||
val timestamp = obj["timestamp"].asDoubleOrNull()
|
||||
if (timestamp != null && timestamp < sinceSeconds - 0.5) continue
|
||||
}
|
||||
val content = obj["content"] as? JsonArray ?: continue
|
||||
val text =
|
||||
content.mapNotNull { entry ->
|
||||
@@ -390,7 +413,9 @@ class TalkModeManager(
|
||||
return
|
||||
}
|
||||
|
||||
val apiKey = System.getenv("ELEVENLABS_API_KEY")?.trim()
|
||||
val apiKey =
|
||||
apiKey?.trim()?.takeIf { it.isNotEmpty() }
|
||||
?: System.getenv("ELEVENLABS_API_KEY")?.trim()
|
||||
if (apiKey.isNullOrEmpty()) {
|
||||
_statusText.value = "Missing ELEVENLABS_API_KEY"
|
||||
return
|
||||
@@ -495,6 +520,7 @@ class TalkModeManager(
|
||||
val bridge = session ?: return
|
||||
val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim()
|
||||
val sagVoice = System.getenv("SAG_VOICE_ID")?.trim()
|
||||
val envKey = System.getenv("ELEVENLABS_API_KEY")?.trim()
|
||||
try {
|
||||
val res = bridge.request("config.get", "{}")
|
||||
val root = json.parseToJsonElement(res).asObjectOrNull()
|
||||
@@ -503,6 +529,7 @@ class TalkModeManager(
|
||||
val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
|
||||
val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
|
||||
val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
|
||||
val key = talk?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
|
||||
val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull()
|
||||
|
||||
defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
|
||||
@@ -510,9 +537,11 @@ class TalkModeManager(
|
||||
defaultModelId = model
|
||||
if (!modelOverrideActive) currentModelId = defaultModelId
|
||||
defaultOutputFormat = outputFormat
|
||||
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
|
||||
if (interrupt != null) interruptOnSpeech = interrupt
|
||||
} catch (_: Throwable) {
|
||||
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
|
||||
apiKey = envKey?.takeIf { it.isNotEmpty() }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ final class TalkModeManager: NSObject {
|
||||
private var defaultModelId: String?
|
||||
private var currentModelId: String?
|
||||
private var defaultOutputFormat: String?
|
||||
private var apiKey: String?
|
||||
private var interruptOnSpeech: Bool = true
|
||||
|
||||
private var bridge: BridgeSession?
|
||||
@@ -189,6 +190,7 @@ final class TalkModeManager: NSObject {
|
||||
}
|
||||
|
||||
do {
|
||||
let startedAt = Date().timeIntervalSince1970
|
||||
let runId = try await self.sendChat(prompt, bridge: bridge)
|
||||
let ok = await self.waitForChatFinal(runId: runId, bridge: bridge)
|
||||
if !ok {
|
||||
@@ -197,7 +199,11 @@ final class TalkModeManager: NSObject {
|
||||
return
|
||||
}
|
||||
|
||||
guard let assistantText = try await self.fetchLatestAssistantText(bridge: bridge) else {
|
||||
guard let assistantText = try await self.waitForAssistantText(
|
||||
bridge: bridge,
|
||||
since: startedAt,
|
||||
timeoutSeconds: 12)
|
||||
else {
|
||||
self.statusText = "No reply"
|
||||
await self.start()
|
||||
return
|
||||
@@ -259,7 +265,22 @@ final class TalkModeManager: NSObject {
|
||||
return false
|
||||
}
|
||||
|
||||
private func fetchLatestAssistantText(bridge: BridgeSession) async throws -> String? {
|
||||
private func waitForAssistantText(
|
||||
bridge: BridgeSession,
|
||||
since: Double,
|
||||
timeoutSeconds: Int) async throws -> String?
|
||||
{
|
||||
let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds))
|
||||
while Date() < deadline {
|
||||
if let text = try await self.fetchLatestAssistantText(bridge: bridge, since: since) {
|
||||
return text
|
||||
}
|
||||
try? await Task.sleep(nanoseconds: 300_000_000)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private func fetchLatestAssistantText(bridge: BridgeSession, since: Double? = nil) async throws -> String? {
|
||||
let res = try await bridge.request(
|
||||
method: "chat.history",
|
||||
paramsJSON: "{\"sessionKey\":\"main\"}",
|
||||
@@ -268,6 +289,9 @@ final class TalkModeManager: NSObject {
|
||||
guard let messages = json["messages"] as? [[String: Any]] else { return nil }
|
||||
for msg in messages.reversed() {
|
||||
guard (msg["role"] as? String) == "assistant" else { continue }
|
||||
if let since, let timestamp = msg["timestamp"] as? Double, timestamp < since - 0.5 {
|
||||
continue
|
||||
}
|
||||
guard let content = msg["content"] as? [[String: Any]] else { continue }
|
||||
let text = content.compactMap { $0["text"] as? String }.joined(separator: "\n")
|
||||
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
@@ -299,7 +323,10 @@ final class TalkModeManager: NSObject {
|
||||
return
|
||||
}
|
||||
|
||||
guard let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"], !apiKey.isEmpty else {
|
||||
let resolvedKey =
|
||||
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
|
||||
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
|
||||
guard let apiKey = resolvedKey, !apiKey.isEmpty else {
|
||||
self.statusText = "Missing ELEVENLABS_API_KEY"
|
||||
return
|
||||
}
|
||||
@@ -375,6 +402,7 @@ final class TalkModeManager: NSObject {
|
||||
self.currentModelId = self.defaultModelId
|
||||
self.defaultOutputFormat = (talk?["outputFormat"] as? String)?
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
self.apiKey = (talk?["apiKey"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if let interrupt = talk?["interruptOnSpeech"] as? Bool {
|
||||
self.interruptOnSpeech = interrupt
|
||||
}
|
||||
|
||||
@@ -329,18 +329,26 @@ final class AppState {
|
||||
func setTalkEnabled(_ enabled: Bool) async {
|
||||
guard voiceWakeSupported else {
|
||||
self.talkEnabled = false
|
||||
await GatewayConnection.shared.talkMode(enabled: false, phase: "disabled")
|
||||
return
|
||||
}
|
||||
|
||||
self.talkEnabled = enabled
|
||||
guard !self.isPreview else { return }
|
||||
|
||||
if !enabled { return }
|
||||
if !enabled {
|
||||
await GatewayConnection.shared.talkMode(enabled: false, phase: "disabled")
|
||||
return
|
||||
}
|
||||
|
||||
if PermissionManager.voiceWakePermissionsGranted() { return }
|
||||
if PermissionManager.voiceWakePermissionsGranted() {
|
||||
await GatewayConnection.shared.talkMode(enabled: true, phase: "enabled")
|
||||
return
|
||||
}
|
||||
|
||||
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
|
||||
self.talkEnabled = granted
|
||||
await GatewayConnection.shared.talkMode(enabled: granted, phase: granted ? "enabled" : "denied")
|
||||
}
|
||||
|
||||
// MARK: - Global wake words sync (Gateway-owned)
|
||||
|
||||
@@ -34,6 +34,7 @@ struct ConfigSettings: View {
|
||||
@State private var talkVoiceId: String = ""
|
||||
@State private var talkInterruptOnSpeech: Bool = true
|
||||
@State private var talkApiKey: String = ""
|
||||
@State private var gatewayApiKeyFound = false
|
||||
|
||||
var body: some View {
|
||||
ScrollView { self.content }
|
||||
@@ -49,6 +50,7 @@ struct ConfigSettings: View {
|
||||
self.hasLoaded = true
|
||||
self.loadConfig()
|
||||
await self.loadModels()
|
||||
await self.refreshGatewayTalkApiKey()
|
||||
self.allowAutosave = true
|
||||
}
|
||||
}
|
||||
@@ -323,6 +325,10 @@ struct ConfigSettings: View {
|
||||
Text("Using ELEVENLABS_API_KEY from the environment.")
|
||||
.font(.footnote)
|
||||
.foregroundStyle(.secondary)
|
||||
} else if self.gatewayApiKeyFound && self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
|
||||
Text("Using API key from the gateway profile.")
|
||||
.font(.footnote)
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -392,6 +398,20 @@ struct ConfigSettings: View {
|
||||
}
|
||||
}
|
||||
|
||||
private func refreshGatewayTalkApiKey() async {
|
||||
do {
|
||||
let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded(
|
||||
method: .configGet,
|
||||
params: nil,
|
||||
timeoutMs: 8000)
|
||||
let talk = snap.config?["talk"]?.dictionaryValue
|
||||
let apiKey = talk?["apiKey"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
self.gatewayApiKeyFound = !(apiKey ?? "").isEmpty
|
||||
} catch {
|
||||
self.gatewayApiKeyFound = false
|
||||
}
|
||||
}
|
||||
|
||||
private func autosaveConfig() {
|
||||
guard self.allowAutosave else { return }
|
||||
Task { await self.saveConfig() }
|
||||
@@ -487,12 +507,14 @@ struct ConfigSettings: View {
|
||||
if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
|
||||
return "ElevenLabs API key: stored in config"
|
||||
}
|
||||
if self.gatewayApiKeyFound { return "ElevenLabs API key: found (gateway)" }
|
||||
return "ElevenLabs API key: missing"
|
||||
}
|
||||
|
||||
private var apiKeyStatusColor: Color {
|
||||
if self.hasEnvApiKey { return .green }
|
||||
if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { return .green }
|
||||
if self.gatewayApiKeyFound { return .green }
|
||||
return .red
|
||||
}
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ actor GatewayConnection {
|
||||
case providersStatus = "providers.status"
|
||||
case configGet = "config.get"
|
||||
case configSet = "config.set"
|
||||
case talkMode = "talk.mode"
|
||||
case webLoginStart = "web.login.start"
|
||||
case webLoginWait = "web.login.wait"
|
||||
case webLogout = "web.logout"
|
||||
@@ -483,6 +484,12 @@ extension GatewayConnection {
|
||||
return res.aborted ?? false
|
||||
}
|
||||
|
||||
func talkMode(enabled: Bool, phase: String? = nil) async {
|
||||
var params: [String: AnyCodable] = ["enabled": AnyCodable(enabled)]
|
||||
if let phase { params["phase"] = AnyCodable(phase) }
|
||||
try? await self.requestVoid(method: .talkMode, params: params)
|
||||
}
|
||||
|
||||
// MARK: - VoiceWake
|
||||
|
||||
func voiceWakeGetTriggers() async throws -> [String] {
|
||||
|
||||
@@ -20,6 +20,7 @@ final class TalkModeController {
|
||||
|
||||
func updatePhase(_ phase: TalkModePhase) {
|
||||
TalkOverlayController.shared.updatePhase(phase)
|
||||
Task { await GatewayConnection.shared.talkMode(enabled: AppStateStore.shared.talkEnabled, phase: phase.rawValue) }
|
||||
}
|
||||
|
||||
func updateLevel(_ level: Double) {
|
||||
|
||||
@@ -244,6 +244,7 @@ actor TalkModeRuntime {
|
||||
await self.reloadConfig()
|
||||
let prompt = self.buildPrompt(transcript: transcript)
|
||||
let runId = UUID().uuidString
|
||||
let startedAt = Date().timeIntervalSince1970
|
||||
|
||||
do {
|
||||
let response = try await GatewayConnection.shared.chatSend(
|
||||
@@ -261,7 +262,11 @@ actor TalkModeRuntime {
|
||||
return
|
||||
}
|
||||
|
||||
guard let assistantText = await self.latestAssistantText(sessionKey: "main") else {
|
||||
guard let assistantText = await self.waitForAssistantText(
|
||||
sessionKey: "main",
|
||||
since: startedAt,
|
||||
timeoutSeconds: 12)
|
||||
else {
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
return
|
||||
@@ -335,7 +340,22 @@ actor TalkModeRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
private func latestAssistantText(sessionKey: String) async -> String? {
|
||||
private func waitForAssistantText(
|
||||
sessionKey: String,
|
||||
since: Double,
|
||||
timeoutSeconds: Int) async -> String?
|
||||
{
|
||||
let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds))
|
||||
while Date() < deadline {
|
||||
if let text = await self.latestAssistantText(sessionKey: sessionKey, since: since) {
|
||||
return text
|
||||
}
|
||||
try? await Task.sleep(nanoseconds: 300_000_000)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private func latestAssistantText(sessionKey: String, since: Double? = nil) async -> String? {
|
||||
do {
|
||||
let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey)
|
||||
let messages = history.messages ?? []
|
||||
@@ -343,7 +363,13 @@ actor TalkModeRuntime {
|
||||
guard let data = try? JSONEncoder().encode(item) else { return nil }
|
||||
return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data)
|
||||
}
|
||||
guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil }
|
||||
let assistant = decoded.last { message in
|
||||
guard message.role == "assistant" else { return false }
|
||||
guard let since else { return true }
|
||||
guard let timestamp = message.timestamp else { return false }
|
||||
return timestamp >= since - 0.5
|
||||
}
|
||||
guard let assistant else { return nil }
|
||||
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
|
||||
let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
|
||||
return trimmed.isEmpty ? nil : trimmed
|
||||
|
||||
@@ -20,9 +20,9 @@ final class TalkOverlayController {
|
||||
private var window: NSPanel?
|
||||
private var hostingView: NSHostingView<TalkOverlayView>?
|
||||
|
||||
private let width: CGFloat = 120
|
||||
private let height: CGFloat = 120
|
||||
private let padding: CGFloat = 6
|
||||
private let width: CGFloat = 160
|
||||
private let height: CGFloat = 160
|
||||
private let padding: CGFloat = 8
|
||||
|
||||
func present() {
|
||||
self.ensureWindow()
|
||||
|
||||
@@ -7,12 +7,12 @@ struct TalkOverlayView: View {
|
||||
var body: some View {
|
||||
ZStack(alignment: .topLeading) {
|
||||
TalkOrbView(phase: self.controller.model.phase, level: self.controller.model.level)
|
||||
.frame(width: 80, height: 80)
|
||||
.frame(width: 96, height: 96)
|
||||
.contentShape(Rectangle())
|
||||
.onTapGesture {
|
||||
TalkModeController.shared.stopSpeaking(reason: .userTap)
|
||||
}
|
||||
.padding(16)
|
||||
.padding(26)
|
||||
|
||||
Button {
|
||||
TalkModeController.shared.exitTalkMode()
|
||||
@@ -29,7 +29,7 @@ struct TalkOverlayView: View {
|
||||
.padding(4)
|
||||
.onHover { self.hovering = $0 }
|
||||
}
|
||||
.frame(width: 120, height: 120, alignment: .center)
|
||||
.frame(width: 160, height: 160, alignment: .center)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,6 +72,7 @@ private struct TalkWaveRings: View {
|
||||
let phase: TalkModePhase
|
||||
let level: Double
|
||||
let time: TimeInterval
|
||||
private let ringColor = Color(red: 0.82, green: 0.94, blue: 1.0)
|
||||
|
||||
var body: some View {
|
||||
ZStack {
|
||||
@@ -80,9 +81,9 @@ private struct TalkWaveRings: View {
|
||||
let progress = (time * speed + Double(idx) * 0.28).truncatingRemainder(dividingBy: 1)
|
||||
let amplitude = phase == .speaking ? 0.95 : phase == .listening ? 0.5 + level * 0.7 : 0.35
|
||||
let scale = 0.75 + progress * amplitude + (phase == .listening ? level * 0.15 : 0)
|
||||
let alpha = phase == .speaking ? 0.55 : phase == .listening ? 0.45 + level * 0.25 : 0.28
|
||||
let alpha = phase == .speaking ? 0.72 : phase == .listening ? 0.58 + level * 0.28 : 0.4
|
||||
Circle()
|
||||
.stroke(Color.white.opacity(alpha - progress * 0.35), lineWidth: 1.2)
|
||||
.stroke(self.ringColor.opacity(alpha - progress * 0.3), lineWidth: 1.6)
|
||||
.scaleEffect(scale)
|
||||
.opacity(alpha - progress * 0.6)
|
||||
}
|
||||
@@ -97,13 +98,13 @@ private struct TalkOrbitArcs: View {
|
||||
ZStack {
|
||||
Circle()
|
||||
.trim(from: 0.08, to: 0.26)
|
||||
.stroke(Color.white.opacity(0.75), style: StrokeStyle(lineWidth: 1.4, lineCap: .round))
|
||||
.stroke(Color.white.opacity(0.88), style: StrokeStyle(lineWidth: 1.6, lineCap: .round))
|
||||
.rotationEffect(.degrees(time * 42))
|
||||
Circle()
|
||||
.trim(from: 0.62, to: 0.86)
|
||||
.stroke(Color.white.opacity(0.55), style: StrokeStyle(lineWidth: 1.2, lineCap: .round))
|
||||
.stroke(Color.white.opacity(0.7), style: StrokeStyle(lineWidth: 1.4, lineCap: .round))
|
||||
.rotationEffect(.degrees(-time * 35))
|
||||
}
|
||||
.scaleEffect(1.05)
|
||||
.scaleEffect(1.08)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user