feat: talk mode key distribution and tts polling

This commit is contained in:
Peter Steinberger
2025-12-30 01:57:45 +01:00
parent 02db68aa67
commit e119a82334
17 changed files with 303 additions and 24 deletions

View File

@@ -329,18 +329,26 @@ final class AppState {
func setTalkEnabled(_ enabled: Bool) async {
guard voiceWakeSupported else {
self.talkEnabled = false
await GatewayConnection.shared.talkMode(enabled: false, phase: "disabled")
return
}
self.talkEnabled = enabled
guard !self.isPreview else { return }
if !enabled { return }
if !enabled {
await GatewayConnection.shared.talkMode(enabled: false, phase: "disabled")
return
}
if PermissionManager.voiceWakePermissionsGranted() { return }
if PermissionManager.voiceWakePermissionsGranted() {
await GatewayConnection.shared.talkMode(enabled: true, phase: "enabled")
return
}
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
self.talkEnabled = granted
await GatewayConnection.shared.talkMode(enabled: granted, phase: granted ? "enabled" : "denied")
}
// MARK: - Global wake words sync (Gateway-owned)

View File

@@ -34,6 +34,7 @@ struct ConfigSettings: View {
@State private var talkVoiceId: String = ""
@State private var talkInterruptOnSpeech: Bool = true
@State private var talkApiKey: String = ""
@State private var gatewayApiKeyFound = false
var body: some View {
ScrollView { self.content }
@@ -49,6 +50,7 @@ struct ConfigSettings: View {
self.hasLoaded = true
self.loadConfig()
await self.loadModels()
await self.refreshGatewayTalkApiKey()
self.allowAutosave = true
}
}
@@ -323,6 +325,10 @@ struct ConfigSettings: View {
Text("Using ELEVENLABS_API_KEY from the environment.")
.font(.footnote)
.foregroundStyle(.secondary)
} else if self.gatewayApiKeyFound && self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
Text("Using API key from the gateway profile.")
.font(.footnote)
.foregroundStyle(.secondary)
}
}
}
@@ -392,6 +398,20 @@ struct ConfigSettings: View {
}
}
private func refreshGatewayTalkApiKey() async {
do {
let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded(
method: .configGet,
params: nil,
timeoutMs: 8000)
let talk = snap.config?["talk"]?.dictionaryValue
let apiKey = talk?["apiKey"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
self.gatewayApiKeyFound = !(apiKey ?? "").isEmpty
} catch {
self.gatewayApiKeyFound = false
}
}
private func autosaveConfig() {
guard self.allowAutosave else { return }
Task { await self.saveConfig() }
@@ -487,12 +507,14 @@ struct ConfigSettings: View {
if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
return "ElevenLabs API key: stored in config"
}
if self.gatewayApiKeyFound { return "ElevenLabs API key: found (gateway)" }
return "ElevenLabs API key: missing"
}
private var apiKeyStatusColor: Color {
if self.hasEnvApiKey { return .green }
if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { return .green }
if self.gatewayApiKeyFound { return .green }
return .red
}

View File

@@ -51,6 +51,7 @@ actor GatewayConnection {
case providersStatus = "providers.status"
case configGet = "config.get"
case configSet = "config.set"
case talkMode = "talk.mode"
case webLoginStart = "web.login.start"
case webLoginWait = "web.login.wait"
case webLogout = "web.logout"
@@ -483,6 +484,12 @@ extension GatewayConnection {
return res.aborted ?? false
}
func talkMode(enabled: Bool, phase: String? = nil) async {
var params: [String: AnyCodable] = ["enabled": AnyCodable(enabled)]
if let phase { params["phase"] = AnyCodable(phase) }
try? await self.requestVoid(method: .talkMode, params: params)
}
// MARK: - VoiceWake
func voiceWakeGetTriggers() async throws -> [String] {

View File

@@ -20,6 +20,7 @@ final class TalkModeController {
func updatePhase(_ phase: TalkModePhase) {
TalkOverlayController.shared.updatePhase(phase)
Task { await GatewayConnection.shared.talkMode(enabled: AppStateStore.shared.talkEnabled, phase: phase.rawValue) }
}
func updateLevel(_ level: Double) {

View File

@@ -244,6 +244,7 @@ actor TalkModeRuntime {
await self.reloadConfig()
let prompt = self.buildPrompt(transcript: transcript)
let runId = UUID().uuidString
let startedAt = Date().timeIntervalSince1970
do {
let response = try await GatewayConnection.shared.chatSend(
@@ -261,7 +262,11 @@ actor TalkModeRuntime {
return
}
guard let assistantText = await self.latestAssistantText(sessionKey: "main") else {
guard let assistantText = await self.waitForAssistantText(
sessionKey: "main",
since: startedAt,
timeoutSeconds: 12)
else {
await self.startListening()
await self.startRecognition()
return
@@ -335,7 +340,22 @@ actor TalkModeRuntime {
}
}
private func latestAssistantText(sessionKey: String) async -> String? {
private func waitForAssistantText(
sessionKey: String,
since: Double,
timeoutSeconds: Int) async -> String?
{
let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds))
while Date() < deadline {
if let text = await self.latestAssistantText(sessionKey: sessionKey, since: since) {
return text
}
try? await Task.sleep(nanoseconds: 300_000_000)
}
return nil
}
private func latestAssistantText(sessionKey: String, since: Double? = nil) async -> String? {
do {
let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey)
let messages = history.messages ?? []
@@ -343,7 +363,13 @@ actor TalkModeRuntime {
guard let data = try? JSONEncoder().encode(item) else { return nil }
return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data)
}
guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil }
let assistant = decoded.last { message in
guard message.role == "assistant" else { return false }
guard let since else { return true }
guard let timestamp = message.timestamp else { return false }
return timestamp >= since - 0.5
}
guard let assistant else { return nil }
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
return trimmed.isEmpty ? nil : trimmed

View File

@@ -20,9 +20,9 @@ final class TalkOverlayController {
private var window: NSPanel?
private var hostingView: NSHostingView<TalkOverlayView>?
private let width: CGFloat = 120
private let height: CGFloat = 120
private let padding: CGFloat = 6
private let width: CGFloat = 160
private let height: CGFloat = 160
private let padding: CGFloat = 8
func present() {
self.ensureWindow()

View File

@@ -7,12 +7,12 @@ struct TalkOverlayView: View {
var body: some View {
ZStack(alignment: .topLeading) {
TalkOrbView(phase: self.controller.model.phase, level: self.controller.model.level)
.frame(width: 80, height: 80)
.frame(width: 96, height: 96)
.contentShape(Rectangle())
.onTapGesture {
TalkModeController.shared.stopSpeaking(reason: .userTap)
}
.padding(16)
.padding(26)
Button {
TalkModeController.shared.exitTalkMode()
@@ -29,7 +29,7 @@ struct TalkOverlayView: View {
.padding(4)
.onHover { self.hovering = $0 }
}
.frame(width: 120, height: 120, alignment: .center)
.frame(width: 160, height: 160, alignment: .center)
}
}
@@ -72,6 +72,7 @@ private struct TalkWaveRings: View {
let phase: TalkModePhase
let level: Double
let time: TimeInterval
private let ringColor = Color(red: 0.82, green: 0.94, blue: 1.0)
var body: some View {
ZStack {
@@ -80,9 +81,9 @@ private struct TalkWaveRings: View {
let progress = (time * speed + Double(idx) * 0.28).truncatingRemainder(dividingBy: 1)
let amplitude = phase == .speaking ? 0.95 : phase == .listening ? 0.5 + level * 0.7 : 0.35
let scale = 0.75 + progress * amplitude + (phase == .listening ? level * 0.15 : 0)
let alpha = phase == .speaking ? 0.55 : phase == .listening ? 0.45 + level * 0.25 : 0.28
let alpha = phase == .speaking ? 0.72 : phase == .listening ? 0.58 + level * 0.28 : 0.4
Circle()
.stroke(Color.white.opacity(alpha - progress * 0.35), lineWidth: 1.2)
.stroke(self.ringColor.opacity(alpha - progress * 0.3), lineWidth: 1.6)
.scaleEffect(scale)
.opacity(alpha - progress * 0.6)
}
@@ -97,13 +98,13 @@ private struct TalkOrbitArcs: View {
ZStack {
Circle()
.trim(from: 0.08, to: 0.26)
.stroke(Color.white.opacity(0.75), style: StrokeStyle(lineWidth: 1.4, lineCap: .round))
.stroke(Color.white.opacity(0.88), style: StrokeStyle(lineWidth: 1.6, lineCap: .round))
.rotationEffect(.degrees(time * 42))
Circle()
.trim(from: 0.62, to: 0.86)
.stroke(Color.white.opacity(0.55), style: StrokeStyle(lineWidth: 1.2, lineCap: .round))
.stroke(Color.white.opacity(0.7), style: StrokeStyle(lineWidth: 1.4, lineCap: .round))
.rotationEffect(.degrees(-time * 35))
}
.scaleEffect(1.05)
.scaleEffect(1.08)
}
}