diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt index 2bf35a276..de32c95c3 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt @@ -84,20 +84,24 @@ class TalkModeManager( private var session: BridgeSession? = null private var pendingRunId: String? = null private var pendingFinal: CompletableDeferred? = null + private var chatSubscribed = false private var player: MediaPlayer? = null private var currentAudioFile: File? = null fun attachSession(session: BridgeSession) { this.session = session + chatSubscribed = false } fun setEnabled(enabled: Boolean) { if (_isEnabled.value == enabled) return _isEnabled.value = enabled if (enabled) { + Log.d(tag, "enabled") start() } else { + Log.d(tag, "disabled") stop() } } @@ -127,9 +131,11 @@ class TalkModeManager( if (_isListening.value) return@post stopRequested = false listeningMode = true + Log.d(tag, "start") if (!SpeechRecognizer.isRecognitionAvailable(context)) { _statusText.value = "Speech recognizer unavailable" + Log.w(tag, "speech recognizer unavailable") return@post } @@ -138,6 +144,7 @@ class TalkModeManager( PackageManager.PERMISSION_GRANTED if (!micOk) { _statusText.value = "Microphone permission required" + Log.w(tag, "microphone permission required") return@post } @@ -146,8 +153,10 @@ class TalkModeManager( recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) } startListeningInternal(markListening = true) startSilenceMonitor() + Log.d(tag, "listening") } catch (err: Throwable) { _statusText.value = "Start failed: ${err.message ?: err::class.simpleName}" + Log.w(tag, "start failed: ${err.message ?: err::class.simpleName}") } } } @@ -164,6 +173,7 @@ class TalkModeManager( _isListening.value = false _statusText.value = "Off" stopSpeaking() + chatSubscribed = false mainHandler.post { recognizer?.cancel() @@ -264,28 +274,36 @@ class TalkModeManager( val bridge = session if (bridge == null) { _statusText.value = "Bridge not connected" + Log.w(tag, "finalize: bridge not connected") start() return } try { val startedAt = System.currentTimeMillis().toDouble() / 1000.0 + subscribeChatIfNeeded(bridge = bridge, sessionKey = "main") + Log.d(tag, "chat.send start chars=${prompt.length}") val runId = sendChat(prompt, bridge) + Log.d(tag, "chat.send ok runId=$runId") val ok = waitForChatFinal(runId) if (!ok) { _statusText.value = "No reply" + Log.w(tag, "chat final timeout runId=$runId") start() return } val assistant = waitForAssistantText(bridge, startedAt, 12_000) if (assistant.isNullOrBlank()) { _statusText.value = "No reply" + Log.w(tag, "assistant text timeout runId=$runId") start() return } + Log.d(tag, "assistant text ok chars=${assistant.length}") playAssistant(assistant) } catch (err: Throwable) { _statusText.value = "Talk failed: ${err.message ?: err::class.simpleName}" + Log.w(tag, "finalize failed: ${err.message ?: err::class.simpleName}") } if (_isEnabled.value) { @@ -293,6 +311,19 @@ class TalkModeManager( } } + private suspend fun subscribeChatIfNeeded(bridge: BridgeSession, sessionKey: String) { + if (chatSubscribed) return + val key = sessionKey.trim() + if (key.isEmpty()) return + try { + bridge.sendEvent("chat.subscribe", """{"sessionKey":"$key"}""") + chatSubscribed = true + Log.d(tag, "chat.subscribe ok sessionKey=$key") + } catch (err: Throwable) { + Log.w(tag, "chat.subscribe failed sessionKey=$key err=${err.message ?: err::class.java.simpleName}") + } + } + private fun buildPrompt(transcript: String): String { val lines = mutableListOf( "Talk Mode active. Reply in a concise, spoken tone.", @@ -410,6 +441,7 @@ class TalkModeManager( val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId if (voiceId.isNullOrBlank()) { _statusText.value = "Missing voice ID" + Log.w(tag, "missing voiceId") return } @@ -418,6 +450,7 @@ class TalkModeManager( ?: System.getenv("ELEVENLABS_API_KEY")?.trim() if (apiKey.isNullOrEmpty()) { _statusText.value = "Missing ELEVENLABS_API_KEY" + Log.w(tag, "missing ELEVENLABS_API_KEY") return } @@ -427,6 +460,7 @@ class TalkModeManager( ensureInterruptListener() try { + val ttsStarted = SystemClock.elapsedRealtime() val request = ElevenLabsRequest( text = cleaned, @@ -442,9 +476,11 @@ class TalkModeManager( language = TalkModeRuntime.validatedLanguage(directive?.language), ) val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request) + Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}") playAudio(audio) } catch (err: Throwable) { _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}" + Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}") } _isSpeaking.value = false @@ -480,11 +516,13 @@ class TalkModeManager( player.prepareAsync() } + Log.d(tag, "play start") try { finished.await() } finally { cleanupPlayer() } + Log.d(tag, "play done") } private fun stopSpeaking(resetInterrupt: Boolean = true) { diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index c11b890a9..6e2e5440d 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -3,6 +3,7 @@ import ClawdisKit import Foundation import Observation import Speech +import OSLog @MainActor @Observable @@ -36,6 +37,9 @@ final class TalkModeManager: NSObject { private let silenceWindow: TimeInterval = 0.7 private var player: AVAudioPlayer? + private var chatSubscribedSessionKeys = Set() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "TalkMode") func attachBridge(_ bridge: BridgeSession) { self.bridge = bridge @@ -44,8 +48,10 @@ final class TalkModeManager: NSObject { func setEnabled(_ enabled: Bool) { self.isEnabled = enabled if enabled { + self.logger.info("enabled") Task { await self.start() } } else { + self.logger.info("disabled") self.stop() } } @@ -54,14 +60,17 @@ final class TalkModeManager: NSObject { guard self.isEnabled else { return } if self.isListening { return } + self.logger.info("start") self.statusText = "Requesting permissions…" let micOk = await Self.requestMicrophonePermission() guard micOk else { + self.logger.warning("start blocked: microphone permission denied") self.statusText = "Microphone permission denied" return } let speechOk = await Self.requestSpeechPermission() guard speechOk else { + self.logger.warning("start blocked: speech permission denied") self.statusText = "Speech recognition permission denied" return } @@ -73,9 +82,12 @@ final class TalkModeManager: NSObject { self.isListening = true self.statusText = "Listening" self.startSilenceMonitor() + await self.subscribeChatIfNeeded(sessionKey: "main") + self.logger.info("listening") } catch { self.isListening = false self.statusText = "Start failed: \(error.localizedDescription)" + self.logger.error("start failed: \(error.localizedDescription, privacy: .public)") } } @@ -89,6 +101,11 @@ final class TalkModeManager: NSObject { self.silenceTask = nil self.stopRecognition() self.stopSpeaking() + Task { await self.unsubscribeAllChats() } + } + + func userTappedOrb() { + self.stopSpeaking() } private func startRecognition() throws { @@ -191,16 +208,21 @@ final class TalkModeManager: NSObject { let prompt = self.buildPrompt(transcript: transcript) guard let bridge else { self.statusText = "Bridge not connected" + self.logger.warning("finalize: bridge not connected") await self.start() return } do { let startedAt = Date().timeIntervalSince1970 + await self.subscribeChatIfNeeded(sessionKey: "main") + self.logger.info("chat.send start chars=\(prompt.count, privacy: .public)") let runId = try await self.sendChat(prompt, bridge: bridge) + self.logger.info("chat.send ok runId=\(runId, privacy: .public)") let ok = await self.waitForChatFinal(runId: runId, bridge: bridge) if !ok { self.statusText = "No reply" + self.logger.warning("chat final timeout runId=\(runId, privacy: .public)") await self.start() return } @@ -211,17 +233,50 @@ final class TalkModeManager: NSObject { timeoutSeconds: 12) else { self.statusText = "No reply" + self.logger.warning("assistant text timeout runId=\(runId, privacy: .public)") await self.start() return } + self.logger.info("assistant text ok chars=\(assistantText.count, privacy: .public)") await self.playAssistant(text: assistantText) } catch { self.statusText = "Talk failed: \(error.localizedDescription)" + self.logger.error("finalize failed: \(error.localizedDescription, privacy: .public)") } await self.start() } + private func subscribeChatIfNeeded(sessionKey: String) async { + let key = sessionKey.trimmingCharacters(in: .whitespacesAndNewlines) + guard !key.isEmpty else { return } + guard let bridge else { return } + guard !self.chatSubscribedSessionKeys.contains(key) else { return } + + do { + let payload = "{\"sessionKey\":\"\(key)\"}" + try await bridge.sendEvent(event: "chat.subscribe", payloadJSON: payload) + self.chatSubscribedSessionKeys.insert(key) + self.logger.info("chat.subscribe ok sessionKey=\(key, privacy: .public)") + } catch { + self.logger.warning("chat.subscribe failed sessionKey=\(key, privacy: .public) err=\(error.localizedDescription, privacy: .public)") + } + } + + private func unsubscribeAllChats() async { + guard let bridge else { return } + let keys = self.chatSubscribedSessionKeys + self.chatSubscribedSessionKeys.removeAll() + for key in keys { + do { + let payload = "{\"sessionKey\":\"\(key)\"}" + try await bridge.sendEvent(event: "chat.unsubscribe", payloadJSON: payload) + } catch { + // ignore + } + } + } + private func buildPrompt(transcript: String) -> String { var lines: [String] = [ "Talk Mode active. Reply in a concise, spoken tone.", @@ -326,6 +381,7 @@ final class TalkModeManager: NSObject { let voiceId = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId guard let voiceId, !voiceId.isEmpty else { self.statusText = "Missing voice ID" + self.logger.error("missing voiceId") return } @@ -334,6 +390,7 @@ final class TalkModeManager: NSObject { ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] guard let apiKey = resolvedKey, !apiKey.isEmpty else { self.statusText = "Missing ELEVENLABS_API_KEY" + self.logger.error("missing ELEVENLABS_API_KEY") return } @@ -342,6 +399,7 @@ final class TalkModeManager: NSObject { self.lastSpokenText = cleaned do { + let started = Date() let request = ElevenLabsRequest( text: cleaned, modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, @@ -359,9 +417,11 @@ final class TalkModeManager: NSObject { let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( voiceId: voiceId, request: request) + self.logger.info("elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s") try await self.playAudio(data: audio) } catch { self.statusText = "Speak failed: \(error.localizedDescription)" + self.logger.error("speak failed: \(error.localizedDescription, privacy: .public)") } self.isSpeaking = false @@ -372,10 +432,12 @@ final class TalkModeManager: NSObject { let player = try AVAudioPlayer(data: data) self.player = player player.prepareToPlay() + self.logger.info("play start") player.play() while player.isPlaying { try? await Task.sleep(nanoseconds: 120_000_000) } + self.logger.info("play done") } private func stopSpeaking() { diff --git a/apps/macos/Sources/Clawdis/GatewayConnection.swift b/apps/macos/Sources/Clawdis/GatewayConnection.swift index 289507cc5..cd87eea2c 100644 --- a/apps/macos/Sources/Clawdis/GatewayConnection.swift +++ b/apps/macos/Sources/Clawdis/GatewayConnection.swift @@ -473,7 +473,10 @@ extension GatewayConnection { params["attachments"] = AnyCodable(encoded) } - return try await self.requestDecoded(method: .chatSend, params: params) + return try await self.requestDecoded( + method: .chatSend, + params: params, + timeoutMs: Double(timeoutMs)) } func chatAbort(sessionKey: String, runId: String) async throws -> Bool { diff --git a/apps/shared/ClawdisKit/Sources/ClawdisChatUI/ChatViewModel.swift b/apps/shared/ClawdisKit/Sources/ClawdisChatUI/ChatViewModel.swift index 4c96b8075..4936d4438 100644 --- a/apps/shared/ClawdisKit/Sources/ClawdisChatUI/ChatViewModel.swift +++ b/apps/shared/ClawdisKit/Sources/ClawdisChatUI/ChatViewModel.swift @@ -293,8 +293,15 @@ public final class ClawdisChatViewModel { return } - if let runId = chat.runId, !self.pendingRuns.contains(runId) { - // Ignore events for other runs. + let isOurRun = chat.runId.flatMap { self.pendingRuns.contains($0) } ?? false + if !isOurRun { + // Keep multiple clients in sync: if another client finishes a run for our session, refresh history. + switch chat.state { + case "final", "aborted", "error": + Task { await self.refreshHistoryAfterRun() } + default: + break + } return } diff --git a/src/gateway/server.test.ts b/src/gateway/server.test.ts index dec83329d..22852c049 100644 --- a/src/gateway/server.test.ts +++ b/src/gateway/server.test.ts @@ -435,7 +435,10 @@ async function waitForSystemEvent(timeoutMs = 2000) { } describe("gateway server", () => { - test("voicewake.get returns defaults and voicewake.set broadcasts", async () => { + test( + "voicewake.get returns defaults and voicewake.set broadcasts", + { timeout: 15_000 }, + async () => { const homeDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-home-")); const prevHome = process.env.HOME; process.env.HOME = homeDir; @@ -486,7 +489,8 @@ describe("gateway server", () => { } else { process.env.HOME = prevHome; } - }); + }, + ); test("models.list returns model catalog", async () => { piSdkMock.enabled = true; @@ -3328,6 +3332,90 @@ describe("gateway server", () => { await server.close(); }); + test("bridge voice transcript triggers chat events for webchat clients", async () => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-gw-")); + testSessionStorePath = path.join(dir, "sessions.json"); + await fs.writeFile( + testSessionStorePath, + JSON.stringify( + { + main: { + sessionId: "sess-main", + updatedAt: Date.now(), + }, + }, + null, + 2, + ), + "utf-8", + ); + + const { server, ws } = await startServerWithClient(); + await connectOk(ws, { + client: { + name: "webchat", + version: "1.0.0", + platform: "test", + mode: "webchat", + }, + }); + + const bridgeCall = bridgeStartCalls.at(-1); + expect(bridgeCall?.onEvent).toBeDefined(); + + const isVoiceFinalChatEvent = (o: unknown) => { + if (!o || typeof o !== "object") return false; + const rec = o as Record; + if (rec.type !== "event" || rec.event !== "chat") return false; + if (!rec.payload || typeof rec.payload !== "object") return false; + const payload = rec.payload as Record; + const runId = typeof payload.runId === "string" ? payload.runId : ""; + const state = typeof payload.state === "string" ? payload.state : ""; + return runId.startsWith("voice-") && state === "final"; + }; + + const finalChatP = onceMessage<{ + type: "event"; + event: string; + payload?: unknown; + }>(ws, isVoiceFinalChatEvent, 8000); + + await bridgeCall?.onEvent?.("ios-node", { + event: "voice.transcript", + payloadJSON: JSON.stringify({ text: "hello", sessionKey: "main" }), + }); + + emitAgentEvent({ + runId: "sess-main", + seq: 1, + ts: Date.now(), + stream: "assistant", + data: { text: "hi from agent" }, + }); + emitAgentEvent({ + runId: "sess-main", + seq: 2, + ts: Date.now(), + stream: "job", + data: { state: "done" }, + }); + + const evt = await finalChatP; + const payload = + evt.payload && typeof evt.payload === "object" + ? (evt.payload as Record) + : {}; + expect(payload.sessionKey).toBe("main"); + const message = + payload.message && typeof payload.message === "object" + ? (payload.message as Record) + : {}; + expect(message.role).toBe("assistant"); + + ws.close(); + await server.close(); + }); + test("bridge chat.abort cancels while saving the session store", async () => { const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-gw-")); testSessionStorePath = path.join(dir, "sessions.json"); diff --git a/src/gateway/server.ts b/src/gateway/server.ts index db6e506f8..c6f259b96 100644 --- a/src/gateway/server.ts +++ b/src/gateway/server.ts @@ -3064,6 +3064,13 @@ export async function startGatewayServer( await saveSessionStore(storePath, store); } + // Ensure chat UI clients refresh when this run completes (even though it wasn't started via chat.send). + // This maps agent bus events (keyed by sessionId) to chat events (keyed by clientRunId). + chatRunSessions.set(sessionId, { + sessionKey, + clientRunId: `voice-${randomUUID()}`, + }); + void agentCommand( { message: text,