diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b8b4f870..079a12308 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries. - macOS Debug: hide “Restart Gateway” when the app won’t start a local gateway (remote mode / attach-only). - macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured. +- macOS Talk Mode: add hard timeout around ElevenLabs TTS synthesis to avoid getting stuck “speaking” forever on hung requests. - macOS Talk Mode: avoid stuck playback when the audio player never starts (fail-fast + watchdog). - macOS Talk Mode: increase overlay window size so wave rings don’t clip; close button is hover-only and closer to the orb. - Talk Mode: wait for chat history to surface the assistant reply before starting TTS (macOS/iOS/Android). diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 73c26a9e5..7f5280805 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -245,6 +245,7 @@ actor TalkModeRuntime { let prompt = self.buildPrompt(transcript: transcript) let runId = UUID().uuidString let startedAt = Date().timeIntervalSince1970 + self.logger.info("talk send start runId=\(runId, privacy: .public) chars=\(prompt.count, privacy: .public)") do { let response = try await GatewayConnection.shared.chatSend( @@ -253,9 +254,11 @@ actor TalkModeRuntime { thinking: "low", idempotencyKey: runId, attachments: []) + self.logger.info("talk chat.send ok runId=\(response.runId, privacy: .public)") let completion = await self.waitForChatCompletion( runId: response.runId, timeoutSeconds: 120) + self.logger.info("talk chat completion runId=\(response.runId, privacy: .public) state=\(String(describing: completion), privacy: .public)") guard completion == .final else { await self.startListening() await self.startRecognition() @@ -267,11 +270,13 @@ actor TalkModeRuntime { since: startedAt, timeoutSeconds: 12) else { + self.logger.warning("talk assistant text missing after completion") await self.startListening() await self.startRecognition() return } + self.logger.info("talk assistant text len=\(assistantText.count, privacy: .public)") await self.playAssistant(text: assistantText) await self.startListening() await self.startRecognition() @@ -301,11 +306,20 @@ actor TalkModeRuntime { return lines.joined(separator: "\n") } - private enum ChatCompletionState { + private enum ChatCompletionState: CustomStringConvertible { case final case aborted case error case timeout + + var description: String { + switch self { + case .final: return "final" + case .aborted: return "aborted" + case .error: return "error" + case .timeout: return "timeout" + } + } } private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState { @@ -421,6 +435,7 @@ actor TalkModeRuntime { self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID") return } + self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)") await self.startRecognition() await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } @@ -445,11 +460,28 @@ actor TalkModeRuntime { normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger), language: Self.validatedLanguage(directive?.language, logger: self.logger)) + let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12)) + self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s") + do { - let audio = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).synthesize( - voiceId: voiceId, - request: request) + let client = ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger) + let audio = try await withThrowingTaskGroup(of: Data.self) { group in + group.addTask { + try await client.synthesize(voiceId: voiceId, request: request) + } + group.addTask { + try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000)) + throw NSError(domain: "TalkTTS", code: 408, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s", + ]) + } + let data = try await group.next()! + group.cancelAll() + return data + } + self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)") let result = await TalkAudioPlayer.shared.play(data: audio) + self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)") if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking { if self.interruptOnSpeech { self.lastInterruptedAtSeconds = interruptedAt @@ -515,6 +547,10 @@ actor TalkModeRuntime { self.defaultOutputFormat = cfg.outputFormat self.interruptOnSpeech = cfg.interruptOnSpeech self.apiKey = cfg.apiKey + let hasApiKey = (cfg.apiKey?.isEmpty == false) + let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none" + let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none" + self.logger.info("talk config voiceId=\(voiceLabel, privacy: .public) modelId=\(modelLabel, privacy: .public) apiKey=\(hasApiKey, privacy: .public) interrupt=\(cfg.interruptOnSpeech, privacy: .public)") } private struct TalkRuntimeConfig { @@ -702,6 +738,8 @@ private struct ElevenLabsClient { let apiKey: String let logger: Logger let baseUrl: URL = URL(string: "https://api.elevenlabs.io")! + let ttsTimeoutSeconds: TimeInterval = 45 + let listVoicesTimeoutSeconds: TimeInterval = 15 func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data { var url = self.baseUrl @@ -746,6 +784,7 @@ private struct ElevenLabsClient { var req = URLRequest(url: url) req.httpMethod = "POST" req.httpBody = body + req.timeoutInterval = self.ttsTimeoutSeconds req.setValue("application/json", forHTTPHeaderField: "Content-Type") req.setValue("audio/mpeg", forHTTPHeaderField: "Accept") req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") @@ -772,6 +811,7 @@ private struct ElevenLabsClient { self.logger.info("elevenlabs voices list request") var req = URLRequest(url: url) req.httpMethod = "GET" + req.timeoutInterval = self.listVoicesTimeoutSeconds req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") let (data, response) = try await URLSession.shared.data(for: req) diff --git a/apps/macos/Sources/Clawdis/TalkOverlay.swift b/apps/macos/Sources/Clawdis/TalkOverlay.swift index 9ed8f2cb0..3031cb97e 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlay.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift @@ -7,7 +7,7 @@ import SwiftUI @Observable final class TalkOverlayController { static let shared = TalkOverlayController() - static let overlaySize: CGFloat = 260 + static let overlaySize: CGFloat = 320 private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay") diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift index 1dbc85277..5611f3d1f 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -6,12 +6,14 @@ struct TalkOverlayView: View { @State private var hoveringWindow = false var body: some View { - ZStack { + ZStack(alignment: .topTrailing) { TalkOrbView( phase: self.controller.model.phase, level: self.controller.model.level, accent: self.seamColor) .frame(width: 96, height: 96) + .padding(.top, 6) + .padding(.trailing, 6) .contentShape(Circle()) .onTapGesture { TalkModeController.shared.stopSpeaking(reason: .userTap)