fix(talk): hard-timeout ElevenLabs synthesis

2025-12-30 05:46:47 +01:00
parent dcee8beb99
commit be2bc61d38
4 changed files with 49 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries.
 - macOS Debug: hide “Restart Gateway” when the app won’t start a local gateway (remote mode / attach-only).
 - macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured.
+- macOS Talk Mode: add hard timeout around ElevenLabs TTS synthesis to avoid getting stuck “speaking” forever on hung requests.
 - macOS Talk Mode: avoid stuck playback when the audio player never starts (fail-fast + watchdog).
 - macOS Talk Mode: increase overlay window size so wave rings don’t clip; close button is hover-only and closer to the orb.
 - Talk Mode: wait for chat history to surface the assistant reply before starting TTS (macOS/iOS/Android).
--- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift
+++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift
@@ -245,6 +245,7 @@ actor TalkModeRuntime {
        let prompt = self.buildPrompt(transcript: transcript)
        let runId = UUID().uuidString
        let startedAt = Date().timeIntervalSince1970
+        self.logger.info("talk send start runId=\(runId, privacy: .public) chars=\(prompt.count, privacy: .public)")

        do {
            let response = try await GatewayConnection.shared.chatSend(
@@ -253,9 +254,11 @@ actor TalkModeRuntime {
                thinking: "low",
                idempotencyKey: runId,
                attachments: [])
+            self.logger.info("talk chat.send ok runId=\(response.runId, privacy: .public)")
            let completion = await self.waitForChatCompletion(
                runId: response.runId,
                timeoutSeconds: 120)
+            self.logger.info("talk chat completion runId=\(response.runId, privacy: .public) state=\(String(describing: completion), privacy: .public)")
            guard completion == .final else {
                await self.startListening()
                await self.startRecognition()
@@ -267,11 +270,13 @@ actor TalkModeRuntime {
                since: startedAt,
                timeoutSeconds: 12)
            else {
+                self.logger.warning("talk assistant text missing after completion")
                await self.startListening()
                await self.startRecognition()
                return
            }

+            self.logger.info("talk assistant text len=\(assistantText.count, privacy: .public)")
            await self.playAssistant(text: assistantText)
            await self.startListening()
            await self.startRecognition()
@@ -301,11 +306,20 @@ actor TalkModeRuntime {
        return lines.joined(separator: "\n")
    }

-    private enum ChatCompletionState {
+    private enum ChatCompletionState: CustomStringConvertible {
        case final
        case aborted
        case error
        case timeout
+
+        var description: String {
+            switch self {
+            case .final: return "final"
+            case .aborted: return "aborted"
+            case .error: return "error"
+            case .timeout: return "timeout"
+            }
+        }
    }

    private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState {
@@ -421,6 +435,7 @@ actor TalkModeRuntime {
            self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
            return
        }
+        self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")

        await self.startRecognition()
        await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
@@ -445,11 +460,28 @@ actor TalkModeRuntime {
            normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger),
            language: Self.validatedLanguage(directive?.language, logger: self.logger))

+        let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
+        self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
+
        do {
-            let audio = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).synthesize(
-                voiceId: voiceId,
-                request: request)
+            let client = ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger)
+            let audio = try await withThrowingTaskGroup(of: Data.self) { group in
+                group.addTask {
+                    try await client.synthesize(voiceId: voiceId, request: request)
+                }
+                group.addTask {
+                    try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000))
+                    throw NSError(domain: "TalkTTS", code: 408, userInfo: [
+                        NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s",
+                    ])
+                }
+                let data = try await group.next()!
+                group.cancelAll()
+                return data
+            }
+            self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
            let result = await TalkAudioPlayer.shared.play(data: audio)
+            self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
            if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
                if self.interruptOnSpeech {
                    self.lastInterruptedAtSeconds = interruptedAt
@@ -515,6 +547,10 @@ actor TalkModeRuntime {
        self.defaultOutputFormat = cfg.outputFormat
        self.interruptOnSpeech = cfg.interruptOnSpeech
        self.apiKey = cfg.apiKey
+        let hasApiKey = (cfg.apiKey?.isEmpty == false)
+        let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none"
+        let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none"
+        self.logger.info("talk config voiceId=\(voiceLabel, privacy: .public) modelId=\(modelLabel, privacy: .public) apiKey=\(hasApiKey, privacy: .public) interrupt=\(cfg.interruptOnSpeech, privacy: .public)")
    }

    private struct TalkRuntimeConfig {
@@ -702,6 +738,8 @@ private struct ElevenLabsClient {
    let apiKey: String
    let logger: Logger
    let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
+    let ttsTimeoutSeconds: TimeInterval = 45
+    let listVoicesTimeoutSeconds: TimeInterval = 15

    func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
        var url = self.baseUrl
@@ -746,6 +784,7 @@ private struct ElevenLabsClient {
        var req = URLRequest(url: url)
        req.httpMethod = "POST"
        req.httpBody = body
+        req.timeoutInterval = self.ttsTimeoutSeconds
        req.setValue("application/json", forHTTPHeaderField: "Content-Type")
        req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
        req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
@@ -772,6 +811,7 @@ private struct ElevenLabsClient {
        self.logger.info("elevenlabs voices list request")
        var req = URLRequest(url: url)
        req.httpMethod = "GET"
+        req.timeoutInterval = self.listVoicesTimeoutSeconds
        req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")

        let (data, response) = try await URLSession.shared.data(for: req)
--- a/apps/macos/Sources/Clawdis/TalkOverlay.swift
+++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift
@@ -7,7 +7,7 @@ import SwiftUI
@Observable
 final class TalkOverlayController {
    static let shared = TalkOverlayController()
-    static let overlaySize: CGFloat = 260
+    static let overlaySize: CGFloat = 320

    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay")

--- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift
+++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift
@@ -6,12 +6,14 @@ struct TalkOverlayView: View {
    @State private var hoveringWindow = false

    var body: some View {
-        ZStack {
+        ZStack(alignment: .topTrailing) {
            TalkOrbView(
                phase: self.controller.model.phase,
                level: self.controller.model.level,
                accent: self.seamColor)
                .frame(width: 96, height: 96)
+                .padding(.top, 6)
+                .padding(.trailing, 6)
                .contentShape(Circle())
                .onTapGesture {
                    TalkModeController.shared.stopSpeaking(reason: .userTap)