fix(talk): hard-timeout ElevenLabs synthesis

This commit is contained in:
Peter Steinberger
2025-12-30 05:46:47 +01:00
parent dcee8beb99
commit be2bc61d38
4 changed files with 49 additions and 6 deletions

View File

@@ -11,6 +11,7 @@
- macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries.
- macOS Debug: hide “Restart Gateway” when the app wont start a local gateway (remote mode / attach-only).
- macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured.
- macOS Talk Mode: add hard timeout around ElevenLabs TTS synthesis to avoid getting stuck “speaking” forever on hung requests.
- macOS Talk Mode: avoid stuck playback when the audio player never starts (fail-fast + watchdog).
- macOS Talk Mode: increase overlay window size so wave rings dont clip; close button is hover-only and closer to the orb.
- Talk Mode: wait for chat history to surface the assistant reply before starting TTS (macOS/iOS/Android).

View File

@@ -245,6 +245,7 @@ actor TalkModeRuntime {
let prompt = self.buildPrompt(transcript: transcript)
let runId = UUID().uuidString
let startedAt = Date().timeIntervalSince1970
self.logger.info("talk send start runId=\(runId, privacy: .public) chars=\(prompt.count, privacy: .public)")
do {
let response = try await GatewayConnection.shared.chatSend(
@@ -253,9 +254,11 @@ actor TalkModeRuntime {
thinking: "low",
idempotencyKey: runId,
attachments: [])
self.logger.info("talk chat.send ok runId=\(response.runId, privacy: .public)")
let completion = await self.waitForChatCompletion(
runId: response.runId,
timeoutSeconds: 120)
self.logger.info("talk chat completion runId=\(response.runId, privacy: .public) state=\(String(describing: completion), privacy: .public)")
guard completion == .final else {
await self.startListening()
await self.startRecognition()
@@ -267,11 +270,13 @@ actor TalkModeRuntime {
since: startedAt,
timeoutSeconds: 12)
else {
self.logger.warning("talk assistant text missing after completion")
await self.startListening()
await self.startRecognition()
return
}
self.logger.info("talk assistant text len=\(assistantText.count, privacy: .public)")
await self.playAssistant(text: assistantText)
await self.startListening()
await self.startRecognition()
@@ -301,11 +306,20 @@ actor TalkModeRuntime {
return lines.joined(separator: "\n")
}
private enum ChatCompletionState {
private enum ChatCompletionState: CustomStringConvertible {
case final
case aborted
case error
case timeout
var description: String {
switch self {
case .final: return "final"
case .aborted: return "aborted"
case .error: return "error"
case .timeout: return "timeout"
}
}
}
private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState {
@@ -421,6 +435,7 @@ actor TalkModeRuntime {
self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
return
}
self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")
await self.startRecognition()
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
@@ -445,11 +460,28 @@ actor TalkModeRuntime {
normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger),
language: Self.validatedLanguage(directive?.language, logger: self.logger))
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
do {
let audio = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).synthesize(
voiceId: voiceId,
request: request)
let client = ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger)
let audio = try await withThrowingTaskGroup(of: Data.self) { group in
group.addTask {
try await client.synthesize(voiceId: voiceId, request: request)
}
group.addTask {
try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000))
throw NSError(domain: "TalkTTS", code: 408, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s",
])
}
let data = try await group.next()!
group.cancelAll()
return data
}
self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
let result = await TalkAudioPlayer.shared.play(data: audio)
self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
if self.interruptOnSpeech {
self.lastInterruptedAtSeconds = interruptedAt
@@ -515,6 +547,10 @@ actor TalkModeRuntime {
self.defaultOutputFormat = cfg.outputFormat
self.interruptOnSpeech = cfg.interruptOnSpeech
self.apiKey = cfg.apiKey
let hasApiKey = (cfg.apiKey?.isEmpty == false)
let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none"
let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none"
self.logger.info("talk config voiceId=\(voiceLabel, privacy: .public) modelId=\(modelLabel, privacy: .public) apiKey=\(hasApiKey, privacy: .public) interrupt=\(cfg.interruptOnSpeech, privacy: .public)")
}
private struct TalkRuntimeConfig {
@@ -702,6 +738,8 @@ private struct ElevenLabsClient {
let apiKey: String
let logger: Logger
let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
let ttsTimeoutSeconds: TimeInterval = 45
let listVoicesTimeoutSeconds: TimeInterval = 15
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
var url = self.baseUrl
@@ -746,6 +784,7 @@ private struct ElevenLabsClient {
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.timeoutInterval = self.ttsTimeoutSeconds
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
@@ -772,6 +811,7 @@ private struct ElevenLabsClient {
self.logger.info("elevenlabs voices list request")
var req = URLRequest(url: url)
req.httpMethod = "GET"
req.timeoutInterval = self.listVoicesTimeoutSeconds
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)

View File

@@ -7,7 +7,7 @@ import SwiftUI
@Observable
final class TalkOverlayController {
static let shared = TalkOverlayController()
static let overlaySize: CGFloat = 260
static let overlaySize: CGFloat = 320
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay")

View File

@@ -6,12 +6,14 @@ struct TalkOverlayView: View {
@State private var hoveringWindow = false
var body: some View {
ZStack {
ZStack(alignment: .topTrailing) {
TalkOrbView(
phase: self.controller.model.phase,
level: self.controller.model.level,
accent: self.seamColor)
.frame(width: 96, height: 96)
.padding(.top, 6)
.padding(.trailing, 6)
.contentShape(Circle())
.onTapGesture {
TalkModeController.shared.stopSpeaking(reason: .userTap)