fix(talk): harden TTS + add system fallback

2025-12-30 07:40:02 +01:00
parent a7617e4d79
commit f86772f26c
22 changed files with 839 additions and 468 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,8 @@
 - macOS Talk Mode: fix audio stop ordering so disabling Talk Mode always stops in-flight playback.
 - macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn.
 - macOS Talk Mode: increase overlay window size so wave rings don’t clip; close button is hover-only and closer to the orb.
+- Talk Mode: fall back to system TTS when ElevenLabs is unavailable, returns non-audio, or playback fails (macOS/iOS/Android).
+- ElevenLabs: add retry/backoff for 429/5xx and include content-type in errors for debugging.
 - Talk Mode: align to the gateway’s main session key and fall back to history polling when chat events drop (prevents stuck “thinking” / missing messages).
 - Talk Mode: treat history timestamps as seconds or milliseconds to avoid stale assistant picks (macOS/iOS/Android).
 - Chat UI: dedupe identical history messages to avoid duplicate bubbles.
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt
@@ -930,7 +930,7 @@ class NodeRuntime(context: Context) {

 private data class Quad<A, B, C, D>(val first: A, val second: B, val third: C, val fourth: D)

-private const val DEFAULT_SEAM_COLOR_ARGB: Long = 0xFF7FB8D4
+private const val DEFAULT_SEAM_COLOR_ARGB: Long = 0xFF4F7A9A

 private const val a2uiReadyCheckJS: String =
  """
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/TalkOrbOverlay.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/TalkOrbOverlay.kt
@@ -62,9 +62,9 @@ fun TalkOrbOverlay(
    verticalArrangement = Arrangement.spacedBy(12.dp),
  ) {
    Box(contentAlignment = Alignment.Center) {
-      Canvas(modifier = Modifier.size(300.dp)) {
+      Canvas(modifier = Modifier.size(360.dp)) {
        val center = this.center
-        val baseRadius = size.minDimension * 0.27f
+        val baseRadius = size.minDimension * 0.30f

        val ring1 = 1.05f + (t * 0.25f)
        val ring2 = 1.20f + (t * 0.55f)
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
@@ -13,6 +13,8 @@ import android.os.SystemClock
 import android.speech.RecognitionListener
 import android.speech.RecognizerIntent
 import android.speech.SpeechRecognizer
+import android.speech.tts.TextToSpeech
+import android.speech.tts.UtteranceProgressListener
 import android.util.Log
 import androidx.core.content.ContextCompat
 import com.steipete.clawdis.node.bridge.BridgeSession
@@ -89,6 +91,9 @@ class TalkModeManager(

  private var player: MediaPlayer? = null
  private var currentAudioFile: File? = null
+  private var systemTts: TextToSpeech? = null
+  private var systemTtsPending: CompletableDeferred<Unit>? = null
+  private var systemTtsPendingId: String? = null

  fun attachSession(session: BridgeSession) {
    this.session = session
@@ -181,6 +186,10 @@ class TalkModeManager(
      recognizer?.destroy()
      recognizer = null
    }
+    systemTts?.stop()
+    systemTtsPending?.cancel()
+    systemTtsPending = null
+    systemTtsPendingId = null
  }

  private fun startListeningInternal(markListening: Boolean) {
@@ -441,16 +450,6 @@ class TalkModeManager(
      apiKey?.trim()?.takeIf { it.isNotEmpty() }
        ?: System.getenv("ELEVENLABS_API_KEY")?.trim()
    val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId
-    if (voiceId.isNullOrBlank()) {
-      _statusText.value = "Missing voice ID"
-      Log.w(tag, "missing voiceId")
-      return
-    }
-    if (apiKey.isNullOrEmpty()) {
-      _statusText.value = "Missing ELEVENLABS_API_KEY"
-      Log.w(tag, "missing ELEVENLABS_API_KEY")
-      return
-    }

    _statusText.value = "Speaking…"
    _isSpeaking.value = true
@@ -458,28 +457,46 @@ class TalkModeManager(
    ensureInterruptListener()

    try {
-      val ttsStarted = SystemClock.elapsedRealtime()
-      val request =
-        ElevenLabsRequest(
-          text = cleaned,
-          modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
-          outputFormat =
-            TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
-          speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
-          stability = TalkModeRuntime.validatedUnit(directive?.stability),
-          similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
-          style = TalkModeRuntime.validatedUnit(directive?.style),
-          speakerBoost = directive?.speakerBoost,
-          seed = TalkModeRuntime.validatedSeed(directive?.seed),
-          normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
-          language = TalkModeRuntime.validatedLanguage(directive?.language),
-        )
-      val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request)
-      Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
-      playAudio(audio)
+      val canUseElevenLabs = !voiceId.isNullOrBlank() && !apiKey.isNullOrEmpty()
+      if (!canUseElevenLabs) {
+        if (voiceId.isNullOrBlank()) {
+          Log.w(tag, "missing voiceId; falling back to system voice")
+        }
+        if (apiKey.isNullOrEmpty()) {
+          Log.w(tag, "missing ELEVENLABS_API_KEY; falling back to system voice")
+        }
+        _statusText.value = "Speaking (System)…"
+        speakWithSystemTts(cleaned)
+      } else {
+        val ttsStarted = SystemClock.elapsedRealtime()
+        val request =
+          ElevenLabsRequest(
+            text = cleaned,
+            modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
+            outputFormat =
+              TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
+            speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
+            stability = TalkModeRuntime.validatedUnit(directive?.stability),
+            similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
+            style = TalkModeRuntime.validatedUnit(directive?.style),
+            speakerBoost = directive?.speakerBoost,
+            seed = TalkModeRuntime.validatedSeed(directive?.seed),
+            normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
+            language = TalkModeRuntime.validatedLanguage(directive?.language),
+          )
+        val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
+        Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
+        playAudio(audio)
+      }
    } catch (err: Throwable) {
-      _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
-      Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}")
+      Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
+      try {
+        _statusText.value = "Speaking (System)…"
+        speakWithSystemTts(cleaned)
+      } catch (fallbackErr: Throwable) {
+        _statusText.value = "Speak failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}"
+        Log.w(tag, "system voice failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}")
+      }
    }

    _isSpeaking.value = false
@@ -524,9 +541,103 @@ class TalkModeManager(
    Log.d(tag, "play done")
  }

+  private suspend fun speakWithSystemTts(text: String) {
+    val trimmed = text.trim()
+    if (trimmed.isEmpty()) return
+    val ok = ensureSystemTts()
+    if (!ok) {
+      throw IllegalStateException("system TTS unavailable")
+    }
+
+    val tts = systemTts ?: throw IllegalStateException("system TTS unavailable")
+    val utteranceId = "talk-${UUID.randomUUID()}"
+    val deferred = CompletableDeferred<Unit>()
+    systemTtsPending?.cancel()
+    systemTtsPending = deferred
+    systemTtsPendingId = utteranceId
+
+    withContext(Dispatchers.Main) {
+      val params = Bundle()
+      tts.speak(trimmed, TextToSpeech.QUEUE_FLUSH, params, utteranceId)
+    }
+
+    withContext(Dispatchers.IO) {
+      try {
+        kotlinx.coroutines.withTimeout(180_000) { deferred.await() }
+      } catch (err: Throwable) {
+        throw err
+      }
+    }
+  }
+
+  private suspend fun ensureSystemTts(): Boolean {
+    if (systemTts != null) return true
+    return withContext(Dispatchers.Main) {
+      val deferred = CompletableDeferred<Boolean>()
+      val tts =
+        try {
+          TextToSpeech(context) { status ->
+            deferred.complete(status == TextToSpeech.SUCCESS)
+          }
+        } catch (_: Throwable) {
+          deferred.complete(false)
+          null
+        }
+      if (tts == null) return@withContext false
+
+      tts.setOnUtteranceProgressListener(
+        object : UtteranceProgressListener() {
+          override fun onStart(utteranceId: String?) {}
+
+          override fun onDone(utteranceId: String?) {
+            if (utteranceId == null) return
+            if (utteranceId != systemTtsPendingId) return
+            systemTtsPending?.complete(Unit)
+            systemTtsPending = null
+            systemTtsPendingId = null
+          }
+
+          @Deprecated("Deprecated in Java")
+          override fun onError(utteranceId: String?) {
+            if (utteranceId == null) return
+            if (utteranceId != systemTtsPendingId) return
+            systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error"))
+            systemTtsPending = null
+            systemTtsPendingId = null
+          }
+
+          override fun onError(utteranceId: String?, errorCode: Int) {
+            if (utteranceId == null) return
+            if (utteranceId != systemTtsPendingId) return
+            systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error $errorCode"))
+            systemTtsPending = null
+            systemTtsPendingId = null
+          }
+        },
+      )
+
+      val ok =
+        try {
+          deferred.await()
+        } catch (_: Throwable) {
+          false
+        }
+      if (ok) {
+        systemTts = tts
+      } else {
+        tts.shutdown()
+      }
+      ok
+    }
+  }
+
  private fun stopSpeaking(resetInterrupt: Boolean = true) {
    if (!_isSpeaking.value) {
      cleanupPlayer()
+      systemTts?.stop()
+      systemTtsPending?.cancel()
+      systemTtsPending = null
+      systemTtsPendingId = null
      return
    }
    if (resetInterrupt) {
@@ -534,6 +645,10 @@ class TalkModeManager(
      lastInterruptedAtSeconds = currentMs / 1000.0
    }
    cleanupPlayer()
+    systemTts?.stop()
+    systemTtsPending?.cancel()
+    systemTtsPending = null
+    systemTtsPendingId = null
    _isSpeaking.value = false
  }

--- a/apps/ios/Sources/Model/NodeAppModel.swift
+++ b/apps/ios/Sources/Model/NodeAppModel.swift
@@ -293,7 +293,7 @@ final class NodeAppModel {
        Self.color(fromHex: self.seamColorHex) ?? Self.defaultSeamColor
    }

-    private static let defaultSeamColor = Color(red: 127 / 255.0, green: 184 / 255.0, blue: 212 / 255.0)
+    private static let defaultSeamColor = Color(red: 79 / 255.0, green: 122 / 255.0, blue: 154 / 255.0)

    private static func color(fromHex raw: String?) -> Color? {
        let trimmed = (raw ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
--- a/apps/ios/Sources/Voice/TalkModeManager.swift
+++ b/apps/ios/Sources/Voice/TalkModeManager.swift
@@ -105,6 +105,7 @@ final class TalkModeManager: NSObject {
        self.stopRecognition()
        self.stopSpeaking()
        self.lastInterruptedAtSeconds = nil
+        TalkSystemSpeechSynthesizer.shared.stop()
        do {
            try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
        } catch {
@@ -301,20 +302,9 @@ final class TalkModeManager: NSObject {
    }

    private func buildPrompt(transcript: String) -> String {
-        var lines: [String] = [
-            "Talk Mode active. Reply in a concise, spoken tone.",
-            "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
-        ]
-
-        if let interrupted = self.lastInterruptedAtSeconds {
-            let formatted = String(format: "%.1f", interrupted)
-            lines.append("Assistant speech interrupted at \(formatted)s.")
-            self.lastInterruptedAtSeconds = nil
-        }
-
-        lines.append("")
-        lines.append(transcript)
-        return lines.joined(separator: "\n")
+        let interrupted = self.lastInterruptedAtSeconds
+        self.lastInterruptedAtSeconds = nil
+        return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted)
    }

    private enum ChatCompletionState: CustomStringConvertible {
@@ -409,7 +399,7 @@ final class TalkModeManager: NSObject {
        for msg in messages.reversed() {
            guard (msg["role"] as? String) == "assistant" else { continue }
            if let since, let timestamp = msg["timestamp"] as? Double,
-               TalkModeRuntime.isMessageTimestampAfter(timestamp, sinceSeconds: since) == false
+               TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) == false
            {
                continue
            }
@@ -440,81 +430,91 @@ final class TalkModeManager: NSObject {
            }
        }

-        let voiceId = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId
-        guard let voiceId, !voiceId.isEmpty else {
-            self.statusText = "Missing voice ID"
-            self.logger.error("missing voiceId")
-            return
-        }
-
-        let resolvedKey =
-            (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
-            ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
-        guard let apiKey = resolvedKey, !apiKey.isEmpty else {
-            self.statusText = "Missing ELEVENLABS_API_KEY"
-            self.logger.error("missing ELEVENLABS_API_KEY")
-            return
-        }
-
        self.statusText = "Generating voice…"
        self.isSpeaking = true
        self.lastSpokenText = cleaned

        do {
            let started = Date()
-            let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
-            let outputFormat = TalkModeRuntime.validatedOutputFormat(desiredOutputFormat)
-            if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
-                self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
-            }
-            let request = ElevenLabsRequest(
-                text: cleaned,
-                modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
-                outputFormat: outputFormat,
-                speed: TalkModeRuntime.resolveSpeed(
-                    speed: directive?.speed,
-                    rateWPM: directive?.rateWPM),
-                stability: TalkModeRuntime.validatedUnit(directive?.stability),
-                similarity: TalkModeRuntime.validatedUnit(directive?.similarity),
-                style: TalkModeRuntime.validatedUnit(directive?.style),
-                speakerBoost: directive?.speakerBoost,
-                seed: TalkModeRuntime.validatedSeed(directive?.seed),
-                normalize: TalkModeRuntime.validatedNormalize(directive?.normalize),
-                language: TalkModeRuntime.validatedLanguage(directive?.language))
+            let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)

-            let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
-            let client = ElevenLabsClient(apiKey: apiKey)
-            let audio = try await withThrowingTaskGroup(of: Data.self) { group in
-                group.addTask {
-                    try await client.synthesize(voiceId: voiceId, request: request)
-                }
-                group.addTask {
-                    try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000))
-                    throw NSError(domain: "TalkTTS", code: 408, userInfo: [
-                        NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s",
-                    ])
-                }
-                let data = try await group.next()!
-                group.cancelAll()
-                return data
-            }
-            self.logger
-                .info(
-                    "elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
+            let voiceId = (directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId)?
+                .trimmingCharacters(in: .whitespacesAndNewlines)
+            let resolvedKey =
+                (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
+                ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
+            let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
+            let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)

-            if self.interruptOnSpeech {
-                do {
-                    try self.startRecognition()
-                } catch {
-                    self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
+            if canUseElevenLabs, let voiceId, let apiKey {
+                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
+                let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
+                if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
+                    self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
                }
-            }

-            self.statusText = "Speaking…"
-            try await self.playAudio(data: audio)
+                let request = ElevenLabsTTSRequest(
+                    text: cleaned,
+                    modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
+                    outputFormat: outputFormat,
+                    speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
+                    stability: TalkTTSValidation.validatedUnit(directive?.stability),
+                    similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
+                    style: TalkTTSValidation.validatedUnit(directive?.style),
+                    speakerBoost: directive?.speakerBoost,
+                    seed: TalkTTSValidation.validatedSeed(directive?.seed),
+                    normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
+                    language: language)
+
+                let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
+                let client = ElevenLabsTTSClient(apiKey: apiKey)
+                let audio = try await client.synthesizeWithHardTimeout(
+                    voiceId: voiceId,
+                    request: request,
+                    hardTimeoutSeconds: synthTimeoutSeconds)
+                self.logger
+                    .info(
+                        "elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
+
+                if self.interruptOnSpeech {
+                    do {
+                        try self.startRecognition()
+                    } catch {
+                        self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
+                    }
+                }
+
+                self.statusText = "Speaking…"
+                try await self.playAudio(data: audio)
+            } else {
+                self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
+                if self.interruptOnSpeech {
+                    do {
+                        try self.startRecognition()
+                    } catch {
+                        self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
+                    }
+                }
+                self.statusText = "Speaking (System)…"
+                try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
+            }
        } catch {
-            self.statusText = "Speak failed: \(error.localizedDescription)"
-            self.logger.error("speak failed: \(error.localizedDescription, privacy: .public)")
+            self.logger.error("tts failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
+            do {
+                if self.interruptOnSpeech {
+                    do {
+                        try self.startRecognition()
+                    } catch {
+                        self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
+                    }
+                }
+                self.statusText = "Speaking (System)…"
+                let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
+                try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
+            } catch {
+                self.statusText = "Speak failed: \(error.localizedDescription)"
+                self.logger.error("system voice failed: \(error.localizedDescription, privacy: .public)")
+            }
        }

        self.stopRecognition()
@@ -527,7 +527,11 @@ final class TalkModeManager: NSObject {
        self.player = player
        player.prepareToPlay()
        self.logger.info("play start")
-        player.play()
+        guard player.play() else {
+            throw NSError(domain: "TalkMode", code: 2, userInfo: [
+                NSLocalizedDescriptionKey: "audio player refused to play",
+            ])
+        }
        while player.isPlaying {
            try? await Task.sleep(nanoseconds: 120_000_000)
        }
@@ -541,6 +545,7 @@ final class TalkModeManager: NSObject {
        }
        self.player?.stop()
        self.player = nil
+        TalkSystemSpeechSynthesizer.shared.stop()
        self.isSpeaking = false
    }

@@ -584,7 +589,7 @@ final class TalkModeManager: NSObject {

    private static func configureAudioSession() throws {
        let session = AVAudioSession.sharedInstance()
-        try session.setCategory(.playAndRecord, mode: .measurement, options: [
+        try session.setCategory(.playAndRecord, mode: .voiceChat, options: [
            .duckOthers,
            .mixWithOthers,
            .allowBluetoothHFP,
@@ -609,127 +614,3 @@ final class TalkModeManager: NSObject {
        }
    }
 }
-
-private struct ElevenLabsRequest {
-    let text: String
-    let modelId: String?
-    let outputFormat: String?
-    let speed: Double?
-    let stability: Double?
-    let similarity: Double?
-    let style: Double?
-    let speakerBoost: Bool?
-    let seed: UInt32?
-    let normalize: String?
-    let language: String?
-}
-
-private struct ElevenLabsClient {
-    let apiKey: String
-    let baseUrl = URL(string: "https://api.elevenlabs.io")!
-
-    func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
-        var url = self.baseUrl
-        url.appendPathComponent("v1")
-        url.appendPathComponent("text-to-speech")
-        url.appendPathComponent(voiceId)
-
-        var payload: [String: Any] = [
-            "text": request.text,
-        ]
-        if let modelId = request.modelId, !modelId.isEmpty {
-            payload["model_id"] = modelId
-        }
-        if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
-            payload["output_format"] = outputFormat
-        }
-        if let seed = request.seed {
-            payload["seed"] = seed
-        }
-        if let normalize = request.normalize {
-            payload["apply_text_normalization"] = normalize
-        }
-        if let language = request.language {
-            payload["language_code"] = language
-        }
-        var voiceSettings: [String: Any] = [:]
-        if let speed = request.speed { voiceSettings["speed"] = speed }
-        if let stability = request.stability { voiceSettings["stability"] = stability }
-        if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
-        if let style = request.style { voiceSettings["style"] = style }
-        if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
-        if !voiceSettings.isEmpty { payload["voice_settings"] = voiceSettings }
-
-        let body = try JSONSerialization.data(withJSONObject: payload, options: [])
-        var req = URLRequest(url: url)
-        req.httpMethod = "POST"
-        req.httpBody = body
-        req.timeoutInterval = 45
-        req.setValue("application/json", forHTTPHeaderField: "Content-Type")
-        req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
-        req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
-
-        let (data, response) = try await URLSession.shared.data(for: req)
-        if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
-            let message = String(data: data, encoding: .utf8) ?? "unknown"
-            throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
-                NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
-            ])
-        }
-        return data
-    }
-}
-
-private enum TalkModeRuntime {
-    static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
-        if let rateWPM, rateWPM > 0 {
-            let resolved = Double(rateWPM) / 175.0
-            if resolved <= 0.5 || resolved >= 2.0 { return nil }
-            return resolved
-        }
-        if let speed {
-            if speed <= 0.5 || speed >= 2.0 { return nil }
-            return speed
-        }
-        return nil
-    }
-
-    static func validatedUnit(_ value: Double?) -> Double? {
-        guard let value else { return nil }
-        if value < 0 || value > 1 { return nil }
-        return value
-    }
-
-    static func validatedSeed(_ value: Int?) -> UInt32? {
-        guard let value else { return nil }
-        if value < 0 || value > 4_294_967_295 { return nil }
-        return UInt32(value)
-    }
-
-    static func validatedNormalize(_ value: String?) -> String? {
-        guard let value else { return nil }
-        let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
-        return ["auto", "on", "off"].contains(normalized) ? normalized : nil
-    }
-
-    static func validatedLanguage(_ value: String?) -> String? {
-        guard let value else { return nil }
-        let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
-        guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
-        return normalized
-    }
-
-    static func validatedOutputFormat(_ value: String?) -> String? {
-        let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
-        guard !trimmed.isEmpty else { return nil }
-        return trimmed.hasPrefix("mp3_") ? trimmed : nil
-    }
-
-    static func isMessageTimestampAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
-        let sinceMs = sinceSeconds * 1000
-        if timestamp > 10_000_000_000 {
-            return timestamp >= sinceMs - 500
-        }
-        return timestamp >= sinceSeconds - 0.5
-    }
-}
--- a/apps/ios/Sources/Voice/TalkOrbOverlay.swift
+++ b/apps/ios/Sources/Voice/TalkOrbOverlay.swift
@@ -12,14 +12,14 @@ struct TalkOrbOverlay: View {
            ZStack {
                Circle()
                    .stroke(seam.opacity(0.26), lineWidth: 2)
-                    .frame(width: 280, height: 280)
+                    .frame(width: 320, height: 320)
                    .scaleEffect(self.pulse ? 1.15 : 0.96)
                    .opacity(self.pulse ? 0.0 : 1.0)
                    .animation(.easeOut(duration: 1.3).repeatForever(autoreverses: false), value: self.pulse)

                Circle()
                    .stroke(seam.opacity(0.18), lineWidth: 2)
-                    .frame(width: 280, height: 280)
+                    .frame(width: 320, height: 320)
                    .scaleEffect(self.pulse ? 1.45 : 1.02)
                    .opacity(self.pulse ? 0.0 : 0.9)
                    .animation(.easeOut(duration: 1.9).repeatForever(autoreverses: false).delay(0.2), value: self.pulse)
@@ -35,7 +35,7 @@ struct TalkOrbOverlay: View {
                            center: .center,
                            startRadius: 1,
                            endRadius: 112))
-                    .frame(width: 168, height: 168)
+                    .frame(width: 190, height: 190)
                    .overlay(
                        Circle()
                            .stroke(seam.opacity(0.35), lineWidth: 1))
--- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift
+++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift
@@ -291,7 +291,9 @@ actor TalkModeRuntime {
        await self.reloadConfig()
        guard self.isCurrent(gen) else { return }
        let prompt = self.buildPrompt(transcript: transcript)
-        let sessionKey = await GatewayConnection.shared.mainSessionKey()
+        let sessionKey =
+            await MainActor.run { WebChatManager.shared.activeSessionKey } ??
+            await GatewayConnection.shared.mainSessionKey()
        let runId = UUID().uuidString
        let startedAt = Date().timeIntervalSince1970
        self.logger.info(
@@ -335,20 +337,9 @@ actor TalkModeRuntime {
    }

    private func buildPrompt(transcript: String) -> String {
-        var lines: [String] = [
-            "Talk Mode active. Reply in a concise, spoken tone.",
-            "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
-        ]
-
-        if let interrupted = self.lastInterruptedAtSeconds {
-            let formatted = String(format: "%.1f", interrupted)
-            lines.append("Assistant speech interrupted at \(formatted)s.")
-            self.lastInterruptedAtSeconds = nil
-        }
-
-        lines.append("")
-        lines.append(transcript)
-        return lines.joined(separator: "\n")
+        let interrupted = self.lastInterruptedAtSeconds
+        self.lastInterruptedAtSeconds = nil
+        return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted)
    }

    private func waitForAssistantText(
@@ -378,7 +369,7 @@ actor TalkModeRuntime {
                guard message.role == "assistant" else { return false }
                guard let since else { return true }
                guard let timestamp = message.timestamp else { return false }
-                return Self.isMessageTimestampAfter(timestamp, sinceSeconds: since)
+                return TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since)
            }
            guard let assistant else { return nil }
            let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
@@ -421,76 +412,108 @@ actor TalkModeRuntime {
            }
        }

-        guard let apiKey = self.apiKey, !apiKey.isEmpty else {
-            self.logger.error("talk missing ELEVENLABS_API_KEY")
-            return
-        }
-
+        let apiKey = self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines)
        let requestedVoice =
            directive?.voiceId ??
            self.currentVoiceId ??
            self.defaultVoiceId
-        guard let voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey) else {
-            self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
-            return
-        }
-        guard self.isCurrent(gen) else { return }
-        self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")

-        await self.startRecognition()
-        guard self.isCurrent(gen) else { return }
-        await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
-        self.phase = .speaking
+        let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
+
+        let voiceId: String?
+        if let apiKey, !apiKey.isEmpty {
+            voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey)
+        } else {
+            voiceId = nil
+        }
+
+        if apiKey?.isEmpty != false {
+            self.ttsLogger.warning("talk missing ELEVENLABS_API_KEY; falling back to system voice")
+        } else if voiceId == nil {
+            self.ttsLogger.warning("talk missing voiceId; falling back to system voice")
+        } else if let voiceId {
+            self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")
+        }
        self.lastSpokenText = cleaned

-        let resolvedSpeed = Self.resolveSpeed(
-            speed: directive?.speed,
-            rateWPM: directive?.rateWPM,
-            logger: self.logger)
-
-        let request = ElevenLabsRequest(
-            text: cleaned,
-            modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
-            outputFormat: Self.validatedOutputFormat(directive?.outputFormat ?? self.defaultOutputFormat, logger: self.logger),
-            speed: resolvedSpeed,
-            stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger),
-            similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger),
-            style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger),
-            speakerBoost: directive?.speakerBoost,
-            seed: Self.validatedSeed(directive?.seed, logger: self.logger),
-            normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger),
-            language: Self.validatedLanguage(directive?.language, logger: self.logger))
-
        let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
-        self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")

        do {
-            let client = ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger)
-            let audio = try await withThrowingTaskGroup(of: Data.self) { group in
-                group.addTask {
-                    try await client.synthesize(voiceId: voiceId, request: request)
+            if let apiKey, !apiKey.isEmpty, let voiceId {
+                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
+                let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
+                if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
+                    self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
                }
-                group.addTask {
-                    try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000))
-                    throw NSError(domain: "TalkTTS", code: 408, userInfo: [
-                        NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s",
+
+                let request = ElevenLabsTTSRequest(
+                    text: cleaned,
+                    modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
+                    outputFormat: outputFormat,
+                    speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
+                    stability: TalkTTSValidation.validatedUnit(directive?.stability),
+                    similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
+                    style: TalkTTSValidation.validatedUnit(directive?.style),
+                    speakerBoost: directive?.speakerBoost,
+                    seed: TalkTTSValidation.validatedSeed(directive?.seed),
+                    normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
+                    language: language)
+
+                self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
+                let client = ElevenLabsTTSClient(apiKey: apiKey)
+                let audio = try await client.synthesizeWithHardTimeout(
+                    voiceId: voiceId,
+                    request: request,
+                    hardTimeoutSeconds: synthTimeoutSeconds)
+                guard self.isCurrent(gen) else { return }
+                self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
+
+                if self.interruptOnSpeech {
+                    await self.startRecognition()
+                    guard self.isCurrent(gen) else { return }
+                }
+
+                await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
+                self.phase = .speaking
+
+                let result = await TalkAudioPlayer.shared.play(data: audio)
+                self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
+                if !result.finished, result.interruptedAt == nil {
+                    throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [
+                        NSLocalizedDescriptionKey: "audio playback failed",
                    ])
                }
-                let data = try await group.next()!
-                group.cancelAll()
-                return data
-            }
-            guard self.isCurrent(gen) else { return }
-            self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
-            let result = await TalkAudioPlayer.shared.play(data: audio)
-            self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
-            if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
-                if self.interruptOnSpeech {
-                    self.lastInterruptedAtSeconds = interruptedAt
+                if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
+                    if self.interruptOnSpeech {
+                        self.lastInterruptedAtSeconds = interruptedAt
+                    }
                }
+            } else {
+                self.ttsLogger.info("talk system voice start chars=\(cleaned.count, privacy: .public)")
+                if self.interruptOnSpeech {
+                    await self.startRecognition()
+                    guard self.isCurrent(gen) else { return }
+                }
+                await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
+                self.phase = .speaking
+                await TalkSystemSpeechSynthesizer.shared.stop()
+                try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
+                self.ttsLogger.info("talk system voice done")
            }
        } catch {
-            self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)")
+            self.ttsLogger.error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
+            do {
+                if self.interruptOnSpeech {
+                    await self.startRecognition()
+                    guard self.isCurrent(gen) else { return }
+                }
+                await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
+                self.phase = .speaking
+                await TalkSystemSpeechSynthesizer.shared.stop()
+                try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
+            } catch {
+                self.ttsLogger.error("talk system voice failed: \(error.localizedDescription, privacy: .public)")
+            }
        }

        if self.phase == .speaking {
@@ -505,7 +528,7 @@ actor TalkModeRuntime {
        if let fallbackVoiceId { return fallbackVoiceId }

        do {
-            let voices = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).listVoices()
+            let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
            guard let first = voices.first else {
                self.ttsLogger.error("elevenlabs voices list empty")
                return nil
@@ -528,6 +551,7 @@ actor TalkModeRuntime {

    func stopSpeaking(reason: TalkStopReason) async {
        let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
+        await TalkSystemSpeechSynthesizer.shared.stop()
        guard self.phase == .speaking else { return }
        if reason == .speech, let interruptedAt {
            self.lastInterruptedAtSeconds = interruptedAt
@@ -720,154 +744,4 @@ actor TalkModeRuntime {
        return normalized
    }

-    private static func validatedLanguage(_ value: String?, logger: Logger) -> String? {
-        guard let value else { return nil }
-        let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
-        guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else {
-            logger.warning("talk language invalid: \(normalized, privacy: .public)")
-            return nil
-        }
-        return normalized
-    }
-
-    private static func validatedOutputFormat(_ value: String?, logger: Logger) -> String? {
-        let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
-        guard !trimmed.isEmpty else { return nil }
-        guard trimmed.hasPrefix("mp3_") else {
-            logger.warning("talk output_format unsupported for local playback: \(trimmed, privacy: .public)")
-            return nil
-        }
-        return trimmed
-    }
-
-    private static func isMessageTimestampAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
-        let sinceMs = sinceSeconds * 1000
-        if timestamp > 10_000_000_000 {
-            return timestamp >= sinceMs - 500
-        }
-        return timestamp >= sinceSeconds - 0.5
-    }
-}
-
-private struct ElevenLabsRequest {
-    let text: String
-    let modelId: String?
-    let outputFormat: String?
-    let speed: Double?
-    let stability: Double?
-    let similarity: Double?
-    let style: Double?
-    let speakerBoost: Bool?
-    let seed: UInt32?
-    let normalize: String?
-    let language: String?
-}
-
-private struct ElevenLabsClient {
-    let apiKey: String
-    let logger: Logger
-    let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
-    let ttsTimeoutSeconds: TimeInterval = 45
-    let listVoicesTimeoutSeconds: TimeInterval = 15
-
-    func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
-        var url = self.baseUrl
-        url.appendPathComponent("v1")
-        url.appendPathComponent("text-to-speech")
-        url.appendPathComponent(voiceId)
-
-        let charCount = request.text.count
-        self.logger.info(
-            "elevenlabs tts request voice=\(voiceId, privacy: .public) model=\(request.modelId ?? "default", privacy: .public) chars=\(charCount, privacy: .public)")
-        let startedAt = Date()
-
-        var payload: [String: Any] = [
-            "text": request.text,
-        ]
-        if let modelId = request.modelId, !modelId.isEmpty {
-            payload["model_id"] = modelId
-        }
-        if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
-            payload["output_format"] = outputFormat
-        }
-        if let seed = request.seed {
-            payload["seed"] = seed
-        }
-        if let normalize = request.normalize {
-            payload["apply_text_normalization"] = normalize
-        }
-        if let language = request.language {
-            payload["language_code"] = language
-        }
-        var voiceSettings: [String: Any] = [:]
-        if let speed = request.speed { voiceSettings["speed"] = speed }
-        if let stability = request.stability { voiceSettings["stability"] = stability }
-        if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
-        if let style = request.style { voiceSettings["style"] = style }
-        if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
-        if !voiceSettings.isEmpty {
-            payload["voice_settings"] = voiceSettings
-        }
-
-        let body = try JSONSerialization.data(withJSONObject: payload, options: [])
-        var req = URLRequest(url: url)
-        req.httpMethod = "POST"
-        req.httpBody = body
-        req.timeoutInterval = self.ttsTimeoutSeconds
-        req.setValue("application/json", forHTTPHeaderField: "Content-Type")
-        req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
-        req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
-
-        let (data, response) = try await URLSession.shared.data(for: req)
-        if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
-            let message = String(data: data, encoding: .utf8) ?? "unknown"
-            self.logger.error(
-                "elevenlabs tts failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)")
-            throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
-                NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
-            ])
-        }
-        let elapsed = Date().timeIntervalSince(startedAt)
-        self.logger.info("elevenlabs tts ok bytes=\(data.count, privacy: .public) dur=\(elapsed, privacy: .public)s")
-        return data
-    }
-
-    func listVoices() async throws -> [ElevenLabsVoice] {
-        var url = self.baseUrl
-        url.appendPathComponent("v1")
-        url.appendPathComponent("voices")
-
-        self.logger.info("elevenlabs voices list request")
-        var req = URLRequest(url: url)
-        req.httpMethod = "GET"
-        req.timeoutInterval = self.listVoicesTimeoutSeconds
-        req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
-
-        let (data, response) = try await URLSession.shared.data(for: req)
-        if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
-            let message = String(data: data, encoding: .utf8) ?? "unknown"
-            self.logger.error(
-                "elevenlabs voices list failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)")
-            throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
-                NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
-            ])
-        }
-
-        let decoded = try JSONDecoder().decode(ElevenLabsVoicesResponse.self, from: data)
-        return decoded.voices
-    }
-}
-
-private struct ElevenLabsVoice: Decodable {
-    let voiceId: String
-    let name: String?
-
-    enum CodingKeys: String, CodingKey {
-        case voiceId = "voice_id"
-        case name
-    }
-}
-
-private struct ElevenLabsVoicesResponse: Decodable {
-    let voices: [ElevenLabsVoice]
 }
--- a/apps/macos/Sources/Clawdis/TalkOverlay.swift
+++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift
@@ -7,7 +7,7 @@ import SwiftUI
@Observable
 final class TalkOverlayController {
    static let shared = TalkOverlayController()
-    static let overlaySize: CGFloat = 360
+    static let overlaySize: CGFloat = 440
    static let windowInset: CGFloat = 88

    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay")
--- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift
+++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift
@@ -31,7 +31,7 @@ struct TalkOverlayView: View {
                }
                .buttonStyle(.plain)
                .contentShape(Circle())
-                .offset(x: -5, y: -5)
+                .offset(x: -2, y: -2)
                .opacity(self.hoveringWindow ? 1 : 0)
                .animation(.easeOut(duration: 0.12), value: self.hoveringWindow)
                .allowsHitTesting(self.hoveringWindow)
@@ -42,7 +42,7 @@ struct TalkOverlayView: View {
        .onHover { self.hoveringWindow = $0 }
    }

-    private static let defaultSeamColor = Color(red: 127 / 255.0, green: 184 / 255.0, blue: 212 / 255.0)
+    private static let defaultSeamColor = Color(red: 79 / 255.0, green: 122 / 255.0, blue: 154 / 255.0)

    private var seamColor: Color {
        Self.color(fromHex: self.appState.seamColorHex) ?? Self.defaultSeamColor
--- a/apps/macos/Sources/Clawdis/WebChatManager.swift
+++ b/apps/macos/Sources/Clawdis/WebChatManager.swift
@@ -29,6 +29,10 @@ final class WebChatManager {

    var onPanelVisibilityChanged: ((Bool) -> Void)?

+    var activeSessionKey: String? {
+        self.panelSessionKey ?? self.windowSessionKey
+    }
+
    func show(sessionKey: String) {
        self.closePanel()
        if let controller = self.windowController {
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
@@ -0,0 +1,233 @@
+import Foundation
+
+public struct ElevenLabsVoice: Decodable, Sendable {
+    public let voiceId: String
+    public let name: String?
+
+    enum CodingKeys: String, CodingKey {
+        case voiceId = "voice_id"
+        case name
+    }
+}
+
+public struct ElevenLabsTTSRequest: Sendable {
+    public var text: String
+    public var modelId: String?
+    public var outputFormat: String?
+    public var speed: Double?
+    public var stability: Double?
+    public var similarity: Double?
+    public var style: Double?
+    public var speakerBoost: Bool?
+    public var seed: UInt32?
+    public var normalize: String?
+    public var language: String?
+
+    public init(
+        text: String,
+        modelId: String? = nil,
+        outputFormat: String? = nil,
+        speed: Double? = nil,
+        stability: Double? = nil,
+        similarity: Double? = nil,
+        style: Double? = nil,
+        speakerBoost: Bool? = nil,
+        seed: UInt32? = nil,
+        normalize: String? = nil,
+        language: String? = nil)
+    {
+        self.text = text
+        self.modelId = modelId
+        self.outputFormat = outputFormat
+        self.speed = speed
+        self.stability = stability
+        self.similarity = similarity
+        self.style = style
+        self.speakerBoost = speakerBoost
+        self.seed = seed
+        self.normalize = normalize
+        self.language = language
+    }
+}
+
+public struct ElevenLabsTTSClient: Sendable {
+    public var apiKey: String
+    public var requestTimeoutSeconds: TimeInterval
+    public var listVoicesTimeoutSeconds: TimeInterval
+    public var baseUrl: URL
+
+    public init(
+        apiKey: String,
+        requestTimeoutSeconds: TimeInterval = 45,
+        listVoicesTimeoutSeconds: TimeInterval = 15,
+        baseUrl: URL = URL(string: "https://api.elevenlabs.io")!)
+    {
+        self.apiKey = apiKey
+        self.requestTimeoutSeconds = requestTimeoutSeconds
+        self.listVoicesTimeoutSeconds = listVoicesTimeoutSeconds
+        self.baseUrl = baseUrl
+    }
+
+    public func synthesizeWithHardTimeout(
+        voiceId: String,
+        request: ElevenLabsTTSRequest,
+        hardTimeoutSeconds: TimeInterval) async throws -> Data
+    {
+        try await withThrowingTaskGroup(of: Data.self) { group in
+            group.addTask {
+                try await self.synthesize(voiceId: voiceId, request: request)
+            }
+            group.addTask {
+                try await Task.sleep(nanoseconds: UInt64(hardTimeoutSeconds * 1_000_000_000))
+                throw NSError(domain: "ElevenLabsTTS", code: 408, userInfo: [
+                    NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(hardTimeoutSeconds)s",
+                ])
+            }
+            let data = try await group.next()!
+            group.cancelAll()
+            return data
+        }
+    }
+
+    public func synthesize(voiceId: String, request: ElevenLabsTTSRequest) async throws -> Data {
+        var url = self.baseUrl
+        url.appendPathComponent("v1")
+        url.appendPathComponent("text-to-speech")
+        url.appendPathComponent(voiceId)
+
+        let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
+
+        var lastError: Error?
+        for attempt in 0..<3 {
+            var req = URLRequest(url: url)
+            req.httpMethod = "POST"
+            req.httpBody = body
+            req.timeoutInterval = self.requestTimeoutSeconds
+            req.setValue("application/json", forHTTPHeaderField: "Content-Type")
+            req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
+            req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
+
+            do {
+                let (data, response) = try await URLSession.shared.data(for: req)
+                if let http = response as? HTTPURLResponse {
+                    let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
+                    if http.statusCode == 429 || http.statusCode >= 500 {
+                        let message = Self.truncatedErrorBody(data)
+                        lastError = NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
+                            NSLocalizedDescriptionKey: "ElevenLabs retryable failure: \(http.statusCode) ct=\(contentType) \(message)",
+                        ])
+                        if attempt < 2 {
+                            let retryAfter = Double(http.value(forHTTPHeaderField: "Retry-After") ?? "")
+                            let baseDelay = [0.25, 0.75, 1.5][attempt]
+                            let delaySeconds = max(baseDelay, retryAfter ?? 0)
+                            try? await Task.sleep(nanoseconds: UInt64(delaySeconds * 1_000_000_000))
+                            continue
+                        }
+                        throw lastError!
+                    }
+
+                    if http.statusCode >= 400 {
+                        let message = Self.truncatedErrorBody(data)
+                        throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
+                            NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
+                        ])
+                    }
+
+                    if !contentType.contains("audio") {
+                        let message = Self.truncatedErrorBody(data)
+                        throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
+                            NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
+                        ])
+                    }
+                }
+                return data
+            } catch {
+                lastError = error
+                if attempt < 2 {
+                    try? await Task.sleep(nanoseconds: UInt64([0.25, 0.75, 1.5][attempt] * 1_000_000_000))
+                    continue
+                }
+                throw error
+            }
+        }
+        throw lastError ?? NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
+            NSLocalizedDescriptionKey: "ElevenLabs failed",
+        ])
+    }
+
+    public func listVoices() async throws -> [ElevenLabsVoice] {
+        var url = self.baseUrl
+        url.appendPathComponent("v1")
+        url.appendPathComponent("voices")
+
+        var req = URLRequest(url: url)
+        req.httpMethod = "GET"
+        req.timeoutInterval = self.listVoicesTimeoutSeconds
+        req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
+
+        let (data, response) = try await URLSession.shared.data(for: req)
+        if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
+            let message = Self.truncatedErrorBody(data)
+            throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
+                NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
+            ])
+        }
+
+        struct VoicesResponse: Decodable { let voices: [ElevenLabsVoice] }
+        return try JSONDecoder().decode(VoicesResponse.self, from: data).voices
+    }
+
+    public static func validatedOutputFormat(_ value: String?) -> String? {
+        let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
+        guard !trimmed.isEmpty else { return nil }
+        guard trimmed.hasPrefix("mp3_") else { return nil }
+        return trimmed
+    }
+
+    public static func validatedLanguage(_ value: String?) -> String? {
+        let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
+        guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
+        return normalized
+    }
+
+    public static func validatedNormalize(_ value: String?) -> String? {
+        let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
+        guard ["auto", "on", "off"].contains(normalized) else { return nil }
+        return normalized
+    }
+
+    private static func buildPayload(_ request: ElevenLabsTTSRequest) -> [String: Any] {
+        var payload: [String: Any] = ["text": request.text]
+        if let modelId = request.modelId?.trimmingCharacters(in: .whitespacesAndNewlines), !modelId.isEmpty {
+            payload["model_id"] = modelId
+        }
+        if let outputFormat = request.outputFormat?.trimmingCharacters(in: .whitespacesAndNewlines), !outputFormat.isEmpty {
+            payload["output_format"] = outputFormat
+        }
+        if let seed = request.seed {
+            payload["seed"] = seed
+        }
+        if let normalize = request.normalize {
+            payload["apply_text_normalization"] = normalize
+        }
+        if let language = request.language {
+            payload["language_code"] = language
+        }
+
+        var voiceSettings: [String: Any] = [:]
+        if let speed = request.speed { voiceSettings["speed"] = speed }
+        if let stability = request.stability { voiceSettings["stability"] = stability }
+        if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
+        if let style = request.style { voiceSettings["style"] = style }
+        if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
+        if !voiceSettings.isEmpty {
+            payload["voice_settings"] = voiceSettings
+        }
+        return payload
+    }
+
+    private static func truncatedErrorBody(_ data: Data) -> String {
+        let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
+        return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
+    }
+}
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift
@@ -67,12 +67,18 @@ public enum TalkDirectiveParser {
        var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false)
        guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) }

-        guard let firstNonEmpty =
+        guard let firstNonEmptyIndex =
            lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty })
        else {
            return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
        }

+        var firstNonEmpty = firstNonEmptyIndex
+        if firstNonEmpty > 0 {
+            lines.removeSubrange(0..<firstNonEmpty)
+            firstNonEmpty = 0
+        }
+
        let head = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines)
        guard head.hasPrefix("{"), head.hasSuffix("}") else {
            return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkHistoryTimestamp.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkHistoryTimestamp.swift
@@ -0,0 +1,13 @@
+public enum TalkHistoryTimestamp: Sendable {
+    /// Gateway history timestamps have historically been emitted as either seconds (Double, epoch seconds)
+    /// or milliseconds (Double, epoch ms). This helper accepts either.
+    public static func isAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
+        let sinceMs = sinceSeconds * 1000
+        // ~2286-11-20 in epoch seconds. Anything bigger is almost certainly epoch milliseconds.
+        if timestamp > 10_000_000_000 {
+            return timestamp >= sinceMs - 500
+        }
+        return timestamp >= sinceSeconds - 0.5
+    }
+}
+
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift
@@ -0,0 +1,18 @@
+public enum TalkPromptBuilder: Sendable {
+    public static func build(transcript: String, interruptedAtSeconds: Double?) -> String {
+        var lines: [String] = [
+            "Talk Mode active. Reply in a concise, spoken tone.",
+            "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
+        ]
+
+        if let interruptedAtSeconds {
+            let formatted = String(format: "%.1f", interruptedAtSeconds)
+            lines.append("Assistant speech interrupted at \(formatted)s.")
+        }
+
+        lines.append("")
+        lines.append(transcript)
+        return lines.joined(separator: "\n")
+    }
+}
+
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkSystemSpeechSynthesizer.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkSystemSpeechSynthesizer.swift
@@ -0,0 +1,110 @@
+import AVFoundation
+import Foundation
+
+@MainActor
+public final class TalkSystemSpeechSynthesizer: NSObject {
+    public enum SpeakError: Error {
+        case canceled
+    }
+
+    public static let shared = TalkSystemSpeechSynthesizer()
+
+    private let synth = AVSpeechSynthesizer()
+    private var speakContinuation: CheckedContinuation<Void, Error>?
+    private var currentUtterance: AVSpeechUtterance?
+    private var currentToken = UUID()
+    private var watchdog: Task<Void, Never>?
+
+    public var isSpeaking: Bool { self.synth.isSpeaking }
+
+    private override init() {
+        super.init()
+        self.synth.delegate = self
+    }
+
+    public func stop() {
+        self.currentToken = UUID()
+        self.watchdog?.cancel()
+        self.watchdog = nil
+        self.synth.stopSpeaking(at: .immediate)
+        self.finishCurrent(with: SpeakError.canceled)
+    }
+
+    public func speak(text: String, language: String? = nil) async throws {
+        let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
+        guard !trimmed.isEmpty else { return }
+
+        self.stop()
+        let token = UUID()
+        self.currentToken = token
+
+        let utterance = AVSpeechUtterance(string: trimmed)
+        if let language, let voice = AVSpeechSynthesisVoice(language: language) {
+            utterance.voice = voice
+        }
+        self.currentUtterance = utterance
+
+        let estimatedSeconds = max(3.0, min(180.0, Double(trimmed.count) * 0.08))
+        self.watchdog?.cancel()
+        self.watchdog = Task { @MainActor [weak self] in
+            guard let self else { return }
+            try? await Task.sleep(nanoseconds: UInt64(estimatedSeconds * 1_000_000_000))
+            if Task.isCancelled { return }
+            guard self.currentToken == token else { return }
+            if self.synth.isSpeaking {
+                self.synth.stopSpeaking(at: .immediate)
+            }
+            self.finishCurrent(
+                with: NSError(domain: "TalkSystemSpeechSynthesizer", code: 408, userInfo: [
+                    NSLocalizedDescriptionKey: "system TTS timed out after \(estimatedSeconds)s",
+                ]))
+        }
+
+        try await withTaskCancellationHandler(operation: {
+            try await withCheckedThrowingContinuation { cont in
+                self.speakContinuation = cont
+                self.synth.speak(utterance)
+            }
+        }, onCancel: {
+            Task { @MainActor in
+                self.stop()
+            }
+        })
+
+        if self.currentToken != token {
+            throw SpeakError.canceled
+        }
+    }
+
+    private func handleFinish(error: Error?) {
+        guard self.currentUtterance != nil else { return }
+        self.watchdog?.cancel()
+        self.watchdog = nil
+        self.finishCurrent(with: error)
+    }
+
+    private func finishCurrent(with error: Error?) {
+        self.currentUtterance = nil
+        let cont = self.speakContinuation
+        self.speakContinuation = nil
+        if let error {
+            cont?.resume(throwing: error)
+        } else {
+            cont?.resume(returning: ())
+        }
+    }
+}
+
+extension TalkSystemSpeechSynthesizer: AVSpeechSynthesizerDelegate {
+    public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
+        Task { @MainActor in
+            self.handleFinish(error: nil)
+        }
+    }
+
+    public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
+        Task { @MainActor in
+            self.handleFinish(error: SpeakError.canceled)
+        }
+    }
+}
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift
@@ -0,0 +1,27 @@
+public enum TalkTTSValidation: Sendable {
+    public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
+        if let rateWPM, rateWPM > 0 {
+            let resolved = Double(rateWPM) / 175.0
+            if resolved <= 0.5 || resolved >= 2.0 { return nil }
+            return resolved
+        }
+        if let speed {
+            if speed <= 0.5 || speed >= 2.0 { return nil }
+            return speed
+        }
+        return nil
+    }
+
+    public static func validatedUnit(_ value: Double?) -> Double? {
+        guard let value else { return nil }
+        if value < 0 || value > 1 { return nil }
+        return value
+    }
+
+    public static func validatedSeed(_ value: Int?) -> UInt32? {
+        guard let value else { return nil }
+        if value < 0 || value > 4294967295 { return nil }
+        return UInt32(value)
+    }
+}
+
--- a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/ElevenLabsTTSValidationTests.swift
+++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/ElevenLabsTTSValidationTests.swift
@@ -0,0 +1,20 @@
+import XCTest
+@testable import ClawdisKit
+
+final class ElevenLabsTTSValidationTests: XCTestCase {
+    func testValidatedOutputFormatAllowsOnlyMp3Presets() {
+        XCTAssertEqual(ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128"), "mp3_44100_128")
+        XCTAssertNil(ElevenLabsTTSClient.validatedOutputFormat("pcm_16000"))
+    }
+
+    func testValidatedLanguageAcceptsTwoLetterCodes() {
+        XCTAssertEqual(ElevenLabsTTSClient.validatedLanguage("EN"), "en")
+        XCTAssertNil(ElevenLabsTTSClient.validatedLanguage("eng"))
+    }
+
+    func testValidatedNormalizeAcceptsKnownValues() {
+        XCTAssertEqual(ElevenLabsTTSClient.validatedNormalize("AUTO"), "auto")
+        XCTAssertNil(ElevenLabsTTSClient.validatedNormalize("maybe"))
+    }
+}
+
--- a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift
+++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift
@@ -50,6 +50,18 @@ final class TalkDirectiveTests: XCTestCase {
        XCTAssertEqual(result.stripped, "Hello.")
    }

+    func testSkipsLeadingEmptyLinesWhenParsingDirective() {
+        let text = """
+
+
+        {"voice":"abc123"}
+        Hello there.
+        """
+        let result = TalkDirectiveParser.parse(text)
+        XCTAssertEqual(result.directive?.voiceId, "abc123")
+        XCTAssertEqual(result.stripped, "Hello there.")
+    }
+
    func testTracksUnknownKeys() {
        let text = """
        {"voice":"abc","mystery":"value","extra":1}
--- a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkHistoryTimestampTests.swift
+++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkHistoryTimestampTests.swift
@@ -0,0 +1,16 @@
+import XCTest
+@testable import ClawdisKit
+
+final class TalkHistoryTimestampTests: XCTestCase {
+    func testSecondsTimestampsAreAcceptedWithSmallTolerance() {
+        XCTAssertTrue(TalkHistoryTimestamp.isAfter(999.6, sinceSeconds: 1000))
+        XCTAssertFalse(TalkHistoryTimestamp.isAfter(999.4, sinceSeconds: 1000))
+    }
+
+    func testMillisecondsTimestampsAreAcceptedWithSmallTolerance() {
+        let sinceSeconds = 1_700_000_000.0
+        let sinceMs = sinceSeconds * 1000
+        XCTAssertTrue(TalkHistoryTimestamp.isAfter(sinceMs - 500, sinceSeconds: sinceSeconds))
+        XCTAssertFalse(TalkHistoryTimestamp.isAfter(sinceMs - 501, sinceSeconds: sinceSeconds))
+    }
+}
--- a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkPromptBuilderTests.swift
+++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkPromptBuilderTests.swift
@@ -0,0 +1,16 @@
+import XCTest
+@testable import ClawdisKit
+
+final class TalkPromptBuilderTests: XCTestCase {
+    func testBuildIncludesTranscript() {
+        let prompt = TalkPromptBuilder.build(transcript: "Hello", interruptedAtSeconds: nil)
+        XCTAssertTrue(prompt.contains("Talk Mode active."))
+        XCTAssertTrue(prompt.hasSuffix("\n\nHello"))
+    }
+
+    func testBuildIncludesInterruptionLineWhenProvided() {
+        let prompt = TalkPromptBuilder.build(transcript: "Hi", interruptedAtSeconds: 1.234)
+        XCTAssertTrue(prompt.contains("Assistant speech interrupted at 1.2s."))
+    }
+}
+
--- a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift
+++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift
@@ -0,0 +1,24 @@
+import XCTest
+@testable import ClawdisKit
+
+final class TalkTTSValidationTests: XCTestCase {
+    func testResolveSpeedUsesRateWPMWhenProvided() {
+        let resolved = TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 175)
+        XCTAssertNotNil(resolved)
+        XCTAssertEqual(resolved ?? 0, 1.0, accuracy: 0.0001)
+        XCTAssertNil(TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 400))
+    }
+
+    func testValidatedUnitBounds() {
+        XCTAssertEqual(TalkTTSValidation.validatedUnit(0), 0)
+        XCTAssertEqual(TalkTTSValidation.validatedUnit(1), 1)
+        XCTAssertNil(TalkTTSValidation.validatedUnit(-0.01))
+        XCTAssertNil(TalkTTSValidation.validatedUnit(1.01))
+    }
+
+    func testValidatedSeedBounds() {
+        XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0)
+        XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234)
+        XCTAssertNil(TalkTTSValidation.validatedSeed(-1))
+    }
+}