fix: stream elevenlabs tts playback

2025-12-30 12:17:40 +01:00
parent 9c532eac07
commit 27adfb76fa
11 changed files with 1091 additions and 91 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,9 @@
 - macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn.
 - macOS Talk Mode: increase overlay window size so wave rings don’t clip; close button is hover-only and closer to the orb.
 - Talk Mode: fall back to system TTS when ElevenLabs is unavailable, returns non-audio, or playback fails (macOS/iOS/Android).
 - Talk Mode: stream PCM on macOS/iOS for lower latency (incremental playback); Android continues MP3 streaming.
 - Talk Mode: validate ElevenLabs v3 stability and latency tier directives before sending requests.
 - iOS/Android Talk Mode: auto-select the first ElevenLabs voice when none is configured.
 - ElevenLabs: add retry/backoff for 429/5xx and include content-type in errors for debugging.
 - Talk Mode: align to the gateway’s main session key and fall back to history polling when chat events drop (prevents stuck “thinking” / missing messages).
 - Talk Mode: treat history timestamps as seconds or milliseconds to avoid stale assistant picks (macOS/iOS/Android).
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/StreamingMediaDataSource.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/StreamingMediaDataSource.kt
@@ -0,0 +1,98 @@
 package com.steipete.clawdis.node.voice
 import android.media.MediaDataSource
 import kotlin.math.min
 internal class StreamingMediaDataSource : MediaDataSource() {
  private data class Chunk(val start: Long, val data: ByteArray)
  private val lock = Object()
  private val chunks = ArrayList<Chunk>()
  private var totalSize: Long = 0
  private var closed = false
  private var finished = false
  private var lastReadIndex = 0
  fun append(data: ByteArray) {
    if (data.isEmpty()) return
    synchronized(lock) {
      if (closed || finished) return
      val chunk = Chunk(totalSize, data)
      chunks.add(chunk)
      totalSize += data.size.toLong()
      lock.notifyAll()
    }
  }
  fun finish() {
    synchronized(lock) {
      if (closed) return
      finished = true
      lock.notifyAll()
    }
  }
  fun fail() {
    synchronized(lock) {
      closed = true
      lock.notifyAll()
    }
  }
  override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int {
    if (position < 0) return -1
    synchronized(lock) {
      while (!closed && !finished && position >= totalSize) {
        lock.wait()
      }
      if (closed) return -1
      if (position >= totalSize && finished) return -1
      val available = (totalSize - position).toInt()
      val toRead = min(size, available)
      var remaining = toRead
      var destOffset = offset
      var pos = position
      var index = findChunkIndex(pos)
      while (remaining > 0 && index < chunks.size) {
        val chunk = chunks[index]
        val inChunkOffset = (pos - chunk.start).toInt()
        if (inChunkOffset >= chunk.data.size) {
          index++
          continue
        }
        val copyLen = min(remaining, chunk.data.size - inChunkOffset)
        System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen)
        remaining -= copyLen
        destOffset += copyLen
        pos += copyLen
        if (inChunkOffset + copyLen >= chunk.data.size) {
          index++
        }
      }
      return toRead - remaining
    }
  }
  override fun getSize(): Long = -1
  override fun close() {
    synchronized(lock) {
      closed = true
      lock.notifyAll()
    }
  }
  private fun findChunkIndex(position: Long): Int {
    var index = lastReadIndex
    while (index < chunks.size) {
      val chunk = chunks[index]
      if (position < chunk.start + chunk.data.size) break
      index++
    }
    lastReadIndex = index
    return index
  }
 }
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
@@ -18,7 +18,6 @@ import android.speech.tts.UtteranceProgressListener
 import android.util.Log
 import androidx.core.content.ContextCompat
 import com.steipete.clawdis.node.bridge.BridgeSession
 import java.io.File
 import java.net.HttpURLConnection
 import java.net.URL
 import java.util.UUID
@@ -44,6 +43,7 @@ class TalkModeManager(
 ) {
  companion object {
    private const val tag = "TalkMode"
    private const val defaultModelIdFallback = "eleven_v3"
  }
  private val mainHandler = Handler(Looper.getMainLooper())
@@ -81,6 +81,7 @@ class TalkModeManager(
  private var defaultVoiceId: String? = null
  private var currentVoiceId: String? = null
  private var fallbackVoiceId: String? = null
  private var defaultModelId: String? = null
  private var currentModelId: String? = null
  private var defaultOutputFormat: String? = null
@@ -97,7 +98,7 @@ class TalkModeManager(
  private var chatSubscribedSessionKey: String? = null
  private var player: MediaPlayer? = null
-  private var currentAudioFile: File? = null
+  private var streamingSource: StreamingMediaDataSource? = null
  private var systemTts: TextToSpeech? = null
  private var systemTtsPending: CompletableDeferred<Unit>? = null
  private var systemTtsPendingId: String? = null
@@ -464,7 +465,13 @@ class TalkModeManager(
    val apiKey =
      apiKey?.trim()?.takeIf { it.isNotEmpty() }
        ?: System.getenv("ELEVENLABS_API_KEY")?.trim()
-    val voiceId = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
+    val preferredVoice = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
    val voiceId =
      if (!apiKey.isNullOrEmpty()) {
        resolveVoiceId(preferredVoice, apiKey)
      } else {
        null
      }
    _statusText.value = "Speaking…"
    _isSpeaking.value = true
@@ -486,24 +493,25 @@ class TalkModeManager(
      } else {
        _usingFallbackTts.value = false
        val ttsStarted = SystemClock.elapsedRealtime()
        val modelId = directive?.modelId ?: currentModelId ?: defaultModelId
        val request =
          ElevenLabsRequest(
            text = cleaned,
-            modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
+            modelId = modelId,
            outputFormat =
              TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
            speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
-            stability = TalkModeRuntime.validatedUnit(directive?.stability),
+            stability = TalkModeRuntime.validatedStability(directive?.stability, modelId),
            similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
            style = TalkModeRuntime.validatedUnit(directive?.style),
            speakerBoost = directive?.speakerBoost,
            seed = TalkModeRuntime.validatedSeed(directive?.seed),
            normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
            language = TalkModeRuntime.validatedLanguage(directive?.language),
            latencyTier = TalkModeRuntime.validatedLatencyTier(directive?.latencyTier),
          )
-        val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
+        streamAndPlay(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
-        Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
+        Log.d(tag, "elevenlabs stream ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
        playAudio(audio)
      }
    } catch (err: Throwable) {
      Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
@@ -520,22 +528,28 @@ class TalkModeManager(
    _isSpeaking.value = false
  }
-  private suspend fun playAudio(data: ByteArray) {
+  private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
    stopSpeaking(resetInterrupt = false)
-    val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
+
-    file.writeBytes(data)
+    val dataSource = StreamingMediaDataSource()
-    currentAudioFile = file
+    streamingSource = dataSource
    val player = MediaPlayer()
    this.player = player
    val prepared = CompletableDeferred<Unit>()
    val finished = CompletableDeferred<Unit>()
    player.setAudioAttributes(
      AudioAttributes.Builder()
        .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
        .setUsage(AudioAttributes.USAGE_ASSISTANT)
        .build(),
    )
    player.setOnPreparedListener {
      it.start()
      prepared.complete(Unit)
    }
    player.setOnCompletionListener {
      finished.complete(Unit)
    }
@@ -544,16 +558,30 @@ class TalkModeManager(
      true
    }
-    player.setDataSource(file.absolutePath)
+    player.setDataSource(dataSource)
    withContext(Dispatchers.Main) {
      player.setOnPreparedListener { it.start() }
      player.prepareAsync()
    }
    val fetchError = CompletableDeferred<Throwable?>()
    val fetchJob =
      scope.launch(Dispatchers.IO) {
        try {
          streamTts(voiceId = voiceId, apiKey = apiKey, request = request, sink = dataSource)
          fetchError.complete(null)
        } catch (err: Throwable) {
          dataSource.fail()
          fetchError.complete(err)
        }
      }
    Log.d(tag, "play start")
    try {
      prepared.await()
      finished.await()
      fetchError.await()?.let { throw it }
    } finally {
      fetchJob.cancel()
      cleanupPlayer()
    }
    Log.d(tag, "play done")
@@ -674,8 +702,8 @@ class TalkModeManager(
    player?.stop()
    player?.release()
    player = null
-    currentAudioFile?.delete()
+    streamingSource?.close()
-    currentAudioFile = null
+    streamingSource = null
  }
  private fun shouldInterrupt(transcript: String): Boolean {
@@ -713,13 +741,15 @@ class TalkModeManager(
      defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
      voiceAliases = aliases
      if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
-      defaultModelId = model
+      defaultModelId = model ?: defaultModelIdFallback
      if (!modelOverrideActive) currentModelId = defaultModelId
      defaultOutputFormat = outputFormat
      apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
      if (interrupt != null) interruptOnSpeech = interrupt
    } catch (_: Throwable) {
      defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
      defaultModelId = defaultModelIdFallback
      if (!modelOverrideActive) currentModelId = defaultModelId
      apiKey = envKey?.takeIf { it.isNotEmpty() }
      voiceAliases = emptyMap()
    }
@@ -730,9 +760,21 @@ class TalkModeManager(
    return obj["runId"].asStringOrNull()
  }
-  private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray {
+  private suspend fun streamTts(
-    return withContext(Dispatchers.IO) {
+    voiceId: String,
-      val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId")
+    apiKey: String,
    request: ElevenLabsRequest,
    sink: StreamingMediaDataSource,
  ) {
    withContext(Dispatchers.IO) {
      val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
      val latencyTier = request.latencyTier
      val url =
        if (latencyTier != null) {
          URL("$baseUrl?optimize_streaming_latency=$latencyTier")
        } else {
          URL(baseUrl)
        }
      val conn = url.openConnection() as HttpURLConnection
      conn.requestMethod = "POST"
      conn.connectTimeout = 30_000
@@ -746,13 +788,21 @@ class TalkModeManager(
      conn.outputStream.use { it.write(payload.toByteArray()) }
      val code = conn.responseCode
      val stream = if (code >= 400) conn.errorStream else conn.inputStream
      val data = stream.readBytes()
      if (code >= 400) {
-        val message = String(data)
+        val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
        sink.fail()
        throw IllegalStateException("ElevenLabs failed: $code $message")
      }
-      data
+
      val buffer = ByteArray(8 * 1024)
      conn.inputStream.use { input ->
        while (true) {
          val read = input.read(buffer)
          if (read <= 0) break
          sink.append(buffer.copyOf(read))
        }
      }
      sink.finish()
    }
  }
@@ -794,6 +844,7 @@ class TalkModeManager(
    val seed: Long?,
    val normalize: String?,
    val language: String?,
    val latencyTier: Int?,
  )
  private object TalkModeRuntime {
@@ -816,6 +867,15 @@ class TalkModeManager(
      return value
    }
    fun validatedStability(value: Double?, modelId: String?): Double? {
      if (value == null) return null
      val normalized = modelId?.trim()?.lowercase()
      if (normalized == "eleven_v3") {
        return if (value == 0.0 || value == 0.5 || value == 1.0) value else null
      }
      return validatedUnit(value)
    }
    fun validatedSeed(value: Long?): Long? {
      if (value == null) return null
      if (value < 0 || value > 4294967295L) return null
@@ -840,6 +900,12 @@ class TalkModeManager(
      return if (trimmed.startsWith("mp3_")) trimmed else null
    }
    fun validatedLatencyTier(value: Int?): Int? {
      if (value == null) return null
      if (value < 0 || value > 4) return null
      return value
    }
    fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean {
      val sinceMs = sinceSeconds * 1000
      return if (timestamp > 10_000_000_000) {
@@ -876,6 +942,62 @@ class TalkModeManager(
    return if (isLikelyVoiceId(trimmed)) trimmed else null
  }
  private suspend fun resolveVoiceId(preferred: String?, apiKey: String): String? {
    val trimmed = preferred?.trim().orEmpty()
    if (trimmed.isNotEmpty()) {
      val resolved = resolveVoiceAlias(trimmed)
      if (resolved != null) return resolved
      Log.w(tag, "unknown voice alias $trimmed")
    }
    fallbackVoiceId?.let { return it }
    return try {
      val voices = listVoices(apiKey)
      val first = voices.firstOrNull() ?: return null
      fallbackVoiceId = first.voiceId
      if (defaultVoiceId.isNullOrBlank()) {
        defaultVoiceId = first.voiceId
      }
      if (!voiceOverrideActive) {
        currentVoiceId = first.voiceId
      }
      val name = first.name ?: "unknown"
      Log.d(tag, "default voice selected $name (${first.voiceId})")
      first.voiceId
    } catch (err: Throwable) {
      Log.w(tag, "list voices failed: ${err.message ?: err::class.simpleName}")
      null
    }
  }
  private suspend fun listVoices(apiKey: String): List<ElevenLabsVoice> {
    return withContext(Dispatchers.IO) {
      val url = URL("https://api.elevenlabs.io/v1/voices")
      val conn = url.openConnection() as HttpURLConnection
      conn.requestMethod = "GET"
      conn.connectTimeout = 15_000
      conn.readTimeout = 15_000
      conn.setRequestProperty("xi-api-key", apiKey)
      val code = conn.responseCode
      val stream = if (code >= 400) conn.errorStream else conn.inputStream
      val data = stream.readBytes()
      if (code >= 400) {
        val message = data.toString(Charsets.UTF_8)
        throw IllegalStateException("ElevenLabs voices failed: $code $message")
      }
      val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull()
      val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList())
      voices.mapNotNull { entry ->
        val obj = entry.asObjectOrNull() ?: return@mapNotNull null
        val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null
        val name = obj["name"].asStringOrNull()
        ElevenLabsVoice(voiceId, name)
      }
    }
  }
  private fun isLikelyVoiceId(value: String): Boolean {
    if (value.length < 10) return false
    return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
@@ -884,6 +1006,8 @@ class TalkModeManager(
  private fun normalizeAliasKey(value: String): String =
    value.trim().lowercase()
  private data class ElevenLabsVoice(val voiceId: String, val name: String?)
  private val listener =
    object : RecognitionListener {
      override fun onReadyForSpeech(params: Bundle?) {
--- a/apps/ios/Sources/Voice/TalkModeManager.swift
+++ b/apps/ios/Sources/Voice/TalkModeManager.swift
@@ -9,6 +9,7 @@ import Speech
@Observable
 final class TalkModeManager: NSObject {
    private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
    private static let defaultModelIdFallback = "eleven_v3"
    var isEnabled: Bool = false
    var isListening: Bool = false
    var isSpeaking: Bool = false
@@ -36,11 +37,12 @@ final class TalkModeManager: NSObject {
    private var voiceAliases: [String: String] = [:]
    private var interruptOnSpeech: Bool = true
    private var mainSessionKey: String = "main"
    private var fallbackVoiceId: String?
    private var lastPlaybackWasPCM: Bool = false
    private var bridge: BridgeSession?
    private let silenceWindow: TimeInterval = 0.7
    private var player: AVAudioPlayer?
    private var chatSubscribedSessionKeys = Set<String>()
    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "TalkMode")
@@ -446,43 +448,43 @@ final class TalkModeManager: NSObject {
            let started = Date()
            let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
            let voiceId = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
            let resolvedKey =
                (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
                ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
            let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
            let preferredVoice = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
            let voiceId: String? = if let apiKey, !apiKey.isEmpty {
                await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
            } else {
                nil
            }
            let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)
            if canUseElevenLabs, let voiceId, let apiKey {
-                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
+                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
                let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
                if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
                    self.logger.warning(
                        "talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
                }
                let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
                let request = ElevenLabsTTSRequest(
                    text: cleaned,
-                    modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
+                    modelId: modelId,
                    outputFormat: outputFormat,
                    speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
-                    stability: TalkTTSValidation.validatedUnit(directive?.stability),
+                    stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
                    similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
                    style: TalkTTSValidation.validatedUnit(directive?.style),
                    speakerBoost: directive?.speakerBoost,
                    seed: TalkTTSValidation.validatedSeed(directive?.seed),
                    normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
-                    language: language)
+                    language: language,
                    latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))
                let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
                let client = ElevenLabsTTSClient(apiKey: apiKey)
-                let audio = try await client.synthesizeWithHardTimeout(
+                let stream = client.streamSynthesize(voiceId: voiceId, request: request)
                    voiceId: voiceId,
                    request: request,
                    hardTimeoutSeconds: synthTimeoutSeconds)
                self.logger
                    .info(
                        "elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
                if self.interruptOnSpeech {
                    do {
@@ -494,7 +496,21 @@ final class TalkModeManager: NSObject {
                }
                self.statusText = "Speaking…"
-                try await self.playAudio(data: audio)
+                let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
                let result: StreamingPlaybackResult
                if let sampleRate {
                    self.lastPlaybackWasPCM = true
                    result = await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate)
                } else {
                    self.lastPlaybackWasPCM = false
                    result = await StreamingAudioPlayer.shared.play(stream: stream)
                }
                self.logger
                    .info(
                        "elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
                if !result.finished, let interruptedAt = result.interruptedAt {
                    self.lastInterruptedAtSeconds = interruptedAt
                }
            } else {
                self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
                if self.interruptOnSpeech {
@@ -533,30 +549,17 @@ final class TalkModeManager: NSObject {
        self.isSpeaking = false
    }
    private func playAudio(data: Data) async throws {
        self.player?.stop()
        let player = try AVAudioPlayer(data: data)
        self.player = player
        player.prepareToPlay()
        self.logger.info("play start")
        guard player.play() else {
            throw NSError(domain: "TalkMode", code: 2, userInfo: [
                NSLocalizedDescriptionKey: "audio player refused to play",
            ])
        }
        while player.isPlaying {
            try? await Task.sleep(nanoseconds: 120_000_000)
        }
        self.logger.info("play done")
    }
    private func stopSpeaking(storeInterruption: Bool = true) {
        guard self.isSpeaking else { return }
        let interruptedAt = self.lastPlaybackWasPCM
            ? PCMStreamingAudioPlayer.shared.stop()
            : StreamingAudioPlayer.shared.stop()
        if storeInterruption {
-            self.lastInterruptedAtSeconds = self.player?.currentTime
+            self.lastInterruptedAtSeconds = interruptedAt
        }
-        self.player?.stop()
+        _ = self.lastPlaybackWasPCM
-        self.player = nil
+            ? StreamingAudioPlayer.shared.stop()
            : PCMStreamingAudioPlayer.shared.stop()
        TalkSystemSpeechSynthesizer.shared.stop()
        self.isSpeaking = false
    }
@@ -581,6 +584,37 @@ final class TalkModeManager: NSObject {
        return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
    }
    private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
        let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
        if !trimmed.isEmpty {
            if let resolved = self.resolveVoiceAlias(trimmed) { return resolved }
            self.logger.warning("unknown voice alias \(trimmed, privacy: .public)")
        }
        if let fallbackVoiceId { return fallbackVoiceId }
        do {
            let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
            guard let first = voices.first else {
                self.logger.warning("elevenlabs voices list empty")
                return nil
            }
            self.fallbackVoiceId = first.voiceId
            if self.defaultVoiceId == nil {
                self.defaultVoiceId = first.voiceId
            }
            if !self.voiceOverrideActive {
                self.currentVoiceId = first.voiceId
            }
            let name = first.name ?? "unknown"
            self.logger
                .info("default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
            return first.voiceId
        } catch {
            self.logger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
            return nil
        }
    }
    private static func isLikelyVoiceId(_ value: String) -> Bool {
        guard value.count >= 10 else { return false }
        return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
@@ -598,22 +632,23 @@ final class TalkModeManager: NSObject {
            self.mainSessionKey = rawMainKey.isEmpty ? "main" : rawMainKey
            self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
            if let aliases = talk?["voiceAliases"] as? [String: Any] {
-                self.voiceAliases =
+                var resolved: [String: String] = [:]
-                    aliases.compactMap { key, value in
+                for (key, value) in aliases {
-                        guard let id = value as? String else { return nil }
+                    guard let id = value as? String else { continue }
-                        let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
+                    let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
-                        let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
+                    let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
-                        guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { return nil }
+                    guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { continue }
-                        return (normalizedKey, trimmedId)
+                    resolved[normalizedKey] = trimmedId
-                    }
+                }
-                    .reduce(into: [:]) { $0[$1.0] = $1.1 }
+                self.voiceAliases = resolved
            } else {
                self.voiceAliases = [:]
            }
            if !self.voiceOverrideActive {
                self.currentVoiceId = self.defaultVoiceId
            }
-            self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
+            let model = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
            self.defaultModelId = (model?.isEmpty == false) ? model : Self.defaultModelIdFallback
            if !self.modelOverrideActive {
                self.currentModelId = self.defaultModelId
            }
@@ -624,7 +659,10 @@ final class TalkModeManager: NSObject {
                self.interruptOnSpeech = interrupt
            }
        } catch {
-            // ignore
+            self.defaultModelId = Self.defaultModelIdFallback
            if !self.modelOverrideActive {
                self.currentModelId = self.defaultModelId
            }
        }
    }
--- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift
+++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift
@@ -10,6 +10,7 @@ actor TalkModeRuntime {
    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime")
    private let ttsLogger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts")
    private static let defaultModelIdFallback = "eleven_v3"
    private final class RMSMeter: @unchecked Sendable {
        private let lock = NSLock()
@@ -62,6 +63,7 @@ actor TalkModeRuntime {
    private var lastSpokenText: String?
    private var apiKey: String?
    private var fallbackVoiceId: String?
    private var lastPlaybackWasPCM: Bool = false
    private let silenceWindow: TimeInterval = 0.7
    private let minSpeechRMS: Double = 1e-3
@@ -496,7 +498,7 @@ actor TalkModeRuntime {
        do {
            if let apiKey, !apiKey.isEmpty, let voiceId {
-                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
+                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
                let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
                if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
                    self.logger
@@ -504,27 +506,25 @@ actor TalkModeRuntime {
                            "talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
                }
                let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
                let request = ElevenLabsTTSRequest(
                    text: cleaned,
-                    modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
+                    modelId: modelId,
                    outputFormat: outputFormat,
                    speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
-                    stability: TalkTTSValidation.validatedUnit(directive?.stability),
+                    stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
                    similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
                    style: TalkTTSValidation.validatedUnit(directive?.style),
                    speakerBoost: directive?.speakerBoost,
                    seed: TalkTTSValidation.validatedSeed(directive?.seed),
                    normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
-                    language: language)
+                    language: language,
                    latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))
                self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
                let client = ElevenLabsTTSClient(apiKey: apiKey)
-                let audio = try await client.synthesizeWithHardTimeout(
+                let stream = client.streamSynthesize(voiceId: voiceId, request: request)
                    voiceId: voiceId,
                    request: request,
                    hardTimeoutSeconds: synthTimeoutSeconds)
                guard self.isCurrent(gen) else { return }
                self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
                if self.interruptOnSpeech {
                    await self.startRecognition()
@@ -534,12 +534,20 @@ actor TalkModeRuntime {
                await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
                self.phase = .speaking
-                let result = await TalkAudioPlayer.shared.play(data: audio)
+                let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
                let result: StreamingPlaybackResult
                if let sampleRate {
                    self.lastPlaybackWasPCM = true
                    result = await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate)
                } else {
                    self.lastPlaybackWasPCM = false
                    result = await StreamingAudioPlayer.shared.play(stream: stream)
                }
                self.ttsLogger
                    .info(
                        "talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
                if !result.finished, result.interruptedAt == nil {
-                    throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [
+                    throw NSError(domain: "StreamingAudioPlayer", code: 1, userInfo: [
                        NSLocalizedDescriptionKey: "audio playback failed",
                    ])
                }
@@ -631,7 +639,15 @@ actor TalkModeRuntime {
    }
    func stopSpeaking(reason: TalkStopReason) async {
-        let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
+        let interruptedAt = await MainActor.run {
            let primary = self.lastPlaybackWasPCM
                ? PCMStreamingAudioPlayer.shared.stop()
                : StreamingAudioPlayer.shared.stop()
            _ = self.lastPlaybackWasPCM
                ? StreamingAudioPlayer.shared.stop()
                : PCMStreamingAudioPlayer.shared.stop()
            return primary
        }
        await TalkSystemSpeechSynthesizer.shared.stop()
        guard self.phase == .speaking else { return }
        if reason == .speech, let interruptedAt {
@@ -707,7 +723,8 @@ actor TalkModeRuntime {
                    guard !key.isEmpty, !value.isEmpty else { return }
                    acc[key] = value
                } ?? [:]
-            let model = talk?["modelId"]?.stringValue
+            let model = talk?["modelId"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
            let resolvedModel = (model?.isEmpty == false) ? model! : Self.defaultModelIdFallback
            let outputFormat = talk?["outputFormat"]?.stringValue
            let interrupt = talk?["interruptOnSpeech"]?.boolValue
            let apiKey = talk?["apiKey"]?.stringValue
@@ -721,7 +738,7 @@ actor TalkModeRuntime {
            return TalkRuntimeConfig(
                voiceId: resolvedVoice,
                voiceAliases: resolvedAliases,
-                modelId: model,
+                modelId: resolvedModel,
                outputFormat: outputFormat,
                interruptOnSpeech: interrupt ?? true,
                apiKey: resolvedApiKey)
@@ -733,7 +750,7 @@ actor TalkModeRuntime {
            return TalkRuntimeConfig(
                voiceId: resolvedVoice,
                voiceAliases: [:],
-                modelId: nil,
+                modelId: Self.defaultModelIdFallback,
                outputFormat: nil,
                interruptOnSpeech: true,
                apiKey: resolvedApiKey)
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
@@ -22,6 +22,7 @@ public struct ElevenLabsTTSRequest: Sendable {
    public var seed: UInt32?
    public var normalize: String?
    public var language: String?
    public var latencyTier: Int?
    public init(
        text: String,
@@ -34,7 +35,8 @@ public struct ElevenLabsTTSRequest: Sendable {
        speakerBoost: Bool? = nil,
        seed: UInt32? = nil,
        normalize: String? = nil,
-        language: String? = nil)
+        language: String? = nil,
        latencyTier: Int? = nil)
    {
        self.text = text
        self.modelId = modelId
@@ -47,6 +49,7 @@ public struct ElevenLabsTTSRequest: Sendable {
        self.seed = seed
        self.normalize = normalize
        self.language = language
        self.latencyTier = latencyTier
    }
 }
@@ -155,6 +158,72 @@ public struct ElevenLabsTTSClient: Sendable {
        ])
    }
    public func streamSynthesize(
        voiceId: String,
        request: ElevenLabsTTSRequest) -> AsyncThrowingStream<Data, Error>
    {
        AsyncThrowingStream { continuation in
            let task = Task {
                do {
                    let url = Self.streamingURL(
                        baseUrl: self.baseUrl,
                        voiceId: voiceId,
                        latencyTier: request.latencyTier)
                    let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
                    var req = URLRequest(url: url)
                    req.httpMethod = "POST"
                    req.httpBody = body
                    req.timeoutInterval = self.requestTimeoutSeconds
                    req.setValue("application/json", forHTTPHeaderField: "Content-Type")
                    req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
                    req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
                    let (bytes, response) = try await URLSession.shared.bytes(for: req)
                    guard let http = response as? HTTPURLResponse else {
                        throw NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
                            NSLocalizedDescriptionKey: "ElevenLabs invalid response",
                        ])
                    }
                    let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
                    if http.statusCode >= 400 {
                        let message = try await Self.readErrorBody(bytes: bytes)
                        throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
                            NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
                        ])
                    }
                    if !contentType.contains("audio") {
                        let message = try await Self.readErrorBody(bytes: bytes)
                        throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
                            NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
                        ])
                    }
                    var buffer = Data()
                    buffer.reserveCapacity(16_384)
                    for try await byte in bytes {
                        buffer.append(byte)
                        if buffer.count >= 8_192 {
                            continuation.yield(buffer)
                            buffer.removeAll(keepingCapacity: true)
                        }
                    }
                    if !buffer.isEmpty {
                        continuation.yield(buffer)
                    }
                    continuation.finish()
                } catch {
                    continuation.finish(throwing: error)
                }
            }
            continuation.onTermination = { _ in
                task.cancel()
            }
        }
    }
    public func listVoices() async throws -> [ElevenLabsVoice] {
        var url = self.baseUrl
        url.appendPathComponent("v1")
@@ -180,7 +249,7 @@ public struct ElevenLabsTTSClient: Sendable {
    public static func validatedOutputFormat(_ value: String?) -> String? {
        let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
        guard !trimmed.isEmpty else { return nil }
-        guard trimmed.hasPrefix("mp3_") else { return nil }
+        guard trimmed.hasPrefix("mp3_") || trimmed.hasPrefix("pcm_") else { return nil }
        return trimmed
    }
@@ -230,4 +299,33 @@ public struct ElevenLabsTTSClient: Sendable {
        let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
        return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
    }
    private static func streamingURL(baseUrl: URL, voiceId: String, latencyTier: Int?) -> URL {
        var url = baseUrl
        url.appendPathComponent("v1")
        url.appendPathComponent("text-to-speech")
        url.appendPathComponent(voiceId)
        url.appendPathComponent("stream")
        guard let latencyTier else { return url }
        let latencyItem = URLQueryItem(
            name: "optimize_streaming_latency",
            value: "\(latencyTier)")
        guard var components = URLComponents(url: url, resolvingAgainstBaseURL: false) else {
            return url
        }
        var items = components.queryItems ?? []
        items.append(latencyItem)
        components.queryItems = items
        return components.url ?? url
    }
    private static func readErrorBody(bytes: URLSession.AsyncBytes) async throws -> String {
        var data = Data()
        for try await byte in bytes {
            data.append(byte)
            if data.count >= 4096 { break }
        }
        return truncatedErrorBody(data)
    }
 }
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/PCMStreamingAudioPlayer.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/PCMStreamingAudioPlayer.swift
@@ -0,0 +1,144 @@
 import AVFoundation
 import Foundation
 import OSLog
@MainActor
 public final class PCMStreamingAudioPlayer {
    public static let shared = PCMStreamingAudioPlayer()
    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.pcm")
    private var engine = AVAudioEngine()
    private var player = AVAudioPlayerNode()
    private var format: AVAudioFormat?
    private var pendingBuffers: Int = 0
    private var inputFinished = false
    private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
    public init() {
        self.engine.attach(self.player)
    }
    public func play(stream: AsyncThrowingStream<Data, Error>, sampleRate: Double) async -> StreamingPlaybackResult {
        self.stopInternal()
        let format = AVAudioFormat(
            commonFormat: .pcmFormatInt16,
            sampleRate: sampleRate,
            channels: 1,
            interleaved: true)
        guard let format else {
            return StreamingPlaybackResult(finished: false, interruptedAt: nil)
        }
        self.configure(format: format)
        return await withCheckedContinuation { continuation in
            self.continuation = continuation
            self.pendingBuffers = 0
            self.inputFinished = false
            Task.detached { [weak self] in
                guard let self else { return }
                do {
                    for try await chunk in stream {
                        await self.enqueuePCM(chunk, format: format)
                    }
                    await self.finishInput()
                } catch {
                    await self.fail(error)
                }
            }
        }
    }
    public func stop() -> Double? {
        let interruptedAt = self.currentTimeSeconds()
        self.stopInternal()
        self.finish(StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
        return interruptedAt
    }
    private func configure(format: AVAudioFormat) {
        if self.format?.sampleRate != format.sampleRate || self.format?.commonFormat != format.commonFormat {
            self.engine.stop()
            self.engine = AVAudioEngine()
            self.player = AVAudioPlayerNode()
            self.engine.attach(self.player)
        }
        self.format = format
        if self.engine.attachedNodes.contains(self.player) {
            self.engine.connect(self.player, to: self.engine.mainMixerNode, format: format)
        }
    }
    private func enqueuePCM(_ data: Data, format: AVAudioFormat) async {
        guard !data.isEmpty else { return }
        let frameCount = data.count / MemoryLayout<Int16>.size
        guard frameCount > 0 else { return }
        guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(frameCount)) else {
            return
        }
        buffer.frameLength = AVAudioFrameCount(frameCount)
        data.withUnsafeBytes { raw in
            guard let src = raw.baseAddress else { return }
            let audioBuffer = buffer.audioBufferList.pointee.mBuffers
            if let dst = audioBuffer.mData {
                memcpy(dst, src, frameCount * MemoryLayout<Int16>.size)
            }
        }
        self.pendingBuffers += 1
        self.player.scheduleBuffer(buffer) { [weak self] in
            Task { @MainActor in
                guard let self else { return }
                self.pendingBuffers = max(0, self.pendingBuffers - 1)
                if self.inputFinished && self.pendingBuffers == 0 {
                    self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
                }
            }
        }
        if !self.player.isPlaying {
            do {
                try self.engine.start()
                self.player.play()
            } catch {
                self.logger.error("pcm engine start failed: \(error.localizedDescription, privacy: .public)")
                self.fail(error)
            }
        }
    }
    private func finishInput() {
        self.inputFinished = true
        if self.pendingBuffers == 0 {
            self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
        }
    }
    private func fail(_ error: Error) {
        self.logger.error("pcm stream failed: \(error.localizedDescription, privacy: .public)")
        self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
    }
    private func stopInternal() {
        self.player.stop()
        self.engine.stop()
        self.pendingBuffers = 0
        self.inputFinished = false
    }
    private func finish(_ result: StreamingPlaybackResult) {
        let continuation = self.continuation
        self.continuation = nil
        continuation?.resume(returning: result)
    }
    private func currentTimeSeconds() -> Double? {
        guard let nodeTime = self.player.lastRenderTime,
              let playerTime = self.player.playerTime(forNodeTime: nodeTime)
        else { return nil }
        return Double(playerTime.sampleTime) / playerTime.sampleRate
    }
 }
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/StreamingAudioPlayer.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/StreamingAudioPlayer.swift
@@ -0,0 +1,429 @@
 import AudioToolbox
 import Foundation
 import OSLog
 public struct StreamingPlaybackResult: Sendable {
    public let finished: Bool
    public let interruptedAt: Double?
    public init(finished: Bool, interruptedAt: Double?) {
        self.finished = finished
        self.interruptedAt = interruptedAt
    }
 }
@MainActor
 public final class StreamingAudioPlayer: NSObject {
    public static let shared = StreamingAudioPlayer()
    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.stream")
    private var playback: Playback?
    public func play(stream: AsyncThrowingStream<Data, Error>) async -> StreamingPlaybackResult {
        self.stopInternal()
        let playback = Playback(logger: self.logger)
        self.playback = playback
        return await withCheckedContinuation { continuation in
            playback.setContinuation(continuation)
            playback.start()
            Task.detached {
                do {
                    for try await chunk in stream {
                        playback.append(chunk)
                    }
                    playback.finishInput()
                } catch {
                    playback.fail(error)
                }
            }
        }
    }
    public func stop() -> Double? {
        guard let playback else { return nil }
        let interruptedAt = playback.stop(immediate: true)
        self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
        return interruptedAt
    }
    private func stopInternal() {
        guard let playback else { return }
        let interruptedAt = playback.stop(immediate: true)
        self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
    }
    private func finish(playback: Playback, result: StreamingPlaybackResult) {
        playback.finish(result)
        guard self.playback === playback else { return }
        self.playback = nil
    }
 }
 private final class Playback: @unchecked Sendable {
    private static let bufferCount: Int = 3
    private static let bufferSize: Int = 32 * 1024
    private let logger: Logger
    private let lock = NSLock()
    private let parseQueue = DispatchQueue(label: "talk.stream.parse")
    fileprivate let bufferLock = NSLock()
    fileprivate let bufferSemaphore = DispatchSemaphore(value: bufferCount)
    private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
    private var finished = false
    private var audioFileStream: AudioFileStreamID?
    private var audioQueue: AudioQueueRef?
    fileprivate var audioFormat: AudioStreamBasicDescription?
    fileprivate var maxPacketSize: UInt32 = 0
    fileprivate var availableBuffers: [AudioQueueBufferRef] = []
    private var currentBuffer: AudioQueueBufferRef?
    private var currentBufferSize: Int = 0
    private var currentPacketDescs: [AudioStreamPacketDescription] = []
    private var isRunning = false
    fileprivate var inputFinished = false
    private var startRequested = false
    private var sampleRate: Double = 0
    init(logger: Logger) {
        self.logger = logger
    }
    func setContinuation(_ continuation: CheckedContinuation<StreamingPlaybackResult, Never>) {
        self.lock.lock()
        self.continuation = continuation
        self.lock.unlock()
    }
    func start() {
        let selfPtr = Unmanaged.passUnretained(self).toOpaque()
        let status = AudioFileStreamOpen(
            selfPtr,
            propertyListenerProc,
            packetsProc,
            kAudioFileMP3Type,
            &self.audioFileStream)
        if status != noErr {
            self.logger.error("talk stream open failed: \(status)")
            self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
        }
    }
    func append(_ data: Data) {
        guard !data.isEmpty else { return }
        self.parseQueue.async { [weak self] in
            guard let self else { return }
            guard let audioFileStream = self.audioFileStream else { return }
            let status = data.withUnsafeBytes { bytes in
                AudioFileStreamParseBytes(
                    audioFileStream,
                    UInt32(bytes.count),
                    bytes.baseAddress,
                    [])
            }
            if status != noErr {
                self.logger.error("talk stream parse failed: \(status)")
                self.fail(NSError(domain: "StreamingAudio", code: Int(status)))
            }
        }
    }
    func finishInput() {
        self.parseQueue.async { [weak self] in
            guard let self else { return }
            self.inputFinished = true
            if self.audioQueue == nil {
                self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
                return
            }
            self.enqueueCurrentBuffer(flushOnly: true)
            self.stop(immediate: false)
        }
    }
    func fail(_ error: Error) {
        self.logger.error("talk stream failed: \(error.localizedDescription, privacy: .public)")
        _ = self.stop(immediate: true)
        self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
    }
    func stop(immediate: Bool) -> Double? {
        guard let audioQueue else { return nil }
        let interruptedAt = self.currentTimeSeconds()
        AudioQueueStop(audioQueue, immediate)
        return interruptedAt
    }
    fileprivate func finish(_ result: StreamingPlaybackResult) {
        let continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
        self.lock.lock()
        if self.finished {
            continuation = nil
        } else {
            self.finished = true
            continuation = self.continuation
            self.continuation = nil
        }
        self.lock.unlock()
        continuation?.resume(returning: result)
        self.teardown()
    }
    private func teardown() {
        if let audioQueue {
            AudioQueueDispose(audioQueue, true)
            self.audioQueue = nil
        }
        if let audioFileStream {
            AudioFileStreamClose(audioFileStream)
            self.audioFileStream = nil
        }
        self.bufferLock.lock()
        self.availableBuffers.removeAll()
        self.bufferLock.unlock()
        self.currentBuffer = nil
        self.currentPacketDescs.removeAll()
    }
    fileprivate func setupQueueIfNeeded(_ asbd: AudioStreamBasicDescription) {
        guard self.audioQueue == nil else { return }
        var format = asbd
        self.audioFormat = format
        self.sampleRate = format.mSampleRate
        let selfPtr = Unmanaged.passUnretained(self).toOpaque()
        let status = AudioQueueNewOutput(
            &format,
            outputCallbackProc,
            selfPtr,
            nil,
            nil,
            0,
            &self.audioQueue)
        if status != noErr {
            self.logger.error("talk queue create failed: \(status)")
            self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
            return
        }
        if let audioQueue {
            AudioQueueAddPropertyListener(audioQueue, kAudioQueueProperty_IsRunning, isRunningCallbackProc, selfPtr)
        }
        if let audioFileStream {
            var cookieSize: UInt32 = 0
            var writable: DarwinBoolean = false
            let cookieStatus = AudioFileStreamGetPropertyInfo(
                audioFileStream,
                kAudioFileStreamProperty_MagicCookieData,
                &cookieSize,
                &writable)
            if cookieStatus == noErr, cookieSize > 0, let audioQueue {
                var cookie = [UInt8](repeating: 0, count: Int(cookieSize))
                let readStatus = AudioFileStreamGetProperty(
                    audioFileStream,
                    kAudioFileStreamProperty_MagicCookieData,
                    &cookieSize,
                    &cookie)
                if readStatus == noErr {
                    AudioQueueSetProperty(audioQueue, kAudioQueueProperty_MagicCookie, cookie, cookieSize)
                }
            }
        }
        if let audioQueue {
            for _ in 0..<Self.bufferCount {
                var buffer: AudioQueueBufferRef?
                let allocStatus = AudioQueueAllocateBuffer(audioQueue, UInt32(Self.bufferSize), &buffer)
                if allocStatus == noErr, let buffer {
                    self.bufferLock.lock()
                    self.availableBuffers.append(buffer)
                    self.bufferLock.unlock()
                }
            }
        }
    }
    private func enqueueCurrentBuffer(flushOnly: Bool = false) {
        guard let audioQueue, let buffer = self.currentBuffer else { return }
        guard self.currentBufferSize > 0 else { return }
        buffer.pointee.mAudioDataByteSize = UInt32(self.currentBufferSize)
        let packetCount = UInt32(self.currentPacketDescs.count)
        let status = self.currentPacketDescs.withUnsafeBufferPointer { descPtr in
            AudioQueueEnqueueBuffer(audioQueue, buffer, packetCount, descPtr.baseAddress)
        }
        if status != noErr {
            self.logger.error("talk queue enqueue failed: \(status)")
        } else {
            if !self.startRequested {
                self.startRequested = true
                let startStatus = AudioQueueStart(audioQueue, nil)
                if startStatus != noErr {
                    self.logger.error("talk queue start failed: \(startStatus)")
                }
            }
        }
        self.currentBuffer = nil
        self.currentBufferSize = 0
        self.currentPacketDescs.removeAll(keepingCapacity: true)
        if !flushOnly {
            self.bufferSemaphore.wait()
            self.bufferLock.lock()
            let next = self.availableBuffers.popLast()
            self.bufferLock.unlock()
            if let next { self.currentBuffer = next }
        }
    }
    fileprivate func handlePackets(
        numberBytes: UInt32,
        numberPackets: UInt32,
        inputData: UnsafeRawPointer,
        packetDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
    {
        if self.audioQueue == nil, let format = self.audioFormat {
            self.setupQueueIfNeeded(format)
        }
        if self.audioQueue == nil {
            return
        }
        if self.currentBuffer == nil {
            self.bufferSemaphore.wait()
            self.bufferLock.lock()
            self.currentBuffer = self.availableBuffers.popLast()
            self.bufferLock.unlock()
            self.currentBufferSize = 0
            self.currentPacketDescs.removeAll(keepingCapacity: true)
        }
        let bytes = inputData.assumingMemoryBound(to: UInt8.self)
        let packetCount = Int(numberPackets)
        for index in 0..<packetCount {
            let packetOffset: Int
            let packetSize: Int
            if let packetDescriptions {
                packetOffset = Int(packetDescriptions[index].mStartOffset)
                packetSize = Int(packetDescriptions[index].mDataByteSize)
            } else {
                let size = Int(numberBytes) / packetCount
                packetOffset = index * size
                packetSize = size
            }
            if packetSize > Self.bufferSize {
                continue
            }
            if self.currentBufferSize + packetSize > Self.bufferSize {
                self.enqueueCurrentBuffer()
            }
            guard let buffer = self.currentBuffer else { continue }
            let dest = buffer.pointee.mAudioData.advanced(by: self.currentBufferSize)
            memcpy(dest, bytes.advanced(by: packetOffset), packetSize)
            let desc = AudioStreamPacketDescription(
                mStartOffset: Int64(self.currentBufferSize),
                mVariableFramesInPacket: 0,
                mDataByteSize: UInt32(packetSize))
            self.currentPacketDescs.append(desc)
            self.currentBufferSize += packetSize
        }
    }
    private func currentTimeSeconds() -> Double? {
        guard let audioQueue, sampleRate > 0 else { return nil }
        var timeStamp = AudioTimeStamp()
        let status = AudioQueueGetCurrentTime(audioQueue, nil, &timeStamp, nil)
        if status != noErr { return nil }
        if timeStamp.mSampleTime.isNaN { return nil }
        return timeStamp.mSampleTime / sampleRate
    }
 }
 private func propertyListenerProc(
    inClientData: UnsafeMutableRawPointer,
    inAudioFileStream: AudioFileStreamID,
    inPropertyID: AudioFileStreamPropertyID,
    ioFlags: UnsafeMutablePointer<AudioFileStreamPropertyFlags>)
 {
    let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
    if inPropertyID == kAudioFileStreamProperty_DataFormat {
        var format = AudioStreamBasicDescription()
        var size = UInt32(MemoryLayout<AudioStreamBasicDescription>.size)
        let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &format)
        if status == noErr {
            playback.audioFormat = format
            playback.setupQueueIfNeeded(format)
        }
    } else if inPropertyID == kAudioFileStreamProperty_PacketSizeUpperBound {
        var maxPacketSize: UInt32 = 0
        var size = UInt32(MemoryLayout<UInt32>.size)
        let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &maxPacketSize)
        if status == noErr {
            playback.maxPacketSize = maxPacketSize
        }
    }
 }
 private func packetsProc(
    inClientData: UnsafeMutableRawPointer,
    inNumberBytes: UInt32,
    inNumberPackets: UInt32,
    inInputData: UnsafeRawPointer,
    inPacketDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
 {
    let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
    playback.handlePackets(
        numberBytes: inNumberBytes,
        numberPackets: inNumberPackets,
        inputData: inInputData,
        packetDescriptions: inPacketDescriptions)
 }
 private func outputCallbackProc(
    inUserData: UnsafeMutableRawPointer?,
    inAQ: AudioQueueRef,
    inBuffer: AudioQueueBufferRef)
 {
    guard let inUserData else { return }
    let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
    playback.bufferLock.lock()
    playback.availableBuffers.append(inBuffer)
    playback.bufferLock.unlock()
    playback.bufferSemaphore.signal()
 }
 private func isRunningCallbackProc(
    inUserData: UnsafeMutableRawPointer?,
    inAQ: AudioQueueRef,
    inID: AudioQueuePropertyID)
 {
    guard let inUserData else { return }
    guard inID == kAudioQueueProperty_IsRunning else { return }
    let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
    var running: UInt32 = 0
    var size = UInt32(MemoryLayout<UInt32>.size)
    let status = AudioQueueGetProperty(inAQ, kAudioQueueProperty_IsRunning, &running, &size)
    if status != noErr { return }
    if running == 0, playback.inputFinished {
        playback.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
    }
 }
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift
@@ -1,4 +1,6 @@
 public enum TalkTTSValidation: Sendable {
    private static let v3StabilityValues: Set<Double> = [0.0, 0.5, 1.0]
    public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
        if let rateWPM, rateWPM > 0 {
            let resolved = Double(rateWPM) / 175.0
@@ -18,10 +20,32 @@ public enum TalkTTSValidation: Sendable {
        return value
    }
    public static func validatedStability(_ value: Double?, modelId: String?) -> Double? {
        guard let value else { return nil }
        let normalizedModel = (modelId ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
        if normalizedModel == "eleven_v3" {
            return v3StabilityValues.contains(value) ? value : nil
        }
        return validatedUnit(value)
    }
    public static func validatedSeed(_ value: Int?) -> UInt32? {
        guard let value else { return nil }
        if value < 0 || value > 4294967295 { return nil }
        return UInt32(value)
    }
 }
    public static func validatedLatencyTier(_ value: Int?) -> Int? {
        guard let value else { return nil }
        if value < 0 || value > 4 { return nil }
        return value
    }
    public static func pcmSampleRate(from outputFormat: String?) -> Double? {
        let trimmed = (outputFormat ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
        guard trimmed.hasPrefix("pcm_") else { return nil }
        let parts = trimmed.split(separator: "_", maxSplits: 1)
        guard parts.count == 2, let rate = Double(parts[1]), rate > 0 else { return nil }
        return rate
    }
 }
--- a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift
+++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift
@@ -16,9 +16,30 @@ final class TalkTTSValidationTests: XCTestCase {
        XCTAssertNil(TalkTTSValidation.validatedUnit(1.01))
    }
    func testValidatedStability() {
        XCTAssertEqual(TalkTTSValidation.validatedStability(0, modelId: "eleven_v3"), 0)
        XCTAssertEqual(TalkTTSValidation.validatedStability(0.5, modelId: "eleven_v3"), 0.5)
        XCTAssertEqual(TalkTTSValidation.validatedStability(1, modelId: "eleven_v3"), 1)
        XCTAssertNil(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_v3"))
        XCTAssertEqual(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_multilingual_v2"), 0.7)
    }
    func testValidatedSeedBounds() {
        XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0)
        XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234)
        XCTAssertNil(TalkTTSValidation.validatedSeed(-1))
    }
    func testValidatedLatencyTier() {
        XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(0), 0)
        XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(4), 4)
        XCTAssertNil(TalkTTSValidation.validatedLatencyTier(-1))
        XCTAssertNil(TalkTTSValidation.validatedLatencyTier(5))
    }
    func testPcmSampleRateParse() {
        XCTAssertEqual(TalkTTSValidation.pcmSampleRate(from: "pcm_44100"), 44100)
        XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "mp3_44100_128"))
        XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "pcm_bad"))
    }
 }
--- a/docs/talk.md
+++ b/docs/talk.md
@@ -10,7 +10,7 @@ Talk mode is a continuous voice conversation loop:
 1) Listen for speech
 2) Send transcript to the model (main session, chat.send)
 3) Wait for the response
-4) Speak it via ElevenLabs
+4) Speak it via ElevenLabs (streaming playback)
 ## Behavior (macOS)
 - **Always-on overlay** while Talk mode is enabled.
@@ -55,8 +55,10 @@ Supported keys:
 Defaults:
 - `interruptOnSpeech`: true
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID`
+- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
 - `modelId`: defaults to `eleven_v3` when unset
 - `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)
 - `outputFormat`: defaults to `pcm_44100` on macOS/iOS for faster streaming playback (Android stays on MP3)
 ## macOS UI
 - Menu bar toggle: **Talk**
@@ -71,4 +73,6 @@ Defaults:
 ## Notes
 - Requires Speech + Microphone permissions.
 - Uses `chat.send` against session key `main`.
- TTS uses ElevenLabs API with `ELEVENLABS_API_KEY`.
+- TTS uses ElevenLabs streaming API with `ELEVENLABS_API_KEY` and incremental playback on macOS/iOS/Android for lower latency.
 - `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`.
 - `latency_tier` is validated to `0..4` when set.