fix: stream elevenlabs tts playback

2025-12-30 12:17:40 +01:00
parent 9c532eac07
commit 27adfb76fa
11 changed files with 1091 additions and 91 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,9 @@
 - macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn.
 - macOS Talk Mode: increase overlay window size so wave rings don’t clip; close button is hover-only and closer to the orb.
 - Talk Mode: fall back to system TTS when ElevenLabs is unavailable, returns non-audio, or playback fails (macOS/iOS/Android).
+- Talk Mode: stream PCM on macOS/iOS for lower latency (incremental playback); Android continues MP3 streaming.
+- Talk Mode: validate ElevenLabs v3 stability and latency tier directives before sending requests.
+- iOS/Android Talk Mode: auto-select the first ElevenLabs voice when none is configured.
 - ElevenLabs: add retry/backoff for 429/5xx and include content-type in errors for debugging.
 - Talk Mode: align to the gateway’s main session key and fall back to history polling when chat events drop (prevents stuck “thinking” / missing messages).
 - Talk Mode: treat history timestamps as seconds or milliseconds to avoid stale assistant picks (macOS/iOS/Android).
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/StreamingMediaDataSource.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/StreamingMediaDataSource.kt
@@ -0,0 +1,98 @@
+package com.steipete.clawdis.node.voice
+
+import android.media.MediaDataSource
+import kotlin.math.min
+
+internal class StreamingMediaDataSource : MediaDataSource() {
+  private data class Chunk(val start: Long, val data: ByteArray)
+
+  private val lock = Object()
+  private val chunks = ArrayList<Chunk>()
+  private var totalSize: Long = 0
+  private var closed = false
+  private var finished = false
+  private var lastReadIndex = 0
+
+  fun append(data: ByteArray) {
+    if (data.isEmpty()) return
+    synchronized(lock) {
+      if (closed || finished) return
+      val chunk = Chunk(totalSize, data)
+      chunks.add(chunk)
+      totalSize += data.size.toLong()
+      lock.notifyAll()
+    }
+  }
+
+  fun finish() {
+    synchronized(lock) {
+      if (closed) return
+      finished = true
+      lock.notifyAll()
+    }
+  }
+
+  fun fail() {
+    synchronized(lock) {
+      closed = true
+      lock.notifyAll()
+    }
+  }
+
+  override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int {
+    if (position < 0) return -1
+    synchronized(lock) {
+      while (!closed && !finished && position >= totalSize) {
+        lock.wait()
+      }
+      if (closed) return -1
+      if (position >= totalSize && finished) return -1
+
+      val available = (totalSize - position).toInt()
+      val toRead = min(size, available)
+      var remaining = toRead
+      var destOffset = offset
+      var pos = position
+
+      var index = findChunkIndex(pos)
+      while (remaining > 0 && index < chunks.size) {
+        val chunk = chunks[index]
+        val inChunkOffset = (pos - chunk.start).toInt()
+        if (inChunkOffset >= chunk.data.size) {
+          index++
+          continue
+        }
+        val copyLen = min(remaining, chunk.data.size - inChunkOffset)
+        System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen)
+        remaining -= copyLen
+        destOffset += copyLen
+        pos += copyLen
+        if (inChunkOffset + copyLen >= chunk.data.size) {
+          index++
+        }
+      }
+
+      return toRead - remaining
+    }
+  }
+
+  override fun getSize(): Long = -1
+
+  override fun close() {
+    synchronized(lock) {
+      closed = true
+      lock.notifyAll()
+    }
+  }
+
+  private fun findChunkIndex(position: Long): Int {
+    var index = lastReadIndex
+    while (index < chunks.size) {
+      val chunk = chunks[index]
+      if (position < chunk.start + chunk.data.size) break
+      index++
+    }
+    lastReadIndex = index
+    return index
+  }
+}
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
@@ -18,7 +18,6 @@ import android.speech.tts.UtteranceProgressListener
 import android.util.Log
 import androidx.core.content.ContextCompat
 import com.steipete.clawdis.node.bridge.BridgeSession
-import java.io.File
 import java.net.HttpURLConnection
 import java.net.URL
 import java.util.UUID
@@ -44,6 +43,7 @@ class TalkModeManager(
 ) {
  companion object {
    private const val tag = "TalkMode"
+    private const val defaultModelIdFallback = "eleven_v3"
  }

  private val mainHandler = Handler(Looper.getMainLooper())
@@ -81,6 +81,7 @@ class TalkModeManager(

  private var defaultVoiceId: String? = null
  private var currentVoiceId: String? = null
+  private var fallbackVoiceId: String? = null
  private var defaultModelId: String? = null
  private var currentModelId: String? = null
  private var defaultOutputFormat: String? = null
@@ -97,7 +98,7 @@ class TalkModeManager(
  private var chatSubscribedSessionKey: String? = null

  private var player: MediaPlayer? = null
-  private var currentAudioFile: File? = null
+  private var streamingSource: StreamingMediaDataSource? = null
  private var systemTts: TextToSpeech? = null
  private var systemTtsPending: CompletableDeferred<Unit>? = null
  private var systemTtsPendingId: String? = null
@@ -464,7 +465,13 @@ class TalkModeManager(
    val apiKey =
      apiKey?.trim()?.takeIf { it.isNotEmpty() }
        ?: System.getenv("ELEVENLABS_API_KEY")?.trim()
-    val voiceId = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
+    val preferredVoice = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
+    val voiceId =
+      if (!apiKey.isNullOrEmpty()) {
+        resolveVoiceId(preferredVoice, apiKey)
+      } else {
+        null
+      }

    _statusText.value = "Speaking…"
    _isSpeaking.value = true
@@ -486,24 +493,25 @@ class TalkModeManager(
      } else {
        _usingFallbackTts.value = false
        val ttsStarted = SystemClock.elapsedRealtime()
+        val modelId = directive?.modelId ?: currentModelId ?: defaultModelId
        val request =
          ElevenLabsRequest(
            text = cleaned,
-            modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
+            modelId = modelId,
            outputFormat =
              TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
            speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
-            stability = TalkModeRuntime.validatedUnit(directive?.stability),
+            stability = TalkModeRuntime.validatedStability(directive?.stability, modelId),
            similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
            style = TalkModeRuntime.validatedUnit(directive?.style),
            speakerBoost = directive?.speakerBoost,
            seed = TalkModeRuntime.validatedSeed(directive?.seed),
            normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
            language = TalkModeRuntime.validatedLanguage(directive?.language),
+            latencyTier = TalkModeRuntime.validatedLatencyTier(directive?.latencyTier),
          )
-        val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
-        Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
-        playAudio(audio)
+        streamAndPlay(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
+        Log.d(tag, "elevenlabs stream ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
      }
    } catch (err: Throwable) {
      Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
@@ -520,22 +528,28 @@ class TalkModeManager(
    _isSpeaking.value = false
  }

-  private suspend fun playAudio(data: ByteArray) {
+  private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
    stopSpeaking(resetInterrupt = false)
-    val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
-    file.writeBytes(data)
-    currentAudioFile = file
+
+    val dataSource = StreamingMediaDataSource()
+    streamingSource = dataSource

    val player = MediaPlayer()
    this.player = player

+    val prepared = CompletableDeferred<Unit>()
    val finished = CompletableDeferred<Unit>()
+
    player.setAudioAttributes(
      AudioAttributes.Builder()
        .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
        .setUsage(AudioAttributes.USAGE_ASSISTANT)
        .build(),
    )
+    player.setOnPreparedListener {
+      it.start()
+      prepared.complete(Unit)
+    }
    player.setOnCompletionListener {
      finished.complete(Unit)
    }
@@ -544,16 +558,30 @@ class TalkModeManager(
      true
    }

-    player.setDataSource(file.absolutePath)
+    player.setDataSource(dataSource)
    withContext(Dispatchers.Main) {
-      player.setOnPreparedListener { it.start() }
      player.prepareAsync()
    }

+    val fetchError = CompletableDeferred<Throwable?>()
+    val fetchJob =
+      scope.launch(Dispatchers.IO) {
+        try {
+          streamTts(voiceId = voiceId, apiKey = apiKey, request = request, sink = dataSource)
+          fetchError.complete(null)
+        } catch (err: Throwable) {
+          dataSource.fail()
+          fetchError.complete(err)
+        }
+      }
+
    Log.d(tag, "play start")
    try {
+      prepared.await()
      finished.await()
+      fetchError.await()?.let { throw it }
    } finally {
+      fetchJob.cancel()
      cleanupPlayer()
    }
    Log.d(tag, "play done")
@@ -674,8 +702,8 @@ class TalkModeManager(
    player?.stop()
    player?.release()
    player = null
-    currentAudioFile?.delete()
-    currentAudioFile = null
+    streamingSource?.close()
+    streamingSource = null
  }

  private fun shouldInterrupt(transcript: String): Boolean {
@@ -713,13 +741,15 @@ class TalkModeManager(
      defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
      voiceAliases = aliases
      if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
-      defaultModelId = model
+      defaultModelId = model ?: defaultModelIdFallback
      if (!modelOverrideActive) currentModelId = defaultModelId
      defaultOutputFormat = outputFormat
      apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
      if (interrupt != null) interruptOnSpeech = interrupt
    } catch (_: Throwable) {
      defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
+      defaultModelId = defaultModelIdFallback
+      if (!modelOverrideActive) currentModelId = defaultModelId
      apiKey = envKey?.takeIf { it.isNotEmpty() }
      voiceAliases = emptyMap()
    }
@@ -730,9 +760,21 @@ class TalkModeManager(
    return obj["runId"].asStringOrNull()
  }

-  private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray {
-    return withContext(Dispatchers.IO) {
-      val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId")
+  private suspend fun streamTts(
+    voiceId: String,
+    apiKey: String,
+    request: ElevenLabsRequest,
+    sink: StreamingMediaDataSource,
+  ) {
+    withContext(Dispatchers.IO) {
+      val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
+      val latencyTier = request.latencyTier
+      val url =
+        if (latencyTier != null) {
+          URL("$baseUrl?optimize_streaming_latency=$latencyTier")
+        } else {
+          URL(baseUrl)
+        }
      val conn = url.openConnection() as HttpURLConnection
      conn.requestMethod = "POST"
      conn.connectTimeout = 30_000
@@ -746,13 +788,21 @@ class TalkModeManager(
      conn.outputStream.use { it.write(payload.toByteArray()) }

      val code = conn.responseCode
-      val stream = if (code >= 400) conn.errorStream else conn.inputStream
-      val data = stream.readBytes()
      if (code >= 400) {
-        val message = String(data)
+        val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
+        sink.fail()
        throw IllegalStateException("ElevenLabs failed: $code $message")
      }
-      data
+
+      val buffer = ByteArray(8 * 1024)
+      conn.inputStream.use { input ->
+        while (true) {
+          val read = input.read(buffer)
+          if (read <= 0) break
+          sink.append(buffer.copyOf(read))
+        }
+      }
+      sink.finish()
    }
  }

@@ -794,6 +844,7 @@ class TalkModeManager(
    val seed: Long?,
    val normalize: String?,
    val language: String?,
+    val latencyTier: Int?,
  )

  private object TalkModeRuntime {
@@ -816,6 +867,15 @@ class TalkModeManager(
      return value
    }

+    fun validatedStability(value: Double?, modelId: String?): Double? {
+      if (value == null) return null
+      val normalized = modelId?.trim()?.lowercase()
+      if (normalized == "eleven_v3") {
+        return if (value == 0.0 || value == 0.5 || value == 1.0) value else null
+      }
+      return validatedUnit(value)
+    }
+
    fun validatedSeed(value: Long?): Long? {
      if (value == null) return null
      if (value < 0 || value > 4294967295L) return null
@@ -840,6 +900,12 @@ class TalkModeManager(
      return if (trimmed.startsWith("mp3_")) trimmed else null
    }

+    fun validatedLatencyTier(value: Int?): Int? {
+      if (value == null) return null
+      if (value < 0 || value > 4) return null
+      return value
+    }
+
    fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean {
      val sinceMs = sinceSeconds * 1000
      return if (timestamp > 10_000_000_000) {
@@ -876,6 +942,62 @@ class TalkModeManager(
    return if (isLikelyVoiceId(trimmed)) trimmed else null
  }

+  private suspend fun resolveVoiceId(preferred: String?, apiKey: String): String? {
+    val trimmed = preferred?.trim().orEmpty()
+    if (trimmed.isNotEmpty()) {
+      val resolved = resolveVoiceAlias(trimmed)
+      if (resolved != null) return resolved
+      Log.w(tag, "unknown voice alias $trimmed")
+    }
+    fallbackVoiceId?.let { return it }
+
+    return try {
+      val voices = listVoices(apiKey)
+      val first = voices.firstOrNull() ?: return null
+      fallbackVoiceId = first.voiceId
+      if (defaultVoiceId.isNullOrBlank()) {
+        defaultVoiceId = first.voiceId
+      }
+      if (!voiceOverrideActive) {
+        currentVoiceId = first.voiceId
+      }
+      val name = first.name ?: "unknown"
+      Log.d(tag, "default voice selected $name (${first.voiceId})")
+      first.voiceId
+    } catch (err: Throwable) {
+      Log.w(tag, "list voices failed: ${err.message ?: err::class.simpleName}")
+      null
+    }
+  }
+
+  private suspend fun listVoices(apiKey: String): List<ElevenLabsVoice> {
+    return withContext(Dispatchers.IO) {
+      val url = URL("https://api.elevenlabs.io/v1/voices")
+      val conn = url.openConnection() as HttpURLConnection
+      conn.requestMethod = "GET"
+      conn.connectTimeout = 15_000
+      conn.readTimeout = 15_000
+      conn.setRequestProperty("xi-api-key", apiKey)
+
+      val code = conn.responseCode
+      val stream = if (code >= 400) conn.errorStream else conn.inputStream
+      val data = stream.readBytes()
+      if (code >= 400) {
+        val message = data.toString(Charsets.UTF_8)
+        throw IllegalStateException("ElevenLabs voices failed: $code $message")
+      }
+
+      val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull()
+      val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList())
+      voices.mapNotNull { entry ->
+        val obj = entry.asObjectOrNull() ?: return@mapNotNull null
+        val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null
+        val name = obj["name"].asStringOrNull()
+        ElevenLabsVoice(voiceId, name)
+      }
+    }
+  }
+
  private fun isLikelyVoiceId(value: String): Boolean {
    if (value.length < 10) return false
    return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
@@ -884,6 +1006,8 @@ class TalkModeManager(
  private fun normalizeAliasKey(value: String): String =
    value.trim().lowercase()

+  private data class ElevenLabsVoice(val voiceId: String, val name: String?)
+
  private val listener =
    object : RecognitionListener {
      override fun onReadyForSpeech(params: Bundle?) {
--- a/apps/ios/Sources/Voice/TalkModeManager.swift
+++ b/apps/ios/Sources/Voice/TalkModeManager.swift
@@ -9,6 +9,7 @@ import Speech
@Observable
 final class TalkModeManager: NSObject {
    private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
+    private static let defaultModelIdFallback = "eleven_v3"
    var isEnabled: Bool = false
    var isListening: Bool = false
    var isSpeaking: Bool = false
@@ -36,11 +37,12 @@ final class TalkModeManager: NSObject {
    private var voiceAliases: [String: String] = [:]
    private var interruptOnSpeech: Bool = true
    private var mainSessionKey: String = "main"
+    private var fallbackVoiceId: String?
+    private var lastPlaybackWasPCM: Bool = false

    private var bridge: BridgeSession?
    private let silenceWindow: TimeInterval = 0.7

-    private var player: AVAudioPlayer?
    private var chatSubscribedSessionKeys = Set<String>()

    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "TalkMode")
@@ -446,43 +448,43 @@ final class TalkModeManager: NSObject {
            let started = Date()
            let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)

-            let voiceId = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
            let resolvedKey =
                (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
                ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
            let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
+            let preferredVoice = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
+            let voiceId: String? = if let apiKey, !apiKey.isEmpty {
+                await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
+            } else {
+                nil
+            }
            let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)

            if canUseElevenLabs, let voiceId, let apiKey {
-                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
+                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
                let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
                if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
                    self.logger.warning(
                        "talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
                }

+                let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
                let request = ElevenLabsTTSRequest(
                    text: cleaned,
-                    modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
+                    modelId: modelId,
                    outputFormat: outputFormat,
                    speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
-                    stability: TalkTTSValidation.validatedUnit(directive?.stability),
+                    stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
                    similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
                    style: TalkTTSValidation.validatedUnit(directive?.style),
                    speakerBoost: directive?.speakerBoost,
                    seed: TalkTTSValidation.validatedSeed(directive?.seed),
                    normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
-                    language: language)
+                    language: language,
+                    latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))

-                let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
                let client = ElevenLabsTTSClient(apiKey: apiKey)
-                let audio = try await client.synthesizeWithHardTimeout(
-                    voiceId: voiceId,
-                    request: request,
-                    hardTimeoutSeconds: synthTimeoutSeconds)
-                self.logger
-                    .info(
-                        "elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
+                let stream = client.streamSynthesize(voiceId: voiceId, request: request)

                if self.interruptOnSpeech {
                    do {
@@ -494,7 +496,21 @@ final class TalkModeManager: NSObject {
                }

                self.statusText = "Speaking…"
-                try await self.playAudio(data: audio)
+                let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
+                let result: StreamingPlaybackResult
+                if let sampleRate {
+                    self.lastPlaybackWasPCM = true
+                    result = await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate)
+                } else {
+                    self.lastPlaybackWasPCM = false
+                    result = await StreamingAudioPlayer.shared.play(stream: stream)
+                }
+                self.logger
+                    .info(
+                        "elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
+                if !result.finished, let interruptedAt = result.interruptedAt {
+                    self.lastInterruptedAtSeconds = interruptedAt
+                }
            } else {
                self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
                if self.interruptOnSpeech {
@@ -533,30 +549,17 @@ final class TalkModeManager: NSObject {
        self.isSpeaking = false
    }

-    private func playAudio(data: Data) async throws {
-        self.player?.stop()
-        let player = try AVAudioPlayer(data: data)
-        self.player = player
-        player.prepareToPlay()
-        self.logger.info("play start")
-        guard player.play() else {
-            throw NSError(domain: "TalkMode", code: 2, userInfo: [
-                NSLocalizedDescriptionKey: "audio player refused to play",
-            ])
-        }
-        while player.isPlaying {
-            try? await Task.sleep(nanoseconds: 120_000_000)
-        }
-        self.logger.info("play done")
-    }
-
    private func stopSpeaking(storeInterruption: Bool = true) {
        guard self.isSpeaking else { return }
+        let interruptedAt = self.lastPlaybackWasPCM
+            ? PCMStreamingAudioPlayer.shared.stop()
+            : StreamingAudioPlayer.shared.stop()
        if storeInterruption {
-            self.lastInterruptedAtSeconds = self.player?.currentTime
+            self.lastInterruptedAtSeconds = interruptedAt
        }
-        self.player?.stop()
-        self.player = nil
+        _ = self.lastPlaybackWasPCM
+            ? StreamingAudioPlayer.shared.stop()
+            : PCMStreamingAudioPlayer.shared.stop()
        TalkSystemSpeechSynthesizer.shared.stop()
        self.isSpeaking = false
    }
@@ -581,6 +584,37 @@ final class TalkModeManager: NSObject {
        return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
    }

+    private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
+        let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
+        if !trimmed.isEmpty {
+            if let resolved = self.resolveVoiceAlias(trimmed) { return resolved }
+            self.logger.warning("unknown voice alias \(trimmed, privacy: .public)")
+        }
+        if let fallbackVoiceId { return fallbackVoiceId }
+
+        do {
+            let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
+            guard let first = voices.first else {
+                self.logger.warning("elevenlabs voices list empty")
+                return nil
+            }
+            self.fallbackVoiceId = first.voiceId
+            if self.defaultVoiceId == nil {
+                self.defaultVoiceId = first.voiceId
+            }
+            if !self.voiceOverrideActive {
+                self.currentVoiceId = first.voiceId
+            }
+            let name = first.name ?? "unknown"
+            self.logger
+                .info("default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
+            return first.voiceId
+        } catch {
+            self.logger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
+            return nil
+        }
+    }
+
    private static func isLikelyVoiceId(_ value: String) -> Bool {
        guard value.count >= 10 else { return false }
        return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
@@ -598,22 +632,23 @@ final class TalkModeManager: NSObject {
            self.mainSessionKey = rawMainKey.isEmpty ? "main" : rawMainKey
            self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
            if let aliases = talk?["voiceAliases"] as? [String: Any] {
-                self.voiceAliases =
-                    aliases.compactMap { key, value in
-                        guard let id = value as? String else { return nil }
-                        let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
-                        let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
-                        guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { return nil }
-                        return (normalizedKey, trimmedId)
-                    }
-                    .reduce(into: [:]) { $0[$1.0] = $1.1 }
+                var resolved: [String: String] = [:]
+                for (key, value) in aliases {
+                    guard let id = value as? String else { continue }
+                    let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
+                    let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
+                    guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { continue }
+                    resolved[normalizedKey] = trimmedId
+                }
+                self.voiceAliases = resolved
            } else {
                self.voiceAliases = [:]
            }
            if !self.voiceOverrideActive {
                self.currentVoiceId = self.defaultVoiceId
            }
-            self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
+            let model = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
+            self.defaultModelId = (model?.isEmpty == false) ? model : Self.defaultModelIdFallback
            if !self.modelOverrideActive {
                self.currentModelId = self.defaultModelId
            }
@@ -624,7 +659,10 @@ final class TalkModeManager: NSObject {
                self.interruptOnSpeech = interrupt
            }
        } catch {
-            // ignore
+            self.defaultModelId = Self.defaultModelIdFallback
+            if !self.modelOverrideActive {
+                self.currentModelId = self.defaultModelId
+            }
        }
    }

--- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift
+++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift
@@ -10,6 +10,7 @@ actor TalkModeRuntime {

    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime")
    private let ttsLogger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts")
+    private static let defaultModelIdFallback = "eleven_v3"

    private final class RMSMeter: @unchecked Sendable {
        private let lock = NSLock()
@@ -62,6 +63,7 @@ actor TalkModeRuntime {
    private var lastSpokenText: String?
    private var apiKey: String?
    private var fallbackVoiceId: String?
+    private var lastPlaybackWasPCM: Bool = false

    private let silenceWindow: TimeInterval = 0.7
    private let minSpeechRMS: Double = 1e-3
@@ -496,7 +498,7 @@ actor TalkModeRuntime {

        do {
            if let apiKey, !apiKey.isEmpty, let voiceId {
-                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
+                let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
                let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
                if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
                    self.logger
@@ -504,27 +506,25 @@ actor TalkModeRuntime {
                            "talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
                }

+                let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
                let request = ElevenLabsTTSRequest(
                    text: cleaned,
-                    modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
+                    modelId: modelId,
                    outputFormat: outputFormat,
                    speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
-                    stability: TalkTTSValidation.validatedUnit(directive?.stability),
+                    stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
                    similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
                    style: TalkTTSValidation.validatedUnit(directive?.style),
                    speakerBoost: directive?.speakerBoost,
                    seed: TalkTTSValidation.validatedSeed(directive?.seed),
                    normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
-                    language: language)
+                    language: language,
+                    latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))

                self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
                let client = ElevenLabsTTSClient(apiKey: apiKey)
-                let audio = try await client.synthesizeWithHardTimeout(
-                    voiceId: voiceId,
-                    request: request,
-                    hardTimeoutSeconds: synthTimeoutSeconds)
+                let stream = client.streamSynthesize(voiceId: voiceId, request: request)
                guard self.isCurrent(gen) else { return }
-                self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")

                if self.interruptOnSpeech {
                    await self.startRecognition()
@@ -534,12 +534,20 @@ actor TalkModeRuntime {
                await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
                self.phase = .speaking

-                let result = await TalkAudioPlayer.shared.play(data: audio)
+                let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
+                let result: StreamingPlaybackResult
+                if let sampleRate {
+                    self.lastPlaybackWasPCM = true
+                    result = await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate)
+                } else {
+                    self.lastPlaybackWasPCM = false
+                    result = await StreamingAudioPlayer.shared.play(stream: stream)
+                }
                self.ttsLogger
                    .info(
                        "talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
                if !result.finished, result.interruptedAt == nil {
-                    throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [
+                    throw NSError(domain: "StreamingAudioPlayer", code: 1, userInfo: [
                        NSLocalizedDescriptionKey: "audio playback failed",
                    ])
                }
@@ -631,7 +639,15 @@ actor TalkModeRuntime {
    }

    func stopSpeaking(reason: TalkStopReason) async {
-        let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
+        let interruptedAt = await MainActor.run {
+            let primary = self.lastPlaybackWasPCM
+                ? PCMStreamingAudioPlayer.shared.stop()
+                : StreamingAudioPlayer.shared.stop()
+            _ = self.lastPlaybackWasPCM
+                ? StreamingAudioPlayer.shared.stop()
+                : PCMStreamingAudioPlayer.shared.stop()
+            return primary
+        }
        await TalkSystemSpeechSynthesizer.shared.stop()
        guard self.phase == .speaking else { return }
        if reason == .speech, let interruptedAt {
@@ -707,7 +723,8 @@ actor TalkModeRuntime {
                    guard !key.isEmpty, !value.isEmpty else { return }
                    acc[key] = value
                } ?? [:]
-            let model = talk?["modelId"]?.stringValue
+            let model = talk?["modelId"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
+            let resolvedModel = (model?.isEmpty == false) ? model! : Self.defaultModelIdFallback
            let outputFormat = talk?["outputFormat"]?.stringValue
            let interrupt = talk?["interruptOnSpeech"]?.boolValue
            let apiKey = talk?["apiKey"]?.stringValue
@@ -721,7 +738,7 @@ actor TalkModeRuntime {
            return TalkRuntimeConfig(
                voiceId: resolvedVoice,
                voiceAliases: resolvedAliases,
-                modelId: model,
+                modelId: resolvedModel,
                outputFormat: outputFormat,
                interruptOnSpeech: interrupt ?? true,
                apiKey: resolvedApiKey)
@@ -733,7 +750,7 @@ actor TalkModeRuntime {
            return TalkRuntimeConfig(
                voiceId: resolvedVoice,
                voiceAliases: [:],
-                modelId: nil,
+                modelId: Self.defaultModelIdFallback,
                outputFormat: nil,
                interruptOnSpeech: true,
                apiKey: resolvedApiKey)
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
@@ -22,6 +22,7 @@ public struct ElevenLabsTTSRequest: Sendable {
    public var seed: UInt32?
    public var normalize: String?
    public var language: String?
+    public var latencyTier: Int?

    public init(
        text: String,
@@ -34,7 +35,8 @@ public struct ElevenLabsTTSRequest: Sendable {
        speakerBoost: Bool? = nil,
        seed: UInt32? = nil,
        normalize: String? = nil,
-        language: String? = nil)
+        language: String? = nil,
+        latencyTier: Int? = nil)
    {
        self.text = text
        self.modelId = modelId
@@ -47,6 +49,7 @@ public struct ElevenLabsTTSRequest: Sendable {
        self.seed = seed
        self.normalize = normalize
        self.language = language
+        self.latencyTier = latencyTier
    }
 }

@@ -155,6 +158,72 @@ public struct ElevenLabsTTSClient: Sendable {
        ])
    }

+    public func streamSynthesize(
+        voiceId: String,
+        request: ElevenLabsTTSRequest) -> AsyncThrowingStream<Data, Error>
+    {
+        AsyncThrowingStream { continuation in
+            let task = Task {
+                do {
+                    let url = Self.streamingURL(
+                        baseUrl: self.baseUrl,
+                        voiceId: voiceId,
+                        latencyTier: request.latencyTier)
+                    let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
+
+                    var req = URLRequest(url: url)
+                    req.httpMethod = "POST"
+                    req.httpBody = body
+                    req.timeoutInterval = self.requestTimeoutSeconds
+                    req.setValue("application/json", forHTTPHeaderField: "Content-Type")
+                    req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
+                    req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
+
+                    let (bytes, response) = try await URLSession.shared.bytes(for: req)
+                    guard let http = response as? HTTPURLResponse else {
+                        throw NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
+                            NSLocalizedDescriptionKey: "ElevenLabs invalid response",
+                        ])
+                    }
+
+                    let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
+                    if http.statusCode >= 400 {
+                        let message = try await Self.readErrorBody(bytes: bytes)
+                        throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
+                            NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
+                        ])
+                    }
+                    if !contentType.contains("audio") {
+                        let message = try await Self.readErrorBody(bytes: bytes)
+                        throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
+                            NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
+                        ])
+                    }
+
+                    var buffer = Data()
+                    buffer.reserveCapacity(16_384)
+                    for try await byte in bytes {
+                        buffer.append(byte)
+                        if buffer.count >= 8_192 {
+                            continuation.yield(buffer)
+                            buffer.removeAll(keepingCapacity: true)
+                        }
+                    }
+                    if !buffer.isEmpty {
+                        continuation.yield(buffer)
+                    }
+                    continuation.finish()
+                } catch {
+                    continuation.finish(throwing: error)
+                }
+            }
+
+            continuation.onTermination = { _ in
+                task.cancel()
+            }
+        }
+    }
+
    public func listVoices() async throws -> [ElevenLabsVoice] {
        var url = self.baseUrl
        url.appendPathComponent("v1")
@@ -180,7 +249,7 @@ public struct ElevenLabsTTSClient: Sendable {
    public static func validatedOutputFormat(_ value: String?) -> String? {
        let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
        guard !trimmed.isEmpty else { return nil }
-        guard trimmed.hasPrefix("mp3_") else { return nil }
+        guard trimmed.hasPrefix("mp3_") || trimmed.hasPrefix("pcm_") else { return nil }
        return trimmed
    }

@@ -230,4 +299,33 @@ public struct ElevenLabsTTSClient: Sendable {
        let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
        return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
    }
+
+    private static func streamingURL(baseUrl: URL, voiceId: String, latencyTier: Int?) -> URL {
+        var url = baseUrl
+        url.appendPathComponent("v1")
+        url.appendPathComponent("text-to-speech")
+        url.appendPathComponent(voiceId)
+        url.appendPathComponent("stream")
+
+        guard let latencyTier else { return url }
+        let latencyItem = URLQueryItem(
+            name: "optimize_streaming_latency",
+            value: "\(latencyTier)")
+        guard var components = URLComponents(url: url, resolvingAgainstBaseURL: false) else {
+            return url
+        }
+        var items = components.queryItems ?? []
+        items.append(latencyItem)
+        components.queryItems = items
+        return components.url ?? url
+    }
+
+    private static func readErrorBody(bytes: URLSession.AsyncBytes) async throws -> String {
+        var data = Data()
+        for try await byte in bytes {
+            data.append(byte)
+            if data.count >= 4096 { break }
+        }
+        return truncatedErrorBody(data)
+    }
 }
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/PCMStreamingAudioPlayer.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/PCMStreamingAudioPlayer.swift
@@ -0,0 +1,144 @@
+import AVFoundation
+import Foundation
+import OSLog
+
+@MainActor
+public final class PCMStreamingAudioPlayer {
+    public static let shared = PCMStreamingAudioPlayer()
+
+    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.pcm")
+    private var engine = AVAudioEngine()
+    private var player = AVAudioPlayerNode()
+    private var format: AVAudioFormat?
+    private var pendingBuffers: Int = 0
+    private var inputFinished = false
+    private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
+
+    public init() {
+        self.engine.attach(self.player)
+    }
+
+    public func play(stream: AsyncThrowingStream<Data, Error>, sampleRate: Double) async -> StreamingPlaybackResult {
+        self.stopInternal()
+
+        let format = AVAudioFormat(
+            commonFormat: .pcmFormatInt16,
+            sampleRate: sampleRate,
+            channels: 1,
+            interleaved: true)
+
+        guard let format else {
+            return StreamingPlaybackResult(finished: false, interruptedAt: nil)
+        }
+        self.configure(format: format)
+
+        return await withCheckedContinuation { continuation in
+            self.continuation = continuation
+            self.pendingBuffers = 0
+            self.inputFinished = false
+
+            Task.detached { [weak self] in
+                guard let self else { return }
+                do {
+                    for try await chunk in stream {
+                        await self.enqueuePCM(chunk, format: format)
+                    }
+                    await self.finishInput()
+                } catch {
+                    await self.fail(error)
+                }
+            }
+        }
+    }
+
+    public func stop() -> Double? {
+        let interruptedAt = self.currentTimeSeconds()
+        self.stopInternal()
+        self.finish(StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
+        return interruptedAt
+    }
+
+    private func configure(format: AVAudioFormat) {
+        if self.format?.sampleRate != format.sampleRate || self.format?.commonFormat != format.commonFormat {
+            self.engine.stop()
+            self.engine = AVAudioEngine()
+            self.player = AVAudioPlayerNode()
+            self.engine.attach(self.player)
+        }
+        self.format = format
+        if self.engine.attachedNodes.contains(self.player) {
+            self.engine.connect(self.player, to: self.engine.mainMixerNode, format: format)
+        }
+    }
+
+    private func enqueuePCM(_ data: Data, format: AVAudioFormat) async {
+        guard !data.isEmpty else { return }
+        let frameCount = data.count / MemoryLayout<Int16>.size
+        guard frameCount > 0 else { return }
+        guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(frameCount)) else {
+            return
+        }
+        buffer.frameLength = AVAudioFrameCount(frameCount)
+
+        data.withUnsafeBytes { raw in
+            guard let src = raw.baseAddress else { return }
+            let audioBuffer = buffer.audioBufferList.pointee.mBuffers
+            if let dst = audioBuffer.mData {
+                memcpy(dst, src, frameCount * MemoryLayout<Int16>.size)
+            }
+        }
+
+        self.pendingBuffers += 1
+        self.player.scheduleBuffer(buffer) { [weak self] in
+            Task { @MainActor in
+                guard let self else { return }
+                self.pendingBuffers = max(0, self.pendingBuffers - 1)
+                if self.inputFinished && self.pendingBuffers == 0 {
+                    self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
+                }
+            }
+        }
+
+        if !self.player.isPlaying {
+            do {
+                try self.engine.start()
+                self.player.play()
+            } catch {
+                self.logger.error("pcm engine start failed: \(error.localizedDescription, privacy: .public)")
+                self.fail(error)
+            }
+        }
+    }
+
+    private func finishInput() {
+        self.inputFinished = true
+        if self.pendingBuffers == 0 {
+            self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
+        }
+    }
+
+    private func fail(_ error: Error) {
+        self.logger.error("pcm stream failed: \(error.localizedDescription, privacy: .public)")
+        self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
+    }
+
+    private func stopInternal() {
+        self.player.stop()
+        self.engine.stop()
+        self.pendingBuffers = 0
+        self.inputFinished = false
+    }
+
+    private func finish(_ result: StreamingPlaybackResult) {
+        let continuation = self.continuation
+        self.continuation = nil
+        continuation?.resume(returning: result)
+    }
+
+    private func currentTimeSeconds() -> Double? {
+        guard let nodeTime = self.player.lastRenderTime,
+              let playerTime = self.player.playerTime(forNodeTime: nodeTime)
+        else { return nil }
+        return Double(playerTime.sampleTime) / playerTime.sampleRate
+    }
+}
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/StreamingAudioPlayer.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/StreamingAudioPlayer.swift
@@ -0,0 +1,429 @@
+import AudioToolbox
+import Foundation
+import OSLog
+
+public struct StreamingPlaybackResult: Sendable {
+    public let finished: Bool
+    public let interruptedAt: Double?
+
+    public init(finished: Bool, interruptedAt: Double?) {
+        self.finished = finished
+        self.interruptedAt = interruptedAt
+    }
+}
+
+@MainActor
+public final class StreamingAudioPlayer: NSObject {
+    public static let shared = StreamingAudioPlayer()
+
+    private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.stream")
+    private var playback: Playback?
+
+    public func play(stream: AsyncThrowingStream<Data, Error>) async -> StreamingPlaybackResult {
+        self.stopInternal()
+
+        let playback = Playback(logger: self.logger)
+        self.playback = playback
+
+        return await withCheckedContinuation { continuation in
+            playback.setContinuation(continuation)
+            playback.start()
+
+            Task.detached {
+                do {
+                    for try await chunk in stream {
+                        playback.append(chunk)
+                    }
+                    playback.finishInput()
+                } catch {
+                    playback.fail(error)
+                }
+            }
+        }
+    }
+
+    public func stop() -> Double? {
+        guard let playback else { return nil }
+        let interruptedAt = playback.stop(immediate: true)
+        self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
+        return interruptedAt
+    }
+
+    private func stopInternal() {
+        guard let playback else { return }
+        let interruptedAt = playback.stop(immediate: true)
+        self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
+    }
+
+    private func finish(playback: Playback, result: StreamingPlaybackResult) {
+        playback.finish(result)
+        guard self.playback === playback else { return }
+        self.playback = nil
+    }
+}
+
+private final class Playback: @unchecked Sendable {
+    private static let bufferCount: Int = 3
+    private static let bufferSize: Int = 32 * 1024
+
+    private let logger: Logger
+    private let lock = NSLock()
+    private let parseQueue = DispatchQueue(label: "talk.stream.parse")
+    fileprivate let bufferLock = NSLock()
+    fileprivate let bufferSemaphore = DispatchSemaphore(value: bufferCount)
+
+    private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
+    private var finished = false
+
+    private var audioFileStream: AudioFileStreamID?
+    private var audioQueue: AudioQueueRef?
+    fileprivate var audioFormat: AudioStreamBasicDescription?
+    fileprivate var maxPacketSize: UInt32 = 0
+
+    fileprivate var availableBuffers: [AudioQueueBufferRef] = []
+    private var currentBuffer: AudioQueueBufferRef?
+    private var currentBufferSize: Int = 0
+    private var currentPacketDescs: [AudioStreamPacketDescription] = []
+
+    private var isRunning = false
+    fileprivate var inputFinished = false
+    private var startRequested = false
+
+    private var sampleRate: Double = 0
+
+    init(logger: Logger) {
+        self.logger = logger
+    }
+
+    func setContinuation(_ continuation: CheckedContinuation<StreamingPlaybackResult, Never>) {
+        self.lock.lock()
+        self.continuation = continuation
+        self.lock.unlock()
+    }
+
+    func start() {
+        let selfPtr = Unmanaged.passUnretained(self).toOpaque()
+        let status = AudioFileStreamOpen(
+            selfPtr,
+            propertyListenerProc,
+            packetsProc,
+            kAudioFileMP3Type,
+            &self.audioFileStream)
+        if status != noErr {
+            self.logger.error("talk stream open failed: \(status)")
+            self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
+        }
+    }
+
+    func append(_ data: Data) {
+        guard !data.isEmpty else { return }
+        self.parseQueue.async { [weak self] in
+            guard let self else { return }
+            guard let audioFileStream = self.audioFileStream else { return }
+            let status = data.withUnsafeBytes { bytes in
+                AudioFileStreamParseBytes(
+                    audioFileStream,
+                    UInt32(bytes.count),
+                    bytes.baseAddress,
+                    [])
+            }
+            if status != noErr {
+                self.logger.error("talk stream parse failed: \(status)")
+                self.fail(NSError(domain: "StreamingAudio", code: Int(status)))
+            }
+        }
+    }
+
+    func finishInput() {
+        self.parseQueue.async { [weak self] in
+            guard let self else { return }
+            self.inputFinished = true
+            if self.audioQueue == nil {
+                self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
+                return
+            }
+            self.enqueueCurrentBuffer(flushOnly: true)
+            self.stop(immediate: false)
+        }
+    }
+
+    func fail(_ error: Error) {
+        self.logger.error("talk stream failed: \(error.localizedDescription, privacy: .public)")
+        _ = self.stop(immediate: true)
+        self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
+    }
+
+    func stop(immediate: Bool) -> Double? {
+        guard let audioQueue else { return nil }
+        let interruptedAt = self.currentTimeSeconds()
+        AudioQueueStop(audioQueue, immediate)
+        return interruptedAt
+    }
+
+    fileprivate func finish(_ result: StreamingPlaybackResult) {
+        let continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
+        self.lock.lock()
+        if self.finished {
+            continuation = nil
+        } else {
+            self.finished = true
+            continuation = self.continuation
+            self.continuation = nil
+        }
+        self.lock.unlock()
+
+        continuation?.resume(returning: result)
+        self.teardown()
+    }
+
+    private func teardown() {
+        if let audioQueue {
+            AudioQueueDispose(audioQueue, true)
+            self.audioQueue = nil
+        }
+        if let audioFileStream {
+            AudioFileStreamClose(audioFileStream)
+            self.audioFileStream = nil
+        }
+        self.bufferLock.lock()
+        self.availableBuffers.removeAll()
+        self.bufferLock.unlock()
+        self.currentBuffer = nil
+        self.currentPacketDescs.removeAll()
+    }
+
+    fileprivate func setupQueueIfNeeded(_ asbd: AudioStreamBasicDescription) {
+        guard self.audioQueue == nil else { return }
+
+        var format = asbd
+        self.audioFormat = format
+        self.sampleRate = format.mSampleRate
+
+        let selfPtr = Unmanaged.passUnretained(self).toOpaque()
+        let status = AudioQueueNewOutput(
+            &format,
+            outputCallbackProc,
+            selfPtr,
+            nil,
+            nil,
+            0,
+            &self.audioQueue)
+        if status != noErr {
+            self.logger.error("talk queue create failed: \(status)")
+            self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
+            return
+        }
+
+        if let audioQueue {
+            AudioQueueAddPropertyListener(audioQueue, kAudioQueueProperty_IsRunning, isRunningCallbackProc, selfPtr)
+        }
+
+        if let audioFileStream {
+            var cookieSize: UInt32 = 0
+            var writable: DarwinBoolean = false
+            let cookieStatus = AudioFileStreamGetPropertyInfo(
+                audioFileStream,
+                kAudioFileStreamProperty_MagicCookieData,
+                &cookieSize,
+                &writable)
+            if cookieStatus == noErr, cookieSize > 0, let audioQueue {
+                var cookie = [UInt8](repeating: 0, count: Int(cookieSize))
+                let readStatus = AudioFileStreamGetProperty(
+                    audioFileStream,
+                    kAudioFileStreamProperty_MagicCookieData,
+                    &cookieSize,
+                    &cookie)
+                if readStatus == noErr {
+                    AudioQueueSetProperty(audioQueue, kAudioQueueProperty_MagicCookie, cookie, cookieSize)
+                }
+            }
+        }
+
+        if let audioQueue {
+            for _ in 0..<Self.bufferCount {
+                var buffer: AudioQueueBufferRef?
+                let allocStatus = AudioQueueAllocateBuffer(audioQueue, UInt32(Self.bufferSize), &buffer)
+                if allocStatus == noErr, let buffer {
+                    self.bufferLock.lock()
+                    self.availableBuffers.append(buffer)
+                    self.bufferLock.unlock()
+                }
+            }
+        }
+    }
+
+    private func enqueueCurrentBuffer(flushOnly: Bool = false) {
+        guard let audioQueue, let buffer = self.currentBuffer else { return }
+        guard self.currentBufferSize > 0 else { return }
+
+        buffer.pointee.mAudioDataByteSize = UInt32(self.currentBufferSize)
+        let packetCount = UInt32(self.currentPacketDescs.count)
+
+        let status = self.currentPacketDescs.withUnsafeBufferPointer { descPtr in
+            AudioQueueEnqueueBuffer(audioQueue, buffer, packetCount, descPtr.baseAddress)
+        }
+        if status != noErr {
+            self.logger.error("talk queue enqueue failed: \(status)")
+        } else {
+            if !self.startRequested {
+                self.startRequested = true
+                let startStatus = AudioQueueStart(audioQueue, nil)
+                if startStatus != noErr {
+                    self.logger.error("talk queue start failed: \(startStatus)")
+                }
+            }
+        }
+
+        self.currentBuffer = nil
+        self.currentBufferSize = 0
+        self.currentPacketDescs.removeAll(keepingCapacity: true)
+        if !flushOnly {
+            self.bufferSemaphore.wait()
+            self.bufferLock.lock()
+            let next = self.availableBuffers.popLast()
+            self.bufferLock.unlock()
+            if let next { self.currentBuffer = next }
+        }
+    }
+
+    fileprivate func handlePackets(
+        numberBytes: UInt32,
+        numberPackets: UInt32,
+        inputData: UnsafeRawPointer,
+        packetDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
+    {
+        if self.audioQueue == nil, let format = self.audioFormat {
+            self.setupQueueIfNeeded(format)
+        }
+
+        if self.audioQueue == nil {
+            return
+        }
+
+        if self.currentBuffer == nil {
+            self.bufferSemaphore.wait()
+            self.bufferLock.lock()
+            self.currentBuffer = self.availableBuffers.popLast()
+            self.bufferLock.unlock()
+            self.currentBufferSize = 0
+            self.currentPacketDescs.removeAll(keepingCapacity: true)
+        }
+
+        let bytes = inputData.assumingMemoryBound(to: UInt8.self)
+        let packetCount = Int(numberPackets)
+        for index in 0..<packetCount {
+            let packetOffset: Int
+            let packetSize: Int
+
+            if let packetDescriptions {
+                packetOffset = Int(packetDescriptions[index].mStartOffset)
+                packetSize = Int(packetDescriptions[index].mDataByteSize)
+            } else {
+                let size = Int(numberBytes) / packetCount
+                packetOffset = index * size
+                packetSize = size
+            }
+
+            if packetSize > Self.bufferSize {
+                continue
+            }
+
+            if self.currentBufferSize + packetSize > Self.bufferSize {
+                self.enqueueCurrentBuffer()
+            }
+
+            guard let buffer = self.currentBuffer else { continue }
+            let dest = buffer.pointee.mAudioData.advanced(by: self.currentBufferSize)
+            memcpy(dest, bytes.advanced(by: packetOffset), packetSize)
+
+            let desc = AudioStreamPacketDescription(
+                mStartOffset: Int64(self.currentBufferSize),
+                mVariableFramesInPacket: 0,
+                mDataByteSize: UInt32(packetSize))
+            self.currentPacketDescs.append(desc)
+            self.currentBufferSize += packetSize
+        }
+    }
+
+    private func currentTimeSeconds() -> Double? {
+        guard let audioQueue, sampleRate > 0 else { return nil }
+        var timeStamp = AudioTimeStamp()
+        let status = AudioQueueGetCurrentTime(audioQueue, nil, &timeStamp, nil)
+        if status != noErr { return nil }
+        if timeStamp.mSampleTime.isNaN { return nil }
+        return timeStamp.mSampleTime / sampleRate
+    }
+}
+
+private func propertyListenerProc(
+    inClientData: UnsafeMutableRawPointer,
+    inAudioFileStream: AudioFileStreamID,
+    inPropertyID: AudioFileStreamPropertyID,
+    ioFlags: UnsafeMutablePointer<AudioFileStreamPropertyFlags>)
+{
+    let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
+
+    if inPropertyID == kAudioFileStreamProperty_DataFormat {
+        var format = AudioStreamBasicDescription()
+        var size = UInt32(MemoryLayout<AudioStreamBasicDescription>.size)
+        let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &format)
+        if status == noErr {
+            playback.audioFormat = format
+            playback.setupQueueIfNeeded(format)
+        }
+    } else if inPropertyID == kAudioFileStreamProperty_PacketSizeUpperBound {
+        var maxPacketSize: UInt32 = 0
+        var size = UInt32(MemoryLayout<UInt32>.size)
+        let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &maxPacketSize)
+        if status == noErr {
+            playback.maxPacketSize = maxPacketSize
+        }
+    }
+}
+
+private func packetsProc(
+    inClientData: UnsafeMutableRawPointer,
+    inNumberBytes: UInt32,
+    inNumberPackets: UInt32,
+    inInputData: UnsafeRawPointer,
+    inPacketDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
+{
+    let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
+    playback.handlePackets(
+        numberBytes: inNumberBytes,
+        numberPackets: inNumberPackets,
+        inputData: inInputData,
+        packetDescriptions: inPacketDescriptions)
+}
+
+private func outputCallbackProc(
+    inUserData: UnsafeMutableRawPointer?,
+    inAQ: AudioQueueRef,
+    inBuffer: AudioQueueBufferRef)
+{
+    guard let inUserData else { return }
+    let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
+    playback.bufferLock.lock()
+    playback.availableBuffers.append(inBuffer)
+    playback.bufferLock.unlock()
+    playback.bufferSemaphore.signal()
+}
+
+private func isRunningCallbackProc(
+    inUserData: UnsafeMutableRawPointer?,
+    inAQ: AudioQueueRef,
+    inID: AudioQueuePropertyID)
+{
+    guard let inUserData else { return }
+    guard inID == kAudioQueueProperty_IsRunning else { return }
+
+    let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
+    var running: UInt32 = 0
+    var size = UInt32(MemoryLayout<UInt32>.size)
+    let status = AudioQueueGetProperty(inAQ, kAudioQueueProperty_IsRunning, &running, &size)
+    if status != noErr { return }
+
+    if running == 0, playback.inputFinished {
+        playback.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
+    }
+}
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift
@@ -1,4 +1,6 @@
 public enum TalkTTSValidation: Sendable {
+    private static let v3StabilityValues: Set<Double> = [0.0, 0.5, 1.0]
+
    public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
        if let rateWPM, rateWPM > 0 {
            let resolved = Double(rateWPM) / 175.0
@@ -18,10 +20,32 @@ public enum TalkTTSValidation: Sendable {
        return value
    }

+    public static func validatedStability(_ value: Double?, modelId: String?) -> Double? {
+        guard let value else { return nil }
+        let normalizedModel = (modelId ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
+        if normalizedModel == "eleven_v3" {
+            return v3StabilityValues.contains(value) ? value : nil
+        }
+        return validatedUnit(value)
+    }
+
    public static func validatedSeed(_ value: Int?) -> UInt32? {
        guard let value else { return nil }
        if value < 0 || value > 4294967295 { return nil }
        return UInt32(value)
    }
-}

+    public static func validatedLatencyTier(_ value: Int?) -> Int? {
+        guard let value else { return nil }
+        if value < 0 || value > 4 { return nil }
+        return value
+    }
+
+    public static func pcmSampleRate(from outputFormat: String?) -> Double? {
+        let trimmed = (outputFormat ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
+        guard trimmed.hasPrefix("pcm_") else { return nil }
+        let parts = trimmed.split(separator: "_", maxSplits: 1)
+        guard parts.count == 2, let rate = Double(parts[1]), rate > 0 else { return nil }
+        return rate
+    }
+}
--- a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift
+++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift
@@ -16,9 +16,30 @@ final class TalkTTSValidationTests: XCTestCase {
        XCTAssertNil(TalkTTSValidation.validatedUnit(1.01))
    }

+    func testValidatedStability() {
+        XCTAssertEqual(TalkTTSValidation.validatedStability(0, modelId: "eleven_v3"), 0)
+        XCTAssertEqual(TalkTTSValidation.validatedStability(0.5, modelId: "eleven_v3"), 0.5)
+        XCTAssertEqual(TalkTTSValidation.validatedStability(1, modelId: "eleven_v3"), 1)
+        XCTAssertNil(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_v3"))
+        XCTAssertEqual(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_multilingual_v2"), 0.7)
+    }
+
    func testValidatedSeedBounds() {
        XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0)
        XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234)
        XCTAssertNil(TalkTTSValidation.validatedSeed(-1))
    }
+
+    func testValidatedLatencyTier() {
+        XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(0), 0)
+        XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(4), 4)
+        XCTAssertNil(TalkTTSValidation.validatedLatencyTier(-1))
+        XCTAssertNil(TalkTTSValidation.validatedLatencyTier(5))
+    }
+
+    func testPcmSampleRateParse() {
+        XCTAssertEqual(TalkTTSValidation.pcmSampleRate(from: "pcm_44100"), 44100)
+        XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "mp3_44100_128"))
+        XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "pcm_bad"))
+    }
 }
--- a/docs/talk.md
+++ b/docs/talk.md
@@ -10,7 +10,7 @@ Talk mode is a continuous voice conversation loop:
 1) Listen for speech
 2) Send transcript to the model (main session, chat.send)
 3) Wait for the response
-4) Speak it via ElevenLabs
+4) Speak it via ElevenLabs (streaming playback)

 ## Behavior (macOS)
 - **Always-on overlay** while Talk mode is enabled.
@@ -55,8 +55,10 @@ Supported keys:

 Defaults:
 - `interruptOnSpeech`: true
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID`
+- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
+- `modelId`: defaults to `eleven_v3` when unset
 - `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)
+- `outputFormat`: defaults to `pcm_44100` on macOS/iOS for faster streaming playback (Android stays on MP3)

 ## macOS UI
 - Menu bar toggle: **Talk**
@@ -71,4 +73,6 @@ Defaults:
 ## Notes
 - Requires Speech + Microphone permissions.
 - Uses `chat.send` against session key `main`.
- TTS uses ElevenLabs API with `ELEVENLABS_API_KEY`.
+- TTS uses ElevenLabs streaming API with `ELEVENLABS_API_KEY` and incremental playback on macOS/iOS/Android for lower latency.
+- `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`.
+- `latency_tier` is validated to `0..4` when set.