fix: stream elevenlabs tts playback

2025-12-30 12:17:40 +01:00
parent 9c532eac07
commit 27adfb76fa
11 changed files with 1091 additions and 91 deletions
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/StreamingMediaDataSource.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/StreamingMediaDataSource.kt
@@ -0,0 +1,98 @@
+package com.steipete.clawdis.node.voice
+
+import android.media.MediaDataSource
+import kotlin.math.min
+
+internal class StreamingMediaDataSource : MediaDataSource() {
+  private data class Chunk(val start: Long, val data: ByteArray)
+
+  private val lock = Object()
+  private val chunks = ArrayList<Chunk>()
+  private var totalSize: Long = 0
+  private var closed = false
+  private var finished = false
+  private var lastReadIndex = 0
+
+  fun append(data: ByteArray) {
+    if (data.isEmpty()) return
+    synchronized(lock) {
+      if (closed || finished) return
+      val chunk = Chunk(totalSize, data)
+      chunks.add(chunk)
+      totalSize += data.size.toLong()
+      lock.notifyAll()
+    }
+  }
+
+  fun finish() {
+    synchronized(lock) {
+      if (closed) return
+      finished = true
+      lock.notifyAll()
+    }
+  }
+
+  fun fail() {
+    synchronized(lock) {
+      closed = true
+      lock.notifyAll()
+    }
+  }
+
+  override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int {
+    if (position < 0) return -1
+    synchronized(lock) {
+      while (!closed && !finished && position >= totalSize) {
+        lock.wait()
+      }
+      if (closed) return -1
+      if (position >= totalSize && finished) return -1
+
+      val available = (totalSize - position).toInt()
+      val toRead = min(size, available)
+      var remaining = toRead
+      var destOffset = offset
+      var pos = position
+
+      var index = findChunkIndex(pos)
+      while (remaining > 0 && index < chunks.size) {
+        val chunk = chunks[index]
+        val inChunkOffset = (pos - chunk.start).toInt()
+        if (inChunkOffset >= chunk.data.size) {
+          index++
+          continue
+        }
+        val copyLen = min(remaining, chunk.data.size - inChunkOffset)
+        System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen)
+        remaining -= copyLen
+        destOffset += copyLen
+        pos += copyLen
+        if (inChunkOffset + copyLen >= chunk.data.size) {
+          index++
+        }
+      }
+
+      return toRead - remaining
+    }
+  }
+
+  override fun getSize(): Long = -1
+
+  override fun close() {
+    synchronized(lock) {
+      closed = true
+      lock.notifyAll()
+    }
+  }
+
+  private fun findChunkIndex(position: Long): Int {
+    var index = lastReadIndex
+    while (index < chunks.size) {
+      val chunk = chunks[index]
+      if (position < chunk.start + chunk.data.size) break
+      index++
+    }
+    lastReadIndex = index
+    return index
+  }
+}
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
@@ -18,7 +18,6 @@ import android.speech.tts.UtteranceProgressListener
 import android.util.Log
 import androidx.core.content.ContextCompat
 import com.steipete.clawdis.node.bridge.BridgeSession
-import java.io.File
 import java.net.HttpURLConnection
 import java.net.URL
 import java.util.UUID
@@ -44,6 +43,7 @@ class TalkModeManager(
 ) {
  companion object {
    private const val tag = "TalkMode"
+    private const val defaultModelIdFallback = "eleven_v3"
  }

  private val mainHandler = Handler(Looper.getMainLooper())
@@ -81,6 +81,7 @@ class TalkModeManager(

  private var defaultVoiceId: String? = null
  private var currentVoiceId: String? = null
+  private var fallbackVoiceId: String? = null
  private var defaultModelId: String? = null
  private var currentModelId: String? = null
  private var defaultOutputFormat: String? = null
@@ -97,7 +98,7 @@ class TalkModeManager(
  private var chatSubscribedSessionKey: String? = null

  private var player: MediaPlayer? = null
-  private var currentAudioFile: File? = null
+  private var streamingSource: StreamingMediaDataSource? = null
  private var systemTts: TextToSpeech? = null
  private var systemTtsPending: CompletableDeferred<Unit>? = null
  private var systemTtsPendingId: String? = null
@@ -464,7 +465,13 @@ class TalkModeManager(
    val apiKey =
      apiKey?.trim()?.takeIf { it.isNotEmpty() }
        ?: System.getenv("ELEVENLABS_API_KEY")?.trim()
-    val voiceId = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
+    val preferredVoice = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
+    val voiceId =
+      if (!apiKey.isNullOrEmpty()) {
+        resolveVoiceId(preferredVoice, apiKey)
+      } else {
+        null
+      }

    _statusText.value = "Speaking…"
    _isSpeaking.value = true
@@ -486,24 +493,25 @@ class TalkModeManager(
      } else {
        _usingFallbackTts.value = false
        val ttsStarted = SystemClock.elapsedRealtime()
+        val modelId = directive?.modelId ?: currentModelId ?: defaultModelId
        val request =
          ElevenLabsRequest(
            text = cleaned,
-            modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
+            modelId = modelId,
            outputFormat =
              TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
            speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
-            stability = TalkModeRuntime.validatedUnit(directive?.stability),
+            stability = TalkModeRuntime.validatedStability(directive?.stability, modelId),
            similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
            style = TalkModeRuntime.validatedUnit(directive?.style),
            speakerBoost = directive?.speakerBoost,
            seed = TalkModeRuntime.validatedSeed(directive?.seed),
            normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
            language = TalkModeRuntime.validatedLanguage(directive?.language),
+            latencyTier = TalkModeRuntime.validatedLatencyTier(directive?.latencyTier),
          )
-        val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
-        Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
-        playAudio(audio)
+        streamAndPlay(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
+        Log.d(tag, "elevenlabs stream ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
      }
    } catch (err: Throwable) {
      Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
@@ -520,22 +528,28 @@ class TalkModeManager(
    _isSpeaking.value = false
  }

-  private suspend fun playAudio(data: ByteArray) {
+  private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
    stopSpeaking(resetInterrupt = false)
-    val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
-    file.writeBytes(data)
-    currentAudioFile = file
+
+    val dataSource = StreamingMediaDataSource()
+    streamingSource = dataSource

    val player = MediaPlayer()
    this.player = player

+    val prepared = CompletableDeferred<Unit>()
    val finished = CompletableDeferred<Unit>()
+
    player.setAudioAttributes(
      AudioAttributes.Builder()
        .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
        .setUsage(AudioAttributes.USAGE_ASSISTANT)
        .build(),
    )
+    player.setOnPreparedListener {
+      it.start()
+      prepared.complete(Unit)
+    }
    player.setOnCompletionListener {
      finished.complete(Unit)
    }
@@ -544,16 +558,30 @@ class TalkModeManager(
      true
    }

-    player.setDataSource(file.absolutePath)
+    player.setDataSource(dataSource)
    withContext(Dispatchers.Main) {
-      player.setOnPreparedListener { it.start() }
      player.prepareAsync()
    }

+    val fetchError = CompletableDeferred<Throwable?>()
+    val fetchJob =
+      scope.launch(Dispatchers.IO) {
+        try {
+          streamTts(voiceId = voiceId, apiKey = apiKey, request = request, sink = dataSource)
+          fetchError.complete(null)
+        } catch (err: Throwable) {
+          dataSource.fail()
+          fetchError.complete(err)
+        }
+      }
+
    Log.d(tag, "play start")
    try {
+      prepared.await()
      finished.await()
+      fetchError.await()?.let { throw it }
    } finally {
+      fetchJob.cancel()
      cleanupPlayer()
    }
    Log.d(tag, "play done")
@@ -674,8 +702,8 @@ class TalkModeManager(
    player?.stop()
    player?.release()
    player = null
-    currentAudioFile?.delete()
-    currentAudioFile = null
+    streamingSource?.close()
+    streamingSource = null
  }

  private fun shouldInterrupt(transcript: String): Boolean {
@@ -713,13 +741,15 @@ class TalkModeManager(
      defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
      voiceAliases = aliases
      if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
-      defaultModelId = model
+      defaultModelId = model ?: defaultModelIdFallback
      if (!modelOverrideActive) currentModelId = defaultModelId
      defaultOutputFormat = outputFormat
      apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
      if (interrupt != null) interruptOnSpeech = interrupt
    } catch (_: Throwable) {
      defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
+      defaultModelId = defaultModelIdFallback
+      if (!modelOverrideActive) currentModelId = defaultModelId
      apiKey = envKey?.takeIf { it.isNotEmpty() }
      voiceAliases = emptyMap()
    }
@@ -730,9 +760,21 @@ class TalkModeManager(
    return obj["runId"].asStringOrNull()
  }

-  private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray {
-    return withContext(Dispatchers.IO) {
-      val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId")
+  private suspend fun streamTts(
+    voiceId: String,
+    apiKey: String,
+    request: ElevenLabsRequest,
+    sink: StreamingMediaDataSource,
+  ) {
+    withContext(Dispatchers.IO) {
+      val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
+      val latencyTier = request.latencyTier
+      val url =
+        if (latencyTier != null) {
+          URL("$baseUrl?optimize_streaming_latency=$latencyTier")
+        } else {
+          URL(baseUrl)
+        }
      val conn = url.openConnection() as HttpURLConnection
      conn.requestMethod = "POST"
      conn.connectTimeout = 30_000
@@ -746,13 +788,21 @@ class TalkModeManager(
      conn.outputStream.use { it.write(payload.toByteArray()) }

      val code = conn.responseCode
-      val stream = if (code >= 400) conn.errorStream else conn.inputStream
-      val data = stream.readBytes()
      if (code >= 400) {
-        val message = String(data)
+        val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
+        sink.fail()
        throw IllegalStateException("ElevenLabs failed: $code $message")
      }
-      data
+
+      val buffer = ByteArray(8 * 1024)
+      conn.inputStream.use { input ->
+        while (true) {
+          val read = input.read(buffer)
+          if (read <= 0) break
+          sink.append(buffer.copyOf(read))
+        }
+      }
+      sink.finish()
    }
  }

@@ -794,6 +844,7 @@ class TalkModeManager(
    val seed: Long?,
    val normalize: String?,
    val language: String?,
+    val latencyTier: Int?,
  )

  private object TalkModeRuntime {
@@ -816,6 +867,15 @@ class TalkModeManager(
      return value
    }

+    fun validatedStability(value: Double?, modelId: String?): Double? {
+      if (value == null) return null
+      val normalized = modelId?.trim()?.lowercase()
+      if (normalized == "eleven_v3") {
+        return if (value == 0.0 || value == 0.5 || value == 1.0) value else null
+      }
+      return validatedUnit(value)
+    }
+
    fun validatedSeed(value: Long?): Long? {
      if (value == null) return null
      if (value < 0 || value > 4294967295L) return null
@@ -840,6 +900,12 @@ class TalkModeManager(
      return if (trimmed.startsWith("mp3_")) trimmed else null
    }

+    fun validatedLatencyTier(value: Int?): Int? {
+      if (value == null) return null
+      if (value < 0 || value > 4) return null
+      return value
+    }
+
    fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean {
      val sinceMs = sinceSeconds * 1000
      return if (timestamp > 10_000_000_000) {
@@ -876,6 +942,62 @@ class TalkModeManager(
    return if (isLikelyVoiceId(trimmed)) trimmed else null
  }

+  private suspend fun resolveVoiceId(preferred: String?, apiKey: String): String? {
+    val trimmed = preferred?.trim().orEmpty()
+    if (trimmed.isNotEmpty()) {
+      val resolved = resolveVoiceAlias(trimmed)
+      if (resolved != null) return resolved
+      Log.w(tag, "unknown voice alias $trimmed")
+    }
+    fallbackVoiceId?.let { return it }
+
+    return try {
+      val voices = listVoices(apiKey)
+      val first = voices.firstOrNull() ?: return null
+      fallbackVoiceId = first.voiceId
+      if (defaultVoiceId.isNullOrBlank()) {
+        defaultVoiceId = first.voiceId
+      }
+      if (!voiceOverrideActive) {
+        currentVoiceId = first.voiceId
+      }
+      val name = first.name ?: "unknown"
+      Log.d(tag, "default voice selected $name (${first.voiceId})")
+      first.voiceId
+    } catch (err: Throwable) {
+      Log.w(tag, "list voices failed: ${err.message ?: err::class.simpleName}")
+      null
+    }
+  }
+
+  private suspend fun listVoices(apiKey: String): List<ElevenLabsVoice> {
+    return withContext(Dispatchers.IO) {
+      val url = URL("https://api.elevenlabs.io/v1/voices")
+      val conn = url.openConnection() as HttpURLConnection
+      conn.requestMethod = "GET"
+      conn.connectTimeout = 15_000
+      conn.readTimeout = 15_000
+      conn.setRequestProperty("xi-api-key", apiKey)
+
+      val code = conn.responseCode
+      val stream = if (code >= 400) conn.errorStream else conn.inputStream
+      val data = stream.readBytes()
+      if (code >= 400) {
+        val message = data.toString(Charsets.UTF_8)
+        throw IllegalStateException("ElevenLabs voices failed: $code $message")
+      }
+
+      val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull()
+      val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList())
+      voices.mapNotNull { entry ->
+        val obj = entry.asObjectOrNull() ?: return@mapNotNull null
+        val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null
+        val name = obj["name"].asStringOrNull()
+        ElevenLabsVoice(voiceId, name)
+      }
+    }
+  }
+
  private fun isLikelyVoiceId(value: String): Boolean {
    if (value.length < 10) return false
    return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
@@ -884,6 +1006,8 @@ class TalkModeManager(
  private fun normalizeAliasKey(value: String): String =
    value.trim().lowercase()

+  private data class ElevenLabsVoice(val voiceId: String, val name: String?)
+
  private val listener =
    object : RecognitionListener {
      override fun onReadyForSpeech(params: Bundle?) {