fix: stream elevenlabs tts playback
This commit is contained in:
@@ -0,0 +1,98 @@
|
||||
package com.steipete.clawdis.node.voice
|
||||
|
||||
import android.media.MediaDataSource
|
||||
import kotlin.math.min
|
||||
|
||||
internal class StreamingMediaDataSource : MediaDataSource() {
|
||||
private data class Chunk(val start: Long, val data: ByteArray)
|
||||
|
||||
private val lock = Object()
|
||||
private val chunks = ArrayList<Chunk>()
|
||||
private var totalSize: Long = 0
|
||||
private var closed = false
|
||||
private var finished = false
|
||||
private var lastReadIndex = 0
|
||||
|
||||
fun append(data: ByteArray) {
|
||||
if (data.isEmpty()) return
|
||||
synchronized(lock) {
|
||||
if (closed || finished) return
|
||||
val chunk = Chunk(totalSize, data)
|
||||
chunks.add(chunk)
|
||||
totalSize += data.size.toLong()
|
||||
lock.notifyAll()
|
||||
}
|
||||
}
|
||||
|
||||
fun finish() {
|
||||
synchronized(lock) {
|
||||
if (closed) return
|
||||
finished = true
|
||||
lock.notifyAll()
|
||||
}
|
||||
}
|
||||
|
||||
fun fail() {
|
||||
synchronized(lock) {
|
||||
closed = true
|
||||
lock.notifyAll()
|
||||
}
|
||||
}
|
||||
|
||||
override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int {
|
||||
if (position < 0) return -1
|
||||
synchronized(lock) {
|
||||
while (!closed && !finished && position >= totalSize) {
|
||||
lock.wait()
|
||||
}
|
||||
if (closed) return -1
|
||||
if (position >= totalSize && finished) return -1
|
||||
|
||||
val available = (totalSize - position).toInt()
|
||||
val toRead = min(size, available)
|
||||
var remaining = toRead
|
||||
var destOffset = offset
|
||||
var pos = position
|
||||
|
||||
var index = findChunkIndex(pos)
|
||||
while (remaining > 0 && index < chunks.size) {
|
||||
val chunk = chunks[index]
|
||||
val inChunkOffset = (pos - chunk.start).toInt()
|
||||
if (inChunkOffset >= chunk.data.size) {
|
||||
index++
|
||||
continue
|
||||
}
|
||||
val copyLen = min(remaining, chunk.data.size - inChunkOffset)
|
||||
System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen)
|
||||
remaining -= copyLen
|
||||
destOffset += copyLen
|
||||
pos += copyLen
|
||||
if (inChunkOffset + copyLen >= chunk.data.size) {
|
||||
index++
|
||||
}
|
||||
}
|
||||
|
||||
return toRead - remaining
|
||||
}
|
||||
}
|
||||
|
||||
override fun getSize(): Long = -1
|
||||
|
||||
override fun close() {
|
||||
synchronized(lock) {
|
||||
closed = true
|
||||
lock.notifyAll()
|
||||
}
|
||||
}
|
||||
|
||||
private fun findChunkIndex(position: Long): Int {
|
||||
var index = lastReadIndex
|
||||
while (index < chunks.size) {
|
||||
val chunk = chunks[index]
|
||||
if (position < chunk.start + chunk.data.size) break
|
||||
index++
|
||||
}
|
||||
lastReadIndex = index
|
||||
return index
|
||||
}
|
||||
}
|
||||
@@ -18,7 +18,6 @@ import android.speech.tts.UtteranceProgressListener
|
||||
import android.util.Log
|
||||
import androidx.core.content.ContextCompat
|
||||
import com.steipete.clawdis.node.bridge.BridgeSession
|
||||
import java.io.File
|
||||
import java.net.HttpURLConnection
|
||||
import java.net.URL
|
||||
import java.util.UUID
|
||||
@@ -44,6 +43,7 @@ class TalkModeManager(
|
||||
) {
|
||||
companion object {
|
||||
private const val tag = "TalkMode"
|
||||
private const val defaultModelIdFallback = "eleven_v3"
|
||||
}
|
||||
|
||||
private val mainHandler = Handler(Looper.getMainLooper())
|
||||
@@ -81,6 +81,7 @@ class TalkModeManager(
|
||||
|
||||
private var defaultVoiceId: String? = null
|
||||
private var currentVoiceId: String? = null
|
||||
private var fallbackVoiceId: String? = null
|
||||
private var defaultModelId: String? = null
|
||||
private var currentModelId: String? = null
|
||||
private var defaultOutputFormat: String? = null
|
||||
@@ -97,7 +98,7 @@ class TalkModeManager(
|
||||
private var chatSubscribedSessionKey: String? = null
|
||||
|
||||
private var player: MediaPlayer? = null
|
||||
private var currentAudioFile: File? = null
|
||||
private var streamingSource: StreamingMediaDataSource? = null
|
||||
private var systemTts: TextToSpeech? = null
|
||||
private var systemTtsPending: CompletableDeferred<Unit>? = null
|
||||
private var systemTtsPendingId: String? = null
|
||||
@@ -464,7 +465,13 @@ class TalkModeManager(
|
||||
val apiKey =
|
||||
apiKey?.trim()?.takeIf { it.isNotEmpty() }
|
||||
?: System.getenv("ELEVENLABS_API_KEY")?.trim()
|
||||
val voiceId = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
|
||||
val preferredVoice = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
|
||||
val voiceId =
|
||||
if (!apiKey.isNullOrEmpty()) {
|
||||
resolveVoiceId(preferredVoice, apiKey)
|
||||
} else {
|
||||
null
|
||||
}
|
||||
|
||||
_statusText.value = "Speaking…"
|
||||
_isSpeaking.value = true
|
||||
@@ -486,24 +493,25 @@ class TalkModeManager(
|
||||
} else {
|
||||
_usingFallbackTts.value = false
|
||||
val ttsStarted = SystemClock.elapsedRealtime()
|
||||
val modelId = directive?.modelId ?: currentModelId ?: defaultModelId
|
||||
val request =
|
||||
ElevenLabsRequest(
|
||||
text = cleaned,
|
||||
modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
|
||||
modelId = modelId,
|
||||
outputFormat =
|
||||
TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
|
||||
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
|
||||
stability = TalkModeRuntime.validatedUnit(directive?.stability),
|
||||
stability = TalkModeRuntime.validatedStability(directive?.stability, modelId),
|
||||
similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
|
||||
style = TalkModeRuntime.validatedUnit(directive?.style),
|
||||
speakerBoost = directive?.speakerBoost,
|
||||
seed = TalkModeRuntime.validatedSeed(directive?.seed),
|
||||
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
|
||||
language = TalkModeRuntime.validatedLanguage(directive?.language),
|
||||
latencyTier = TalkModeRuntime.validatedLatencyTier(directive?.latencyTier),
|
||||
)
|
||||
val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
|
||||
Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
|
||||
playAudio(audio)
|
||||
streamAndPlay(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
|
||||
Log.d(tag, "elevenlabs stream ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
|
||||
}
|
||||
} catch (err: Throwable) {
|
||||
Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
|
||||
@@ -520,22 +528,28 @@ class TalkModeManager(
|
||||
_isSpeaking.value = false
|
||||
}
|
||||
|
||||
private suspend fun playAudio(data: ByteArray) {
|
||||
private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
|
||||
stopSpeaking(resetInterrupt = false)
|
||||
val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
|
||||
file.writeBytes(data)
|
||||
currentAudioFile = file
|
||||
|
||||
val dataSource = StreamingMediaDataSource()
|
||||
streamingSource = dataSource
|
||||
|
||||
val player = MediaPlayer()
|
||||
this.player = player
|
||||
|
||||
val prepared = CompletableDeferred<Unit>()
|
||||
val finished = CompletableDeferred<Unit>()
|
||||
|
||||
player.setAudioAttributes(
|
||||
AudioAttributes.Builder()
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
||||
.build(),
|
||||
)
|
||||
player.setOnPreparedListener {
|
||||
it.start()
|
||||
prepared.complete(Unit)
|
||||
}
|
||||
player.setOnCompletionListener {
|
||||
finished.complete(Unit)
|
||||
}
|
||||
@@ -544,16 +558,30 @@ class TalkModeManager(
|
||||
true
|
||||
}
|
||||
|
||||
player.setDataSource(file.absolutePath)
|
||||
player.setDataSource(dataSource)
|
||||
withContext(Dispatchers.Main) {
|
||||
player.setOnPreparedListener { it.start() }
|
||||
player.prepareAsync()
|
||||
}
|
||||
|
||||
val fetchError = CompletableDeferred<Throwable?>()
|
||||
val fetchJob =
|
||||
scope.launch(Dispatchers.IO) {
|
||||
try {
|
||||
streamTts(voiceId = voiceId, apiKey = apiKey, request = request, sink = dataSource)
|
||||
fetchError.complete(null)
|
||||
} catch (err: Throwable) {
|
||||
dataSource.fail()
|
||||
fetchError.complete(err)
|
||||
}
|
||||
}
|
||||
|
||||
Log.d(tag, "play start")
|
||||
try {
|
||||
prepared.await()
|
||||
finished.await()
|
||||
fetchError.await()?.let { throw it }
|
||||
} finally {
|
||||
fetchJob.cancel()
|
||||
cleanupPlayer()
|
||||
}
|
||||
Log.d(tag, "play done")
|
||||
@@ -674,8 +702,8 @@ class TalkModeManager(
|
||||
player?.stop()
|
||||
player?.release()
|
||||
player = null
|
||||
currentAudioFile?.delete()
|
||||
currentAudioFile = null
|
||||
streamingSource?.close()
|
||||
streamingSource = null
|
||||
}
|
||||
|
||||
private fun shouldInterrupt(transcript: String): Boolean {
|
||||
@@ -713,13 +741,15 @@ class TalkModeManager(
|
||||
defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
|
||||
voiceAliases = aliases
|
||||
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
|
||||
defaultModelId = model
|
||||
defaultModelId = model ?: defaultModelIdFallback
|
||||
if (!modelOverrideActive) currentModelId = defaultModelId
|
||||
defaultOutputFormat = outputFormat
|
||||
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
|
||||
if (interrupt != null) interruptOnSpeech = interrupt
|
||||
} catch (_: Throwable) {
|
||||
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
|
||||
defaultModelId = defaultModelIdFallback
|
||||
if (!modelOverrideActive) currentModelId = defaultModelId
|
||||
apiKey = envKey?.takeIf { it.isNotEmpty() }
|
||||
voiceAliases = emptyMap()
|
||||
}
|
||||
@@ -730,9 +760,21 @@ class TalkModeManager(
|
||||
return obj["runId"].asStringOrNull()
|
||||
}
|
||||
|
||||
private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray {
|
||||
return withContext(Dispatchers.IO) {
|
||||
val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId")
|
||||
private suspend fun streamTts(
|
||||
voiceId: String,
|
||||
apiKey: String,
|
||||
request: ElevenLabsRequest,
|
||||
sink: StreamingMediaDataSource,
|
||||
) {
|
||||
withContext(Dispatchers.IO) {
|
||||
val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
|
||||
val latencyTier = request.latencyTier
|
||||
val url =
|
||||
if (latencyTier != null) {
|
||||
URL("$baseUrl?optimize_streaming_latency=$latencyTier")
|
||||
} else {
|
||||
URL(baseUrl)
|
||||
}
|
||||
val conn = url.openConnection() as HttpURLConnection
|
||||
conn.requestMethod = "POST"
|
||||
conn.connectTimeout = 30_000
|
||||
@@ -746,13 +788,21 @@ class TalkModeManager(
|
||||
conn.outputStream.use { it.write(payload.toByteArray()) }
|
||||
|
||||
val code = conn.responseCode
|
||||
val stream = if (code >= 400) conn.errorStream else conn.inputStream
|
||||
val data = stream.readBytes()
|
||||
if (code >= 400) {
|
||||
val message = String(data)
|
||||
val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
|
||||
sink.fail()
|
||||
throw IllegalStateException("ElevenLabs failed: $code $message")
|
||||
}
|
||||
data
|
||||
|
||||
val buffer = ByteArray(8 * 1024)
|
||||
conn.inputStream.use { input ->
|
||||
while (true) {
|
||||
val read = input.read(buffer)
|
||||
if (read <= 0) break
|
||||
sink.append(buffer.copyOf(read))
|
||||
}
|
||||
}
|
||||
sink.finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -794,6 +844,7 @@ class TalkModeManager(
|
||||
val seed: Long?,
|
||||
val normalize: String?,
|
||||
val language: String?,
|
||||
val latencyTier: Int?,
|
||||
)
|
||||
|
||||
private object TalkModeRuntime {
|
||||
@@ -816,6 +867,15 @@ class TalkModeManager(
|
||||
return value
|
||||
}
|
||||
|
||||
fun validatedStability(value: Double?, modelId: String?): Double? {
|
||||
if (value == null) return null
|
||||
val normalized = modelId?.trim()?.lowercase()
|
||||
if (normalized == "eleven_v3") {
|
||||
return if (value == 0.0 || value == 0.5 || value == 1.0) value else null
|
||||
}
|
||||
return validatedUnit(value)
|
||||
}
|
||||
|
||||
fun validatedSeed(value: Long?): Long? {
|
||||
if (value == null) return null
|
||||
if (value < 0 || value > 4294967295L) return null
|
||||
@@ -840,6 +900,12 @@ class TalkModeManager(
|
||||
return if (trimmed.startsWith("mp3_")) trimmed else null
|
||||
}
|
||||
|
||||
fun validatedLatencyTier(value: Int?): Int? {
|
||||
if (value == null) return null
|
||||
if (value < 0 || value > 4) return null
|
||||
return value
|
||||
}
|
||||
|
||||
fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean {
|
||||
val sinceMs = sinceSeconds * 1000
|
||||
return if (timestamp > 10_000_000_000) {
|
||||
@@ -876,6 +942,62 @@ class TalkModeManager(
|
||||
return if (isLikelyVoiceId(trimmed)) trimmed else null
|
||||
}
|
||||
|
||||
private suspend fun resolveVoiceId(preferred: String?, apiKey: String): String? {
|
||||
val trimmed = preferred?.trim().orEmpty()
|
||||
if (trimmed.isNotEmpty()) {
|
||||
val resolved = resolveVoiceAlias(trimmed)
|
||||
if (resolved != null) return resolved
|
||||
Log.w(tag, "unknown voice alias $trimmed")
|
||||
}
|
||||
fallbackVoiceId?.let { return it }
|
||||
|
||||
return try {
|
||||
val voices = listVoices(apiKey)
|
||||
val first = voices.firstOrNull() ?: return null
|
||||
fallbackVoiceId = first.voiceId
|
||||
if (defaultVoiceId.isNullOrBlank()) {
|
||||
defaultVoiceId = first.voiceId
|
||||
}
|
||||
if (!voiceOverrideActive) {
|
||||
currentVoiceId = first.voiceId
|
||||
}
|
||||
val name = first.name ?: "unknown"
|
||||
Log.d(tag, "default voice selected $name (${first.voiceId})")
|
||||
first.voiceId
|
||||
} catch (err: Throwable) {
|
||||
Log.w(tag, "list voices failed: ${err.message ?: err::class.simpleName}")
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun listVoices(apiKey: String): List<ElevenLabsVoice> {
|
||||
return withContext(Dispatchers.IO) {
|
||||
val url = URL("https://api.elevenlabs.io/v1/voices")
|
||||
val conn = url.openConnection() as HttpURLConnection
|
||||
conn.requestMethod = "GET"
|
||||
conn.connectTimeout = 15_000
|
||||
conn.readTimeout = 15_000
|
||||
conn.setRequestProperty("xi-api-key", apiKey)
|
||||
|
||||
val code = conn.responseCode
|
||||
val stream = if (code >= 400) conn.errorStream else conn.inputStream
|
||||
val data = stream.readBytes()
|
||||
if (code >= 400) {
|
||||
val message = data.toString(Charsets.UTF_8)
|
||||
throw IllegalStateException("ElevenLabs voices failed: $code $message")
|
||||
}
|
||||
|
||||
val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull()
|
||||
val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList())
|
||||
voices.mapNotNull { entry ->
|
||||
val obj = entry.asObjectOrNull() ?: return@mapNotNull null
|
||||
val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null
|
||||
val name = obj["name"].asStringOrNull()
|
||||
ElevenLabsVoice(voiceId, name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun isLikelyVoiceId(value: String): Boolean {
|
||||
if (value.length < 10) return false
|
||||
return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
|
||||
@@ -884,6 +1006,8 @@ class TalkModeManager(
|
||||
private fun normalizeAliasKey(value: String): String =
|
||||
value.trim().lowercase()
|
||||
|
||||
private data class ElevenLabsVoice(val voiceId: String, val name: String?)
|
||||
|
||||
private val listener =
|
||||
object : RecognitionListener {
|
||||
override fun onReadyForSpeech(params: Bundle?) {
|
||||
|
||||
Reference in New Issue
Block a user