fix: stream elevenlabs tts playback

This commit is contained in:
Peter Steinberger
2025-12-30 12:17:40 +01:00
parent 9c532eac07
commit 27adfb76fa
11 changed files with 1091 additions and 91 deletions

View File

@@ -17,6 +17,9 @@
- macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn. - macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn.
- macOS Talk Mode: increase overlay window size so wave rings dont clip; close button is hover-only and closer to the orb. - macOS Talk Mode: increase overlay window size so wave rings dont clip; close button is hover-only and closer to the orb.
- Talk Mode: fall back to system TTS when ElevenLabs is unavailable, returns non-audio, or playback fails (macOS/iOS/Android). - Talk Mode: fall back to system TTS when ElevenLabs is unavailable, returns non-audio, or playback fails (macOS/iOS/Android).
- Talk Mode: stream PCM on macOS/iOS for lower latency (incremental playback); Android continues MP3 streaming.
- Talk Mode: validate ElevenLabs v3 stability and latency tier directives before sending requests.
- iOS/Android Talk Mode: auto-select the first ElevenLabs voice when none is configured.
- ElevenLabs: add retry/backoff for 429/5xx and include content-type in errors for debugging. - ElevenLabs: add retry/backoff for 429/5xx and include content-type in errors for debugging.
- Talk Mode: align to the gateways main session key and fall back to history polling when chat events drop (prevents stuck “thinking” / missing messages). - Talk Mode: align to the gateways main session key and fall back to history polling when chat events drop (prevents stuck “thinking” / missing messages).
- Talk Mode: treat history timestamps as seconds or milliseconds to avoid stale assistant picks (macOS/iOS/Android). - Talk Mode: treat history timestamps as seconds or milliseconds to avoid stale assistant picks (macOS/iOS/Android).

View File

@@ -0,0 +1,98 @@
package com.steipete.clawdis.node.voice
import android.media.MediaDataSource
import kotlin.math.min
internal class StreamingMediaDataSource : MediaDataSource() {
private data class Chunk(val start: Long, val data: ByteArray)
private val lock = Object()
private val chunks = ArrayList<Chunk>()
private var totalSize: Long = 0
private var closed = false
private var finished = false
private var lastReadIndex = 0
fun append(data: ByteArray) {
if (data.isEmpty()) return
synchronized(lock) {
if (closed || finished) return
val chunk = Chunk(totalSize, data)
chunks.add(chunk)
totalSize += data.size.toLong()
lock.notifyAll()
}
}
fun finish() {
synchronized(lock) {
if (closed) return
finished = true
lock.notifyAll()
}
}
fun fail() {
synchronized(lock) {
closed = true
lock.notifyAll()
}
}
override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int {
if (position < 0) return -1
synchronized(lock) {
while (!closed && !finished && position >= totalSize) {
lock.wait()
}
if (closed) return -1
if (position >= totalSize && finished) return -1
val available = (totalSize - position).toInt()
val toRead = min(size, available)
var remaining = toRead
var destOffset = offset
var pos = position
var index = findChunkIndex(pos)
while (remaining > 0 && index < chunks.size) {
val chunk = chunks[index]
val inChunkOffset = (pos - chunk.start).toInt()
if (inChunkOffset >= chunk.data.size) {
index++
continue
}
val copyLen = min(remaining, chunk.data.size - inChunkOffset)
System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen)
remaining -= copyLen
destOffset += copyLen
pos += copyLen
if (inChunkOffset + copyLen >= chunk.data.size) {
index++
}
}
return toRead - remaining
}
}
override fun getSize(): Long = -1
override fun close() {
synchronized(lock) {
closed = true
lock.notifyAll()
}
}
private fun findChunkIndex(position: Long): Int {
var index = lastReadIndex
while (index < chunks.size) {
val chunk = chunks[index]
if (position < chunk.start + chunk.data.size) break
index++
}
lastReadIndex = index
return index
}
}

View File

@@ -18,7 +18,6 @@ import android.speech.tts.UtteranceProgressListener
import android.util.Log import android.util.Log
import androidx.core.content.ContextCompat import androidx.core.content.ContextCompat
import com.steipete.clawdis.node.bridge.BridgeSession import com.steipete.clawdis.node.bridge.BridgeSession
import java.io.File
import java.net.HttpURLConnection import java.net.HttpURLConnection
import java.net.URL import java.net.URL
import java.util.UUID import java.util.UUID
@@ -44,6 +43,7 @@ class TalkModeManager(
) { ) {
companion object { companion object {
private const val tag = "TalkMode" private const val tag = "TalkMode"
private const val defaultModelIdFallback = "eleven_v3"
} }
private val mainHandler = Handler(Looper.getMainLooper()) private val mainHandler = Handler(Looper.getMainLooper())
@@ -81,6 +81,7 @@ class TalkModeManager(
private var defaultVoiceId: String? = null private var defaultVoiceId: String? = null
private var currentVoiceId: String? = null private var currentVoiceId: String? = null
private var fallbackVoiceId: String? = null
private var defaultModelId: String? = null private var defaultModelId: String? = null
private var currentModelId: String? = null private var currentModelId: String? = null
private var defaultOutputFormat: String? = null private var defaultOutputFormat: String? = null
@@ -97,7 +98,7 @@ class TalkModeManager(
private var chatSubscribedSessionKey: String? = null private var chatSubscribedSessionKey: String? = null
private var player: MediaPlayer? = null private var player: MediaPlayer? = null
private var currentAudioFile: File? = null private var streamingSource: StreamingMediaDataSource? = null
private var systemTts: TextToSpeech? = null private var systemTts: TextToSpeech? = null
private var systemTtsPending: CompletableDeferred<Unit>? = null private var systemTtsPending: CompletableDeferred<Unit>? = null
private var systemTtsPendingId: String? = null private var systemTtsPendingId: String? = null
@@ -464,7 +465,13 @@ class TalkModeManager(
val apiKey = val apiKey =
apiKey?.trim()?.takeIf { it.isNotEmpty() } apiKey?.trim()?.takeIf { it.isNotEmpty() }
?: System.getenv("ELEVENLABS_API_KEY")?.trim() ?: System.getenv("ELEVENLABS_API_KEY")?.trim()
val voiceId = resolvedVoice ?: currentVoiceId ?: defaultVoiceId val preferredVoice = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
val voiceId =
if (!apiKey.isNullOrEmpty()) {
resolveVoiceId(preferredVoice, apiKey)
} else {
null
}
_statusText.value = "Speaking…" _statusText.value = "Speaking…"
_isSpeaking.value = true _isSpeaking.value = true
@@ -486,24 +493,25 @@ class TalkModeManager(
} else { } else {
_usingFallbackTts.value = false _usingFallbackTts.value = false
val ttsStarted = SystemClock.elapsedRealtime() val ttsStarted = SystemClock.elapsedRealtime()
val modelId = directive?.modelId ?: currentModelId ?: defaultModelId
val request = val request =
ElevenLabsRequest( ElevenLabsRequest(
text = cleaned, text = cleaned,
modelId = directive?.modelId ?: currentModelId ?: defaultModelId, modelId = modelId,
outputFormat = outputFormat =
TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat), TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm), speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
stability = TalkModeRuntime.validatedUnit(directive?.stability), stability = TalkModeRuntime.validatedStability(directive?.stability, modelId),
similarity = TalkModeRuntime.validatedUnit(directive?.similarity), similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
style = TalkModeRuntime.validatedUnit(directive?.style), style = TalkModeRuntime.validatedUnit(directive?.style),
speakerBoost = directive?.speakerBoost, speakerBoost = directive?.speakerBoost,
seed = TalkModeRuntime.validatedSeed(directive?.seed), seed = TalkModeRuntime.validatedSeed(directive?.seed),
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize), normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
language = TalkModeRuntime.validatedLanguage(directive?.language), language = TalkModeRuntime.validatedLanguage(directive?.language),
latencyTier = TalkModeRuntime.validatedLatencyTier(directive?.latencyTier),
) )
val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request) streamAndPlay(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}") Log.d(tag, "elevenlabs stream ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
playAudio(audio)
} }
} catch (err: Throwable) { } catch (err: Throwable) {
Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice") Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
@@ -520,22 +528,28 @@ class TalkModeManager(
_isSpeaking.value = false _isSpeaking.value = false
} }
private suspend fun playAudio(data: ByteArray) { private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
stopSpeaking(resetInterrupt = false) stopSpeaking(resetInterrupt = false)
val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
file.writeBytes(data) val dataSource = StreamingMediaDataSource()
currentAudioFile = file streamingSource = dataSource
val player = MediaPlayer() val player = MediaPlayer()
this.player = player this.player = player
val prepared = CompletableDeferred<Unit>()
val finished = CompletableDeferred<Unit>() val finished = CompletableDeferred<Unit>()
player.setAudioAttributes( player.setAudioAttributes(
AudioAttributes.Builder() AudioAttributes.Builder()
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.setUsage(AudioAttributes.USAGE_ASSISTANT) .setUsage(AudioAttributes.USAGE_ASSISTANT)
.build(), .build(),
) )
player.setOnPreparedListener {
it.start()
prepared.complete(Unit)
}
player.setOnCompletionListener { player.setOnCompletionListener {
finished.complete(Unit) finished.complete(Unit)
} }
@@ -544,16 +558,30 @@ class TalkModeManager(
true true
} }
player.setDataSource(file.absolutePath) player.setDataSource(dataSource)
withContext(Dispatchers.Main) { withContext(Dispatchers.Main) {
player.setOnPreparedListener { it.start() }
player.prepareAsync() player.prepareAsync()
} }
val fetchError = CompletableDeferred<Throwable?>()
val fetchJob =
scope.launch(Dispatchers.IO) {
try {
streamTts(voiceId = voiceId, apiKey = apiKey, request = request, sink = dataSource)
fetchError.complete(null)
} catch (err: Throwable) {
dataSource.fail()
fetchError.complete(err)
}
}
Log.d(tag, "play start") Log.d(tag, "play start")
try { try {
prepared.await()
finished.await() finished.await()
fetchError.await()?.let { throw it }
} finally { } finally {
fetchJob.cancel()
cleanupPlayer() cleanupPlayer()
} }
Log.d(tag, "play done") Log.d(tag, "play done")
@@ -674,8 +702,8 @@ class TalkModeManager(
player?.stop() player?.stop()
player?.release() player?.release()
player = null player = null
currentAudioFile?.delete() streamingSource?.close()
currentAudioFile = null streamingSource = null
} }
private fun shouldInterrupt(transcript: String): Boolean { private fun shouldInterrupt(transcript: String): Boolean {
@@ -713,13 +741,15 @@ class TalkModeManager(
defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
voiceAliases = aliases voiceAliases = aliases
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
defaultModelId = model defaultModelId = model ?: defaultModelIdFallback
if (!modelOverrideActive) currentModelId = defaultModelId if (!modelOverrideActive) currentModelId = defaultModelId
defaultOutputFormat = outputFormat defaultOutputFormat = outputFormat
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() } apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
if (interrupt != null) interruptOnSpeech = interrupt if (interrupt != null) interruptOnSpeech = interrupt
} catch (_: Throwable) { } catch (_: Throwable) {
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
defaultModelId = defaultModelIdFallback
if (!modelOverrideActive) currentModelId = defaultModelId
apiKey = envKey?.takeIf { it.isNotEmpty() } apiKey = envKey?.takeIf { it.isNotEmpty() }
voiceAliases = emptyMap() voiceAliases = emptyMap()
} }
@@ -730,9 +760,21 @@ class TalkModeManager(
return obj["runId"].asStringOrNull() return obj["runId"].asStringOrNull()
} }
private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray { private suspend fun streamTts(
return withContext(Dispatchers.IO) { voiceId: String,
val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId") apiKey: String,
request: ElevenLabsRequest,
sink: StreamingMediaDataSource,
) {
withContext(Dispatchers.IO) {
val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
val latencyTier = request.latencyTier
val url =
if (latencyTier != null) {
URL("$baseUrl?optimize_streaming_latency=$latencyTier")
} else {
URL(baseUrl)
}
val conn = url.openConnection() as HttpURLConnection val conn = url.openConnection() as HttpURLConnection
conn.requestMethod = "POST" conn.requestMethod = "POST"
conn.connectTimeout = 30_000 conn.connectTimeout = 30_000
@@ -746,13 +788,21 @@ class TalkModeManager(
conn.outputStream.use { it.write(payload.toByteArray()) } conn.outputStream.use { it.write(payload.toByteArray()) }
val code = conn.responseCode val code = conn.responseCode
val stream = if (code >= 400) conn.errorStream else conn.inputStream
val data = stream.readBytes()
if (code >= 400) { if (code >= 400) {
val message = String(data) val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
sink.fail()
throw IllegalStateException("ElevenLabs failed: $code $message") throw IllegalStateException("ElevenLabs failed: $code $message")
} }
data
val buffer = ByteArray(8 * 1024)
conn.inputStream.use { input ->
while (true) {
val read = input.read(buffer)
if (read <= 0) break
sink.append(buffer.copyOf(read))
}
}
sink.finish()
} }
} }
@@ -794,6 +844,7 @@ class TalkModeManager(
val seed: Long?, val seed: Long?,
val normalize: String?, val normalize: String?,
val language: String?, val language: String?,
val latencyTier: Int?,
) )
private object TalkModeRuntime { private object TalkModeRuntime {
@@ -816,6 +867,15 @@ class TalkModeManager(
return value return value
} }
fun validatedStability(value: Double?, modelId: String?): Double? {
if (value == null) return null
val normalized = modelId?.trim()?.lowercase()
if (normalized == "eleven_v3") {
return if (value == 0.0 || value == 0.5 || value == 1.0) value else null
}
return validatedUnit(value)
}
fun validatedSeed(value: Long?): Long? { fun validatedSeed(value: Long?): Long? {
if (value == null) return null if (value == null) return null
if (value < 0 || value > 4294967295L) return null if (value < 0 || value > 4294967295L) return null
@@ -840,6 +900,12 @@ class TalkModeManager(
return if (trimmed.startsWith("mp3_")) trimmed else null return if (trimmed.startsWith("mp3_")) trimmed else null
} }
fun validatedLatencyTier(value: Int?): Int? {
if (value == null) return null
if (value < 0 || value > 4) return null
return value
}
fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean { fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean {
val sinceMs = sinceSeconds * 1000 val sinceMs = sinceSeconds * 1000
return if (timestamp > 10_000_000_000) { return if (timestamp > 10_000_000_000) {
@@ -876,6 +942,62 @@ class TalkModeManager(
return if (isLikelyVoiceId(trimmed)) trimmed else null return if (isLikelyVoiceId(trimmed)) trimmed else null
} }
private suspend fun resolveVoiceId(preferred: String?, apiKey: String): String? {
val trimmed = preferred?.trim().orEmpty()
if (trimmed.isNotEmpty()) {
val resolved = resolveVoiceAlias(trimmed)
if (resolved != null) return resolved
Log.w(tag, "unknown voice alias $trimmed")
}
fallbackVoiceId?.let { return it }
return try {
val voices = listVoices(apiKey)
val first = voices.firstOrNull() ?: return null
fallbackVoiceId = first.voiceId
if (defaultVoiceId.isNullOrBlank()) {
defaultVoiceId = first.voiceId
}
if (!voiceOverrideActive) {
currentVoiceId = first.voiceId
}
val name = first.name ?: "unknown"
Log.d(tag, "default voice selected $name (${first.voiceId})")
first.voiceId
} catch (err: Throwable) {
Log.w(tag, "list voices failed: ${err.message ?: err::class.simpleName}")
null
}
}
private suspend fun listVoices(apiKey: String): List<ElevenLabsVoice> {
return withContext(Dispatchers.IO) {
val url = URL("https://api.elevenlabs.io/v1/voices")
val conn = url.openConnection() as HttpURLConnection
conn.requestMethod = "GET"
conn.connectTimeout = 15_000
conn.readTimeout = 15_000
conn.setRequestProperty("xi-api-key", apiKey)
val code = conn.responseCode
val stream = if (code >= 400) conn.errorStream else conn.inputStream
val data = stream.readBytes()
if (code >= 400) {
val message = data.toString(Charsets.UTF_8)
throw IllegalStateException("ElevenLabs voices failed: $code $message")
}
val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull()
val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList())
voices.mapNotNull { entry ->
val obj = entry.asObjectOrNull() ?: return@mapNotNull null
val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null
val name = obj["name"].asStringOrNull()
ElevenLabsVoice(voiceId, name)
}
}
}
private fun isLikelyVoiceId(value: String): Boolean { private fun isLikelyVoiceId(value: String): Boolean {
if (value.length < 10) return false if (value.length < 10) return false
return value.all { it.isLetterOrDigit() || it == '-' || it == '_' } return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
@@ -884,6 +1006,8 @@ class TalkModeManager(
private fun normalizeAliasKey(value: String): String = private fun normalizeAliasKey(value: String): String =
value.trim().lowercase() value.trim().lowercase()
private data class ElevenLabsVoice(val voiceId: String, val name: String?)
private val listener = private val listener =
object : RecognitionListener { object : RecognitionListener {
override fun onReadyForSpeech(params: Bundle?) { override fun onReadyForSpeech(params: Bundle?) {

View File

@@ -9,6 +9,7 @@ import Speech
@Observable @Observable
final class TalkModeManager: NSObject { final class TalkModeManager: NSObject {
private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
private static let defaultModelIdFallback = "eleven_v3"
var isEnabled: Bool = false var isEnabled: Bool = false
var isListening: Bool = false var isListening: Bool = false
var isSpeaking: Bool = false var isSpeaking: Bool = false
@@ -36,11 +37,12 @@ final class TalkModeManager: NSObject {
private var voiceAliases: [String: String] = [:] private var voiceAliases: [String: String] = [:]
private var interruptOnSpeech: Bool = true private var interruptOnSpeech: Bool = true
private var mainSessionKey: String = "main" private var mainSessionKey: String = "main"
private var fallbackVoiceId: String?
private var lastPlaybackWasPCM: Bool = false
private var bridge: BridgeSession? private var bridge: BridgeSession?
private let silenceWindow: TimeInterval = 0.7 private let silenceWindow: TimeInterval = 0.7
private var player: AVAudioPlayer?
private var chatSubscribedSessionKeys = Set<String>() private var chatSubscribedSessionKeys = Set<String>()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "TalkMode") private let logger = Logger(subsystem: "com.steipete.clawdis", category: "TalkMode")
@@ -446,43 +448,43 @@ final class TalkModeManager: NSObject {
let started = Date() let started = Date()
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language) let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
let voiceId = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
let resolvedKey = let resolvedKey =
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ?? (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines) let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
let preferredVoice = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
let voiceId: String? = if let apiKey, !apiKey.isEmpty {
await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
} else {
nil
}
let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false) let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)
if canUseElevenLabs, let voiceId, let apiKey { if canUseElevenLabs, let voiceId, let apiKey {
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat) let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty { if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
self.logger.warning( self.logger.warning(
"talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)") "talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
} }
let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
let request = ElevenLabsTTSRequest( let request = ElevenLabsTTSRequest(
text: cleaned, text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, modelId: modelId,
outputFormat: outputFormat, outputFormat: outputFormat,
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM), speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
stability: TalkTTSValidation.validatedUnit(directive?.stability), stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
similarity: TalkTTSValidation.validatedUnit(directive?.similarity), similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
style: TalkTTSValidation.validatedUnit(directive?.style), style: TalkTTSValidation.validatedUnit(directive?.style),
speakerBoost: directive?.speakerBoost, speakerBoost: directive?.speakerBoost,
seed: TalkTTSValidation.validatedSeed(directive?.seed), seed: TalkTTSValidation.validatedSeed(directive?.seed),
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize), normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
language: language) language: language,
latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
let client = ElevenLabsTTSClient(apiKey: apiKey) let client = ElevenLabsTTSClient(apiKey: apiKey)
let audio = try await client.synthesizeWithHardTimeout( let stream = client.streamSynthesize(voiceId: voiceId, request: request)
voiceId: voiceId,
request: request,
hardTimeoutSeconds: synthTimeoutSeconds)
self.logger
.info(
"elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
if self.interruptOnSpeech { if self.interruptOnSpeech {
do { do {
@@ -494,7 +496,21 @@ final class TalkModeManager: NSObject {
} }
self.statusText = "Speaking…" self.statusText = "Speaking…"
try await self.playAudio(data: audio) let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
let result: StreamingPlaybackResult
if let sampleRate {
self.lastPlaybackWasPCM = true
result = await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate)
} else {
self.lastPlaybackWasPCM = false
result = await StreamingAudioPlayer.shared.play(stream: stream)
}
self.logger
.info(
"elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
if !result.finished, let interruptedAt = result.interruptedAt {
self.lastInterruptedAtSeconds = interruptedAt
}
} else { } else {
self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)") self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
if self.interruptOnSpeech { if self.interruptOnSpeech {
@@ -533,30 +549,17 @@ final class TalkModeManager: NSObject {
self.isSpeaking = false self.isSpeaking = false
} }
private func playAudio(data: Data) async throws {
self.player?.stop()
let player = try AVAudioPlayer(data: data)
self.player = player
player.prepareToPlay()
self.logger.info("play start")
guard player.play() else {
throw NSError(domain: "TalkMode", code: 2, userInfo: [
NSLocalizedDescriptionKey: "audio player refused to play",
])
}
while player.isPlaying {
try? await Task.sleep(nanoseconds: 120_000_000)
}
self.logger.info("play done")
}
private func stopSpeaking(storeInterruption: Bool = true) { private func stopSpeaking(storeInterruption: Bool = true) {
guard self.isSpeaking else { return } guard self.isSpeaking else { return }
let interruptedAt = self.lastPlaybackWasPCM
? PCMStreamingAudioPlayer.shared.stop()
: StreamingAudioPlayer.shared.stop()
if storeInterruption { if storeInterruption {
self.lastInterruptedAtSeconds = self.player?.currentTime self.lastInterruptedAtSeconds = interruptedAt
} }
self.player?.stop() _ = self.lastPlaybackWasPCM
self.player = nil ? StreamingAudioPlayer.shared.stop()
: PCMStreamingAudioPlayer.shared.stop()
TalkSystemSpeechSynthesizer.shared.stop() TalkSystemSpeechSynthesizer.shared.stop()
self.isSpeaking = false self.isSpeaking = false
} }
@@ -581,6 +584,37 @@ final class TalkModeManager: NSObject {
return Self.isLikelyVoiceId(trimmed) ? trimmed : nil return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
} }
private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
if !trimmed.isEmpty {
if let resolved = self.resolveVoiceAlias(trimmed) { return resolved }
self.logger.warning("unknown voice alias \(trimmed, privacy: .public)")
}
if let fallbackVoiceId { return fallbackVoiceId }
do {
let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
guard let first = voices.first else {
self.logger.warning("elevenlabs voices list empty")
return nil
}
self.fallbackVoiceId = first.voiceId
if self.defaultVoiceId == nil {
self.defaultVoiceId = first.voiceId
}
if !self.voiceOverrideActive {
self.currentVoiceId = first.voiceId
}
let name = first.name ?? "unknown"
self.logger
.info("default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
return first.voiceId
} catch {
self.logger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
return nil
}
}
private static func isLikelyVoiceId(_ value: String) -> Bool { private static func isLikelyVoiceId(_ value: String) -> Bool {
guard value.count >= 10 else { return false } guard value.count >= 10 else { return false }
return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" } return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
@@ -598,22 +632,23 @@ final class TalkModeManager: NSObject {
self.mainSessionKey = rawMainKey.isEmpty ? "main" : rawMainKey self.mainSessionKey = rawMainKey.isEmpty ? "main" : rawMainKey
self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
if let aliases = talk?["voiceAliases"] as? [String: Any] { if let aliases = talk?["voiceAliases"] as? [String: Any] {
self.voiceAliases = var resolved: [String: String] = [:]
aliases.compactMap { key, value in for (key, value) in aliases {
guard let id = value as? String else { return nil } guard let id = value as? String else { continue }
let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines) let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { return nil } guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { continue }
return (normalizedKey, trimmedId) resolved[normalizedKey] = trimmedId
} }
.reduce(into: [:]) { $0[$1.0] = $1.1 } self.voiceAliases = resolved
} else { } else {
self.voiceAliases = [:] self.voiceAliases = [:]
} }
if !self.voiceOverrideActive { if !self.voiceOverrideActive {
self.currentVoiceId = self.defaultVoiceId self.currentVoiceId = self.defaultVoiceId
} }
self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) let model = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
self.defaultModelId = (model?.isEmpty == false) ? model : Self.defaultModelIdFallback
if !self.modelOverrideActive { if !self.modelOverrideActive {
self.currentModelId = self.defaultModelId self.currentModelId = self.defaultModelId
} }
@@ -624,7 +659,10 @@ final class TalkModeManager: NSObject {
self.interruptOnSpeech = interrupt self.interruptOnSpeech = interrupt
} }
} catch { } catch {
// ignore self.defaultModelId = Self.defaultModelIdFallback
if !self.modelOverrideActive {
self.currentModelId = self.defaultModelId
}
} }
} }

View File

@@ -10,6 +10,7 @@ actor TalkModeRuntime {
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime") private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime")
private let ttsLogger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts") private let ttsLogger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts")
private static let defaultModelIdFallback = "eleven_v3"
private final class RMSMeter: @unchecked Sendable { private final class RMSMeter: @unchecked Sendable {
private let lock = NSLock() private let lock = NSLock()
@@ -62,6 +63,7 @@ actor TalkModeRuntime {
private var lastSpokenText: String? private var lastSpokenText: String?
private var apiKey: String? private var apiKey: String?
private var fallbackVoiceId: String? private var fallbackVoiceId: String?
private var lastPlaybackWasPCM: Bool = false
private let silenceWindow: TimeInterval = 0.7 private let silenceWindow: TimeInterval = 0.7
private let minSpeechRMS: Double = 1e-3 private let minSpeechRMS: Double = 1e-3
@@ -496,7 +498,7 @@ actor TalkModeRuntime {
do { do {
if let apiKey, !apiKey.isEmpty, let voiceId { if let apiKey, !apiKey.isEmpty, let voiceId {
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat) let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty { if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
self.logger self.logger
@@ -504,27 +506,25 @@ actor TalkModeRuntime {
"talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)") "talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
} }
let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
let request = ElevenLabsTTSRequest( let request = ElevenLabsTTSRequest(
text: cleaned, text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, modelId: modelId,
outputFormat: outputFormat, outputFormat: outputFormat,
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM), speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
stability: TalkTTSValidation.validatedUnit(directive?.stability), stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
similarity: TalkTTSValidation.validatedUnit(directive?.similarity), similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
style: TalkTTSValidation.validatedUnit(directive?.style), style: TalkTTSValidation.validatedUnit(directive?.style),
speakerBoost: directive?.speakerBoost, speakerBoost: directive?.speakerBoost,
seed: TalkTTSValidation.validatedSeed(directive?.seed), seed: TalkTTSValidation.validatedSeed(directive?.seed),
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize), normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
language: language) language: language,
latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))
self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s") self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
let client = ElevenLabsTTSClient(apiKey: apiKey) let client = ElevenLabsTTSClient(apiKey: apiKey)
let audio = try await client.synthesizeWithHardTimeout( let stream = client.streamSynthesize(voiceId: voiceId, request: request)
voiceId: voiceId,
request: request,
hardTimeoutSeconds: synthTimeoutSeconds)
guard self.isCurrent(gen) else { return } guard self.isCurrent(gen) else { return }
self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
if self.interruptOnSpeech { if self.interruptOnSpeech {
await self.startRecognition() await self.startRecognition()
@@ -534,12 +534,20 @@ actor TalkModeRuntime {
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
self.phase = .speaking self.phase = .speaking
let result = await TalkAudioPlayer.shared.play(data: audio) let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
let result: StreamingPlaybackResult
if let sampleRate {
self.lastPlaybackWasPCM = true
result = await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate)
} else {
self.lastPlaybackWasPCM = false
result = await StreamingAudioPlayer.shared.play(stream: stream)
}
self.ttsLogger self.ttsLogger
.info( .info(
"talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)") "talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
if !result.finished, result.interruptedAt == nil { if !result.finished, result.interruptedAt == nil {
throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [ throw NSError(domain: "StreamingAudioPlayer", code: 1, userInfo: [
NSLocalizedDescriptionKey: "audio playback failed", NSLocalizedDescriptionKey: "audio playback failed",
]) ])
} }
@@ -631,7 +639,15 @@ actor TalkModeRuntime {
} }
func stopSpeaking(reason: TalkStopReason) async { func stopSpeaking(reason: TalkStopReason) async {
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() } let interruptedAt = await MainActor.run {
let primary = self.lastPlaybackWasPCM
? PCMStreamingAudioPlayer.shared.stop()
: StreamingAudioPlayer.shared.stop()
_ = self.lastPlaybackWasPCM
? StreamingAudioPlayer.shared.stop()
: PCMStreamingAudioPlayer.shared.stop()
return primary
}
await TalkSystemSpeechSynthesizer.shared.stop() await TalkSystemSpeechSynthesizer.shared.stop()
guard self.phase == .speaking else { return } guard self.phase == .speaking else { return }
if reason == .speech, let interruptedAt { if reason == .speech, let interruptedAt {
@@ -707,7 +723,8 @@ actor TalkModeRuntime {
guard !key.isEmpty, !value.isEmpty else { return } guard !key.isEmpty, !value.isEmpty else { return }
acc[key] = value acc[key] = value
} ?? [:] } ?? [:]
let model = talk?["modelId"]?.stringValue let model = talk?["modelId"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
let resolvedModel = (model?.isEmpty == false) ? model! : Self.defaultModelIdFallback
let outputFormat = talk?["outputFormat"]?.stringValue let outputFormat = talk?["outputFormat"]?.stringValue
let interrupt = talk?["interruptOnSpeech"]?.boolValue let interrupt = talk?["interruptOnSpeech"]?.boolValue
let apiKey = talk?["apiKey"]?.stringValue let apiKey = talk?["apiKey"]?.stringValue
@@ -721,7 +738,7 @@ actor TalkModeRuntime {
return TalkRuntimeConfig( return TalkRuntimeConfig(
voiceId: resolvedVoice, voiceId: resolvedVoice,
voiceAliases: resolvedAliases, voiceAliases: resolvedAliases,
modelId: model, modelId: resolvedModel,
outputFormat: outputFormat, outputFormat: outputFormat,
interruptOnSpeech: interrupt ?? true, interruptOnSpeech: interrupt ?? true,
apiKey: resolvedApiKey) apiKey: resolvedApiKey)
@@ -733,7 +750,7 @@ actor TalkModeRuntime {
return TalkRuntimeConfig( return TalkRuntimeConfig(
voiceId: resolvedVoice, voiceId: resolvedVoice,
voiceAliases: [:], voiceAliases: [:],
modelId: nil, modelId: Self.defaultModelIdFallback,
outputFormat: nil, outputFormat: nil,
interruptOnSpeech: true, interruptOnSpeech: true,
apiKey: resolvedApiKey) apiKey: resolvedApiKey)

View File

@@ -22,6 +22,7 @@ public struct ElevenLabsTTSRequest: Sendable {
public var seed: UInt32? public var seed: UInt32?
public var normalize: String? public var normalize: String?
public var language: String? public var language: String?
public var latencyTier: Int?
public init( public init(
text: String, text: String,
@@ -34,7 +35,8 @@ public struct ElevenLabsTTSRequest: Sendable {
speakerBoost: Bool? = nil, speakerBoost: Bool? = nil,
seed: UInt32? = nil, seed: UInt32? = nil,
normalize: String? = nil, normalize: String? = nil,
language: String? = nil) language: String? = nil,
latencyTier: Int? = nil)
{ {
self.text = text self.text = text
self.modelId = modelId self.modelId = modelId
@@ -47,6 +49,7 @@ public struct ElevenLabsTTSRequest: Sendable {
self.seed = seed self.seed = seed
self.normalize = normalize self.normalize = normalize
self.language = language self.language = language
self.latencyTier = latencyTier
} }
} }
@@ -155,6 +158,72 @@ public struct ElevenLabsTTSClient: Sendable {
]) ])
} }
public func streamSynthesize(
voiceId: String,
request: ElevenLabsTTSRequest) -> AsyncThrowingStream<Data, Error>
{
AsyncThrowingStream { continuation in
let task = Task {
do {
let url = Self.streamingURL(
baseUrl: self.baseUrl,
voiceId: voiceId,
latencyTier: request.latencyTier)
let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.timeoutInterval = self.requestTimeoutSeconds
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (bytes, response) = try await URLSession.shared.bytes(for: req)
guard let http = response as? HTTPURLResponse else {
throw NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs invalid response",
])
}
let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
if http.statusCode >= 400 {
let message = try await Self.readErrorBody(bytes: bytes)
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
])
}
if !contentType.contains("audio") {
let message = try await Self.readErrorBody(bytes: bytes)
throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
])
}
var buffer = Data()
buffer.reserveCapacity(16_384)
for try await byte in bytes {
buffer.append(byte)
if buffer.count >= 8_192 {
continuation.yield(buffer)
buffer.removeAll(keepingCapacity: true)
}
}
if !buffer.isEmpty {
continuation.yield(buffer)
}
continuation.finish()
} catch {
continuation.finish(throwing: error)
}
}
continuation.onTermination = { _ in
task.cancel()
}
}
}
public func listVoices() async throws -> [ElevenLabsVoice] { public func listVoices() async throws -> [ElevenLabsVoice] {
var url = self.baseUrl var url = self.baseUrl
url.appendPathComponent("v1") url.appendPathComponent("v1")
@@ -180,7 +249,7 @@ public struct ElevenLabsTTSClient: Sendable {
public static func validatedOutputFormat(_ value: String?) -> String? { public static func validatedOutputFormat(_ value: String?) -> String? {
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines) let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return nil } guard !trimmed.isEmpty else { return nil }
guard trimmed.hasPrefix("mp3_") else { return nil } guard trimmed.hasPrefix("mp3_") || trimmed.hasPrefix("pcm_") else { return nil }
return trimmed return trimmed
} }
@@ -230,4 +299,33 @@ public struct ElevenLabsTTSClient: Sendable {
let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown" let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ") return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
} }
private static func streamingURL(baseUrl: URL, voiceId: String, latencyTier: Int?) -> URL {
var url = baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
url.appendPathComponent("stream")
guard let latencyTier else { return url }
let latencyItem = URLQueryItem(
name: "optimize_streaming_latency",
value: "\(latencyTier)")
guard var components = URLComponents(url: url, resolvingAgainstBaseURL: false) else {
return url
}
var items = components.queryItems ?? []
items.append(latencyItem)
components.queryItems = items
return components.url ?? url
}
private static func readErrorBody(bytes: URLSession.AsyncBytes) async throws -> String {
var data = Data()
for try await byte in bytes {
data.append(byte)
if data.count >= 4096 { break }
}
return truncatedErrorBody(data)
}
} }

View File

@@ -0,0 +1,144 @@
import AVFoundation
import Foundation
import OSLog
@MainActor
public final class PCMStreamingAudioPlayer {
public static let shared = PCMStreamingAudioPlayer()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.pcm")
private var engine = AVAudioEngine()
private var player = AVAudioPlayerNode()
private var format: AVAudioFormat?
private var pendingBuffers: Int = 0
private var inputFinished = false
private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
public init() {
self.engine.attach(self.player)
}
public func play(stream: AsyncThrowingStream<Data, Error>, sampleRate: Double) async -> StreamingPlaybackResult {
self.stopInternal()
let format = AVAudioFormat(
commonFormat: .pcmFormatInt16,
sampleRate: sampleRate,
channels: 1,
interleaved: true)
guard let format else {
return StreamingPlaybackResult(finished: false, interruptedAt: nil)
}
self.configure(format: format)
return await withCheckedContinuation { continuation in
self.continuation = continuation
self.pendingBuffers = 0
self.inputFinished = false
Task.detached { [weak self] in
guard let self else { return }
do {
for try await chunk in stream {
await self.enqueuePCM(chunk, format: format)
}
await self.finishInput()
} catch {
await self.fail(error)
}
}
}
}
public func stop() -> Double? {
let interruptedAt = self.currentTimeSeconds()
self.stopInternal()
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
return interruptedAt
}
private func configure(format: AVAudioFormat) {
if self.format?.sampleRate != format.sampleRate || self.format?.commonFormat != format.commonFormat {
self.engine.stop()
self.engine = AVAudioEngine()
self.player = AVAudioPlayerNode()
self.engine.attach(self.player)
}
self.format = format
if self.engine.attachedNodes.contains(self.player) {
self.engine.connect(self.player, to: self.engine.mainMixerNode, format: format)
}
}
private func enqueuePCM(_ data: Data, format: AVAudioFormat) async {
guard !data.isEmpty else { return }
let frameCount = data.count / MemoryLayout<Int16>.size
guard frameCount > 0 else { return }
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(frameCount)) else {
return
}
buffer.frameLength = AVAudioFrameCount(frameCount)
data.withUnsafeBytes { raw in
guard let src = raw.baseAddress else { return }
let audioBuffer = buffer.audioBufferList.pointee.mBuffers
if let dst = audioBuffer.mData {
memcpy(dst, src, frameCount * MemoryLayout<Int16>.size)
}
}
self.pendingBuffers += 1
self.player.scheduleBuffer(buffer) { [weak self] in
Task { @MainActor in
guard let self else { return }
self.pendingBuffers = max(0, self.pendingBuffers - 1)
if self.inputFinished && self.pendingBuffers == 0 {
self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
}
}
}
if !self.player.isPlaying {
do {
try self.engine.start()
self.player.play()
} catch {
self.logger.error("pcm engine start failed: \(error.localizedDescription, privacy: .public)")
self.fail(error)
}
}
}
private func finishInput() {
self.inputFinished = true
if self.pendingBuffers == 0 {
self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
}
}
private func fail(_ error: Error) {
self.logger.error("pcm stream failed: \(error.localizedDescription, privacy: .public)")
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
}
private func stopInternal() {
self.player.stop()
self.engine.stop()
self.pendingBuffers = 0
self.inputFinished = false
}
private func finish(_ result: StreamingPlaybackResult) {
let continuation = self.continuation
self.continuation = nil
continuation?.resume(returning: result)
}
private func currentTimeSeconds() -> Double? {
guard let nodeTime = self.player.lastRenderTime,
let playerTime = self.player.playerTime(forNodeTime: nodeTime)
else { return nil }
return Double(playerTime.sampleTime) / playerTime.sampleRate
}
}

View File

@@ -0,0 +1,429 @@
import AudioToolbox
import Foundation
import OSLog
public struct StreamingPlaybackResult: Sendable {
public let finished: Bool
public let interruptedAt: Double?
public init(finished: Bool, interruptedAt: Double?) {
self.finished = finished
self.interruptedAt = interruptedAt
}
}
@MainActor
public final class StreamingAudioPlayer: NSObject {
public static let shared = StreamingAudioPlayer()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.stream")
private var playback: Playback?
public func play(stream: AsyncThrowingStream<Data, Error>) async -> StreamingPlaybackResult {
self.stopInternal()
let playback = Playback(logger: self.logger)
self.playback = playback
return await withCheckedContinuation { continuation in
playback.setContinuation(continuation)
playback.start()
Task.detached {
do {
for try await chunk in stream {
playback.append(chunk)
}
playback.finishInput()
} catch {
playback.fail(error)
}
}
}
}
public func stop() -> Double? {
guard let playback else { return nil }
let interruptedAt = playback.stop(immediate: true)
self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
return interruptedAt
}
private func stopInternal() {
guard let playback else { return }
let interruptedAt = playback.stop(immediate: true)
self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
}
private func finish(playback: Playback, result: StreamingPlaybackResult) {
playback.finish(result)
guard self.playback === playback else { return }
self.playback = nil
}
}
private final class Playback: @unchecked Sendable {
private static let bufferCount: Int = 3
private static let bufferSize: Int = 32 * 1024
private let logger: Logger
private let lock = NSLock()
private let parseQueue = DispatchQueue(label: "talk.stream.parse")
fileprivate let bufferLock = NSLock()
fileprivate let bufferSemaphore = DispatchSemaphore(value: bufferCount)
private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
private var finished = false
private var audioFileStream: AudioFileStreamID?
private var audioQueue: AudioQueueRef?
fileprivate var audioFormat: AudioStreamBasicDescription?
fileprivate var maxPacketSize: UInt32 = 0
fileprivate var availableBuffers: [AudioQueueBufferRef] = []
private var currentBuffer: AudioQueueBufferRef?
private var currentBufferSize: Int = 0
private var currentPacketDescs: [AudioStreamPacketDescription] = []
private var isRunning = false
fileprivate var inputFinished = false
private var startRequested = false
private var sampleRate: Double = 0
init(logger: Logger) {
self.logger = logger
}
func setContinuation(_ continuation: CheckedContinuation<StreamingPlaybackResult, Never>) {
self.lock.lock()
self.continuation = continuation
self.lock.unlock()
}
func start() {
let selfPtr = Unmanaged.passUnretained(self).toOpaque()
let status = AudioFileStreamOpen(
selfPtr,
propertyListenerProc,
packetsProc,
kAudioFileMP3Type,
&self.audioFileStream)
if status != noErr {
self.logger.error("talk stream open failed: \(status)")
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
}
}
func append(_ data: Data) {
guard !data.isEmpty else { return }
self.parseQueue.async { [weak self] in
guard let self else { return }
guard let audioFileStream = self.audioFileStream else { return }
let status = data.withUnsafeBytes { bytes in
AudioFileStreamParseBytes(
audioFileStream,
UInt32(bytes.count),
bytes.baseAddress,
[])
}
if status != noErr {
self.logger.error("talk stream parse failed: \(status)")
self.fail(NSError(domain: "StreamingAudio", code: Int(status)))
}
}
}
func finishInput() {
self.parseQueue.async { [weak self] in
guard let self else { return }
self.inputFinished = true
if self.audioQueue == nil {
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
return
}
self.enqueueCurrentBuffer(flushOnly: true)
self.stop(immediate: false)
}
}
func fail(_ error: Error) {
self.logger.error("talk stream failed: \(error.localizedDescription, privacy: .public)")
_ = self.stop(immediate: true)
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
}
func stop(immediate: Bool) -> Double? {
guard let audioQueue else { return nil }
let interruptedAt = self.currentTimeSeconds()
AudioQueueStop(audioQueue, immediate)
return interruptedAt
}
fileprivate func finish(_ result: StreamingPlaybackResult) {
let continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
self.lock.lock()
if self.finished {
continuation = nil
} else {
self.finished = true
continuation = self.continuation
self.continuation = nil
}
self.lock.unlock()
continuation?.resume(returning: result)
self.teardown()
}
private func teardown() {
if let audioQueue {
AudioQueueDispose(audioQueue, true)
self.audioQueue = nil
}
if let audioFileStream {
AudioFileStreamClose(audioFileStream)
self.audioFileStream = nil
}
self.bufferLock.lock()
self.availableBuffers.removeAll()
self.bufferLock.unlock()
self.currentBuffer = nil
self.currentPacketDescs.removeAll()
}
fileprivate func setupQueueIfNeeded(_ asbd: AudioStreamBasicDescription) {
guard self.audioQueue == nil else { return }
var format = asbd
self.audioFormat = format
self.sampleRate = format.mSampleRate
let selfPtr = Unmanaged.passUnretained(self).toOpaque()
let status = AudioQueueNewOutput(
&format,
outputCallbackProc,
selfPtr,
nil,
nil,
0,
&self.audioQueue)
if status != noErr {
self.logger.error("talk queue create failed: \(status)")
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
return
}
if let audioQueue {
AudioQueueAddPropertyListener(audioQueue, kAudioQueueProperty_IsRunning, isRunningCallbackProc, selfPtr)
}
if let audioFileStream {
var cookieSize: UInt32 = 0
var writable: DarwinBoolean = false
let cookieStatus = AudioFileStreamGetPropertyInfo(
audioFileStream,
kAudioFileStreamProperty_MagicCookieData,
&cookieSize,
&writable)
if cookieStatus == noErr, cookieSize > 0, let audioQueue {
var cookie = [UInt8](repeating: 0, count: Int(cookieSize))
let readStatus = AudioFileStreamGetProperty(
audioFileStream,
kAudioFileStreamProperty_MagicCookieData,
&cookieSize,
&cookie)
if readStatus == noErr {
AudioQueueSetProperty(audioQueue, kAudioQueueProperty_MagicCookie, cookie, cookieSize)
}
}
}
if let audioQueue {
for _ in 0..<Self.bufferCount {
var buffer: AudioQueueBufferRef?
let allocStatus = AudioQueueAllocateBuffer(audioQueue, UInt32(Self.bufferSize), &buffer)
if allocStatus == noErr, let buffer {
self.bufferLock.lock()
self.availableBuffers.append(buffer)
self.bufferLock.unlock()
}
}
}
}
private func enqueueCurrentBuffer(flushOnly: Bool = false) {
guard let audioQueue, let buffer = self.currentBuffer else { return }
guard self.currentBufferSize > 0 else { return }
buffer.pointee.mAudioDataByteSize = UInt32(self.currentBufferSize)
let packetCount = UInt32(self.currentPacketDescs.count)
let status = self.currentPacketDescs.withUnsafeBufferPointer { descPtr in
AudioQueueEnqueueBuffer(audioQueue, buffer, packetCount, descPtr.baseAddress)
}
if status != noErr {
self.logger.error("talk queue enqueue failed: \(status)")
} else {
if !self.startRequested {
self.startRequested = true
let startStatus = AudioQueueStart(audioQueue, nil)
if startStatus != noErr {
self.logger.error("talk queue start failed: \(startStatus)")
}
}
}
self.currentBuffer = nil
self.currentBufferSize = 0
self.currentPacketDescs.removeAll(keepingCapacity: true)
if !flushOnly {
self.bufferSemaphore.wait()
self.bufferLock.lock()
let next = self.availableBuffers.popLast()
self.bufferLock.unlock()
if let next { self.currentBuffer = next }
}
}
fileprivate func handlePackets(
numberBytes: UInt32,
numberPackets: UInt32,
inputData: UnsafeRawPointer,
packetDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
{
if self.audioQueue == nil, let format = self.audioFormat {
self.setupQueueIfNeeded(format)
}
if self.audioQueue == nil {
return
}
if self.currentBuffer == nil {
self.bufferSemaphore.wait()
self.bufferLock.lock()
self.currentBuffer = self.availableBuffers.popLast()
self.bufferLock.unlock()
self.currentBufferSize = 0
self.currentPacketDescs.removeAll(keepingCapacity: true)
}
let bytes = inputData.assumingMemoryBound(to: UInt8.self)
let packetCount = Int(numberPackets)
for index in 0..<packetCount {
let packetOffset: Int
let packetSize: Int
if let packetDescriptions {
packetOffset = Int(packetDescriptions[index].mStartOffset)
packetSize = Int(packetDescriptions[index].mDataByteSize)
} else {
let size = Int(numberBytes) / packetCount
packetOffset = index * size
packetSize = size
}
if packetSize > Self.bufferSize {
continue
}
if self.currentBufferSize + packetSize > Self.bufferSize {
self.enqueueCurrentBuffer()
}
guard let buffer = self.currentBuffer else { continue }
let dest = buffer.pointee.mAudioData.advanced(by: self.currentBufferSize)
memcpy(dest, bytes.advanced(by: packetOffset), packetSize)
let desc = AudioStreamPacketDescription(
mStartOffset: Int64(self.currentBufferSize),
mVariableFramesInPacket: 0,
mDataByteSize: UInt32(packetSize))
self.currentPacketDescs.append(desc)
self.currentBufferSize += packetSize
}
}
private func currentTimeSeconds() -> Double? {
guard let audioQueue, sampleRate > 0 else { return nil }
var timeStamp = AudioTimeStamp()
let status = AudioQueueGetCurrentTime(audioQueue, nil, &timeStamp, nil)
if status != noErr { return nil }
if timeStamp.mSampleTime.isNaN { return nil }
return timeStamp.mSampleTime / sampleRate
}
}
private func propertyListenerProc(
inClientData: UnsafeMutableRawPointer,
inAudioFileStream: AudioFileStreamID,
inPropertyID: AudioFileStreamPropertyID,
ioFlags: UnsafeMutablePointer<AudioFileStreamPropertyFlags>)
{
let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
if inPropertyID == kAudioFileStreamProperty_DataFormat {
var format = AudioStreamBasicDescription()
var size = UInt32(MemoryLayout<AudioStreamBasicDescription>.size)
let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &format)
if status == noErr {
playback.audioFormat = format
playback.setupQueueIfNeeded(format)
}
} else if inPropertyID == kAudioFileStreamProperty_PacketSizeUpperBound {
var maxPacketSize: UInt32 = 0
var size = UInt32(MemoryLayout<UInt32>.size)
let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &maxPacketSize)
if status == noErr {
playback.maxPacketSize = maxPacketSize
}
}
}
private func packetsProc(
inClientData: UnsafeMutableRawPointer,
inNumberBytes: UInt32,
inNumberPackets: UInt32,
inInputData: UnsafeRawPointer,
inPacketDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
{
let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
playback.handlePackets(
numberBytes: inNumberBytes,
numberPackets: inNumberPackets,
inputData: inInputData,
packetDescriptions: inPacketDescriptions)
}
private func outputCallbackProc(
inUserData: UnsafeMutableRawPointer?,
inAQ: AudioQueueRef,
inBuffer: AudioQueueBufferRef)
{
guard let inUserData else { return }
let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
playback.bufferLock.lock()
playback.availableBuffers.append(inBuffer)
playback.bufferLock.unlock()
playback.bufferSemaphore.signal()
}
private func isRunningCallbackProc(
inUserData: UnsafeMutableRawPointer?,
inAQ: AudioQueueRef,
inID: AudioQueuePropertyID)
{
guard let inUserData else { return }
guard inID == kAudioQueueProperty_IsRunning else { return }
let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
var running: UInt32 = 0
var size = UInt32(MemoryLayout<UInt32>.size)
let status = AudioQueueGetProperty(inAQ, kAudioQueueProperty_IsRunning, &running, &size)
if status != noErr { return }
if running == 0, playback.inputFinished {
playback.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
}
}

View File

@@ -1,4 +1,6 @@
public enum TalkTTSValidation: Sendable { public enum TalkTTSValidation: Sendable {
private static let v3StabilityValues: Set<Double> = [0.0, 0.5, 1.0]
public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? { public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
if let rateWPM, rateWPM > 0 { if let rateWPM, rateWPM > 0 {
let resolved = Double(rateWPM) / 175.0 let resolved = Double(rateWPM) / 175.0
@@ -18,10 +20,32 @@ public enum TalkTTSValidation: Sendable {
return value return value
} }
public static func validatedStability(_ value: Double?, modelId: String?) -> Double? {
guard let value else { return nil }
let normalizedModel = (modelId ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
if normalizedModel == "eleven_v3" {
return v3StabilityValues.contains(value) ? value : nil
}
return validatedUnit(value)
}
public static func validatedSeed(_ value: Int?) -> UInt32? { public static func validatedSeed(_ value: Int?) -> UInt32? {
guard let value else { return nil } guard let value else { return nil }
if value < 0 || value > 4294967295 { return nil } if value < 0 || value > 4294967295 { return nil }
return UInt32(value) return UInt32(value)
} }
}
public static func validatedLatencyTier(_ value: Int?) -> Int? {
guard let value else { return nil }
if value < 0 || value > 4 { return nil }
return value
}
public static func pcmSampleRate(from outputFormat: String?) -> Double? {
let trimmed = (outputFormat ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard trimmed.hasPrefix("pcm_") else { return nil }
let parts = trimmed.split(separator: "_", maxSplits: 1)
guard parts.count == 2, let rate = Double(parts[1]), rate > 0 else { return nil }
return rate
}
}

View File

@@ -16,9 +16,30 @@ final class TalkTTSValidationTests: XCTestCase {
XCTAssertNil(TalkTTSValidation.validatedUnit(1.01)) XCTAssertNil(TalkTTSValidation.validatedUnit(1.01))
} }
func testValidatedStability() {
XCTAssertEqual(TalkTTSValidation.validatedStability(0, modelId: "eleven_v3"), 0)
XCTAssertEqual(TalkTTSValidation.validatedStability(0.5, modelId: "eleven_v3"), 0.5)
XCTAssertEqual(TalkTTSValidation.validatedStability(1, modelId: "eleven_v3"), 1)
XCTAssertNil(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_v3"))
XCTAssertEqual(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_multilingual_v2"), 0.7)
}
func testValidatedSeedBounds() { func testValidatedSeedBounds() {
XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0) XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0)
XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234) XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234)
XCTAssertNil(TalkTTSValidation.validatedSeed(-1)) XCTAssertNil(TalkTTSValidation.validatedSeed(-1))
} }
func testValidatedLatencyTier() {
XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(0), 0)
XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(4), 4)
XCTAssertNil(TalkTTSValidation.validatedLatencyTier(-1))
XCTAssertNil(TalkTTSValidation.validatedLatencyTier(5))
}
func testPcmSampleRateParse() {
XCTAssertEqual(TalkTTSValidation.pcmSampleRate(from: "pcm_44100"), 44100)
XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "mp3_44100_128"))
XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "pcm_bad"))
}
} }

View File

@@ -10,7 +10,7 @@ Talk mode is a continuous voice conversation loop:
1) Listen for speech 1) Listen for speech
2) Send transcript to the model (main session, chat.send) 2) Send transcript to the model (main session, chat.send)
3) Wait for the response 3) Wait for the response
4) Speak it via ElevenLabs 4) Speak it via ElevenLabs (streaming playback)
## Behavior (macOS) ## Behavior (macOS)
- **Always-on overlay** while Talk mode is enabled. - **Always-on overlay** while Talk mode is enabled.
@@ -55,8 +55,10 @@ Supported keys:
Defaults: Defaults:
- `interruptOnSpeech`: true - `interruptOnSpeech`: true
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` - `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
- `modelId`: defaults to `eleven_v3` when unset
- `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available) - `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)
- `outputFormat`: defaults to `pcm_44100` on macOS/iOS for faster streaming playback (Android stays on MP3)
## macOS UI ## macOS UI
- Menu bar toggle: **Talk** - Menu bar toggle: **Talk**
@@ -71,4 +73,6 @@ Defaults:
## Notes ## Notes
- Requires Speech + Microphone permissions. - Requires Speech + Microphone permissions.
- Uses `chat.send` against session key `main`. - Uses `chat.send` against session key `main`.
- TTS uses ElevenLabs API with `ELEVENLABS_API_KEY`. - TTS uses ElevenLabs streaming API with `ELEVENLABS_API_KEY` and incremental playback on macOS/iOS/Android for lower latency.
- `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`.
- `latency_tier` is validated to `0..4` when set.