fix: stream elevenlabs tts playback

This commit is contained in:
Peter Steinberger
2025-12-30 12:17:40 +01:00
parent 9c532eac07
commit 27adfb76fa
11 changed files with 1091 additions and 91 deletions

View File

@@ -17,6 +17,9 @@
- macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn.
- macOS Talk Mode: increase overlay window size so wave rings dont clip; close button is hover-only and closer to the orb.
- Talk Mode: fall back to system TTS when ElevenLabs is unavailable, returns non-audio, or playback fails (macOS/iOS/Android).
- Talk Mode: stream PCM on macOS/iOS for lower latency (incremental playback); Android continues MP3 streaming.
- Talk Mode: validate ElevenLabs v3 stability and latency tier directives before sending requests.
- iOS/Android Talk Mode: auto-select the first ElevenLabs voice when none is configured.
- ElevenLabs: add retry/backoff for 429/5xx and include content-type in errors for debugging.
- Talk Mode: align to the gateways main session key and fall back to history polling when chat events drop (prevents stuck “thinking” / missing messages).
- Talk Mode: treat history timestamps as seconds or milliseconds to avoid stale assistant picks (macOS/iOS/Android).

View File

@@ -0,0 +1,98 @@
package com.steipete.clawdis.node.voice
import android.media.MediaDataSource
import kotlin.math.min
internal class StreamingMediaDataSource : MediaDataSource() {
private data class Chunk(val start: Long, val data: ByteArray)
private val lock = Object()
private val chunks = ArrayList<Chunk>()
private var totalSize: Long = 0
private var closed = false
private var finished = false
private var lastReadIndex = 0
fun append(data: ByteArray) {
if (data.isEmpty()) return
synchronized(lock) {
if (closed || finished) return
val chunk = Chunk(totalSize, data)
chunks.add(chunk)
totalSize += data.size.toLong()
lock.notifyAll()
}
}
fun finish() {
synchronized(lock) {
if (closed) return
finished = true
lock.notifyAll()
}
}
fun fail() {
synchronized(lock) {
closed = true
lock.notifyAll()
}
}
override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int {
if (position < 0) return -1
synchronized(lock) {
while (!closed && !finished && position >= totalSize) {
lock.wait()
}
if (closed) return -1
if (position >= totalSize && finished) return -1
val available = (totalSize - position).toInt()
val toRead = min(size, available)
var remaining = toRead
var destOffset = offset
var pos = position
var index = findChunkIndex(pos)
while (remaining > 0 && index < chunks.size) {
val chunk = chunks[index]
val inChunkOffset = (pos - chunk.start).toInt()
if (inChunkOffset >= chunk.data.size) {
index++
continue
}
val copyLen = min(remaining, chunk.data.size - inChunkOffset)
System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen)
remaining -= copyLen
destOffset += copyLen
pos += copyLen
if (inChunkOffset + copyLen >= chunk.data.size) {
index++
}
}
return toRead - remaining
}
}
override fun getSize(): Long = -1
override fun close() {
synchronized(lock) {
closed = true
lock.notifyAll()
}
}
private fun findChunkIndex(position: Long): Int {
var index = lastReadIndex
while (index < chunks.size) {
val chunk = chunks[index]
if (position < chunk.start + chunk.data.size) break
index++
}
lastReadIndex = index
return index
}
}

View File

@@ -18,7 +18,6 @@ import android.speech.tts.UtteranceProgressListener
import android.util.Log
import androidx.core.content.ContextCompat
import com.steipete.clawdis.node.bridge.BridgeSession
import java.io.File
import java.net.HttpURLConnection
import java.net.URL
import java.util.UUID
@@ -44,6 +43,7 @@ class TalkModeManager(
) {
companion object {
private const val tag = "TalkMode"
private const val defaultModelIdFallback = "eleven_v3"
}
private val mainHandler = Handler(Looper.getMainLooper())
@@ -81,6 +81,7 @@ class TalkModeManager(
private var defaultVoiceId: String? = null
private var currentVoiceId: String? = null
private var fallbackVoiceId: String? = null
private var defaultModelId: String? = null
private var currentModelId: String? = null
private var defaultOutputFormat: String? = null
@@ -97,7 +98,7 @@ class TalkModeManager(
private var chatSubscribedSessionKey: String? = null
private var player: MediaPlayer? = null
private var currentAudioFile: File? = null
private var streamingSource: StreamingMediaDataSource? = null
private var systemTts: TextToSpeech? = null
private var systemTtsPending: CompletableDeferred<Unit>? = null
private var systemTtsPendingId: String? = null
@@ -464,7 +465,13 @@ class TalkModeManager(
val apiKey =
apiKey?.trim()?.takeIf { it.isNotEmpty() }
?: System.getenv("ELEVENLABS_API_KEY")?.trim()
val voiceId = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
val preferredVoice = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
val voiceId =
if (!apiKey.isNullOrEmpty()) {
resolveVoiceId(preferredVoice, apiKey)
} else {
null
}
_statusText.value = "Speaking…"
_isSpeaking.value = true
@@ -486,24 +493,25 @@ class TalkModeManager(
} else {
_usingFallbackTts.value = false
val ttsStarted = SystemClock.elapsedRealtime()
val modelId = directive?.modelId ?: currentModelId ?: defaultModelId
val request =
ElevenLabsRequest(
text = cleaned,
modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
modelId = modelId,
outputFormat =
TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
stability = TalkModeRuntime.validatedUnit(directive?.stability),
stability = TalkModeRuntime.validatedStability(directive?.stability, modelId),
similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
style = TalkModeRuntime.validatedUnit(directive?.style),
speakerBoost = directive?.speakerBoost,
seed = TalkModeRuntime.validatedSeed(directive?.seed),
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
language = TalkModeRuntime.validatedLanguage(directive?.language),
latencyTier = TalkModeRuntime.validatedLatencyTier(directive?.latencyTier),
)
val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
playAudio(audio)
streamAndPlay(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
Log.d(tag, "elevenlabs stream ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
}
} catch (err: Throwable) {
Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
@@ -520,22 +528,28 @@ class TalkModeManager(
_isSpeaking.value = false
}
private suspend fun playAudio(data: ByteArray) {
private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
stopSpeaking(resetInterrupt = false)
val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
file.writeBytes(data)
currentAudioFile = file
val dataSource = StreamingMediaDataSource()
streamingSource = dataSource
val player = MediaPlayer()
this.player = player
val prepared = CompletableDeferred<Unit>()
val finished = CompletableDeferred<Unit>()
player.setAudioAttributes(
AudioAttributes.Builder()
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.setUsage(AudioAttributes.USAGE_ASSISTANT)
.build(),
)
player.setOnPreparedListener {
it.start()
prepared.complete(Unit)
}
player.setOnCompletionListener {
finished.complete(Unit)
}
@@ -544,16 +558,30 @@ class TalkModeManager(
true
}
player.setDataSource(file.absolutePath)
player.setDataSource(dataSource)
withContext(Dispatchers.Main) {
player.setOnPreparedListener { it.start() }
player.prepareAsync()
}
val fetchError = CompletableDeferred<Throwable?>()
val fetchJob =
scope.launch(Dispatchers.IO) {
try {
streamTts(voiceId = voiceId, apiKey = apiKey, request = request, sink = dataSource)
fetchError.complete(null)
} catch (err: Throwable) {
dataSource.fail()
fetchError.complete(err)
}
}
Log.d(tag, "play start")
try {
prepared.await()
finished.await()
fetchError.await()?.let { throw it }
} finally {
fetchJob.cancel()
cleanupPlayer()
}
Log.d(tag, "play done")
@@ -674,8 +702,8 @@ class TalkModeManager(
player?.stop()
player?.release()
player = null
currentAudioFile?.delete()
currentAudioFile = null
streamingSource?.close()
streamingSource = null
}
private fun shouldInterrupt(transcript: String): Boolean {
@@ -713,13 +741,15 @@ class TalkModeManager(
defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
voiceAliases = aliases
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
defaultModelId = model
defaultModelId = model ?: defaultModelIdFallback
if (!modelOverrideActive) currentModelId = defaultModelId
defaultOutputFormat = outputFormat
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
if (interrupt != null) interruptOnSpeech = interrupt
} catch (_: Throwable) {
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
defaultModelId = defaultModelIdFallback
if (!modelOverrideActive) currentModelId = defaultModelId
apiKey = envKey?.takeIf { it.isNotEmpty() }
voiceAliases = emptyMap()
}
@@ -730,9 +760,21 @@ class TalkModeManager(
return obj["runId"].asStringOrNull()
}
private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray {
return withContext(Dispatchers.IO) {
val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId")
private suspend fun streamTts(
voiceId: String,
apiKey: String,
request: ElevenLabsRequest,
sink: StreamingMediaDataSource,
) {
withContext(Dispatchers.IO) {
val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
val latencyTier = request.latencyTier
val url =
if (latencyTier != null) {
URL("$baseUrl?optimize_streaming_latency=$latencyTier")
} else {
URL(baseUrl)
}
val conn = url.openConnection() as HttpURLConnection
conn.requestMethod = "POST"
conn.connectTimeout = 30_000
@@ -746,13 +788,21 @@ class TalkModeManager(
conn.outputStream.use { it.write(payload.toByteArray()) }
val code = conn.responseCode
val stream = if (code >= 400) conn.errorStream else conn.inputStream
val data = stream.readBytes()
if (code >= 400) {
val message = String(data)
val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
sink.fail()
throw IllegalStateException("ElevenLabs failed: $code $message")
}
data
val buffer = ByteArray(8 * 1024)
conn.inputStream.use { input ->
while (true) {
val read = input.read(buffer)
if (read <= 0) break
sink.append(buffer.copyOf(read))
}
}
sink.finish()
}
}
@@ -794,6 +844,7 @@ class TalkModeManager(
val seed: Long?,
val normalize: String?,
val language: String?,
val latencyTier: Int?,
)
private object TalkModeRuntime {
@@ -816,6 +867,15 @@ class TalkModeManager(
return value
}
fun validatedStability(value: Double?, modelId: String?): Double? {
if (value == null) return null
val normalized = modelId?.trim()?.lowercase()
if (normalized == "eleven_v3") {
return if (value == 0.0 || value == 0.5 || value == 1.0) value else null
}
return validatedUnit(value)
}
fun validatedSeed(value: Long?): Long? {
if (value == null) return null
if (value < 0 || value > 4294967295L) return null
@@ -840,6 +900,12 @@ class TalkModeManager(
return if (trimmed.startsWith("mp3_")) trimmed else null
}
fun validatedLatencyTier(value: Int?): Int? {
if (value == null) return null
if (value < 0 || value > 4) return null
return value
}
fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean {
val sinceMs = sinceSeconds * 1000
return if (timestamp > 10_000_000_000) {
@@ -876,6 +942,62 @@ class TalkModeManager(
return if (isLikelyVoiceId(trimmed)) trimmed else null
}
private suspend fun resolveVoiceId(preferred: String?, apiKey: String): String? {
val trimmed = preferred?.trim().orEmpty()
if (trimmed.isNotEmpty()) {
val resolved = resolveVoiceAlias(trimmed)
if (resolved != null) return resolved
Log.w(tag, "unknown voice alias $trimmed")
}
fallbackVoiceId?.let { return it }
return try {
val voices = listVoices(apiKey)
val first = voices.firstOrNull() ?: return null
fallbackVoiceId = first.voiceId
if (defaultVoiceId.isNullOrBlank()) {
defaultVoiceId = first.voiceId
}
if (!voiceOverrideActive) {
currentVoiceId = first.voiceId
}
val name = first.name ?: "unknown"
Log.d(tag, "default voice selected $name (${first.voiceId})")
first.voiceId
} catch (err: Throwable) {
Log.w(tag, "list voices failed: ${err.message ?: err::class.simpleName}")
null
}
}
private suspend fun listVoices(apiKey: String): List<ElevenLabsVoice> {
return withContext(Dispatchers.IO) {
val url = URL("https://api.elevenlabs.io/v1/voices")
val conn = url.openConnection() as HttpURLConnection
conn.requestMethod = "GET"
conn.connectTimeout = 15_000
conn.readTimeout = 15_000
conn.setRequestProperty("xi-api-key", apiKey)
val code = conn.responseCode
val stream = if (code >= 400) conn.errorStream else conn.inputStream
val data = stream.readBytes()
if (code >= 400) {
val message = data.toString(Charsets.UTF_8)
throw IllegalStateException("ElevenLabs voices failed: $code $message")
}
val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull()
val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList())
voices.mapNotNull { entry ->
val obj = entry.asObjectOrNull() ?: return@mapNotNull null
val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null
val name = obj["name"].asStringOrNull()
ElevenLabsVoice(voiceId, name)
}
}
}
private fun isLikelyVoiceId(value: String): Boolean {
if (value.length < 10) return false
return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
@@ -884,6 +1006,8 @@ class TalkModeManager(
private fun normalizeAliasKey(value: String): String =
value.trim().lowercase()
private data class ElevenLabsVoice(val voiceId: String, val name: String?)
private val listener =
object : RecognitionListener {
override fun onReadyForSpeech(params: Bundle?) {

View File

@@ -9,6 +9,7 @@ import Speech
@Observable
final class TalkModeManager: NSObject {
private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
private static let defaultModelIdFallback = "eleven_v3"
var isEnabled: Bool = false
var isListening: Bool = false
var isSpeaking: Bool = false
@@ -36,11 +37,12 @@ final class TalkModeManager: NSObject {
private var voiceAliases: [String: String] = [:]
private var interruptOnSpeech: Bool = true
private var mainSessionKey: String = "main"
private var fallbackVoiceId: String?
private var lastPlaybackWasPCM: Bool = false
private var bridge: BridgeSession?
private let silenceWindow: TimeInterval = 0.7
private var player: AVAudioPlayer?
private var chatSubscribedSessionKeys = Set<String>()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "TalkMode")
@@ -446,43 +448,43 @@ final class TalkModeManager: NSObject {
let started = Date()
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
let voiceId = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
let resolvedKey =
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
let preferredVoice = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
let voiceId: String? = if let apiKey, !apiKey.isEmpty {
await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
} else {
nil
}
let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)
if canUseElevenLabs, let voiceId, let apiKey {
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
self.logger.warning(
"talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
}
let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
let request = ElevenLabsTTSRequest(
text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
modelId: modelId,
outputFormat: outputFormat,
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
stability: TalkTTSValidation.validatedUnit(directive?.stability),
stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
style: TalkTTSValidation.validatedUnit(directive?.style),
speakerBoost: directive?.speakerBoost,
seed: TalkTTSValidation.validatedSeed(directive?.seed),
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
language: language)
language: language,
latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
let client = ElevenLabsTTSClient(apiKey: apiKey)
let audio = try await client.synthesizeWithHardTimeout(
voiceId: voiceId,
request: request,
hardTimeoutSeconds: synthTimeoutSeconds)
self.logger
.info(
"elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
let stream = client.streamSynthesize(voiceId: voiceId, request: request)
if self.interruptOnSpeech {
do {
@@ -494,7 +496,21 @@ final class TalkModeManager: NSObject {
}
self.statusText = "Speaking…"
try await self.playAudio(data: audio)
let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
let result: StreamingPlaybackResult
if let sampleRate {
self.lastPlaybackWasPCM = true
result = await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate)
} else {
self.lastPlaybackWasPCM = false
result = await StreamingAudioPlayer.shared.play(stream: stream)
}
self.logger
.info(
"elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
if !result.finished, let interruptedAt = result.interruptedAt {
self.lastInterruptedAtSeconds = interruptedAt
}
} else {
self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
if self.interruptOnSpeech {
@@ -533,30 +549,17 @@ final class TalkModeManager: NSObject {
self.isSpeaking = false
}
private func playAudio(data: Data) async throws {
self.player?.stop()
let player = try AVAudioPlayer(data: data)
self.player = player
player.prepareToPlay()
self.logger.info("play start")
guard player.play() else {
throw NSError(domain: "TalkMode", code: 2, userInfo: [
NSLocalizedDescriptionKey: "audio player refused to play",
])
}
while player.isPlaying {
try? await Task.sleep(nanoseconds: 120_000_000)
}
self.logger.info("play done")
}
private func stopSpeaking(storeInterruption: Bool = true) {
guard self.isSpeaking else { return }
let interruptedAt = self.lastPlaybackWasPCM
? PCMStreamingAudioPlayer.shared.stop()
: StreamingAudioPlayer.shared.stop()
if storeInterruption {
self.lastInterruptedAtSeconds = self.player?.currentTime
self.lastInterruptedAtSeconds = interruptedAt
}
self.player?.stop()
self.player = nil
_ = self.lastPlaybackWasPCM
? StreamingAudioPlayer.shared.stop()
: PCMStreamingAudioPlayer.shared.stop()
TalkSystemSpeechSynthesizer.shared.stop()
self.isSpeaking = false
}
@@ -581,6 +584,37 @@ final class TalkModeManager: NSObject {
return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
}
private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
if !trimmed.isEmpty {
if let resolved = self.resolveVoiceAlias(trimmed) { return resolved }
self.logger.warning("unknown voice alias \(trimmed, privacy: .public)")
}
if let fallbackVoiceId { return fallbackVoiceId }
do {
let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
guard let first = voices.first else {
self.logger.warning("elevenlabs voices list empty")
return nil
}
self.fallbackVoiceId = first.voiceId
if self.defaultVoiceId == nil {
self.defaultVoiceId = first.voiceId
}
if !self.voiceOverrideActive {
self.currentVoiceId = first.voiceId
}
let name = first.name ?? "unknown"
self.logger
.info("default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
return first.voiceId
} catch {
self.logger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
return nil
}
}
private static func isLikelyVoiceId(_ value: String) -> Bool {
guard value.count >= 10 else { return false }
return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
@@ -598,22 +632,23 @@ final class TalkModeManager: NSObject {
self.mainSessionKey = rawMainKey.isEmpty ? "main" : rawMainKey
self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
if let aliases = talk?["voiceAliases"] as? [String: Any] {
self.voiceAliases =
aliases.compactMap { key, value in
guard let id = value as? String else { return nil }
let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { return nil }
return (normalizedKey, trimmedId)
}
.reduce(into: [:]) { $0[$1.0] = $1.1 }
var resolved: [String: String] = [:]
for (key, value) in aliases {
guard let id = value as? String else { continue }
let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { continue }
resolved[normalizedKey] = trimmedId
}
self.voiceAliases = resolved
} else {
self.voiceAliases = [:]
}
if !self.voiceOverrideActive {
self.currentVoiceId = self.defaultVoiceId
}
self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
let model = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
self.defaultModelId = (model?.isEmpty == false) ? model : Self.defaultModelIdFallback
if !self.modelOverrideActive {
self.currentModelId = self.defaultModelId
}
@@ -624,7 +659,10 @@ final class TalkModeManager: NSObject {
self.interruptOnSpeech = interrupt
}
} catch {
// ignore
self.defaultModelId = Self.defaultModelIdFallback
if !self.modelOverrideActive {
self.currentModelId = self.defaultModelId
}
}
}

View File

@@ -10,6 +10,7 @@ actor TalkModeRuntime {
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime")
private let ttsLogger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts")
private static let defaultModelIdFallback = "eleven_v3"
private final class RMSMeter: @unchecked Sendable {
private let lock = NSLock()
@@ -62,6 +63,7 @@ actor TalkModeRuntime {
private var lastSpokenText: String?
private var apiKey: String?
private var fallbackVoiceId: String?
private var lastPlaybackWasPCM: Bool = false
private let silenceWindow: TimeInterval = 0.7
private let minSpeechRMS: Double = 1e-3
@@ -496,7 +498,7 @@ actor TalkModeRuntime {
do {
if let apiKey, !apiKey.isEmpty, let voiceId {
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
self.logger
@@ -504,27 +506,25 @@ actor TalkModeRuntime {
"talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
}
let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
let request = ElevenLabsTTSRequest(
text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
modelId: modelId,
outputFormat: outputFormat,
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
stability: TalkTTSValidation.validatedUnit(directive?.stability),
stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
style: TalkTTSValidation.validatedUnit(directive?.style),
speakerBoost: directive?.speakerBoost,
seed: TalkTTSValidation.validatedSeed(directive?.seed),
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
language: language)
language: language,
latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))
self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
let client = ElevenLabsTTSClient(apiKey: apiKey)
let audio = try await client.synthesizeWithHardTimeout(
voiceId: voiceId,
request: request,
hardTimeoutSeconds: synthTimeoutSeconds)
let stream = client.streamSynthesize(voiceId: voiceId, request: request)
guard self.isCurrent(gen) else { return }
self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
if self.interruptOnSpeech {
await self.startRecognition()
@@ -534,12 +534,20 @@ actor TalkModeRuntime {
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
self.phase = .speaking
let result = await TalkAudioPlayer.shared.play(data: audio)
let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
let result: StreamingPlaybackResult
if let sampleRate {
self.lastPlaybackWasPCM = true
result = await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate)
} else {
self.lastPlaybackWasPCM = false
result = await StreamingAudioPlayer.shared.play(stream: stream)
}
self.ttsLogger
.info(
"talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
if !result.finished, result.interruptedAt == nil {
throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [
throw NSError(domain: "StreamingAudioPlayer", code: 1, userInfo: [
NSLocalizedDescriptionKey: "audio playback failed",
])
}
@@ -631,7 +639,15 @@ actor TalkModeRuntime {
}
func stopSpeaking(reason: TalkStopReason) async {
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
let interruptedAt = await MainActor.run {
let primary = self.lastPlaybackWasPCM
? PCMStreamingAudioPlayer.shared.stop()
: StreamingAudioPlayer.shared.stop()
_ = self.lastPlaybackWasPCM
? StreamingAudioPlayer.shared.stop()
: PCMStreamingAudioPlayer.shared.stop()
return primary
}
await TalkSystemSpeechSynthesizer.shared.stop()
guard self.phase == .speaking else { return }
if reason == .speech, let interruptedAt {
@@ -707,7 +723,8 @@ actor TalkModeRuntime {
guard !key.isEmpty, !value.isEmpty else { return }
acc[key] = value
} ?? [:]
let model = talk?["modelId"]?.stringValue
let model = talk?["modelId"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
let resolvedModel = (model?.isEmpty == false) ? model! : Self.defaultModelIdFallback
let outputFormat = talk?["outputFormat"]?.stringValue
let interrupt = talk?["interruptOnSpeech"]?.boolValue
let apiKey = talk?["apiKey"]?.stringValue
@@ -721,7 +738,7 @@ actor TalkModeRuntime {
return TalkRuntimeConfig(
voiceId: resolvedVoice,
voiceAliases: resolvedAliases,
modelId: model,
modelId: resolvedModel,
outputFormat: outputFormat,
interruptOnSpeech: interrupt ?? true,
apiKey: resolvedApiKey)
@@ -733,7 +750,7 @@ actor TalkModeRuntime {
return TalkRuntimeConfig(
voiceId: resolvedVoice,
voiceAliases: [:],
modelId: nil,
modelId: Self.defaultModelIdFallback,
outputFormat: nil,
interruptOnSpeech: true,
apiKey: resolvedApiKey)

View File

@@ -22,6 +22,7 @@ public struct ElevenLabsTTSRequest: Sendable {
public var seed: UInt32?
public var normalize: String?
public var language: String?
public var latencyTier: Int?
public init(
text: String,
@@ -34,7 +35,8 @@ public struct ElevenLabsTTSRequest: Sendable {
speakerBoost: Bool? = nil,
seed: UInt32? = nil,
normalize: String? = nil,
language: String? = nil)
language: String? = nil,
latencyTier: Int? = nil)
{
self.text = text
self.modelId = modelId
@@ -47,6 +49,7 @@ public struct ElevenLabsTTSRequest: Sendable {
self.seed = seed
self.normalize = normalize
self.language = language
self.latencyTier = latencyTier
}
}
@@ -155,6 +158,72 @@ public struct ElevenLabsTTSClient: Sendable {
])
}
public func streamSynthesize(
voiceId: String,
request: ElevenLabsTTSRequest) -> AsyncThrowingStream<Data, Error>
{
AsyncThrowingStream { continuation in
let task = Task {
do {
let url = Self.streamingURL(
baseUrl: self.baseUrl,
voiceId: voiceId,
latencyTier: request.latencyTier)
let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.timeoutInterval = self.requestTimeoutSeconds
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (bytes, response) = try await URLSession.shared.bytes(for: req)
guard let http = response as? HTTPURLResponse else {
throw NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs invalid response",
])
}
let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
if http.statusCode >= 400 {
let message = try await Self.readErrorBody(bytes: bytes)
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
])
}
if !contentType.contains("audio") {
let message = try await Self.readErrorBody(bytes: bytes)
throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
])
}
var buffer = Data()
buffer.reserveCapacity(16_384)
for try await byte in bytes {
buffer.append(byte)
if buffer.count >= 8_192 {
continuation.yield(buffer)
buffer.removeAll(keepingCapacity: true)
}
}
if !buffer.isEmpty {
continuation.yield(buffer)
}
continuation.finish()
} catch {
continuation.finish(throwing: error)
}
}
continuation.onTermination = { _ in
task.cancel()
}
}
}
public func listVoices() async throws -> [ElevenLabsVoice] {
var url = self.baseUrl
url.appendPathComponent("v1")
@@ -180,7 +249,7 @@ public struct ElevenLabsTTSClient: Sendable {
public static func validatedOutputFormat(_ value: String?) -> String? {
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return nil }
guard trimmed.hasPrefix("mp3_") else { return nil }
guard trimmed.hasPrefix("mp3_") || trimmed.hasPrefix("pcm_") else { return nil }
return trimmed
}
@@ -230,4 +299,33 @@ public struct ElevenLabsTTSClient: Sendable {
let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
}
private static func streamingURL(baseUrl: URL, voiceId: String, latencyTier: Int?) -> URL {
var url = baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
url.appendPathComponent("stream")
guard let latencyTier else { return url }
let latencyItem = URLQueryItem(
name: "optimize_streaming_latency",
value: "\(latencyTier)")
guard var components = URLComponents(url: url, resolvingAgainstBaseURL: false) else {
return url
}
var items = components.queryItems ?? []
items.append(latencyItem)
components.queryItems = items
return components.url ?? url
}
private static func readErrorBody(bytes: URLSession.AsyncBytes) async throws -> String {
var data = Data()
for try await byte in bytes {
data.append(byte)
if data.count >= 4096 { break }
}
return truncatedErrorBody(data)
}
}

View File

@@ -0,0 +1,144 @@
import AVFoundation
import Foundation
import OSLog
@MainActor
public final class PCMStreamingAudioPlayer {
public static let shared = PCMStreamingAudioPlayer()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.pcm")
private var engine = AVAudioEngine()
private var player = AVAudioPlayerNode()
private var format: AVAudioFormat?
private var pendingBuffers: Int = 0
private var inputFinished = false
private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
public init() {
self.engine.attach(self.player)
}
public func play(stream: AsyncThrowingStream<Data, Error>, sampleRate: Double) async -> StreamingPlaybackResult {
self.stopInternal()
let format = AVAudioFormat(
commonFormat: .pcmFormatInt16,
sampleRate: sampleRate,
channels: 1,
interleaved: true)
guard let format else {
return StreamingPlaybackResult(finished: false, interruptedAt: nil)
}
self.configure(format: format)
return await withCheckedContinuation { continuation in
self.continuation = continuation
self.pendingBuffers = 0
self.inputFinished = false
Task.detached { [weak self] in
guard let self else { return }
do {
for try await chunk in stream {
await self.enqueuePCM(chunk, format: format)
}
await self.finishInput()
} catch {
await self.fail(error)
}
}
}
}
public func stop() -> Double? {
let interruptedAt = self.currentTimeSeconds()
self.stopInternal()
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
return interruptedAt
}
private func configure(format: AVAudioFormat) {
if self.format?.sampleRate != format.sampleRate || self.format?.commonFormat != format.commonFormat {
self.engine.stop()
self.engine = AVAudioEngine()
self.player = AVAudioPlayerNode()
self.engine.attach(self.player)
}
self.format = format
if self.engine.attachedNodes.contains(self.player) {
self.engine.connect(self.player, to: self.engine.mainMixerNode, format: format)
}
}
private func enqueuePCM(_ data: Data, format: AVAudioFormat) async {
guard !data.isEmpty else { return }
let frameCount = data.count / MemoryLayout<Int16>.size
guard frameCount > 0 else { return }
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(frameCount)) else {
return
}
buffer.frameLength = AVAudioFrameCount(frameCount)
data.withUnsafeBytes { raw in
guard let src = raw.baseAddress else { return }
let audioBuffer = buffer.audioBufferList.pointee.mBuffers
if let dst = audioBuffer.mData {
memcpy(dst, src, frameCount * MemoryLayout<Int16>.size)
}
}
self.pendingBuffers += 1
self.player.scheduleBuffer(buffer) { [weak self] in
Task { @MainActor in
guard let self else { return }
self.pendingBuffers = max(0, self.pendingBuffers - 1)
if self.inputFinished && self.pendingBuffers == 0 {
self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
}
}
}
if !self.player.isPlaying {
do {
try self.engine.start()
self.player.play()
} catch {
self.logger.error("pcm engine start failed: \(error.localizedDescription, privacy: .public)")
self.fail(error)
}
}
}
private func finishInput() {
self.inputFinished = true
if self.pendingBuffers == 0 {
self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
}
}
private func fail(_ error: Error) {
self.logger.error("pcm stream failed: \(error.localizedDescription, privacy: .public)")
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
}
private func stopInternal() {
self.player.stop()
self.engine.stop()
self.pendingBuffers = 0
self.inputFinished = false
}
private func finish(_ result: StreamingPlaybackResult) {
let continuation = self.continuation
self.continuation = nil
continuation?.resume(returning: result)
}
private func currentTimeSeconds() -> Double? {
guard let nodeTime = self.player.lastRenderTime,
let playerTime = self.player.playerTime(forNodeTime: nodeTime)
else { return nil }
return Double(playerTime.sampleTime) / playerTime.sampleRate
}
}

View File

@@ -0,0 +1,429 @@
import AudioToolbox
import Foundation
import OSLog
public struct StreamingPlaybackResult: Sendable {
public let finished: Bool
public let interruptedAt: Double?
public init(finished: Bool, interruptedAt: Double?) {
self.finished = finished
self.interruptedAt = interruptedAt
}
}
@MainActor
public final class StreamingAudioPlayer: NSObject {
public static let shared = StreamingAudioPlayer()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.stream")
private var playback: Playback?
public func play(stream: AsyncThrowingStream<Data, Error>) async -> StreamingPlaybackResult {
self.stopInternal()
let playback = Playback(logger: self.logger)
self.playback = playback
return await withCheckedContinuation { continuation in
playback.setContinuation(continuation)
playback.start()
Task.detached {
do {
for try await chunk in stream {
playback.append(chunk)
}
playback.finishInput()
} catch {
playback.fail(error)
}
}
}
}
public func stop() -> Double? {
guard let playback else { return nil }
let interruptedAt = playback.stop(immediate: true)
self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
return interruptedAt
}
private func stopInternal() {
guard let playback else { return }
let interruptedAt = playback.stop(immediate: true)
self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
}
private func finish(playback: Playback, result: StreamingPlaybackResult) {
playback.finish(result)
guard self.playback === playback else { return }
self.playback = nil
}
}
private final class Playback: @unchecked Sendable {
private static let bufferCount: Int = 3
private static let bufferSize: Int = 32 * 1024
private let logger: Logger
private let lock = NSLock()
private let parseQueue = DispatchQueue(label: "talk.stream.parse")
fileprivate let bufferLock = NSLock()
fileprivate let bufferSemaphore = DispatchSemaphore(value: bufferCount)
private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
private var finished = false
private var audioFileStream: AudioFileStreamID?
private var audioQueue: AudioQueueRef?
fileprivate var audioFormat: AudioStreamBasicDescription?
fileprivate var maxPacketSize: UInt32 = 0
fileprivate var availableBuffers: [AudioQueueBufferRef] = []
private var currentBuffer: AudioQueueBufferRef?
private var currentBufferSize: Int = 0
private var currentPacketDescs: [AudioStreamPacketDescription] = []
private var isRunning = false
fileprivate var inputFinished = false
private var startRequested = false
private var sampleRate: Double = 0
init(logger: Logger) {
self.logger = logger
}
func setContinuation(_ continuation: CheckedContinuation<StreamingPlaybackResult, Never>) {
self.lock.lock()
self.continuation = continuation
self.lock.unlock()
}
func start() {
let selfPtr = Unmanaged.passUnretained(self).toOpaque()
let status = AudioFileStreamOpen(
selfPtr,
propertyListenerProc,
packetsProc,
kAudioFileMP3Type,
&self.audioFileStream)
if status != noErr {
self.logger.error("talk stream open failed: \(status)")
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
}
}
func append(_ data: Data) {
guard !data.isEmpty else { return }
self.parseQueue.async { [weak self] in
guard let self else { return }
guard let audioFileStream = self.audioFileStream else { return }
let status = data.withUnsafeBytes { bytes in
AudioFileStreamParseBytes(
audioFileStream,
UInt32(bytes.count),
bytes.baseAddress,
[])
}
if status != noErr {
self.logger.error("talk stream parse failed: \(status)")
self.fail(NSError(domain: "StreamingAudio", code: Int(status)))
}
}
}
func finishInput() {
self.parseQueue.async { [weak self] in
guard let self else { return }
self.inputFinished = true
if self.audioQueue == nil {
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
return
}
self.enqueueCurrentBuffer(flushOnly: true)
self.stop(immediate: false)
}
}
func fail(_ error: Error) {
self.logger.error("talk stream failed: \(error.localizedDescription, privacy: .public)")
_ = self.stop(immediate: true)
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
}
func stop(immediate: Bool) -> Double? {
guard let audioQueue else { return nil }
let interruptedAt = self.currentTimeSeconds()
AudioQueueStop(audioQueue, immediate)
return interruptedAt
}
fileprivate func finish(_ result: StreamingPlaybackResult) {
let continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
self.lock.lock()
if self.finished {
continuation = nil
} else {
self.finished = true
continuation = self.continuation
self.continuation = nil
}
self.lock.unlock()
continuation?.resume(returning: result)
self.teardown()
}
private func teardown() {
if let audioQueue {
AudioQueueDispose(audioQueue, true)
self.audioQueue = nil
}
if let audioFileStream {
AudioFileStreamClose(audioFileStream)
self.audioFileStream = nil
}
self.bufferLock.lock()
self.availableBuffers.removeAll()
self.bufferLock.unlock()
self.currentBuffer = nil
self.currentPacketDescs.removeAll()
}
fileprivate func setupQueueIfNeeded(_ asbd: AudioStreamBasicDescription) {
guard self.audioQueue == nil else { return }
var format = asbd
self.audioFormat = format
self.sampleRate = format.mSampleRate
let selfPtr = Unmanaged.passUnretained(self).toOpaque()
let status = AudioQueueNewOutput(
&format,
outputCallbackProc,
selfPtr,
nil,
nil,
0,
&self.audioQueue)
if status != noErr {
self.logger.error("talk queue create failed: \(status)")
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
return
}
if let audioQueue {
AudioQueueAddPropertyListener(audioQueue, kAudioQueueProperty_IsRunning, isRunningCallbackProc, selfPtr)
}
if let audioFileStream {
var cookieSize: UInt32 = 0
var writable: DarwinBoolean = false
let cookieStatus = AudioFileStreamGetPropertyInfo(
audioFileStream,
kAudioFileStreamProperty_MagicCookieData,
&cookieSize,
&writable)
if cookieStatus == noErr, cookieSize > 0, let audioQueue {
var cookie = [UInt8](repeating: 0, count: Int(cookieSize))
let readStatus = AudioFileStreamGetProperty(
audioFileStream,
kAudioFileStreamProperty_MagicCookieData,
&cookieSize,
&cookie)
if readStatus == noErr {
AudioQueueSetProperty(audioQueue, kAudioQueueProperty_MagicCookie, cookie, cookieSize)
}
}
}
if let audioQueue {
for _ in 0..<Self.bufferCount {
var buffer: AudioQueueBufferRef?
let allocStatus = AudioQueueAllocateBuffer(audioQueue, UInt32(Self.bufferSize), &buffer)
if allocStatus == noErr, let buffer {
self.bufferLock.lock()
self.availableBuffers.append(buffer)
self.bufferLock.unlock()
}
}
}
}
private func enqueueCurrentBuffer(flushOnly: Bool = false) {
guard let audioQueue, let buffer = self.currentBuffer else { return }
guard self.currentBufferSize > 0 else { return }
buffer.pointee.mAudioDataByteSize = UInt32(self.currentBufferSize)
let packetCount = UInt32(self.currentPacketDescs.count)
let status = self.currentPacketDescs.withUnsafeBufferPointer { descPtr in
AudioQueueEnqueueBuffer(audioQueue, buffer, packetCount, descPtr.baseAddress)
}
if status != noErr {
self.logger.error("talk queue enqueue failed: \(status)")
} else {
if !self.startRequested {
self.startRequested = true
let startStatus = AudioQueueStart(audioQueue, nil)
if startStatus != noErr {
self.logger.error("talk queue start failed: \(startStatus)")
}
}
}
self.currentBuffer = nil
self.currentBufferSize = 0
self.currentPacketDescs.removeAll(keepingCapacity: true)
if !flushOnly {
self.bufferSemaphore.wait()
self.bufferLock.lock()
let next = self.availableBuffers.popLast()
self.bufferLock.unlock()
if let next { self.currentBuffer = next }
}
}
fileprivate func handlePackets(
numberBytes: UInt32,
numberPackets: UInt32,
inputData: UnsafeRawPointer,
packetDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
{
if self.audioQueue == nil, let format = self.audioFormat {
self.setupQueueIfNeeded(format)
}
if self.audioQueue == nil {
return
}
if self.currentBuffer == nil {
self.bufferSemaphore.wait()
self.bufferLock.lock()
self.currentBuffer = self.availableBuffers.popLast()
self.bufferLock.unlock()
self.currentBufferSize = 0
self.currentPacketDescs.removeAll(keepingCapacity: true)
}
let bytes = inputData.assumingMemoryBound(to: UInt8.self)
let packetCount = Int(numberPackets)
for index in 0..<packetCount {
let packetOffset: Int
let packetSize: Int
if let packetDescriptions {
packetOffset = Int(packetDescriptions[index].mStartOffset)
packetSize = Int(packetDescriptions[index].mDataByteSize)
} else {
let size = Int(numberBytes) / packetCount
packetOffset = index * size
packetSize = size
}
if packetSize > Self.bufferSize {
continue
}
if self.currentBufferSize + packetSize > Self.bufferSize {
self.enqueueCurrentBuffer()
}
guard let buffer = self.currentBuffer else { continue }
let dest = buffer.pointee.mAudioData.advanced(by: self.currentBufferSize)
memcpy(dest, bytes.advanced(by: packetOffset), packetSize)
let desc = AudioStreamPacketDescription(
mStartOffset: Int64(self.currentBufferSize),
mVariableFramesInPacket: 0,
mDataByteSize: UInt32(packetSize))
self.currentPacketDescs.append(desc)
self.currentBufferSize += packetSize
}
}
private func currentTimeSeconds() -> Double? {
guard let audioQueue, sampleRate > 0 else { return nil }
var timeStamp = AudioTimeStamp()
let status = AudioQueueGetCurrentTime(audioQueue, nil, &timeStamp, nil)
if status != noErr { return nil }
if timeStamp.mSampleTime.isNaN { return nil }
return timeStamp.mSampleTime / sampleRate
}
}
private func propertyListenerProc(
inClientData: UnsafeMutableRawPointer,
inAudioFileStream: AudioFileStreamID,
inPropertyID: AudioFileStreamPropertyID,
ioFlags: UnsafeMutablePointer<AudioFileStreamPropertyFlags>)
{
let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
if inPropertyID == kAudioFileStreamProperty_DataFormat {
var format = AudioStreamBasicDescription()
var size = UInt32(MemoryLayout<AudioStreamBasicDescription>.size)
let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &format)
if status == noErr {
playback.audioFormat = format
playback.setupQueueIfNeeded(format)
}
} else if inPropertyID == kAudioFileStreamProperty_PacketSizeUpperBound {
var maxPacketSize: UInt32 = 0
var size = UInt32(MemoryLayout<UInt32>.size)
let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &maxPacketSize)
if status == noErr {
playback.maxPacketSize = maxPacketSize
}
}
}
private func packetsProc(
inClientData: UnsafeMutableRawPointer,
inNumberBytes: UInt32,
inNumberPackets: UInt32,
inInputData: UnsafeRawPointer,
inPacketDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
{
let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
playback.handlePackets(
numberBytes: inNumberBytes,
numberPackets: inNumberPackets,
inputData: inInputData,
packetDescriptions: inPacketDescriptions)
}
private func outputCallbackProc(
inUserData: UnsafeMutableRawPointer?,
inAQ: AudioQueueRef,
inBuffer: AudioQueueBufferRef)
{
guard let inUserData else { return }
let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
playback.bufferLock.lock()
playback.availableBuffers.append(inBuffer)
playback.bufferLock.unlock()
playback.bufferSemaphore.signal()
}
private func isRunningCallbackProc(
inUserData: UnsafeMutableRawPointer?,
inAQ: AudioQueueRef,
inID: AudioQueuePropertyID)
{
guard let inUserData else { return }
guard inID == kAudioQueueProperty_IsRunning else { return }
let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
var running: UInt32 = 0
var size = UInt32(MemoryLayout<UInt32>.size)
let status = AudioQueueGetProperty(inAQ, kAudioQueueProperty_IsRunning, &running, &size)
if status != noErr { return }
if running == 0, playback.inputFinished {
playback.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
}
}

View File

@@ -1,4 +1,6 @@
public enum TalkTTSValidation: Sendable {
private static let v3StabilityValues: Set<Double> = [0.0, 0.5, 1.0]
public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
if let rateWPM, rateWPM > 0 {
let resolved = Double(rateWPM) / 175.0
@@ -18,10 +20,32 @@ public enum TalkTTSValidation: Sendable {
return value
}
public static func validatedStability(_ value: Double?, modelId: String?) -> Double? {
guard let value else { return nil }
let normalizedModel = (modelId ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
if normalizedModel == "eleven_v3" {
return v3StabilityValues.contains(value) ? value : nil
}
return validatedUnit(value)
}
public static func validatedSeed(_ value: Int?) -> UInt32? {
guard let value else { return nil }
if value < 0 || value > 4294967295 { return nil }
return UInt32(value)
}
}
public static func validatedLatencyTier(_ value: Int?) -> Int? {
guard let value else { return nil }
if value < 0 || value > 4 { return nil }
return value
}
public static func pcmSampleRate(from outputFormat: String?) -> Double? {
let trimmed = (outputFormat ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard trimmed.hasPrefix("pcm_") else { return nil }
let parts = trimmed.split(separator: "_", maxSplits: 1)
guard parts.count == 2, let rate = Double(parts[1]), rate > 0 else { return nil }
return rate
}
}

View File

@@ -16,9 +16,30 @@ final class TalkTTSValidationTests: XCTestCase {
XCTAssertNil(TalkTTSValidation.validatedUnit(1.01))
}
func testValidatedStability() {
XCTAssertEqual(TalkTTSValidation.validatedStability(0, modelId: "eleven_v3"), 0)
XCTAssertEqual(TalkTTSValidation.validatedStability(0.5, modelId: "eleven_v3"), 0.5)
XCTAssertEqual(TalkTTSValidation.validatedStability(1, modelId: "eleven_v3"), 1)
XCTAssertNil(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_v3"))
XCTAssertEqual(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_multilingual_v2"), 0.7)
}
func testValidatedSeedBounds() {
XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0)
XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234)
XCTAssertNil(TalkTTSValidation.validatedSeed(-1))
}
func testValidatedLatencyTier() {
XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(0), 0)
XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(4), 4)
XCTAssertNil(TalkTTSValidation.validatedLatencyTier(-1))
XCTAssertNil(TalkTTSValidation.validatedLatencyTier(5))
}
func testPcmSampleRateParse() {
XCTAssertEqual(TalkTTSValidation.pcmSampleRate(from: "pcm_44100"), 44100)
XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "mp3_44100_128"))
XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "pcm_bad"))
}
}

View File

@@ -10,7 +10,7 @@ Talk mode is a continuous voice conversation loop:
1) Listen for speech
2) Send transcript to the model (main session, chat.send)
3) Wait for the response
4) Speak it via ElevenLabs
4) Speak it via ElevenLabs (streaming playback)
## Behavior (macOS)
- **Always-on overlay** while Talk mode is enabled.
@@ -55,8 +55,10 @@ Supported keys:
Defaults:
- `interruptOnSpeech`: true
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID`
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
- `modelId`: defaults to `eleven_v3` when unset
- `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)
- `outputFormat`: defaults to `pcm_44100` on macOS/iOS for faster streaming playback (Android stays on MP3)
## macOS UI
- Menu bar toggle: **Talk**
@@ -71,4 +73,6 @@ Defaults:
## Notes
- Requires Speech + Microphone permissions.
- Uses `chat.send` against session key `main`.
- TTS uses ElevenLabs API with `ELEVENLABS_API_KEY`.
- TTS uses ElevenLabs streaming API with `ELEVENLABS_API_KEY` and incremental playback on macOS/iOS/Android for lower latency.
- `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`.
- `latency_tier` is validated to `0..4` when set.