fix: default android talk pcm_24000

This commit is contained in:
Peter Steinberger
2025-12-30 12:52:56 +01:00
parent 83262a67b1
commit 3bf8b9ccf4
2 changed files with 200 additions and 35 deletions

View File

@@ -5,6 +5,9 @@ import android.content.Context
import android.content.Intent import android.content.Intent
import android.content.pm.PackageManager import android.content.pm.PackageManager
import android.media.AudioAttributes import android.media.AudioAttributes
import android.media.AudioFormat
import android.media.AudioManager
import android.media.AudioTrack
import android.media.MediaPlayer import android.media.MediaPlayer
import android.os.Bundle import android.os.Bundle
import android.os.Handler import android.os.Handler
@@ -36,6 +39,7 @@ import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.JsonObject import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive import kotlinx.serialization.json.JsonPrimitive
import kotlinx.serialization.json.buildJsonObject import kotlinx.serialization.json.buildJsonObject
import kotlin.math.max
class TalkModeManager( class TalkModeManager(
private val context: Context, private val context: Context,
@@ -44,6 +48,7 @@ class TalkModeManager(
companion object { companion object {
private const val tag = "TalkMode" private const val tag = "TalkMode"
private const val defaultModelIdFallback = "eleven_v3" private const val defaultModelIdFallback = "eleven_v3"
private const val defaultOutputFormatFallback = "pcm_24000"
} }
private val mainHandler = Handler(Looper.getMainLooper()) private val mainHandler = Handler(Looper.getMainLooper())
@@ -99,6 +104,8 @@ class TalkModeManager(
private var player: MediaPlayer? = null private var player: MediaPlayer? = null
private var streamingSource: StreamingMediaDataSource? = null private var streamingSource: StreamingMediaDataSource? = null
private var pcmTrack: AudioTrack? = null
@Volatile private var pcmStopRequested = false
private var systemTts: TextToSpeech? = null private var systemTts: TextToSpeech? = null
private var systemTtsPending: CompletableDeferred<Unit>? = null private var systemTtsPending: CompletableDeferred<Unit>? = null
private var systemTtsPendingId: String? = null private var systemTtsPendingId: String? = null
@@ -531,6 +538,22 @@ class TalkModeManager(
private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) { private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
stopSpeaking(resetInterrupt = false) stopSpeaking(resetInterrupt = false)
pcmStopRequested = false
val pcmSampleRate = TalkModeRuntime.parsePcmSampleRate(request.outputFormat)
if (pcmSampleRate != null) {
try {
streamAndPlayPcm(voiceId = voiceId, apiKey = apiKey, request = request, sampleRate = pcmSampleRate)
return
} catch (err: Throwable) {
if (pcmStopRequested) return
Log.w(tag, "pcm playback failed; falling back to mp3: ${err.message ?: err::class.simpleName}")
}
}
streamAndPlayMp3(voiceId = voiceId, apiKey = apiKey, request = request)
}
private suspend fun streamAndPlayMp3(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
val dataSource = StreamingMediaDataSource() val dataSource = StreamingMediaDataSource()
streamingSource = dataSource streamingSource = dataSource
@@ -587,6 +610,54 @@ class TalkModeManager(
Log.d(tag, "play done") Log.d(tag, "play done")
} }
private suspend fun streamAndPlayPcm(
voiceId: String,
apiKey: String,
request: ElevenLabsRequest,
sampleRate: Int,
) {
val minBuffer =
AudioTrack.getMinBufferSize(
sampleRate,
AudioFormat.CHANNEL_OUT_MONO,
AudioFormat.ENCODING_PCM_16BIT,
)
if (minBuffer <= 0) {
throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
}
val bufferSize = max(minBuffer * 2, 8 * 1024)
val track =
AudioTrack(
AudioAttributes.Builder()
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.setUsage(AudioAttributes.USAGE_ASSISTANT)
.build(),
AudioFormat.Builder()
.setSampleRate(sampleRate)
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
.build(),
bufferSize,
AudioTrack.MODE_STREAM,
AudioManager.AUDIO_SESSION_ID_GENERATE,
)
if (track.state != AudioTrack.STATE_INITIALIZED) {
track.release()
throw IllegalStateException("AudioTrack init failed")
}
pcmTrack = track
track.play()
Log.d(tag, "pcm play start sampleRate=$sampleRate bufferSize=$bufferSize")
try {
streamPcm(voiceId = voiceId, apiKey = apiKey, request = request, track = track)
} finally {
cleanupPcmTrack()
}
Log.d(tag, "pcm play done")
}
private suspend fun speakWithSystemTts(text: String) { private suspend fun speakWithSystemTts(text: String) {
val trimmed = text.trim() val trimmed = text.trim()
if (trimmed.isEmpty()) return if (trimmed.isEmpty()) return
@@ -678,8 +749,10 @@ class TalkModeManager(
} }
private fun stopSpeaking(resetInterrupt: Boolean = true) { private fun stopSpeaking(resetInterrupt: Boolean = true) {
pcmStopRequested = true
if (!_isSpeaking.value) { if (!_isSpeaking.value) {
cleanupPlayer() cleanupPlayer()
cleanupPcmTrack()
systemTts?.stop() systemTts?.stop()
systemTtsPending?.cancel() systemTtsPending?.cancel()
systemTtsPending = null systemTtsPending = null
@@ -691,6 +764,7 @@ class TalkModeManager(
lastInterruptedAtSeconds = currentMs / 1000.0 lastInterruptedAtSeconds = currentMs / 1000.0
} }
cleanupPlayer() cleanupPlayer()
cleanupPcmTrack()
systemTts?.stop() systemTts?.stop()
systemTtsPending?.cancel() systemTtsPending?.cancel()
systemTtsPending = null systemTtsPending = null
@@ -706,6 +780,20 @@ class TalkModeManager(
streamingSource = null streamingSource = null
} }
private fun cleanupPcmTrack() {
val track = pcmTrack ?: return
try {
track.pause()
track.flush()
track.stop()
} catch (_: Throwable) {
// ignore cleanup errors
} finally {
track.release()
}
pcmTrack = null
}
private fun shouldInterrupt(transcript: String): Boolean { private fun shouldInterrupt(transcript: String): Boolean {
val trimmed = transcript.trim() val trimmed = transcript.trim()
if (trimmed.length < 3) return false if (trimmed.length < 3) return false
@@ -743,7 +831,7 @@ class TalkModeManager(
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
defaultModelId = model ?: defaultModelIdFallback defaultModelId = model ?: defaultModelIdFallback
if (!modelOverrideActive) currentModelId = defaultModelId if (!modelOverrideActive) currentModelId = defaultModelId
defaultOutputFormat = outputFormat defaultOutputFormat = outputFormat ?: defaultOutputFormatFallback
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() } apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
if (interrupt != null) interruptOnSpeech = interrupt if (interrupt != null) interruptOnSpeech = interrupt
} catch (_: Throwable) { } catch (_: Throwable) {
@@ -752,6 +840,7 @@ class TalkModeManager(
if (!modelOverrideActive) currentModelId = defaultModelId if (!modelOverrideActive) currentModelId = defaultModelId
apiKey = envKey?.takeIf { it.isNotEmpty() } apiKey = envKey?.takeIf { it.isNotEmpty() }
voiceAliases = emptyMap() voiceAliases = emptyMap()
defaultOutputFormat = defaultOutputFormatFallback
} }
} }
@@ -767,45 +856,110 @@ class TalkModeManager(
sink: StreamingMediaDataSource, sink: StreamingMediaDataSource,
) { ) {
withContext(Dispatchers.IO) { withContext(Dispatchers.IO) {
val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream" val conn = openTtsConnection(voiceId = voiceId, apiKey = apiKey, request = request)
val latencyTier = request.latencyTier try {
val url = val payload = buildRequestPayload(request)
if (latencyTier != null) { conn.outputStream.use { it.write(payload.toByteArray()) }
URL("$baseUrl?optimize_streaming_latency=$latencyTier")
} else { val code = conn.responseCode
URL(baseUrl) if (code >= 400) {
val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
sink.fail()
throw IllegalStateException("ElevenLabs failed: $code $message")
} }
val conn = url.openConnection() as HttpURLConnection
conn.requestMethod = "POST"
conn.connectTimeout = 30_000
conn.readTimeout = 30_000
conn.setRequestProperty("Content-Type", "application/json")
conn.setRequestProperty("Accept", "audio/mpeg")
conn.setRequestProperty("xi-api-key", apiKey)
conn.doOutput = true
val payload = buildRequestPayload(request) val buffer = ByteArray(8 * 1024)
conn.outputStream.use { it.write(payload.toByteArray()) } conn.inputStream.use { input ->
while (true) {
val code = conn.responseCode val read = input.read(buffer)
if (code >= 400) { if (read <= 0) break
val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: "" sink.append(buffer.copyOf(read))
sink.fail() }
throw IllegalStateException("ElevenLabs failed: $code $message")
}
val buffer = ByteArray(8 * 1024)
conn.inputStream.use { input ->
while (true) {
val read = input.read(buffer)
if (read <= 0) break
sink.append(buffer.copyOf(read))
} }
sink.finish()
} finally {
conn.disconnect()
} }
sink.finish()
} }
} }
private suspend fun streamPcm(
voiceId: String,
apiKey: String,
request: ElevenLabsRequest,
track: AudioTrack,
) {
withContext(Dispatchers.IO) {
val conn = openTtsConnection(voiceId = voiceId, apiKey = apiKey, request = request)
try {
val payload = buildRequestPayload(request)
conn.outputStream.use { it.write(payload.toByteArray()) }
val code = conn.responseCode
if (code >= 400) {
val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
throw IllegalStateException("ElevenLabs failed: $code $message")
}
val buffer = ByteArray(8 * 1024)
conn.inputStream.use { input ->
while (true) {
if (pcmStopRequested) return@withContext
val read = input.read(buffer)
if (read <= 0) break
var offset = 0
while (offset < read) {
if (pcmStopRequested) return@withContext
val wrote =
try {
track.write(buffer, offset, read - offset)
} catch (err: Throwable) {
if (pcmStopRequested) return@withContext
throw err
}
if (wrote <= 0) {
if (pcmStopRequested) return@withContext
throw IllegalStateException("AudioTrack write failed: $wrote")
}
offset += wrote
}
}
}
} finally {
conn.disconnect()
}
}
}
private fun openTtsConnection(
voiceId: String,
apiKey: String,
request: ElevenLabsRequest,
): HttpURLConnection {
val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
val latencyTier = request.latencyTier
val url =
if (latencyTier != null) {
URL("$baseUrl?optimize_streaming_latency=$latencyTier")
} else {
URL(baseUrl)
}
val conn = url.openConnection() as HttpURLConnection
conn.requestMethod = "POST"
conn.connectTimeout = 30_000
conn.readTimeout = 30_000
conn.setRequestProperty("Content-Type", "application/json")
conn.setRequestProperty("Accept", resolveAcceptHeader(request.outputFormat))
conn.setRequestProperty("xi-api-key", apiKey)
conn.doOutput = true
return conn
}
private fun resolveAcceptHeader(outputFormat: String?): String {
val normalized = outputFormat?.trim()?.lowercase().orEmpty()
return if (normalized.startsWith("pcm_")) "audio/pcm" else "audio/mpeg"
}
private fun buildRequestPayload(request: ElevenLabsRequest): String { private fun buildRequestPayload(request: ElevenLabsRequest): String {
val voiceSettingsEntries = val voiceSettingsEntries =
buildJsonObject { buildJsonObject {
@@ -897,7 +1051,8 @@ class TalkModeManager(
fun validatedOutputFormat(value: String?): String? { fun validatedOutputFormat(value: String?): String? {
val trimmed = value?.trim()?.lowercase() ?: return null val trimmed = value?.trim()?.lowercase() ?: return null
if (trimmed.isEmpty()) return null if (trimmed.isEmpty()) return null
return if (trimmed.startsWith("mp3_")) trimmed else null if (trimmed.startsWith("mp3_")) return trimmed
return if (parsePcmSampleRate(trimmed) != null) trimmed else null
} }
fun validatedLatencyTier(value: Int?): Int? { fun validatedLatencyTier(value: Int?): Int? {
@@ -906,6 +1061,15 @@ class TalkModeManager(
return value return value
} }
fun parsePcmSampleRate(value: String?): Int? {
val trimmed = value?.trim()?.lowercase() ?: return null
if (!trimmed.startsWith("pcm_")) return null
val suffix = trimmed.removePrefix("pcm_")
val digits = suffix.takeWhile { it.isDigit() }
val rate = digits.toIntOrNull() ?: return null
return if (rate in setOf(16000, 22050, 24000, 44100)) rate else null
}
fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean { fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean {
val sinceMs = sinceSeconds * 1000 val sinceMs = sinceSeconds * 1000
return if (timestamp > 10_000_000_000) { return if (timestamp > 10_000_000_000) {

View File

@@ -58,7 +58,7 @@ Defaults:
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available) - `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
- `modelId`: defaults to `eleven_v3` when unset - `modelId`: defaults to `eleven_v3` when unset
- `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available) - `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)
- `outputFormat`: defaults to `pcm_44100` on macOS/iOS for faster streaming playback (Android stays on MP3) - `outputFormat`: defaults to `pcm_44100` on macOS/iOS and `pcm_24000` on Android (set `mp3_*` to force MP3 streaming)
## macOS UI ## macOS UI
- Menu bar toggle: **Talk** - Menu bar toggle: **Talk**
@@ -76,3 +76,4 @@ Defaults:
- TTS uses ElevenLabs streaming API with `ELEVENLABS_API_KEY` and incremental playback on macOS/iOS/Android for lower latency. - TTS uses ElevenLabs streaming API with `ELEVENLABS_API_KEY` and incremental playback on macOS/iOS/Android for lower latency.
- `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`. - `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`.
- `latency_tier` is validated to `0..4` when set. - `latency_tier` is validated to `0..4` when set.
- Android supports `pcm_16000`, `pcm_22050`, `pcm_24000`, and `pcm_44100` output formats for low-latency AudioTrack streaming.