fix: default android talk pcm_24000
This commit is contained in:
@@ -5,6 +5,9 @@ import android.content.Context
|
|||||||
import android.content.Intent
|
import android.content.Intent
|
||||||
import android.content.pm.PackageManager
|
import android.content.pm.PackageManager
|
||||||
import android.media.AudioAttributes
|
import android.media.AudioAttributes
|
||||||
|
import android.media.AudioFormat
|
||||||
|
import android.media.AudioManager
|
||||||
|
import android.media.AudioTrack
|
||||||
import android.media.MediaPlayer
|
import android.media.MediaPlayer
|
||||||
import android.os.Bundle
|
import android.os.Bundle
|
||||||
import android.os.Handler
|
import android.os.Handler
|
||||||
@@ -36,6 +39,7 @@ import kotlinx.serialization.json.JsonElement
|
|||||||
import kotlinx.serialization.json.JsonObject
|
import kotlinx.serialization.json.JsonObject
|
||||||
import kotlinx.serialization.json.JsonPrimitive
|
import kotlinx.serialization.json.JsonPrimitive
|
||||||
import kotlinx.serialization.json.buildJsonObject
|
import kotlinx.serialization.json.buildJsonObject
|
||||||
|
import kotlin.math.max
|
||||||
|
|
||||||
class TalkModeManager(
|
class TalkModeManager(
|
||||||
private val context: Context,
|
private val context: Context,
|
||||||
@@ -44,6 +48,7 @@ class TalkModeManager(
|
|||||||
companion object {
|
companion object {
|
||||||
private const val tag = "TalkMode"
|
private const val tag = "TalkMode"
|
||||||
private const val defaultModelIdFallback = "eleven_v3"
|
private const val defaultModelIdFallback = "eleven_v3"
|
||||||
|
private const val defaultOutputFormatFallback = "pcm_24000"
|
||||||
}
|
}
|
||||||
|
|
||||||
private val mainHandler = Handler(Looper.getMainLooper())
|
private val mainHandler = Handler(Looper.getMainLooper())
|
||||||
@@ -99,6 +104,8 @@ class TalkModeManager(
|
|||||||
|
|
||||||
private var player: MediaPlayer? = null
|
private var player: MediaPlayer? = null
|
||||||
private var streamingSource: StreamingMediaDataSource? = null
|
private var streamingSource: StreamingMediaDataSource? = null
|
||||||
|
private var pcmTrack: AudioTrack? = null
|
||||||
|
@Volatile private var pcmStopRequested = false
|
||||||
private var systemTts: TextToSpeech? = null
|
private var systemTts: TextToSpeech? = null
|
||||||
private var systemTtsPending: CompletableDeferred<Unit>? = null
|
private var systemTtsPending: CompletableDeferred<Unit>? = null
|
||||||
private var systemTtsPendingId: String? = null
|
private var systemTtsPendingId: String? = null
|
||||||
@@ -531,6 +538,22 @@ class TalkModeManager(
|
|||||||
private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
|
private suspend fun streamAndPlay(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
|
||||||
stopSpeaking(resetInterrupt = false)
|
stopSpeaking(resetInterrupt = false)
|
||||||
|
|
||||||
|
pcmStopRequested = false
|
||||||
|
val pcmSampleRate = TalkModeRuntime.parsePcmSampleRate(request.outputFormat)
|
||||||
|
if (pcmSampleRate != null) {
|
||||||
|
try {
|
||||||
|
streamAndPlayPcm(voiceId = voiceId, apiKey = apiKey, request = request, sampleRate = pcmSampleRate)
|
||||||
|
return
|
||||||
|
} catch (err: Throwable) {
|
||||||
|
if (pcmStopRequested) return
|
||||||
|
Log.w(tag, "pcm playback failed; falling back to mp3: ${err.message ?: err::class.simpleName}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
streamAndPlayMp3(voiceId = voiceId, apiKey = apiKey, request = request)
|
||||||
|
}
|
||||||
|
|
||||||
|
private suspend fun streamAndPlayMp3(voiceId: String, apiKey: String, request: ElevenLabsRequest) {
|
||||||
val dataSource = StreamingMediaDataSource()
|
val dataSource = StreamingMediaDataSource()
|
||||||
streamingSource = dataSource
|
streamingSource = dataSource
|
||||||
|
|
||||||
@@ -587,6 +610,54 @@ class TalkModeManager(
|
|||||||
Log.d(tag, "play done")
|
Log.d(tag, "play done")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private suspend fun streamAndPlayPcm(
|
||||||
|
voiceId: String,
|
||||||
|
apiKey: String,
|
||||||
|
request: ElevenLabsRequest,
|
||||||
|
sampleRate: Int,
|
||||||
|
) {
|
||||||
|
val minBuffer =
|
||||||
|
AudioTrack.getMinBufferSize(
|
||||||
|
sampleRate,
|
||||||
|
AudioFormat.CHANNEL_OUT_MONO,
|
||||||
|
AudioFormat.ENCODING_PCM_16BIT,
|
||||||
|
)
|
||||||
|
if (minBuffer <= 0) {
|
||||||
|
throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
|
||||||
|
}
|
||||||
|
|
||||||
|
val bufferSize = max(minBuffer * 2, 8 * 1024)
|
||||||
|
val track =
|
||||||
|
AudioTrack(
|
||||||
|
AudioAttributes.Builder()
|
||||||
|
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||||
|
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
||||||
|
.build(),
|
||||||
|
AudioFormat.Builder()
|
||||||
|
.setSampleRate(sampleRate)
|
||||||
|
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
|
||||||
|
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
||||||
|
.build(),
|
||||||
|
bufferSize,
|
||||||
|
AudioTrack.MODE_STREAM,
|
||||||
|
AudioManager.AUDIO_SESSION_ID_GENERATE,
|
||||||
|
)
|
||||||
|
if (track.state != AudioTrack.STATE_INITIALIZED) {
|
||||||
|
track.release()
|
||||||
|
throw IllegalStateException("AudioTrack init failed")
|
||||||
|
}
|
||||||
|
pcmTrack = track
|
||||||
|
track.play()
|
||||||
|
|
||||||
|
Log.d(tag, "pcm play start sampleRate=$sampleRate bufferSize=$bufferSize")
|
||||||
|
try {
|
||||||
|
streamPcm(voiceId = voiceId, apiKey = apiKey, request = request, track = track)
|
||||||
|
} finally {
|
||||||
|
cleanupPcmTrack()
|
||||||
|
}
|
||||||
|
Log.d(tag, "pcm play done")
|
||||||
|
}
|
||||||
|
|
||||||
private suspend fun speakWithSystemTts(text: String) {
|
private suspend fun speakWithSystemTts(text: String) {
|
||||||
val trimmed = text.trim()
|
val trimmed = text.trim()
|
||||||
if (trimmed.isEmpty()) return
|
if (trimmed.isEmpty()) return
|
||||||
@@ -678,8 +749,10 @@ class TalkModeManager(
|
|||||||
}
|
}
|
||||||
|
|
||||||
private fun stopSpeaking(resetInterrupt: Boolean = true) {
|
private fun stopSpeaking(resetInterrupt: Boolean = true) {
|
||||||
|
pcmStopRequested = true
|
||||||
if (!_isSpeaking.value) {
|
if (!_isSpeaking.value) {
|
||||||
cleanupPlayer()
|
cleanupPlayer()
|
||||||
|
cleanupPcmTrack()
|
||||||
systemTts?.stop()
|
systemTts?.stop()
|
||||||
systemTtsPending?.cancel()
|
systemTtsPending?.cancel()
|
||||||
systemTtsPending = null
|
systemTtsPending = null
|
||||||
@@ -691,6 +764,7 @@ class TalkModeManager(
|
|||||||
lastInterruptedAtSeconds = currentMs / 1000.0
|
lastInterruptedAtSeconds = currentMs / 1000.0
|
||||||
}
|
}
|
||||||
cleanupPlayer()
|
cleanupPlayer()
|
||||||
|
cleanupPcmTrack()
|
||||||
systemTts?.stop()
|
systemTts?.stop()
|
||||||
systemTtsPending?.cancel()
|
systemTtsPending?.cancel()
|
||||||
systemTtsPending = null
|
systemTtsPending = null
|
||||||
@@ -706,6 +780,20 @@ class TalkModeManager(
|
|||||||
streamingSource = null
|
streamingSource = null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun cleanupPcmTrack() {
|
||||||
|
val track = pcmTrack ?: return
|
||||||
|
try {
|
||||||
|
track.pause()
|
||||||
|
track.flush()
|
||||||
|
track.stop()
|
||||||
|
} catch (_: Throwable) {
|
||||||
|
// ignore cleanup errors
|
||||||
|
} finally {
|
||||||
|
track.release()
|
||||||
|
}
|
||||||
|
pcmTrack = null
|
||||||
|
}
|
||||||
|
|
||||||
private fun shouldInterrupt(transcript: String): Boolean {
|
private fun shouldInterrupt(transcript: String): Boolean {
|
||||||
val trimmed = transcript.trim()
|
val trimmed = transcript.trim()
|
||||||
if (trimmed.length < 3) return false
|
if (trimmed.length < 3) return false
|
||||||
@@ -743,7 +831,7 @@ class TalkModeManager(
|
|||||||
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
|
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
|
||||||
defaultModelId = model ?: defaultModelIdFallback
|
defaultModelId = model ?: defaultModelIdFallback
|
||||||
if (!modelOverrideActive) currentModelId = defaultModelId
|
if (!modelOverrideActive) currentModelId = defaultModelId
|
||||||
defaultOutputFormat = outputFormat
|
defaultOutputFormat = outputFormat ?: defaultOutputFormatFallback
|
||||||
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
|
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
|
||||||
if (interrupt != null) interruptOnSpeech = interrupt
|
if (interrupt != null) interruptOnSpeech = interrupt
|
||||||
} catch (_: Throwable) {
|
} catch (_: Throwable) {
|
||||||
@@ -752,6 +840,7 @@ class TalkModeManager(
|
|||||||
if (!modelOverrideActive) currentModelId = defaultModelId
|
if (!modelOverrideActive) currentModelId = defaultModelId
|
||||||
apiKey = envKey?.takeIf { it.isNotEmpty() }
|
apiKey = envKey?.takeIf { it.isNotEmpty() }
|
||||||
voiceAliases = emptyMap()
|
voiceAliases = emptyMap()
|
||||||
|
defaultOutputFormat = defaultOutputFormatFallback
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -767,45 +856,110 @@ class TalkModeManager(
|
|||||||
sink: StreamingMediaDataSource,
|
sink: StreamingMediaDataSource,
|
||||||
) {
|
) {
|
||||||
withContext(Dispatchers.IO) {
|
withContext(Dispatchers.IO) {
|
||||||
val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
|
val conn = openTtsConnection(voiceId = voiceId, apiKey = apiKey, request = request)
|
||||||
val latencyTier = request.latencyTier
|
try {
|
||||||
val url =
|
val payload = buildRequestPayload(request)
|
||||||
if (latencyTier != null) {
|
conn.outputStream.use { it.write(payload.toByteArray()) }
|
||||||
URL("$baseUrl?optimize_streaming_latency=$latencyTier")
|
|
||||||
} else {
|
val code = conn.responseCode
|
||||||
URL(baseUrl)
|
if (code >= 400) {
|
||||||
|
val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
|
||||||
|
sink.fail()
|
||||||
|
throw IllegalStateException("ElevenLabs failed: $code $message")
|
||||||
}
|
}
|
||||||
val conn = url.openConnection() as HttpURLConnection
|
|
||||||
conn.requestMethod = "POST"
|
|
||||||
conn.connectTimeout = 30_000
|
|
||||||
conn.readTimeout = 30_000
|
|
||||||
conn.setRequestProperty("Content-Type", "application/json")
|
|
||||||
conn.setRequestProperty("Accept", "audio/mpeg")
|
|
||||||
conn.setRequestProperty("xi-api-key", apiKey)
|
|
||||||
conn.doOutput = true
|
|
||||||
|
|
||||||
val payload = buildRequestPayload(request)
|
val buffer = ByteArray(8 * 1024)
|
||||||
conn.outputStream.use { it.write(payload.toByteArray()) }
|
conn.inputStream.use { input ->
|
||||||
|
while (true) {
|
||||||
val code = conn.responseCode
|
val read = input.read(buffer)
|
||||||
if (code >= 400) {
|
if (read <= 0) break
|
||||||
val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
|
sink.append(buffer.copyOf(read))
|
||||||
sink.fail()
|
}
|
||||||
throw IllegalStateException("ElevenLabs failed: $code $message")
|
|
||||||
}
|
|
||||||
|
|
||||||
val buffer = ByteArray(8 * 1024)
|
|
||||||
conn.inputStream.use { input ->
|
|
||||||
while (true) {
|
|
||||||
val read = input.read(buffer)
|
|
||||||
if (read <= 0) break
|
|
||||||
sink.append(buffer.copyOf(read))
|
|
||||||
}
|
}
|
||||||
|
sink.finish()
|
||||||
|
} finally {
|
||||||
|
conn.disconnect()
|
||||||
}
|
}
|
||||||
sink.finish()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private suspend fun streamPcm(
|
||||||
|
voiceId: String,
|
||||||
|
apiKey: String,
|
||||||
|
request: ElevenLabsRequest,
|
||||||
|
track: AudioTrack,
|
||||||
|
) {
|
||||||
|
withContext(Dispatchers.IO) {
|
||||||
|
val conn = openTtsConnection(voiceId = voiceId, apiKey = apiKey, request = request)
|
||||||
|
try {
|
||||||
|
val payload = buildRequestPayload(request)
|
||||||
|
conn.outputStream.use { it.write(payload.toByteArray()) }
|
||||||
|
|
||||||
|
val code = conn.responseCode
|
||||||
|
if (code >= 400) {
|
||||||
|
val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
|
||||||
|
throw IllegalStateException("ElevenLabs failed: $code $message")
|
||||||
|
}
|
||||||
|
|
||||||
|
val buffer = ByteArray(8 * 1024)
|
||||||
|
conn.inputStream.use { input ->
|
||||||
|
while (true) {
|
||||||
|
if (pcmStopRequested) return@withContext
|
||||||
|
val read = input.read(buffer)
|
||||||
|
if (read <= 0) break
|
||||||
|
var offset = 0
|
||||||
|
while (offset < read) {
|
||||||
|
if (pcmStopRequested) return@withContext
|
||||||
|
val wrote =
|
||||||
|
try {
|
||||||
|
track.write(buffer, offset, read - offset)
|
||||||
|
} catch (err: Throwable) {
|
||||||
|
if (pcmStopRequested) return@withContext
|
||||||
|
throw err
|
||||||
|
}
|
||||||
|
if (wrote <= 0) {
|
||||||
|
if (pcmStopRequested) return@withContext
|
||||||
|
throw IllegalStateException("AudioTrack write failed: $wrote")
|
||||||
|
}
|
||||||
|
offset += wrote
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
conn.disconnect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun openTtsConnection(
|
||||||
|
voiceId: String,
|
||||||
|
apiKey: String,
|
||||||
|
request: ElevenLabsRequest,
|
||||||
|
): HttpURLConnection {
|
||||||
|
val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
|
||||||
|
val latencyTier = request.latencyTier
|
||||||
|
val url =
|
||||||
|
if (latencyTier != null) {
|
||||||
|
URL("$baseUrl?optimize_streaming_latency=$latencyTier")
|
||||||
|
} else {
|
||||||
|
URL(baseUrl)
|
||||||
|
}
|
||||||
|
val conn = url.openConnection() as HttpURLConnection
|
||||||
|
conn.requestMethod = "POST"
|
||||||
|
conn.connectTimeout = 30_000
|
||||||
|
conn.readTimeout = 30_000
|
||||||
|
conn.setRequestProperty("Content-Type", "application/json")
|
||||||
|
conn.setRequestProperty("Accept", resolveAcceptHeader(request.outputFormat))
|
||||||
|
conn.setRequestProperty("xi-api-key", apiKey)
|
||||||
|
conn.doOutput = true
|
||||||
|
return conn
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun resolveAcceptHeader(outputFormat: String?): String {
|
||||||
|
val normalized = outputFormat?.trim()?.lowercase().orEmpty()
|
||||||
|
return if (normalized.startsWith("pcm_")) "audio/pcm" else "audio/mpeg"
|
||||||
|
}
|
||||||
|
|
||||||
private fun buildRequestPayload(request: ElevenLabsRequest): String {
|
private fun buildRequestPayload(request: ElevenLabsRequest): String {
|
||||||
val voiceSettingsEntries =
|
val voiceSettingsEntries =
|
||||||
buildJsonObject {
|
buildJsonObject {
|
||||||
@@ -897,7 +1051,8 @@ class TalkModeManager(
|
|||||||
fun validatedOutputFormat(value: String?): String? {
|
fun validatedOutputFormat(value: String?): String? {
|
||||||
val trimmed = value?.trim()?.lowercase() ?: return null
|
val trimmed = value?.trim()?.lowercase() ?: return null
|
||||||
if (trimmed.isEmpty()) return null
|
if (trimmed.isEmpty()) return null
|
||||||
return if (trimmed.startsWith("mp3_")) trimmed else null
|
if (trimmed.startsWith("mp3_")) return trimmed
|
||||||
|
return if (parsePcmSampleRate(trimmed) != null) trimmed else null
|
||||||
}
|
}
|
||||||
|
|
||||||
fun validatedLatencyTier(value: Int?): Int? {
|
fun validatedLatencyTier(value: Int?): Int? {
|
||||||
@@ -906,6 +1061,15 @@ class TalkModeManager(
|
|||||||
return value
|
return value
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fun parsePcmSampleRate(value: String?): Int? {
|
||||||
|
val trimmed = value?.trim()?.lowercase() ?: return null
|
||||||
|
if (!trimmed.startsWith("pcm_")) return null
|
||||||
|
val suffix = trimmed.removePrefix("pcm_")
|
||||||
|
val digits = suffix.takeWhile { it.isDigit() }
|
||||||
|
val rate = digits.toIntOrNull() ?: return null
|
||||||
|
return if (rate in setOf(16000, 22050, 24000, 44100)) rate else null
|
||||||
|
}
|
||||||
|
|
||||||
fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean {
|
fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean {
|
||||||
val sinceMs = sinceSeconds * 1000
|
val sinceMs = sinceSeconds * 1000
|
||||||
return if (timestamp > 10_000_000_000) {
|
return if (timestamp > 10_000_000_000) {
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ Defaults:
|
|||||||
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
|
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
|
||||||
- `modelId`: defaults to `eleven_v3` when unset
|
- `modelId`: defaults to `eleven_v3` when unset
|
||||||
- `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)
|
- `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)
|
||||||
- `outputFormat`: defaults to `pcm_44100` on macOS/iOS for faster streaming playback (Android stays on MP3)
|
- `outputFormat`: defaults to `pcm_44100` on macOS/iOS and `pcm_24000` on Android (set `mp3_*` to force MP3 streaming)
|
||||||
|
|
||||||
## macOS UI
|
## macOS UI
|
||||||
- Menu bar toggle: **Talk**
|
- Menu bar toggle: **Talk**
|
||||||
@@ -76,3 +76,4 @@ Defaults:
|
|||||||
- TTS uses ElevenLabs streaming API with `ELEVENLABS_API_KEY` and incremental playback on macOS/iOS/Android for lower latency.
|
- TTS uses ElevenLabs streaming API with `ELEVENLABS_API_KEY` and incremental playback on macOS/iOS/Android for lower latency.
|
||||||
- `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`.
|
- `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`.
|
||||||
- `latency_tier` is validated to `0..4` when set.
|
- `latency_tier` is validated to `0..4` when set.
|
||||||
|
- Android supports `pcm_16000`, `pcm_22050`, `pcm_24000`, and `pcm_44100` output formats for low-latency AudioTrack streaming.
|
||||||
|
|||||||
Reference in New Issue
Block a user