feat: add talk mode across nodes

This commit is contained in:
Peter Steinberger
2025-12-29 23:21:05 +01:00
parent 6927b0fb8d
commit 20d7882033
26 changed files with 3087 additions and 0 deletions

View File

@@ -35,6 +35,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
val voiceWakeMode: StateFlow<VoiceWakeMode> = runtime.voiceWakeMode
val voiceWakeStatusText: StateFlow<String> = runtime.voiceWakeStatusText
val voiceWakeIsListening: StateFlow<Boolean> = runtime.voiceWakeIsListening
val talkEnabled: StateFlow<Boolean> = runtime.talkEnabled
val talkStatusText: StateFlow<String> = runtime.talkStatusText
val talkIsListening: StateFlow<Boolean> = runtime.talkIsListening
val talkIsSpeaking: StateFlow<Boolean> = runtime.talkIsSpeaking
val manualEnabled: StateFlow<Boolean> = runtime.manualEnabled
val manualHost: StateFlow<String> = runtime.manualHost
val manualPort: StateFlow<Int> = runtime.manualPort
@@ -95,6 +99,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
runtime.setVoiceWakeMode(mode)
}
fun setTalkEnabled(enabled: Boolean) {
runtime.setTalkEnabled(enabled)
}
fun connect(endpoint: BridgeEndpoint) {
runtime.connect(endpoint)
}

View File

@@ -25,6 +25,7 @@ import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UIAction
import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UICommand
import com.steipete.clawdis.node.protocol.ClawdisCanvasCommand
import com.steipete.clawdis.node.protocol.ClawdisScreenCommand
import com.steipete.clawdis.node.voice.TalkModeManager
import com.steipete.clawdis.node.voice.VoiceWakeManager
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
@@ -84,6 +85,15 @@ class NodeRuntime(context: Context) {
val voiceWakeStatusText: StateFlow<String>
get() = voiceWake.statusText
val talkStatusText: StateFlow<String>
get() = talkMode.statusText
val talkIsListening: StateFlow<Boolean>
get() = talkMode.isListening
val talkIsSpeaking: StateFlow<Boolean>
get() = talkMode.isSpeaking
private val discovery = BridgeDiscovery(appContext, scope = scope)
val bridges: StateFlow<List<BridgeEndpoint>> = discovery.bridges
val discoveryStatusText: StateFlow<String> = discovery.statusText
@@ -133,6 +143,9 @@ class NodeRuntime(context: Context) {
)
private val chat = ChatController(scope = scope, session = session, json = json)
private val talkMode: TalkModeManager by lazy {
TalkModeManager(context = appContext, scope = scope).also { it.attachSession(session) }
}
private fun handleSessionDisconnected(message: String) {
_statusText.value = message
@@ -163,6 +176,7 @@ class NodeRuntime(context: Context) {
val preventSleep: StateFlow<Boolean> = prefs.preventSleep
val wakeWords: StateFlow<List<String>> = prefs.wakeWords
val voiceWakeMode: StateFlow<VoiceWakeMode> = prefs.voiceWakeMode
val talkEnabled: StateFlow<Boolean> = prefs.talkEnabled
val manualEnabled: StateFlow<Boolean> = prefs.manualEnabled
val manualHost: StateFlow<String> = prefs.manualHost
val manualPort: StateFlow<Int> = prefs.manualPort
@@ -218,6 +232,13 @@ class NodeRuntime(context: Context) {
}
}
scope.launch {
talkEnabled.collect { enabled ->
talkMode.setEnabled(enabled)
externalAudioCaptureActive.value = enabled
}
}
scope.launch(Dispatchers.Default) {
bridges.collect { list ->
if (list.isNotEmpty()) {
@@ -311,6 +332,10 @@ class NodeRuntime(context: Context) {
prefs.setVoiceWakeMode(mode)
}
fun setTalkEnabled(value: Boolean) {
prefs.setTalkEnabled(value)
}
fun connect(endpoint: BridgeEndpoint) {
scope.launch {
_statusText.value = "Connecting…"
@@ -548,6 +573,7 @@ class NodeRuntime(context: Context) {
return
}
talkMode.handleBridgeEvent(event, payloadJson)
chat.handleBridgeEvent(event, payloadJson)
}

View File

@@ -73,6 +73,9 @@ class SecurePrefs(context: Context) {
private val _voiceWakeMode = MutableStateFlow(loadVoiceWakeMode())
val voiceWakeMode: StateFlow<VoiceWakeMode> = _voiceWakeMode
private val _talkEnabled = MutableStateFlow(prefs.getBoolean("talk.enabled", false))
val talkEnabled: StateFlow<Boolean> = _talkEnabled
fun setLastDiscoveredStableId(value: String) {
val trimmed = value.trim()
prefs.edit { putString("bridge.lastDiscoveredStableId", trimmed) }
@@ -158,6 +161,11 @@ class SecurePrefs(context: Context) {
_voiceWakeMode.value = mode
}
fun setTalkEnabled(value: Boolean) {
prefs.edit { putBoolean("talk.enabled", value) }
_talkEnabled.value = value
}
private fun loadVoiceWakeMode(): VoiceWakeMode {
val raw = prefs.getString(voiceWakeModeKey, null)
val resolved = VoiceWakeMode.fromRawValue(raw)

View File

@@ -62,6 +62,8 @@ fun SettingsSheet(viewModel: MainViewModel) {
val wakeWords by viewModel.wakeWords.collectAsState()
val voiceWakeMode by viewModel.voiceWakeMode.collectAsState()
val voiceWakeStatusText by viewModel.voiceWakeStatusText.collectAsState()
val talkEnabled by viewModel.talkEnabled.collectAsState()
val talkStatusText by viewModel.talkStatusText.collectAsState()
val isConnected by viewModel.isConnected.collectAsState()
val manualEnabled by viewModel.manualEnabled.collectAsState()
val manualHost by viewModel.manualHost.collectAsState()
@@ -307,6 +309,28 @@ fun SettingsSheet(viewModel: MainViewModel) {
// Voice
item { Text("Voice", style = MaterialTheme.typography.titleSmall) }
item {
ListItem(
headlineContent = { Text("Talk Mode") },
supportingContent = { Text(talkStatusText) },
trailingContent = {
Switch(
checked = talkEnabled,
onCheckedChange = { on ->
if (on) {
val micOk =
ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
PackageManager.PERMISSION_GRANTED
if (!micOk) audioPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO)
viewModel.setTalkEnabled(true)
} else {
viewModel.setTalkEnabled(false)
}
},
)
},
)
}
item {
val enabled = voiceWakeMode != VoiceWakeMode.Off
ListItem(

View File

@@ -0,0 +1,194 @@
package com.steipete.clawdis.node.voice
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
private val directiveJson = Json { ignoreUnknownKeys = true }
data class TalkDirective(
val voiceId: String? = null,
val modelId: String? = null,
val speed: Double? = null,
val rateWpm: Int? = null,
val stability: Double? = null,
val similarity: Double? = null,
val style: Double? = null,
val speakerBoost: Boolean? = null,
val seed: Long? = null,
val normalize: String? = null,
val language: String? = null,
val outputFormat: String? = null,
val latencyTier: Int? = null,
val once: Boolean? = null,
)
data class TalkDirectiveParseResult(
val directive: TalkDirective?,
val stripped: String,
val unknownKeys: List<String>,
)
object TalkDirectiveParser {
fun parse(text: String): TalkDirectiveParseResult {
val normalized = text.replace("\r\n", "\n")
val lines = normalized.split("\n").toMutableList()
if (lines.isEmpty()) return TalkDirectiveParseResult(null, text, emptyList())
val firstNonEmpty = lines.indexOfFirst { it.trim().isNotEmpty() }
if (firstNonEmpty == -1) return TalkDirectiveParseResult(null, text, emptyList())
val head = lines[firstNonEmpty].trim()
if (!head.startsWith("{") || !head.endsWith("}")) {
return TalkDirectiveParseResult(null, text, emptyList())
}
val obj = parseJsonObject(head) ?: return TalkDirectiveParseResult(null, text, emptyList())
val speakerBoost =
boolValue(obj, listOf("speaker_boost", "speakerBoost"))
?: boolValue(obj, listOf("no_speaker_boost", "noSpeakerBoost"))?.not()
val directive = TalkDirective(
voiceId = stringValue(obj, listOf("voice", "voice_id", "voiceId")),
modelId = stringValue(obj, listOf("model", "model_id", "modelId")),
speed = doubleValue(obj, listOf("speed")),
rateWpm = intValue(obj, listOf("rate", "wpm")),
stability = doubleValue(obj, listOf("stability")),
similarity = doubleValue(obj, listOf("similarity", "similarity_boost", "similarityBoost")),
style = doubleValue(obj, listOf("style")),
speakerBoost = speakerBoost,
seed = longValue(obj, listOf("seed")),
normalize = stringValue(obj, listOf("normalize", "apply_text_normalization")),
language = stringValue(obj, listOf("lang", "language_code", "language")),
outputFormat = stringValue(obj, listOf("output_format", "format")),
latencyTier = intValue(obj, listOf("latency", "latency_tier", "latencyTier")),
once = boolValue(obj, listOf("once")),
)
val hasDirective = listOf(
directive.voiceId,
directive.modelId,
directive.speed,
directive.rateWpm,
directive.stability,
directive.similarity,
directive.style,
directive.speakerBoost,
directive.seed,
directive.normalize,
directive.language,
directive.outputFormat,
directive.latencyTier,
directive.once,
).any { it != null }
if (!hasDirective) return TalkDirectiveParseResult(null, text, emptyList())
val knownKeys = setOf(
"voice", "voice_id", "voiceid",
"model", "model_id", "modelid",
"speed", "rate", "wpm",
"stability", "similarity", "similarity_boost", "similarityboost",
"style",
"speaker_boost", "speakerboost",
"no_speaker_boost", "nospeakerboost",
"seed",
"normalize", "apply_text_normalization",
"lang", "language_code", "language",
"output_format", "format",
"latency", "latency_tier", "latencytier",
"once",
)
val unknownKeys = obj.keys.filter { !knownKeys.contains(it.lowercase()) }.sorted()
lines.removeAt(firstNonEmpty)
if (firstNonEmpty < lines.size) {
if (lines[firstNonEmpty].trim().isEmpty()) {
lines.removeAt(firstNonEmpty)
}
}
return TalkDirectiveParseResult(directive, lines.joinToString("\n"), unknownKeys)
}
private fun parseJsonObject(line: String): JsonObject? {
return try {
directiveJson.parseToJsonElement(line) as? JsonObject
} catch (_: Throwable) {
null
}
}
private fun stringValue(obj: JsonObject, keys: List<String>): String? {
for (key in keys) {
val value = obj[key].asStringOrNull()?.trim()
if (!value.isNullOrEmpty()) return value
}
return null
}
private fun doubleValue(obj: JsonObject, keys: List<String>): Double? {
for (key in keys) {
val value = obj[key].asDoubleOrNull()
if (value != null) return value
}
return null
}
private fun intValue(obj: JsonObject, keys: List<String>): Int? {
for (key in keys) {
val value = obj[key].asIntOrNull()
if (value != null) return value
}
return null
}
private fun longValue(obj: JsonObject, keys: List<String>): Long? {
for (key in keys) {
val value = obj[key].asLongOrNull()
if (value != null) return value
}
return null
}
private fun boolValue(obj: JsonObject, keys: List<String>): Boolean? {
for (key in keys) {
val value = obj[key].asBooleanOrNull()
if (value != null) return value
}
return null
}
}
private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull
private fun JsonElement?.asDoubleOrNull(): Double? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.isString) return primitive.content.toDoubleOrNull()
return primitive.doubleOrNull
}
private fun JsonElement?.asIntOrNull(): Int? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.isString) return primitive.content.toIntOrNull()
return primitive.intOrNull
}
private fun JsonElement?.asLongOrNull(): Long? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.isString) return primitive.content.toLongOrNull()
return primitive.longOrNull
}
private fun JsonElement?.asBooleanOrNull(): Boolean? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.booleanOrNull != null) return primitive.booleanOrNull
val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null
return when (content) {
"true", "yes", "1" -> true
"false", "no", "0" -> false
else -> null
}
}

View File

@@ -0,0 +1,713 @@
package com.steipete.clawdis.node.voice
import android.Manifest
import android.content.Context
import android.content.Intent
import android.content.pm.PackageManager
import android.media.AudioAttributes
import android.media.MediaPlayer
import android.os.Bundle
import android.os.Handler
import android.os.Looper
import android.os.SystemClock
import android.speech.RecognitionListener
import android.speech.RecognizerIntent
import android.speech.SpeechRecognizer
import android.util.Log
import androidx.core.content.ContextCompat
import com.steipete.clawdis.node.bridge.BridgeSession
import java.io.File
import java.net.HttpURLConnection
import java.net.URL
import java.util.UUID
import kotlinx.coroutines.CompletableDeferred
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.delay
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.flow.StateFlow
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
import kotlinx.serialization.json.buildJsonObject
class TalkModeManager(
private val context: Context,
private val scope: CoroutineScope,
) {
companion object {
private const val tag = "TalkMode"
}
private val mainHandler = Handler(Looper.getMainLooper())
private val json = Json { ignoreUnknownKeys = true }
private val _isEnabled = MutableStateFlow(false)
val isEnabled: StateFlow<Boolean> = _isEnabled
private val _isListening = MutableStateFlow(false)
val isListening: StateFlow<Boolean> = _isListening
private val _isSpeaking = MutableStateFlow(false)
val isSpeaking: StateFlow<Boolean> = _isSpeaking
private val _statusText = MutableStateFlow("Off")
val statusText: StateFlow<String> = _statusText
private var recognizer: SpeechRecognizer? = null
private var restartJob: Job? = null
private var stopRequested = false
private var listeningMode = false
private var silenceJob: Job? = null
private val silenceWindowMs = 700L
private var lastTranscript: String = ""
private var lastHeardAtMs: Long? = null
private var lastSpokenText: String? = null
private var lastInterruptedAtSeconds: Double? = null
private var defaultVoiceId: String? = null
private var currentVoiceId: String? = null
private var defaultModelId: String? = null
private var currentModelId: String? = null
private var defaultOutputFormat: String? = null
private var interruptOnSpeech: Boolean = true
private var voiceOverrideActive = false
private var modelOverrideActive = false
private var session: BridgeSession? = null
private var pendingRunId: String? = null
private var pendingFinal: CompletableDeferred<Boolean>? = null
private var player: MediaPlayer? = null
private var currentAudioFile: File? = null
fun attachSession(session: BridgeSession) {
this.session = session
}
fun setEnabled(enabled: Boolean) {
if (_isEnabled.value == enabled) return
_isEnabled.value = enabled
if (enabled) {
start()
} else {
stop()
}
}
fun handleBridgeEvent(event: String, payloadJson: String?) {
if (event != "chat") return
if (payloadJson.isNullOrBlank()) return
val pending = pendingRunId ?: return
val obj =
try {
json.parseToJsonElement(payloadJson).asObjectOrNull()
} catch (_: Throwable) {
null
} ?: return
val runId = obj["runId"].asStringOrNull() ?: return
if (runId != pending) return
val state = obj["state"].asStringOrNull() ?: return
if (state == "final") {
pendingFinal?.complete(true)
pendingFinal = null
pendingRunId = null
}
}
private fun start() {
mainHandler.post {
if (_isListening.value) return@post
stopRequested = false
listeningMode = true
if (!SpeechRecognizer.isRecognitionAvailable(context)) {
_statusText.value = "Speech recognizer unavailable"
return@post
}
val micOk =
ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
PackageManager.PERMISSION_GRANTED
if (!micOk) {
_statusText.value = "Microphone permission required"
return@post
}
try {
recognizer?.destroy()
recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
startListeningInternal(markListening = true)
startSilenceMonitor()
} catch (err: Throwable) {
_statusText.value = "Start failed: ${err.message ?: err::class.simpleName}"
}
}
}
private fun stop() {
stopRequested = true
listeningMode = false
restartJob?.cancel()
restartJob = null
silenceJob?.cancel()
silenceJob = null
lastTranscript = ""
lastHeardAtMs = null
_isListening.value = false
_statusText.value = "Off"
stopSpeaking()
mainHandler.post {
recognizer?.cancel()
recognizer?.destroy()
recognizer = null
}
}
private fun startListeningInternal(markListening: Boolean) {
val r = recognizer ?: return
val intent =
Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM)
putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
}
if (markListening) {
_statusText.value = "Listening"
_isListening.value = true
}
r.startListening(intent)
}
private fun scheduleRestart(delayMs: Long = 350) {
if (stopRequested) return
restartJob?.cancel()
restartJob =
scope.launch {
delay(delayMs)
mainHandler.post {
if (stopRequested) return@post
try {
recognizer?.cancel()
val shouldListen = listeningMode
val shouldInterrupt = _isSpeaking.value && interruptOnSpeech
if (!shouldListen && !shouldInterrupt) return@post
startListeningInternal(markListening = shouldListen)
} catch (_: Throwable) {
// handled by onError
}
}
}
}
private fun handleTranscript(text: String, isFinal: Boolean) {
val trimmed = text.trim()
if (_isSpeaking.value && interruptOnSpeech) {
if (shouldInterrupt(trimmed)) {
stopSpeaking()
}
return
}
if (!_isListening.value) return
if (trimmed.isNotEmpty()) {
lastTranscript = trimmed
lastHeardAtMs = SystemClock.elapsedRealtime()
}
if (isFinal) {
lastTranscript = trimmed
}
}
private fun startSilenceMonitor() {
silenceJob?.cancel()
silenceJob =
scope.launch {
while (_isEnabled.value) {
delay(200)
checkSilence()
}
}
}
private fun checkSilence() {
if (!_isListening.value) return
val transcript = lastTranscript.trim()
if (transcript.isEmpty()) return
val lastHeard = lastHeardAtMs ?: return
val elapsed = SystemClock.elapsedRealtime() - lastHeard
if (elapsed < silenceWindowMs) return
scope.launch { finalizeTranscript(transcript) }
}
private suspend fun finalizeTranscript(transcript: String) {
listeningMode = false
_isListening.value = false
_statusText.value = "Thinking…"
lastTranscript = ""
lastHeardAtMs = null
reloadConfig()
val prompt = buildPrompt(transcript)
val bridge = session
if (bridge == null) {
_statusText.value = "Bridge not connected"
start()
return
}
try {
val runId = sendChat(prompt, bridge)
val ok = waitForChatFinal(runId)
if (!ok) {
_statusText.value = "No reply"
start()
return
}
val assistant = fetchLatestAssistantText(bridge)
if (assistant.isNullOrBlank()) {
_statusText.value = "No reply"
start()
return
}
playAssistant(assistant)
} catch (err: Throwable) {
_statusText.value = "Talk failed: ${err.message ?: err::class.simpleName}"
}
if (_isEnabled.value) {
start()
}
}
private fun buildPrompt(transcript: String): String {
val lines = mutableListOf(
"Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
)
lastInterruptedAtSeconds?.let {
lines.add("Assistant speech interrupted at ${"%.1f".format(it)}s.")
lastInterruptedAtSeconds = null
}
lines.add("")
lines.add(transcript)
return lines.joinToString("\n")
}
private suspend fun sendChat(message: String, bridge: BridgeSession): String {
val runId = UUID.randomUUID().toString()
val params =
buildJsonObject {
put("sessionKey", JsonPrimitive("main"))
put("message", JsonPrimitive(message))
put("thinking", JsonPrimitive("low"))
put("timeoutMs", JsonPrimitive(30_000))
put("idempotencyKey", JsonPrimitive(runId))
}
val res = bridge.request("chat.send", params.toString())
val parsed = parseRunId(res) ?: runId
if (parsed != runId) {
pendingRunId = parsed
}
return parsed
}
private suspend fun waitForChatFinal(runId: String): Boolean {
pendingFinal?.cancel()
val deferred = CompletableDeferred<Boolean>()
pendingRunId = runId
pendingFinal = deferred
val result =
withContext(Dispatchers.IO) {
try {
kotlinx.coroutines.withTimeout(120_000) { deferred.await() }
} catch (_: Throwable) {
false
}
}
if (!result) {
pendingFinal = null
pendingRunId = null
}
return result
}
private suspend fun fetchLatestAssistantText(bridge: BridgeSession): String? {
val res = bridge.request("chat.history", "{\"sessionKey\":\"main\"}")
val root = json.parseToJsonElement(res).asObjectOrNull() ?: return null
val messages = root["messages"] as? JsonArray ?: return null
for (item in messages.reversed()) {
val obj = item.asObjectOrNull() ?: continue
if (obj["role"].asStringOrNull() != "assistant") continue
val content = obj["content"] as? JsonArray ?: continue
val text =
content.mapNotNull { entry ->
entry.asObjectOrNull()?.get("text")?.asStringOrNull()?.trim()
}.filter { it.isNotEmpty() }
if (text.isNotEmpty()) return text.joinToString("\n")
}
return null
}
private suspend fun playAssistant(text: String) {
val parsed = TalkDirectiveParser.parse(text)
if (parsed.unknownKeys.isNotEmpty()) {
Log.w(tag, "Unknown talk directive keys: ${parsed.unknownKeys}")
}
val directive = parsed.directive
val cleaned = parsed.stripped.trim()
if (cleaned.isEmpty()) return
if (directive?.voiceId != null) {
if (directive.once != true) {
currentVoiceId = directive.voiceId
voiceOverrideActive = true
}
}
if (directive?.modelId != null) {
if (directive.once != true) {
currentModelId = directive.modelId
modelOverrideActive = true
}
}
val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId
if (voiceId.isNullOrBlank()) {
_statusText.value = "Missing voice ID"
return
}
val apiKey = System.getenv("ELEVENLABS_API_KEY")?.trim()
if (apiKey.isNullOrEmpty()) {
_statusText.value = "Missing ELEVENLABS_API_KEY"
return
}
_statusText.value = "Speaking…"
_isSpeaking.value = true
lastSpokenText = cleaned
ensureInterruptListener()
try {
val request =
ElevenLabsRequest(
text = cleaned,
modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
outputFormat = directive?.outputFormat ?: defaultOutputFormat,
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
stability = TalkModeRuntime.validatedUnit(directive?.stability),
similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
style = TalkModeRuntime.validatedUnit(directive?.style),
speakerBoost = directive?.speakerBoost,
seed = TalkModeRuntime.validatedSeed(directive?.seed),
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
language = TalkModeRuntime.validatedLanguage(directive?.language),
)
val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request)
playAudio(audio)
} catch (err: Throwable) {
_statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
}
_isSpeaking.value = false
}
private suspend fun playAudio(data: ByteArray) {
stopSpeaking(resetInterrupt = false)
val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
file.writeBytes(data)
currentAudioFile = file
val player = MediaPlayer()
this.player = player
val finished = CompletableDeferred<Unit>()
player.setAudioAttributes(
AudioAttributes.Builder()
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.setUsage(AudioAttributes.USAGE_ASSISTANT)
.build(),
)
player.setOnCompletionListener {
finished.complete(Unit)
}
player.setOnErrorListener { _, _, _ ->
finished.completeExceptionally(IllegalStateException("MediaPlayer error"))
true
}
player.setDataSource(file.absolutePath)
withContext(Dispatchers.Main) {
player.setOnPreparedListener { it.start() }
player.prepareAsync()
}
try {
finished.await()
} finally {
cleanupPlayer()
}
}
private fun stopSpeaking(resetInterrupt: Boolean = true) {
if (!_isSpeaking.value) {
cleanupPlayer()
return
}
if (resetInterrupt) {
val currentMs = player?.currentPosition?.toDouble() ?: 0.0
lastInterruptedAtSeconds = currentMs / 1000.0
}
cleanupPlayer()
_isSpeaking.value = false
}
private fun cleanupPlayer() {
player?.stop()
player?.release()
player = null
currentAudioFile?.delete()
currentAudioFile = null
}
private fun shouldInterrupt(transcript: String): Boolean {
val trimmed = transcript.trim()
if (trimmed.length < 3) return false
val spoken = lastSpokenText?.lowercase()
if (spoken != null && spoken.contains(trimmed.lowercase())) return false
return true
}
private suspend fun reloadConfig() {
val bridge = session ?: return
val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim()
val sagVoice = System.getenv("SAG_VOICE_ID")?.trim()
try {
val res = bridge.request("config.get", "{}")
val root = json.parseToJsonElement(res).asObjectOrNull()
val config = root?.get("config").asObjectOrNull()
val talk = config?.get("talk").asObjectOrNull()
val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull()
defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
defaultModelId = model
if (!modelOverrideActive) currentModelId = defaultModelId
defaultOutputFormat = outputFormat
if (interrupt != null) interruptOnSpeech = interrupt
} catch (_: Throwable) {
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
}
}
private fun parseRunId(jsonString: String): String? {
val obj = json.parseToJsonElement(jsonString).asObjectOrNull() ?: return null
return obj["runId"].asStringOrNull()
}
private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray {
return withContext(Dispatchers.IO) {
val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId")
val conn = url.openConnection() as HttpURLConnection
conn.requestMethod = "POST"
conn.setRequestProperty("Content-Type", "application/json")
conn.setRequestProperty("Accept", "audio/mpeg")
conn.setRequestProperty("xi-api-key", apiKey)
conn.doOutput = true
val payload = buildRequestPayload(request)
conn.outputStream.use { it.write(payload.toByteArray()) }
val code = conn.responseCode
val stream = if (code >= 400) conn.errorStream else conn.inputStream
val data = stream.readBytes()
if (code >= 400) {
val message = String(data)
throw IllegalStateException("ElevenLabs failed: $code $message")
}
data
}
}
private fun buildRequestPayload(request: ElevenLabsRequest): String {
val voiceSettingsEntries =
buildJsonObject {
request.speed?.let { put("speed", JsonPrimitive(it)) }
request.stability?.let { put("stability", JsonPrimitive(it)) }
request.similarity?.let { put("similarity_boost", JsonPrimitive(it)) }
request.style?.let { put("style", JsonPrimitive(it)) }
request.speakerBoost?.let { put("use_speaker_boost", JsonPrimitive(it)) }
}
val payload =
buildJsonObject {
put("text", JsonPrimitive(request.text))
request.modelId?.takeIf { it.isNotEmpty() }?.let { put("model_id", JsonPrimitive(it)) }
request.outputFormat?.takeIf { it.isNotEmpty() }?.let { put("output_format", JsonPrimitive(it)) }
request.seed?.let { put("seed", JsonPrimitive(it)) }
request.normalize?.let { put("apply_text_normalization", JsonPrimitive(it)) }
request.language?.let { put("language_code", JsonPrimitive(it)) }
if (voiceSettingsEntries.isNotEmpty()) {
put("voice_settings", voiceSettingsEntries)
}
}
return payload.toString()
}
private data class ElevenLabsRequest(
val text: String,
val modelId: String?,
val outputFormat: String?,
val speed: Double?,
val stability: Double?,
val similarity: Double?,
val style: Double?,
val speakerBoost: Boolean?,
val seed: Long?,
val normalize: String?,
val language: String?,
)
private object TalkModeRuntime {
fun resolveSpeed(speed: Double?, rateWpm: Int?): Double? {
if (rateWpm != null && rateWpm > 0) {
val resolved = rateWpm.toDouble() / 175.0
if (resolved <= 0.5 || resolved >= 2.0) return null
return resolved
}
if (speed != null) {
if (speed <= 0.5 || speed >= 2.0) return null
return speed
}
return null
}
fun validatedUnit(value: Double?): Double? {
if (value == null) return null
if (value < 0 || value > 1) return null
return value
}
fun validatedSeed(value: Long?): Long? {
if (value == null) return null
if (value < 0 || value > 4294967295L) return null
return value
}
fun validatedNormalize(value: String?): String? {
val normalized = value?.trim()?.lowercase() ?: return null
return if (normalized in listOf("auto", "on", "off")) normalized else null
}
fun validatedLanguage(value: String?): String? {
val normalized = value?.trim()?.lowercase() ?: return null
if (normalized.length != 2) return null
if (!normalized.all { it in 'a'..'z' }) return null
return normalized
}
}
private fun ensureInterruptListener() {
if (!interruptOnSpeech || !_isEnabled.value) return
mainHandler.post {
if (stopRequested) return@post
if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
try {
if (recognizer == null) {
recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
}
recognizer?.cancel()
startListeningInternal(markListening = false)
} catch (_: Throwable) {
// ignore
}
}
}
private val listener =
object : RecognitionListener {
override fun onReadyForSpeech(params: Bundle?) {
if (_isEnabled.value) {
_statusText.value = if (_isListening.value) "Listening" else _statusText.value
}
}
override fun onBeginningOfSpeech() {}
override fun onRmsChanged(rmsdB: Float) {}
override fun onBufferReceived(buffer: ByteArray?) {}
override fun onEndOfSpeech() {
scheduleRestart()
}
override fun onError(error: Int) {
if (stopRequested) return
_isListening.value = false
if (error == SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) {
_statusText.value = "Microphone permission required"
return
}
_statusText.value =
when (error) {
SpeechRecognizer.ERROR_AUDIO -> "Audio error"
SpeechRecognizer.ERROR_CLIENT -> "Client error"
SpeechRecognizer.ERROR_NETWORK -> "Network error"
SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> "Network timeout"
SpeechRecognizer.ERROR_NO_MATCH -> "Listening"
SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "Recognizer busy"
SpeechRecognizer.ERROR_SERVER -> "Server error"
SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Listening"
else -> "Speech error ($error)"
}
scheduleRestart(delayMs = 600)
}
override fun onResults(results: Bundle?) {
val list = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty()
list.firstOrNull()?.let { handleTranscript(it, isFinal = true) }
scheduleRestart()
}
override fun onPartialResults(partialResults: Bundle?) {
val list = partialResults?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty()
list.firstOrNull()?.let { handleTranscript(it, isFinal = false) }
}
override fun onEvent(eventType: Int, params: Bundle?) {}
}
}
private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject
private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull
private fun JsonElement?.asBooleanOrNull(): Boolean? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.booleanOrNull != null) return primitive.booleanOrNull
val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null
return when (content) {
"true", "yes", "1" -> true
"false", "no", "0" -> false
else -> null
}
}

View File

@@ -0,0 +1,55 @@
package com.steipete.clawdis.node.voice
import org.junit.Assert.assertEquals
import org.junit.Assert.assertNull
import org.junit.Assert.assertTrue
import org.junit.Test
class TalkDirectiveParserTest {
@Test
fun parsesDirectiveAndStripsHeader() {
val input = """
{"voice":"voice-123","once":true}
Hello from talk mode.
""".trimIndent()
val result = TalkDirectiveParser.parse(input)
assertEquals("voice-123", result.directive?.voiceId)
assertEquals(true, result.directive?.once)
assertEquals("Hello from talk mode.", result.stripped.trim())
}
@Test
fun ignoresUnknownKeysButReportsThem() {
val input = """
{"voice":"abc","foo":1,"bar":"baz"}
Hi there.
""".trimIndent()
val result = TalkDirectiveParser.parse(input)
assertEquals("abc", result.directive?.voiceId)
assertTrue(result.unknownKeys.containsAll(listOf("bar", "foo")))
}
@Test
fun parsesAlternateKeys() {
val input = """
{"model_id":"eleven_v3","similarity_boost":0.4,"no_speaker_boost":true,"rate":200}
Speak.
""".trimIndent()
val result = TalkDirectiveParser.parse(input)
assertEquals("eleven_v3", result.directive?.modelId)
assertEquals(0.4, result.directive?.similarity)
assertEquals(false, result.directive?.speakerBoost)
assertEquals(200, result.directive?.rateWpm)
}
@Test
fun returnsNullWhenNoDirectivePresent() {
val input = """
{}
Hello.
""".trimIndent()
val result = TalkDirectiveParser.parse(input)
assertNull(result.directive)
assertEquals(input, result.stripped)
}
}