feat: add talk mode across nodes

2025-12-29 23:21:05 +01:00
parent 6927b0fb8d
commit 20d7882033
26 changed files with 3087 additions and 0 deletions
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt
@@ -35,6 +35,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
  val voiceWakeMode: StateFlow<VoiceWakeMode> = runtime.voiceWakeMode
  val voiceWakeStatusText: StateFlow<String> = runtime.voiceWakeStatusText
  val voiceWakeIsListening: StateFlow<Boolean> = runtime.voiceWakeIsListening
+  val talkEnabled: StateFlow<Boolean> = runtime.talkEnabled
+  val talkStatusText: StateFlow<String> = runtime.talkStatusText
+  val talkIsListening: StateFlow<Boolean> = runtime.talkIsListening
+  val talkIsSpeaking: StateFlow<Boolean> = runtime.talkIsSpeaking
  val manualEnabled: StateFlow<Boolean> = runtime.manualEnabled
  val manualHost: StateFlow<String> = runtime.manualHost
  val manualPort: StateFlow<Int> = runtime.manualPort
@@ -95,6 +99,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
    runtime.setVoiceWakeMode(mode)
  }

+  fun setTalkEnabled(enabled: Boolean) {
+    runtime.setTalkEnabled(enabled)
+  }
+
  fun connect(endpoint: BridgeEndpoint) {
    runtime.connect(endpoint)
  }
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt
@@ -25,6 +25,7 @@ import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UIAction
 import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UICommand
 import com.steipete.clawdis.node.protocol.ClawdisCanvasCommand
 import com.steipete.clawdis.node.protocol.ClawdisScreenCommand
+import com.steipete.clawdis.node.voice.TalkModeManager
 import com.steipete.clawdis.node.voice.VoiceWakeManager
 import kotlinx.coroutines.CoroutineScope
 import kotlinx.coroutines.Dispatchers
@@ -84,6 +85,15 @@ class NodeRuntime(context: Context) {
  val voiceWakeStatusText: StateFlow<String>
    get() = voiceWake.statusText

+  val talkStatusText: StateFlow<String>
+    get() = talkMode.statusText
+
+  val talkIsListening: StateFlow<Boolean>
+    get() = talkMode.isListening
+
+  val talkIsSpeaking: StateFlow<Boolean>
+    get() = talkMode.isSpeaking
+
  private val discovery = BridgeDiscovery(appContext, scope = scope)
  val bridges: StateFlow<List<BridgeEndpoint>> = discovery.bridges
  val discoveryStatusText: StateFlow<String> = discovery.statusText
@@ -133,6 +143,9 @@ class NodeRuntime(context: Context) {
    )

  private val chat = ChatController(scope = scope, session = session, json = json)
+  private val talkMode: TalkModeManager by lazy {
+    TalkModeManager(context = appContext, scope = scope).also { it.attachSession(session) }
+  }

  private fun handleSessionDisconnected(message: String) {
    _statusText.value = message
@@ -163,6 +176,7 @@ class NodeRuntime(context: Context) {
  val preventSleep: StateFlow<Boolean> = prefs.preventSleep
  val wakeWords: StateFlow<List<String>> = prefs.wakeWords
  val voiceWakeMode: StateFlow<VoiceWakeMode> = prefs.voiceWakeMode
+  val talkEnabled: StateFlow<Boolean> = prefs.talkEnabled
  val manualEnabled: StateFlow<Boolean> = prefs.manualEnabled
  val manualHost: StateFlow<String> = prefs.manualHost
  val manualPort: StateFlow<Int> = prefs.manualPort
@@ -218,6 +232,13 @@ class NodeRuntime(context: Context) {
        }
    }

+    scope.launch {
+      talkEnabled.collect { enabled ->
+        talkMode.setEnabled(enabled)
+        externalAudioCaptureActive.value = enabled
+      }
+    }
+
    scope.launch(Dispatchers.Default) {
      bridges.collect { list ->
        if (list.isNotEmpty()) {
@@ -311,6 +332,10 @@ class NodeRuntime(context: Context) {
    prefs.setVoiceWakeMode(mode)
  }

+  fun setTalkEnabled(value: Boolean) {
+    prefs.setTalkEnabled(value)
+  }
+
  fun connect(endpoint: BridgeEndpoint) {
    scope.launch {
      _statusText.value = "Connecting…"
@@ -548,6 +573,7 @@ class NodeRuntime(context: Context) {
      return
    }

+    talkMode.handleBridgeEvent(event, payloadJson)
    chat.handleBridgeEvent(event, payloadJson)
  }

--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt
@@ -73,6 +73,9 @@ class SecurePrefs(context: Context) {
  private val _voiceWakeMode = MutableStateFlow(loadVoiceWakeMode())
  val voiceWakeMode: StateFlow<VoiceWakeMode> = _voiceWakeMode

+  private val _talkEnabled = MutableStateFlow(prefs.getBoolean("talk.enabled", false))
+  val talkEnabled: StateFlow<Boolean> = _talkEnabled
+
  fun setLastDiscoveredStableId(value: String) {
    val trimmed = value.trim()
    prefs.edit { putString("bridge.lastDiscoveredStableId", trimmed) }
@@ -158,6 +161,11 @@ class SecurePrefs(context: Context) {
    _voiceWakeMode.value = mode
  }

+  fun setTalkEnabled(value: Boolean) {
+    prefs.edit { putBoolean("talk.enabled", value) }
+    _talkEnabled.value = value
+  }
+
  private fun loadVoiceWakeMode(): VoiceWakeMode {
    val raw = prefs.getString(voiceWakeModeKey, null)
    val resolved = VoiceWakeMode.fromRawValue(raw)
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt
@@ -62,6 +62,8 @@ fun SettingsSheet(viewModel: MainViewModel) {
  val wakeWords by viewModel.wakeWords.collectAsState()
  val voiceWakeMode by viewModel.voiceWakeMode.collectAsState()
  val voiceWakeStatusText by viewModel.voiceWakeStatusText.collectAsState()
+  val talkEnabled by viewModel.talkEnabled.collectAsState()
+  val talkStatusText by viewModel.talkStatusText.collectAsState()
  val isConnected by viewModel.isConnected.collectAsState()
  val manualEnabled by viewModel.manualEnabled.collectAsState()
  val manualHost by viewModel.manualHost.collectAsState()
@@ -307,6 +309,28 @@ fun SettingsSheet(viewModel: MainViewModel) {

    // Voice
    item { Text("Voice", style = MaterialTheme.typography.titleSmall) }
+    item {
+      ListItem(
+        headlineContent = { Text("Talk Mode") },
+        supportingContent = { Text(talkStatusText) },
+        trailingContent = {
+          Switch(
+            checked = talkEnabled,
+            onCheckedChange = { on ->
+              if (on) {
+                val micOk =
+                  ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
+                    PackageManager.PERMISSION_GRANTED
+                if (!micOk) audioPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO)
+                viewModel.setTalkEnabled(true)
+              } else {
+                viewModel.setTalkEnabled(false)
+              }
+            },
+          )
+        },
+      )
+    }
    item {
      val enabled = voiceWakeMode != VoiceWakeMode.Off
      ListItem(
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt
@@ -0,0 +1,194 @@
+package com.steipete.clawdis.node.voice
+
+import kotlinx.serialization.json.Json
+import kotlinx.serialization.json.JsonElement
+import kotlinx.serialization.json.JsonObject
+import kotlinx.serialization.json.JsonPrimitive
+
+private val directiveJson = Json { ignoreUnknownKeys = true }
+
+data class TalkDirective(
+  val voiceId: String? = null,
+  val modelId: String? = null,
+  val speed: Double? = null,
+  val rateWpm: Int? = null,
+  val stability: Double? = null,
+  val similarity: Double? = null,
+  val style: Double? = null,
+  val speakerBoost: Boolean? = null,
+  val seed: Long? = null,
+  val normalize: String? = null,
+  val language: String? = null,
+  val outputFormat: String? = null,
+  val latencyTier: Int? = null,
+  val once: Boolean? = null,
+)
+
+data class TalkDirectiveParseResult(
+  val directive: TalkDirective?,
+  val stripped: String,
+  val unknownKeys: List<String>,
+)
+
+object TalkDirectiveParser {
+  fun parse(text: String): TalkDirectiveParseResult {
+    val normalized = text.replace("\r\n", "\n")
+    val lines = normalized.split("\n").toMutableList()
+    if (lines.isEmpty()) return TalkDirectiveParseResult(null, text, emptyList())
+
+    val firstNonEmpty = lines.indexOfFirst { it.trim().isNotEmpty() }
+    if (firstNonEmpty == -1) return TalkDirectiveParseResult(null, text, emptyList())
+
+    val head = lines[firstNonEmpty].trim()
+    if (!head.startsWith("{") || !head.endsWith("}")) {
+      return TalkDirectiveParseResult(null, text, emptyList())
+    }
+
+    val obj = parseJsonObject(head) ?: return TalkDirectiveParseResult(null, text, emptyList())
+
+    val speakerBoost =
+      boolValue(obj, listOf("speaker_boost", "speakerBoost"))
+        ?: boolValue(obj, listOf("no_speaker_boost", "noSpeakerBoost"))?.not()
+
+    val directive = TalkDirective(
+      voiceId = stringValue(obj, listOf("voice", "voice_id", "voiceId")),
+      modelId = stringValue(obj, listOf("model", "model_id", "modelId")),
+      speed = doubleValue(obj, listOf("speed")),
+      rateWpm = intValue(obj, listOf("rate", "wpm")),
+      stability = doubleValue(obj, listOf("stability")),
+      similarity = doubleValue(obj, listOf("similarity", "similarity_boost", "similarityBoost")),
+      style = doubleValue(obj, listOf("style")),
+      speakerBoost = speakerBoost,
+      seed = longValue(obj, listOf("seed")),
+      normalize = stringValue(obj, listOf("normalize", "apply_text_normalization")),
+      language = stringValue(obj, listOf("lang", "language_code", "language")),
+      outputFormat = stringValue(obj, listOf("output_format", "format")),
+      latencyTier = intValue(obj, listOf("latency", "latency_tier", "latencyTier")),
+      once = boolValue(obj, listOf("once")),
+    )
+
+    val hasDirective = listOf(
+      directive.voiceId,
+      directive.modelId,
+      directive.speed,
+      directive.rateWpm,
+      directive.stability,
+      directive.similarity,
+      directive.style,
+      directive.speakerBoost,
+      directive.seed,
+      directive.normalize,
+      directive.language,
+      directive.outputFormat,
+      directive.latencyTier,
+      directive.once,
+    ).any { it != null }
+
+    if (!hasDirective) return TalkDirectiveParseResult(null, text, emptyList())
+
+    val knownKeys = setOf(
+      "voice", "voice_id", "voiceid",
+      "model", "model_id", "modelid",
+      "speed", "rate", "wpm",
+      "stability", "similarity", "similarity_boost", "similarityboost",
+      "style",
+      "speaker_boost", "speakerboost",
+      "no_speaker_boost", "nospeakerboost",
+      "seed",
+      "normalize", "apply_text_normalization",
+      "lang", "language_code", "language",
+      "output_format", "format",
+      "latency", "latency_tier", "latencytier",
+      "once",
+    )
+    val unknownKeys = obj.keys.filter { !knownKeys.contains(it.lowercase()) }.sorted()
+
+    lines.removeAt(firstNonEmpty)
+    if (firstNonEmpty < lines.size) {
+      if (lines[firstNonEmpty].trim().isEmpty()) {
+        lines.removeAt(firstNonEmpty)
+      }
+    }
+
+    return TalkDirectiveParseResult(directive, lines.joinToString("\n"), unknownKeys)
+  }
+
+  private fun parseJsonObject(line: String): JsonObject? {
+    return try {
+      directiveJson.parseToJsonElement(line) as? JsonObject
+    } catch (_: Throwable) {
+      null
+    }
+  }
+
+  private fun stringValue(obj: JsonObject, keys: List<String>): String? {
+    for (key in keys) {
+      val value = obj[key].asStringOrNull()?.trim()
+      if (!value.isNullOrEmpty()) return value
+    }
+    return null
+  }
+
+  private fun doubleValue(obj: JsonObject, keys: List<String>): Double? {
+    for (key in keys) {
+      val value = obj[key].asDoubleOrNull()
+      if (value != null) return value
+    }
+    return null
+  }
+
+  private fun intValue(obj: JsonObject, keys: List<String>): Int? {
+    for (key in keys) {
+      val value = obj[key].asIntOrNull()
+      if (value != null) return value
+    }
+    return null
+  }
+
+  private fun longValue(obj: JsonObject, keys: List<String>): Long? {
+    for (key in keys) {
+      val value = obj[key].asLongOrNull()
+      if (value != null) return value
+    }
+    return null
+  }
+
+  private fun boolValue(obj: JsonObject, keys: List<String>): Boolean? {
+    for (key in keys) {
+      val value = obj[key].asBooleanOrNull()
+      if (value != null) return value
+    }
+    return null
+  }
+}
+
+private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull
+
+private fun JsonElement?.asDoubleOrNull(): Double? {
+  val primitive = this as? JsonPrimitive ?: return null
+  if (primitive.isString) return primitive.content.toDoubleOrNull()
+  return primitive.doubleOrNull
+}
+
+private fun JsonElement?.asIntOrNull(): Int? {
+  val primitive = this as? JsonPrimitive ?: return null
+  if (primitive.isString) return primitive.content.toIntOrNull()
+  return primitive.intOrNull
+}
+
+private fun JsonElement?.asLongOrNull(): Long? {
+  val primitive = this as? JsonPrimitive ?: return null
+  if (primitive.isString) return primitive.content.toLongOrNull()
+  return primitive.longOrNull
+}
+
+private fun JsonElement?.asBooleanOrNull(): Boolean? {
+  val primitive = this as? JsonPrimitive ?: return null
+  if (primitive.booleanOrNull != null) return primitive.booleanOrNull
+  val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null
+  return when (content) {
+    "true", "yes", "1" -> true
+    "false", "no", "0" -> false
+    else -> null
+  }
+}
--- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
+++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt
@@ -0,0 +1,713 @@
+package com.steipete.clawdis.node.voice
+
+import android.Manifest
+import android.content.Context
+import android.content.Intent
+import android.content.pm.PackageManager
+import android.media.AudioAttributes
+import android.media.MediaPlayer
+import android.os.Bundle
+import android.os.Handler
+import android.os.Looper
+import android.os.SystemClock
+import android.speech.RecognitionListener
+import android.speech.RecognizerIntent
+import android.speech.SpeechRecognizer
+import android.util.Log
+import androidx.core.content.ContextCompat
+import com.steipete.clawdis.node.bridge.BridgeSession
+import java.io.File
+import java.net.HttpURLConnection
+import java.net.URL
+import java.util.UUID
+import kotlinx.coroutines.CompletableDeferred
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.Job
+import kotlinx.coroutines.delay
+import kotlinx.coroutines.flow.MutableStateFlow
+import kotlinx.coroutines.flow.StateFlow
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
+import kotlinx.serialization.json.Json
+import kotlinx.serialization.json.JsonArray
+import kotlinx.serialization.json.JsonElement
+import kotlinx.serialization.json.JsonObject
+import kotlinx.serialization.json.JsonPrimitive
+import kotlinx.serialization.json.buildJsonObject
+
+class TalkModeManager(
+  private val context: Context,
+  private val scope: CoroutineScope,
+) {
+  companion object {
+    private const val tag = "TalkMode"
+  }
+
+  private val mainHandler = Handler(Looper.getMainLooper())
+  private val json = Json { ignoreUnknownKeys = true }
+
+  private val _isEnabled = MutableStateFlow(false)
+  val isEnabled: StateFlow<Boolean> = _isEnabled
+
+  private val _isListening = MutableStateFlow(false)
+  val isListening: StateFlow<Boolean> = _isListening
+
+  private val _isSpeaking = MutableStateFlow(false)
+  val isSpeaking: StateFlow<Boolean> = _isSpeaking
+
+  private val _statusText = MutableStateFlow("Off")
+  val statusText: StateFlow<String> = _statusText
+
+  private var recognizer: SpeechRecognizer? = null
+  private var restartJob: Job? = null
+  private var stopRequested = false
+  private var listeningMode = false
+
+  private var silenceJob: Job? = null
+  private val silenceWindowMs = 700L
+  private var lastTranscript: String = ""
+  private var lastHeardAtMs: Long? = null
+  private var lastSpokenText: String? = null
+  private var lastInterruptedAtSeconds: Double? = null
+
+  private var defaultVoiceId: String? = null
+  private var currentVoiceId: String? = null
+  private var defaultModelId: String? = null
+  private var currentModelId: String? = null
+  private var defaultOutputFormat: String? = null
+  private var interruptOnSpeech: Boolean = true
+  private var voiceOverrideActive = false
+  private var modelOverrideActive = false
+
+  private var session: BridgeSession? = null
+  private var pendingRunId: String? = null
+  private var pendingFinal: CompletableDeferred<Boolean>? = null
+
+  private var player: MediaPlayer? = null
+  private var currentAudioFile: File? = null
+
+  fun attachSession(session: BridgeSession) {
+    this.session = session
+  }
+
+  fun setEnabled(enabled: Boolean) {
+    if (_isEnabled.value == enabled) return
+    _isEnabled.value = enabled
+    if (enabled) {
+      start()
+    } else {
+      stop()
+    }
+  }
+
+  fun handleBridgeEvent(event: String, payloadJson: String?) {
+    if (event != "chat") return
+    if (payloadJson.isNullOrBlank()) return
+    val pending = pendingRunId ?: return
+    val obj =
+      try {
+        json.parseToJsonElement(payloadJson).asObjectOrNull()
+      } catch (_: Throwable) {
+        null
+      } ?: return
+    val runId = obj["runId"].asStringOrNull() ?: return
+    if (runId != pending) return
+    val state = obj["state"].asStringOrNull() ?: return
+    if (state == "final") {
+      pendingFinal?.complete(true)
+      pendingFinal = null
+      pendingRunId = null
+    }
+  }
+
+  private fun start() {
+    mainHandler.post {
+      if (_isListening.value) return@post
+      stopRequested = false
+      listeningMode = true
+
+      if (!SpeechRecognizer.isRecognitionAvailable(context)) {
+        _statusText.value = "Speech recognizer unavailable"
+        return@post
+      }
+
+      val micOk =
+        ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
+          PackageManager.PERMISSION_GRANTED
+      if (!micOk) {
+        _statusText.value = "Microphone permission required"
+        return@post
+      }
+
+      try {
+        recognizer?.destroy()
+        recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
+        startListeningInternal(markListening = true)
+        startSilenceMonitor()
+      } catch (err: Throwable) {
+        _statusText.value = "Start failed: ${err.message ?: err::class.simpleName}"
+      }
+    }
+  }
+
+  private fun stop() {
+    stopRequested = true
+    listeningMode = false
+    restartJob?.cancel()
+    restartJob = null
+    silenceJob?.cancel()
+    silenceJob = null
+    lastTranscript = ""
+    lastHeardAtMs = null
+    _isListening.value = false
+    _statusText.value = "Off"
+    stopSpeaking()
+
+    mainHandler.post {
+      recognizer?.cancel()
+      recognizer?.destroy()
+      recognizer = null
+    }
+  }
+
+  private fun startListeningInternal(markListening: Boolean) {
+    val r = recognizer ?: return
+    val intent =
+      Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
+        putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM)
+        putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
+        putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
+        putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
+      }
+
+    if (markListening) {
+      _statusText.value = "Listening"
+      _isListening.value = true
+    }
+    r.startListening(intent)
+  }
+
+  private fun scheduleRestart(delayMs: Long = 350) {
+    if (stopRequested) return
+    restartJob?.cancel()
+    restartJob =
+      scope.launch {
+        delay(delayMs)
+        mainHandler.post {
+          if (stopRequested) return@post
+          try {
+            recognizer?.cancel()
+            val shouldListen = listeningMode
+            val shouldInterrupt = _isSpeaking.value && interruptOnSpeech
+            if (!shouldListen && !shouldInterrupt) return@post
+            startListeningInternal(markListening = shouldListen)
+          } catch (_: Throwable) {
+            // handled by onError
+          }
+        }
+      }
+  }
+
+  private fun handleTranscript(text: String, isFinal: Boolean) {
+    val trimmed = text.trim()
+    if (_isSpeaking.value && interruptOnSpeech) {
+      if (shouldInterrupt(trimmed)) {
+        stopSpeaking()
+      }
+      return
+    }
+
+    if (!_isListening.value) return
+
+    if (trimmed.isNotEmpty()) {
+      lastTranscript = trimmed
+      lastHeardAtMs = SystemClock.elapsedRealtime()
+    }
+
+    if (isFinal) {
+      lastTranscript = trimmed
+    }
+  }
+
+  private fun startSilenceMonitor() {
+    silenceJob?.cancel()
+    silenceJob =
+      scope.launch {
+        while (_isEnabled.value) {
+          delay(200)
+          checkSilence()
+        }
+      }
+  }
+
+  private fun checkSilence() {
+    if (!_isListening.value) return
+    val transcript = lastTranscript.trim()
+    if (transcript.isEmpty()) return
+    val lastHeard = lastHeardAtMs ?: return
+    val elapsed = SystemClock.elapsedRealtime() - lastHeard
+    if (elapsed < silenceWindowMs) return
+    scope.launch { finalizeTranscript(transcript) }
+  }
+
+  private suspend fun finalizeTranscript(transcript: String) {
+    listeningMode = false
+    _isListening.value = false
+    _statusText.value = "Thinking…"
+    lastTranscript = ""
+    lastHeardAtMs = null
+
+    reloadConfig()
+    val prompt = buildPrompt(transcript)
+    val bridge = session
+    if (bridge == null) {
+      _statusText.value = "Bridge not connected"
+      start()
+      return
+    }
+
+    try {
+      val runId = sendChat(prompt, bridge)
+      val ok = waitForChatFinal(runId)
+      if (!ok) {
+        _statusText.value = "No reply"
+        start()
+        return
+      }
+      val assistant = fetchLatestAssistantText(bridge)
+      if (assistant.isNullOrBlank()) {
+        _statusText.value = "No reply"
+        start()
+        return
+      }
+      playAssistant(assistant)
+    } catch (err: Throwable) {
+      _statusText.value = "Talk failed: ${err.message ?: err::class.simpleName}"
+    }
+
+    if (_isEnabled.value) {
+      start()
+    }
+  }
+
+  private fun buildPrompt(transcript: String): String {
+    val lines = mutableListOf(
+      "Talk Mode active. Reply in a concise, spoken tone.",
+      "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
+    )
+    lastInterruptedAtSeconds?.let {
+      lines.add("Assistant speech interrupted at ${"%.1f".format(it)}s.")
+      lastInterruptedAtSeconds = null
+    }
+    lines.add("")
+    lines.add(transcript)
+    return lines.joinToString("\n")
+  }
+
+  private suspend fun sendChat(message: String, bridge: BridgeSession): String {
+    val runId = UUID.randomUUID().toString()
+    val params =
+      buildJsonObject {
+        put("sessionKey", JsonPrimitive("main"))
+        put("message", JsonPrimitive(message))
+        put("thinking", JsonPrimitive("low"))
+        put("timeoutMs", JsonPrimitive(30_000))
+        put("idempotencyKey", JsonPrimitive(runId))
+      }
+    val res = bridge.request("chat.send", params.toString())
+    val parsed = parseRunId(res) ?: runId
+    if (parsed != runId) {
+      pendingRunId = parsed
+    }
+    return parsed
+  }
+
+  private suspend fun waitForChatFinal(runId: String): Boolean {
+    pendingFinal?.cancel()
+    val deferred = CompletableDeferred<Boolean>()
+    pendingRunId = runId
+    pendingFinal = deferred
+
+    val result =
+      withContext(Dispatchers.IO) {
+        try {
+          kotlinx.coroutines.withTimeout(120_000) { deferred.await() }
+        } catch (_: Throwable) {
+          false
+        }
+      }
+
+    if (!result) {
+      pendingFinal = null
+      pendingRunId = null
+    }
+    return result
+  }
+
+  private suspend fun fetchLatestAssistantText(bridge: BridgeSession): String? {
+    val res = bridge.request("chat.history", "{\"sessionKey\":\"main\"}")
+    val root = json.parseToJsonElement(res).asObjectOrNull() ?: return null
+    val messages = root["messages"] as? JsonArray ?: return null
+    for (item in messages.reversed()) {
+      val obj = item.asObjectOrNull() ?: continue
+      if (obj["role"].asStringOrNull() != "assistant") continue
+      val content = obj["content"] as? JsonArray ?: continue
+      val text =
+        content.mapNotNull { entry ->
+          entry.asObjectOrNull()?.get("text")?.asStringOrNull()?.trim()
+        }.filter { it.isNotEmpty() }
+      if (text.isNotEmpty()) return text.joinToString("\n")
+    }
+    return null
+  }
+
+  private suspend fun playAssistant(text: String) {
+    val parsed = TalkDirectiveParser.parse(text)
+    if (parsed.unknownKeys.isNotEmpty()) {
+      Log.w(tag, "Unknown talk directive keys: ${parsed.unknownKeys}")
+    }
+    val directive = parsed.directive
+    val cleaned = parsed.stripped.trim()
+    if (cleaned.isEmpty()) return
+
+    if (directive?.voiceId != null) {
+      if (directive.once != true) {
+        currentVoiceId = directive.voiceId
+        voiceOverrideActive = true
+      }
+    }
+    if (directive?.modelId != null) {
+      if (directive.once != true) {
+        currentModelId = directive.modelId
+        modelOverrideActive = true
+      }
+    }
+
+    val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId
+    if (voiceId.isNullOrBlank()) {
+      _statusText.value = "Missing voice ID"
+      return
+    }
+
+    val apiKey = System.getenv("ELEVENLABS_API_KEY")?.trim()
+    if (apiKey.isNullOrEmpty()) {
+      _statusText.value = "Missing ELEVENLABS_API_KEY"
+      return
+    }
+
+    _statusText.value = "Speaking…"
+    _isSpeaking.value = true
+    lastSpokenText = cleaned
+    ensureInterruptListener()
+
+    try {
+      val request =
+        ElevenLabsRequest(
+          text = cleaned,
+          modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
+          outputFormat = directive?.outputFormat ?: defaultOutputFormat,
+          speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
+          stability = TalkModeRuntime.validatedUnit(directive?.stability),
+          similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
+          style = TalkModeRuntime.validatedUnit(directive?.style),
+          speakerBoost = directive?.speakerBoost,
+          seed = TalkModeRuntime.validatedSeed(directive?.seed),
+          normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
+          language = TalkModeRuntime.validatedLanguage(directive?.language),
+        )
+      val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request)
+      playAudio(audio)
+    } catch (err: Throwable) {
+      _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
+    }
+
+    _isSpeaking.value = false
+  }
+
+  private suspend fun playAudio(data: ByteArray) {
+    stopSpeaking(resetInterrupt = false)
+    val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
+    file.writeBytes(data)
+    currentAudioFile = file
+
+    val player = MediaPlayer()
+    this.player = player
+
+    val finished = CompletableDeferred<Unit>()
+    player.setAudioAttributes(
+      AudioAttributes.Builder()
+        .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
+        .setUsage(AudioAttributes.USAGE_ASSISTANT)
+        .build(),
+    )
+    player.setOnCompletionListener {
+      finished.complete(Unit)
+    }
+    player.setOnErrorListener { _, _, _ ->
+      finished.completeExceptionally(IllegalStateException("MediaPlayer error"))
+      true
+    }
+
+    player.setDataSource(file.absolutePath)
+    withContext(Dispatchers.Main) {
+      player.setOnPreparedListener { it.start() }
+      player.prepareAsync()
+    }
+
+    try {
+      finished.await()
+    } finally {
+      cleanupPlayer()
+    }
+  }
+
+  private fun stopSpeaking(resetInterrupt: Boolean = true) {
+    if (!_isSpeaking.value) {
+      cleanupPlayer()
+      return
+    }
+    if (resetInterrupt) {
+      val currentMs = player?.currentPosition?.toDouble() ?: 0.0
+      lastInterruptedAtSeconds = currentMs / 1000.0
+    }
+    cleanupPlayer()
+    _isSpeaking.value = false
+  }
+
+  private fun cleanupPlayer() {
+    player?.stop()
+    player?.release()
+    player = null
+    currentAudioFile?.delete()
+    currentAudioFile = null
+  }
+
+  private fun shouldInterrupt(transcript: String): Boolean {
+    val trimmed = transcript.trim()
+    if (trimmed.length < 3) return false
+    val spoken = lastSpokenText?.lowercase()
+    if (spoken != null && spoken.contains(trimmed.lowercase())) return false
+    return true
+  }
+
+  private suspend fun reloadConfig() {
+    val bridge = session ?: return
+    val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim()
+    val sagVoice = System.getenv("SAG_VOICE_ID")?.trim()
+    try {
+      val res = bridge.request("config.get", "{}")
+      val root = json.parseToJsonElement(res).asObjectOrNull()
+      val config = root?.get("config").asObjectOrNull()
+      val talk = config?.get("talk").asObjectOrNull()
+      val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
+      val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
+      val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
+      val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull()
+
+      defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
+      if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
+      defaultModelId = model
+      if (!modelOverrideActive) currentModelId = defaultModelId
+      defaultOutputFormat = outputFormat
+      if (interrupt != null) interruptOnSpeech = interrupt
+    } catch (_: Throwable) {
+      defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
+    }
+  }
+
+  private fun parseRunId(jsonString: String): String? {
+    val obj = json.parseToJsonElement(jsonString).asObjectOrNull() ?: return null
+    return obj["runId"].asStringOrNull()
+  }
+
+  private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray {
+    return withContext(Dispatchers.IO) {
+      val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId")
+      val conn = url.openConnection() as HttpURLConnection
+      conn.requestMethod = "POST"
+      conn.setRequestProperty("Content-Type", "application/json")
+      conn.setRequestProperty("Accept", "audio/mpeg")
+      conn.setRequestProperty("xi-api-key", apiKey)
+      conn.doOutput = true
+
+      val payload = buildRequestPayload(request)
+      conn.outputStream.use { it.write(payload.toByteArray()) }
+
+      val code = conn.responseCode
+      val stream = if (code >= 400) conn.errorStream else conn.inputStream
+      val data = stream.readBytes()
+      if (code >= 400) {
+        val message = String(data)
+        throw IllegalStateException("ElevenLabs failed: $code $message")
+      }
+      data
+    }
+  }
+
+  private fun buildRequestPayload(request: ElevenLabsRequest): String {
+    val voiceSettingsEntries =
+      buildJsonObject {
+        request.speed?.let { put("speed", JsonPrimitive(it)) }
+        request.stability?.let { put("stability", JsonPrimitive(it)) }
+        request.similarity?.let { put("similarity_boost", JsonPrimitive(it)) }
+        request.style?.let { put("style", JsonPrimitive(it)) }
+        request.speakerBoost?.let { put("use_speaker_boost", JsonPrimitive(it)) }
+      }
+
+    val payload =
+      buildJsonObject {
+        put("text", JsonPrimitive(request.text))
+        request.modelId?.takeIf { it.isNotEmpty() }?.let { put("model_id", JsonPrimitive(it)) }
+        request.outputFormat?.takeIf { it.isNotEmpty() }?.let { put("output_format", JsonPrimitive(it)) }
+        request.seed?.let { put("seed", JsonPrimitive(it)) }
+        request.normalize?.let { put("apply_text_normalization", JsonPrimitive(it)) }
+        request.language?.let { put("language_code", JsonPrimitive(it)) }
+        if (voiceSettingsEntries.isNotEmpty()) {
+          put("voice_settings", voiceSettingsEntries)
+        }
+      }
+
+    return payload.toString()
+  }
+
+  private data class ElevenLabsRequest(
+    val text: String,
+    val modelId: String?,
+    val outputFormat: String?,
+    val speed: Double?,
+    val stability: Double?,
+    val similarity: Double?,
+    val style: Double?,
+    val speakerBoost: Boolean?,
+    val seed: Long?,
+    val normalize: String?,
+    val language: String?,
+  )
+
+  private object TalkModeRuntime {
+    fun resolveSpeed(speed: Double?, rateWpm: Int?): Double? {
+      if (rateWpm != null && rateWpm > 0) {
+        val resolved = rateWpm.toDouble() / 175.0
+        if (resolved <= 0.5 || resolved >= 2.0) return null
+        return resolved
+      }
+      if (speed != null) {
+        if (speed <= 0.5 || speed >= 2.0) return null
+        return speed
+      }
+      return null
+    }
+
+    fun validatedUnit(value: Double?): Double? {
+      if (value == null) return null
+      if (value < 0 || value > 1) return null
+      return value
+    }
+
+    fun validatedSeed(value: Long?): Long? {
+      if (value == null) return null
+      if (value < 0 || value > 4294967295L) return null
+      return value
+    }
+
+    fun validatedNormalize(value: String?): String? {
+      val normalized = value?.trim()?.lowercase() ?: return null
+      return if (normalized in listOf("auto", "on", "off")) normalized else null
+    }
+
+    fun validatedLanguage(value: String?): String? {
+      val normalized = value?.trim()?.lowercase() ?: return null
+      if (normalized.length != 2) return null
+      if (!normalized.all { it in 'a'..'z' }) return null
+      return normalized
+    }
+  }
+
+  private fun ensureInterruptListener() {
+    if (!interruptOnSpeech || !_isEnabled.value) return
+    mainHandler.post {
+      if (stopRequested) return@post
+      if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
+      try {
+        if (recognizer == null) {
+          recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
+        }
+        recognizer?.cancel()
+        startListeningInternal(markListening = false)
+      } catch (_: Throwable) {
+        // ignore
+      }
+    }
+  }
+
+  private val listener =
+    object : RecognitionListener {
+      override fun onReadyForSpeech(params: Bundle?) {
+        if (_isEnabled.value) {
+          _statusText.value = if (_isListening.value) "Listening" else _statusText.value
+        }
+      }
+
+      override fun onBeginningOfSpeech() {}
+
+      override fun onRmsChanged(rmsdB: Float) {}
+
+      override fun onBufferReceived(buffer: ByteArray?) {}
+
+      override fun onEndOfSpeech() {
+        scheduleRestart()
+      }
+
+      override fun onError(error: Int) {
+        if (stopRequested) return
+        _isListening.value = false
+        if (error == SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) {
+          _statusText.value = "Microphone permission required"
+          return
+        }
+
+        _statusText.value =
+          when (error) {
+            SpeechRecognizer.ERROR_AUDIO -> "Audio error"
+            SpeechRecognizer.ERROR_CLIENT -> "Client error"
+            SpeechRecognizer.ERROR_NETWORK -> "Network error"
+            SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> "Network timeout"
+            SpeechRecognizer.ERROR_NO_MATCH -> "Listening"
+            SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "Recognizer busy"
+            SpeechRecognizer.ERROR_SERVER -> "Server error"
+            SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Listening"
+            else -> "Speech error ($error)"
+          }
+        scheduleRestart(delayMs = 600)
+      }
+
+      override fun onResults(results: Bundle?) {
+        val list = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty()
+        list.firstOrNull()?.let { handleTranscript(it, isFinal = true) }
+        scheduleRestart()
+      }
+
+      override fun onPartialResults(partialResults: Bundle?) {
+        val list = partialResults?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty()
+        list.firstOrNull()?.let { handleTranscript(it, isFinal = false) }
+      }
+
+      override fun onEvent(eventType: Int, params: Bundle?) {}
+    }
+}
+
+private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject
+
+private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull
+
+private fun JsonElement?.asBooleanOrNull(): Boolean? {
+  val primitive = this as? JsonPrimitive ?: return null
+  if (primitive.booleanOrNull != null) return primitive.booleanOrNull
+  val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null
+  return when (content) {
+    "true", "yes", "1" -> true
+    "false", "no", "0" -> false
+    else -> null
+  }
+}
--- a/apps/android/app/src/test/java/com/steipete/clawdis/node/voice/TalkDirectiveParserTest.kt
+++ b/apps/android/app/src/test/java/com/steipete/clawdis/node/voice/TalkDirectiveParserTest.kt
@@ -0,0 +1,55 @@
+package com.steipete.clawdis.node.voice
+
+import org.junit.Assert.assertEquals
+import org.junit.Assert.assertNull
+import org.junit.Assert.assertTrue
+import org.junit.Test
+
+class TalkDirectiveParserTest {
+  @Test
+  fun parsesDirectiveAndStripsHeader() {
+    val input = """
+      {"voice":"voice-123","once":true}
+      Hello from talk mode.
+    """.trimIndent()
+    val result = TalkDirectiveParser.parse(input)
+    assertEquals("voice-123", result.directive?.voiceId)
+    assertEquals(true, result.directive?.once)
+    assertEquals("Hello from talk mode.", result.stripped.trim())
+  }
+
+  @Test
+  fun ignoresUnknownKeysButReportsThem() {
+    val input = """
+      {"voice":"abc","foo":1,"bar":"baz"}
+      Hi there.
+    """.trimIndent()
+    val result = TalkDirectiveParser.parse(input)
+    assertEquals("abc", result.directive?.voiceId)
+    assertTrue(result.unknownKeys.containsAll(listOf("bar", "foo")))
+  }
+
+  @Test
+  fun parsesAlternateKeys() {
+    val input = """
+      {"model_id":"eleven_v3","similarity_boost":0.4,"no_speaker_boost":true,"rate":200}
+      Speak.
+    """.trimIndent()
+    val result = TalkDirectiveParser.parse(input)
+    assertEquals("eleven_v3", result.directive?.modelId)
+    assertEquals(0.4, result.directive?.similarity)
+    assertEquals(false, result.directive?.speakerBoost)
+    assertEquals(200, result.directive?.rateWpm)
+  }
+
+  @Test
+  fun returnsNullWhenNoDirectivePresent() {
+    val input = """
+      {}
+      Hello.
+    """.trimIndent()
+    val result = TalkDirectiveParser.parse(input)
+    assertNull(result.directive)
+    assertEquals(input, result.stripped)
+  }
+}