feat: add talk mode across nodes

This commit is contained in:
Peter Steinberger
2025-12-29 23:21:05 +01:00
parent 6927b0fb8d
commit 20d7882033
26 changed files with 3087 additions and 0 deletions

View File

@@ -35,6 +35,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
val voiceWakeMode: StateFlow<VoiceWakeMode> = runtime.voiceWakeMode
val voiceWakeStatusText: StateFlow<String> = runtime.voiceWakeStatusText
val voiceWakeIsListening: StateFlow<Boolean> = runtime.voiceWakeIsListening
val talkEnabled: StateFlow<Boolean> = runtime.talkEnabled
val talkStatusText: StateFlow<String> = runtime.talkStatusText
val talkIsListening: StateFlow<Boolean> = runtime.talkIsListening
val talkIsSpeaking: StateFlow<Boolean> = runtime.talkIsSpeaking
val manualEnabled: StateFlow<Boolean> = runtime.manualEnabled
val manualHost: StateFlow<String> = runtime.manualHost
val manualPort: StateFlow<Int> = runtime.manualPort
@@ -95,6 +99,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
runtime.setVoiceWakeMode(mode)
}
fun setTalkEnabled(enabled: Boolean) {
runtime.setTalkEnabled(enabled)
}
fun connect(endpoint: BridgeEndpoint) {
runtime.connect(endpoint)
}

View File

@@ -25,6 +25,7 @@ import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UIAction
import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UICommand
import com.steipete.clawdis.node.protocol.ClawdisCanvasCommand
import com.steipete.clawdis.node.protocol.ClawdisScreenCommand
import com.steipete.clawdis.node.voice.TalkModeManager
import com.steipete.clawdis.node.voice.VoiceWakeManager
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
@@ -84,6 +85,15 @@ class NodeRuntime(context: Context) {
val voiceWakeStatusText: StateFlow<String>
get() = voiceWake.statusText
val talkStatusText: StateFlow<String>
get() = talkMode.statusText
val talkIsListening: StateFlow<Boolean>
get() = talkMode.isListening
val talkIsSpeaking: StateFlow<Boolean>
get() = talkMode.isSpeaking
private val discovery = BridgeDiscovery(appContext, scope = scope)
val bridges: StateFlow<List<BridgeEndpoint>> = discovery.bridges
val discoveryStatusText: StateFlow<String> = discovery.statusText
@@ -133,6 +143,9 @@ class NodeRuntime(context: Context) {
)
private val chat = ChatController(scope = scope, session = session, json = json)
private val talkMode: TalkModeManager by lazy {
TalkModeManager(context = appContext, scope = scope).also { it.attachSession(session) }
}
private fun handleSessionDisconnected(message: String) {
_statusText.value = message
@@ -163,6 +176,7 @@ class NodeRuntime(context: Context) {
val preventSleep: StateFlow<Boolean> = prefs.preventSleep
val wakeWords: StateFlow<List<String>> = prefs.wakeWords
val voiceWakeMode: StateFlow<VoiceWakeMode> = prefs.voiceWakeMode
val talkEnabled: StateFlow<Boolean> = prefs.talkEnabled
val manualEnabled: StateFlow<Boolean> = prefs.manualEnabled
val manualHost: StateFlow<String> = prefs.manualHost
val manualPort: StateFlow<Int> = prefs.manualPort
@@ -218,6 +232,13 @@ class NodeRuntime(context: Context) {
}
}
scope.launch {
talkEnabled.collect { enabled ->
talkMode.setEnabled(enabled)
externalAudioCaptureActive.value = enabled
}
}
scope.launch(Dispatchers.Default) {
bridges.collect { list ->
if (list.isNotEmpty()) {
@@ -311,6 +332,10 @@ class NodeRuntime(context: Context) {
prefs.setVoiceWakeMode(mode)
}
fun setTalkEnabled(value: Boolean) {
prefs.setTalkEnabled(value)
}
fun connect(endpoint: BridgeEndpoint) {
scope.launch {
_statusText.value = "Connecting…"
@@ -548,6 +573,7 @@ class NodeRuntime(context: Context) {
return
}
talkMode.handleBridgeEvent(event, payloadJson)
chat.handleBridgeEvent(event, payloadJson)
}

View File

@@ -73,6 +73,9 @@ class SecurePrefs(context: Context) {
private val _voiceWakeMode = MutableStateFlow(loadVoiceWakeMode())
val voiceWakeMode: StateFlow<VoiceWakeMode> = _voiceWakeMode
private val _talkEnabled = MutableStateFlow(prefs.getBoolean("talk.enabled", false))
val talkEnabled: StateFlow<Boolean> = _talkEnabled
fun setLastDiscoveredStableId(value: String) {
val trimmed = value.trim()
prefs.edit { putString("bridge.lastDiscoveredStableId", trimmed) }
@@ -158,6 +161,11 @@ class SecurePrefs(context: Context) {
_voiceWakeMode.value = mode
}
fun setTalkEnabled(value: Boolean) {
prefs.edit { putBoolean("talk.enabled", value) }
_talkEnabled.value = value
}
private fun loadVoiceWakeMode(): VoiceWakeMode {
val raw = prefs.getString(voiceWakeModeKey, null)
val resolved = VoiceWakeMode.fromRawValue(raw)

View File

@@ -62,6 +62,8 @@ fun SettingsSheet(viewModel: MainViewModel) {
val wakeWords by viewModel.wakeWords.collectAsState()
val voiceWakeMode by viewModel.voiceWakeMode.collectAsState()
val voiceWakeStatusText by viewModel.voiceWakeStatusText.collectAsState()
val talkEnabled by viewModel.talkEnabled.collectAsState()
val talkStatusText by viewModel.talkStatusText.collectAsState()
val isConnected by viewModel.isConnected.collectAsState()
val manualEnabled by viewModel.manualEnabled.collectAsState()
val manualHost by viewModel.manualHost.collectAsState()
@@ -307,6 +309,28 @@ fun SettingsSheet(viewModel: MainViewModel) {
// Voice
item { Text("Voice", style = MaterialTheme.typography.titleSmall) }
item {
ListItem(
headlineContent = { Text("Talk Mode") },
supportingContent = { Text(talkStatusText) },
trailingContent = {
Switch(
checked = talkEnabled,
onCheckedChange = { on ->
if (on) {
val micOk =
ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
PackageManager.PERMISSION_GRANTED
if (!micOk) audioPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO)
viewModel.setTalkEnabled(true)
} else {
viewModel.setTalkEnabled(false)
}
},
)
},
)
}
item {
val enabled = voiceWakeMode != VoiceWakeMode.Off
ListItem(

View File

@@ -0,0 +1,194 @@
package com.steipete.clawdis.node.voice
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
private val directiveJson = Json { ignoreUnknownKeys = true }
data class TalkDirective(
val voiceId: String? = null,
val modelId: String? = null,
val speed: Double? = null,
val rateWpm: Int? = null,
val stability: Double? = null,
val similarity: Double? = null,
val style: Double? = null,
val speakerBoost: Boolean? = null,
val seed: Long? = null,
val normalize: String? = null,
val language: String? = null,
val outputFormat: String? = null,
val latencyTier: Int? = null,
val once: Boolean? = null,
)
data class TalkDirectiveParseResult(
val directive: TalkDirective?,
val stripped: String,
val unknownKeys: List<String>,
)
object TalkDirectiveParser {
fun parse(text: String): TalkDirectiveParseResult {
val normalized = text.replace("\r\n", "\n")
val lines = normalized.split("\n").toMutableList()
if (lines.isEmpty()) return TalkDirectiveParseResult(null, text, emptyList())
val firstNonEmpty = lines.indexOfFirst { it.trim().isNotEmpty() }
if (firstNonEmpty == -1) return TalkDirectiveParseResult(null, text, emptyList())
val head = lines[firstNonEmpty].trim()
if (!head.startsWith("{") || !head.endsWith("}")) {
return TalkDirectiveParseResult(null, text, emptyList())
}
val obj = parseJsonObject(head) ?: return TalkDirectiveParseResult(null, text, emptyList())
val speakerBoost =
boolValue(obj, listOf("speaker_boost", "speakerBoost"))
?: boolValue(obj, listOf("no_speaker_boost", "noSpeakerBoost"))?.not()
val directive = TalkDirective(
voiceId = stringValue(obj, listOf("voice", "voice_id", "voiceId")),
modelId = stringValue(obj, listOf("model", "model_id", "modelId")),
speed = doubleValue(obj, listOf("speed")),
rateWpm = intValue(obj, listOf("rate", "wpm")),
stability = doubleValue(obj, listOf("stability")),
similarity = doubleValue(obj, listOf("similarity", "similarity_boost", "similarityBoost")),
style = doubleValue(obj, listOf("style")),
speakerBoost = speakerBoost,
seed = longValue(obj, listOf("seed")),
normalize = stringValue(obj, listOf("normalize", "apply_text_normalization")),
language = stringValue(obj, listOf("lang", "language_code", "language")),
outputFormat = stringValue(obj, listOf("output_format", "format")),
latencyTier = intValue(obj, listOf("latency", "latency_tier", "latencyTier")),
once = boolValue(obj, listOf("once")),
)
val hasDirective = listOf(
directive.voiceId,
directive.modelId,
directive.speed,
directive.rateWpm,
directive.stability,
directive.similarity,
directive.style,
directive.speakerBoost,
directive.seed,
directive.normalize,
directive.language,
directive.outputFormat,
directive.latencyTier,
directive.once,
).any { it != null }
if (!hasDirective) return TalkDirectiveParseResult(null, text, emptyList())
val knownKeys = setOf(
"voice", "voice_id", "voiceid",
"model", "model_id", "modelid",
"speed", "rate", "wpm",
"stability", "similarity", "similarity_boost", "similarityboost",
"style",
"speaker_boost", "speakerboost",
"no_speaker_boost", "nospeakerboost",
"seed",
"normalize", "apply_text_normalization",
"lang", "language_code", "language",
"output_format", "format",
"latency", "latency_tier", "latencytier",
"once",
)
val unknownKeys = obj.keys.filter { !knownKeys.contains(it.lowercase()) }.sorted()
lines.removeAt(firstNonEmpty)
if (firstNonEmpty < lines.size) {
if (lines[firstNonEmpty].trim().isEmpty()) {
lines.removeAt(firstNonEmpty)
}
}
return TalkDirectiveParseResult(directive, lines.joinToString("\n"), unknownKeys)
}
private fun parseJsonObject(line: String): JsonObject? {
return try {
directiveJson.parseToJsonElement(line) as? JsonObject
} catch (_: Throwable) {
null
}
}
private fun stringValue(obj: JsonObject, keys: List<String>): String? {
for (key in keys) {
val value = obj[key].asStringOrNull()?.trim()
if (!value.isNullOrEmpty()) return value
}
return null
}
private fun doubleValue(obj: JsonObject, keys: List<String>): Double? {
for (key in keys) {
val value = obj[key].asDoubleOrNull()
if (value != null) return value
}
return null
}
private fun intValue(obj: JsonObject, keys: List<String>): Int? {
for (key in keys) {
val value = obj[key].asIntOrNull()
if (value != null) return value
}
return null
}
private fun longValue(obj: JsonObject, keys: List<String>): Long? {
for (key in keys) {
val value = obj[key].asLongOrNull()
if (value != null) return value
}
return null
}
private fun boolValue(obj: JsonObject, keys: List<String>): Boolean? {
for (key in keys) {
val value = obj[key].asBooleanOrNull()
if (value != null) return value
}
return null
}
}
private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull
private fun JsonElement?.asDoubleOrNull(): Double? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.isString) return primitive.content.toDoubleOrNull()
return primitive.doubleOrNull
}
private fun JsonElement?.asIntOrNull(): Int? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.isString) return primitive.content.toIntOrNull()
return primitive.intOrNull
}
private fun JsonElement?.asLongOrNull(): Long? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.isString) return primitive.content.toLongOrNull()
return primitive.longOrNull
}
private fun JsonElement?.asBooleanOrNull(): Boolean? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.booleanOrNull != null) return primitive.booleanOrNull
val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null
return when (content) {
"true", "yes", "1" -> true
"false", "no", "0" -> false
else -> null
}
}

View File

@@ -0,0 +1,713 @@
package com.steipete.clawdis.node.voice
import android.Manifest
import android.content.Context
import android.content.Intent
import android.content.pm.PackageManager
import android.media.AudioAttributes
import android.media.MediaPlayer
import android.os.Bundle
import android.os.Handler
import android.os.Looper
import android.os.SystemClock
import android.speech.RecognitionListener
import android.speech.RecognizerIntent
import android.speech.SpeechRecognizer
import android.util.Log
import androidx.core.content.ContextCompat
import com.steipete.clawdis.node.bridge.BridgeSession
import java.io.File
import java.net.HttpURLConnection
import java.net.URL
import java.util.UUID
import kotlinx.coroutines.CompletableDeferred
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.delay
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.flow.StateFlow
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
import kotlinx.serialization.json.buildJsonObject
class TalkModeManager(
private val context: Context,
private val scope: CoroutineScope,
) {
companion object {
private const val tag = "TalkMode"
}
private val mainHandler = Handler(Looper.getMainLooper())
private val json = Json { ignoreUnknownKeys = true }
private val _isEnabled = MutableStateFlow(false)
val isEnabled: StateFlow<Boolean> = _isEnabled
private val _isListening = MutableStateFlow(false)
val isListening: StateFlow<Boolean> = _isListening
private val _isSpeaking = MutableStateFlow(false)
val isSpeaking: StateFlow<Boolean> = _isSpeaking
private val _statusText = MutableStateFlow("Off")
val statusText: StateFlow<String> = _statusText
private var recognizer: SpeechRecognizer? = null
private var restartJob: Job? = null
private var stopRequested = false
private var listeningMode = false
private var silenceJob: Job? = null
private val silenceWindowMs = 700L
private var lastTranscript: String = ""
private var lastHeardAtMs: Long? = null
private var lastSpokenText: String? = null
private var lastInterruptedAtSeconds: Double? = null
private var defaultVoiceId: String? = null
private var currentVoiceId: String? = null
private var defaultModelId: String? = null
private var currentModelId: String? = null
private var defaultOutputFormat: String? = null
private var interruptOnSpeech: Boolean = true
private var voiceOverrideActive = false
private var modelOverrideActive = false
private var session: BridgeSession? = null
private var pendingRunId: String? = null
private var pendingFinal: CompletableDeferred<Boolean>? = null
private var player: MediaPlayer? = null
private var currentAudioFile: File? = null
fun attachSession(session: BridgeSession) {
this.session = session
}
fun setEnabled(enabled: Boolean) {
if (_isEnabled.value == enabled) return
_isEnabled.value = enabled
if (enabled) {
start()
} else {
stop()
}
}
fun handleBridgeEvent(event: String, payloadJson: String?) {
if (event != "chat") return
if (payloadJson.isNullOrBlank()) return
val pending = pendingRunId ?: return
val obj =
try {
json.parseToJsonElement(payloadJson).asObjectOrNull()
} catch (_: Throwable) {
null
} ?: return
val runId = obj["runId"].asStringOrNull() ?: return
if (runId != pending) return
val state = obj["state"].asStringOrNull() ?: return
if (state == "final") {
pendingFinal?.complete(true)
pendingFinal = null
pendingRunId = null
}
}
private fun start() {
mainHandler.post {
if (_isListening.value) return@post
stopRequested = false
listeningMode = true
if (!SpeechRecognizer.isRecognitionAvailable(context)) {
_statusText.value = "Speech recognizer unavailable"
return@post
}
val micOk =
ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
PackageManager.PERMISSION_GRANTED
if (!micOk) {
_statusText.value = "Microphone permission required"
return@post
}
try {
recognizer?.destroy()
recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
startListeningInternal(markListening = true)
startSilenceMonitor()
} catch (err: Throwable) {
_statusText.value = "Start failed: ${err.message ?: err::class.simpleName}"
}
}
}
private fun stop() {
stopRequested = true
listeningMode = false
restartJob?.cancel()
restartJob = null
silenceJob?.cancel()
silenceJob = null
lastTranscript = ""
lastHeardAtMs = null
_isListening.value = false
_statusText.value = "Off"
stopSpeaking()
mainHandler.post {
recognizer?.cancel()
recognizer?.destroy()
recognizer = null
}
}
private fun startListeningInternal(markListening: Boolean) {
val r = recognizer ?: return
val intent =
Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM)
putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
}
if (markListening) {
_statusText.value = "Listening"
_isListening.value = true
}
r.startListening(intent)
}
private fun scheduleRestart(delayMs: Long = 350) {
if (stopRequested) return
restartJob?.cancel()
restartJob =
scope.launch {
delay(delayMs)
mainHandler.post {
if (stopRequested) return@post
try {
recognizer?.cancel()
val shouldListen = listeningMode
val shouldInterrupt = _isSpeaking.value && interruptOnSpeech
if (!shouldListen && !shouldInterrupt) return@post
startListeningInternal(markListening = shouldListen)
} catch (_: Throwable) {
// handled by onError
}
}
}
}
private fun handleTranscript(text: String, isFinal: Boolean) {
val trimmed = text.trim()
if (_isSpeaking.value && interruptOnSpeech) {
if (shouldInterrupt(trimmed)) {
stopSpeaking()
}
return
}
if (!_isListening.value) return
if (trimmed.isNotEmpty()) {
lastTranscript = trimmed
lastHeardAtMs = SystemClock.elapsedRealtime()
}
if (isFinal) {
lastTranscript = trimmed
}
}
private fun startSilenceMonitor() {
silenceJob?.cancel()
silenceJob =
scope.launch {
while (_isEnabled.value) {
delay(200)
checkSilence()
}
}
}
private fun checkSilence() {
if (!_isListening.value) return
val transcript = lastTranscript.trim()
if (transcript.isEmpty()) return
val lastHeard = lastHeardAtMs ?: return
val elapsed = SystemClock.elapsedRealtime() - lastHeard
if (elapsed < silenceWindowMs) return
scope.launch { finalizeTranscript(transcript) }
}
private suspend fun finalizeTranscript(transcript: String) {
listeningMode = false
_isListening.value = false
_statusText.value = "Thinking…"
lastTranscript = ""
lastHeardAtMs = null
reloadConfig()
val prompt = buildPrompt(transcript)
val bridge = session
if (bridge == null) {
_statusText.value = "Bridge not connected"
start()
return
}
try {
val runId = sendChat(prompt, bridge)
val ok = waitForChatFinal(runId)
if (!ok) {
_statusText.value = "No reply"
start()
return
}
val assistant = fetchLatestAssistantText(bridge)
if (assistant.isNullOrBlank()) {
_statusText.value = "No reply"
start()
return
}
playAssistant(assistant)
} catch (err: Throwable) {
_statusText.value = "Talk failed: ${err.message ?: err::class.simpleName}"
}
if (_isEnabled.value) {
start()
}
}
private fun buildPrompt(transcript: String): String {
val lines = mutableListOf(
"Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
)
lastInterruptedAtSeconds?.let {
lines.add("Assistant speech interrupted at ${"%.1f".format(it)}s.")
lastInterruptedAtSeconds = null
}
lines.add("")
lines.add(transcript)
return lines.joinToString("\n")
}
private suspend fun sendChat(message: String, bridge: BridgeSession): String {
val runId = UUID.randomUUID().toString()
val params =
buildJsonObject {
put("sessionKey", JsonPrimitive("main"))
put("message", JsonPrimitive(message))
put("thinking", JsonPrimitive("low"))
put("timeoutMs", JsonPrimitive(30_000))
put("idempotencyKey", JsonPrimitive(runId))
}
val res = bridge.request("chat.send", params.toString())
val parsed = parseRunId(res) ?: runId
if (parsed != runId) {
pendingRunId = parsed
}
return parsed
}
private suspend fun waitForChatFinal(runId: String): Boolean {
pendingFinal?.cancel()
val deferred = CompletableDeferred<Boolean>()
pendingRunId = runId
pendingFinal = deferred
val result =
withContext(Dispatchers.IO) {
try {
kotlinx.coroutines.withTimeout(120_000) { deferred.await() }
} catch (_: Throwable) {
false
}
}
if (!result) {
pendingFinal = null
pendingRunId = null
}
return result
}
private suspend fun fetchLatestAssistantText(bridge: BridgeSession): String? {
val res = bridge.request("chat.history", "{\"sessionKey\":\"main\"}")
val root = json.parseToJsonElement(res).asObjectOrNull() ?: return null
val messages = root["messages"] as? JsonArray ?: return null
for (item in messages.reversed()) {
val obj = item.asObjectOrNull() ?: continue
if (obj["role"].asStringOrNull() != "assistant") continue
val content = obj["content"] as? JsonArray ?: continue
val text =
content.mapNotNull { entry ->
entry.asObjectOrNull()?.get("text")?.asStringOrNull()?.trim()
}.filter { it.isNotEmpty() }
if (text.isNotEmpty()) return text.joinToString("\n")
}
return null
}
private suspend fun playAssistant(text: String) {
val parsed = TalkDirectiveParser.parse(text)
if (parsed.unknownKeys.isNotEmpty()) {
Log.w(tag, "Unknown talk directive keys: ${parsed.unknownKeys}")
}
val directive = parsed.directive
val cleaned = parsed.stripped.trim()
if (cleaned.isEmpty()) return
if (directive?.voiceId != null) {
if (directive.once != true) {
currentVoiceId = directive.voiceId
voiceOverrideActive = true
}
}
if (directive?.modelId != null) {
if (directive.once != true) {
currentModelId = directive.modelId
modelOverrideActive = true
}
}
val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId
if (voiceId.isNullOrBlank()) {
_statusText.value = "Missing voice ID"
return
}
val apiKey = System.getenv("ELEVENLABS_API_KEY")?.trim()
if (apiKey.isNullOrEmpty()) {
_statusText.value = "Missing ELEVENLABS_API_KEY"
return
}
_statusText.value = "Speaking…"
_isSpeaking.value = true
lastSpokenText = cleaned
ensureInterruptListener()
try {
val request =
ElevenLabsRequest(
text = cleaned,
modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
outputFormat = directive?.outputFormat ?: defaultOutputFormat,
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
stability = TalkModeRuntime.validatedUnit(directive?.stability),
similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
style = TalkModeRuntime.validatedUnit(directive?.style),
speakerBoost = directive?.speakerBoost,
seed = TalkModeRuntime.validatedSeed(directive?.seed),
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
language = TalkModeRuntime.validatedLanguage(directive?.language),
)
val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request)
playAudio(audio)
} catch (err: Throwable) {
_statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
}
_isSpeaking.value = false
}
private suspend fun playAudio(data: ByteArray) {
stopSpeaking(resetInterrupt = false)
val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
file.writeBytes(data)
currentAudioFile = file
val player = MediaPlayer()
this.player = player
val finished = CompletableDeferred<Unit>()
player.setAudioAttributes(
AudioAttributes.Builder()
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.setUsage(AudioAttributes.USAGE_ASSISTANT)
.build(),
)
player.setOnCompletionListener {
finished.complete(Unit)
}
player.setOnErrorListener { _, _, _ ->
finished.completeExceptionally(IllegalStateException("MediaPlayer error"))
true
}
player.setDataSource(file.absolutePath)
withContext(Dispatchers.Main) {
player.setOnPreparedListener { it.start() }
player.prepareAsync()
}
try {
finished.await()
} finally {
cleanupPlayer()
}
}
private fun stopSpeaking(resetInterrupt: Boolean = true) {
if (!_isSpeaking.value) {
cleanupPlayer()
return
}
if (resetInterrupt) {
val currentMs = player?.currentPosition?.toDouble() ?: 0.0
lastInterruptedAtSeconds = currentMs / 1000.0
}
cleanupPlayer()
_isSpeaking.value = false
}
private fun cleanupPlayer() {
player?.stop()
player?.release()
player = null
currentAudioFile?.delete()
currentAudioFile = null
}
private fun shouldInterrupt(transcript: String): Boolean {
val trimmed = transcript.trim()
if (trimmed.length < 3) return false
val spoken = lastSpokenText?.lowercase()
if (spoken != null && spoken.contains(trimmed.lowercase())) return false
return true
}
private suspend fun reloadConfig() {
val bridge = session ?: return
val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim()
val sagVoice = System.getenv("SAG_VOICE_ID")?.trim()
try {
val res = bridge.request("config.get", "{}")
val root = json.parseToJsonElement(res).asObjectOrNull()
val config = root?.get("config").asObjectOrNull()
val talk = config?.get("talk").asObjectOrNull()
val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull()
defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
defaultModelId = model
if (!modelOverrideActive) currentModelId = defaultModelId
defaultOutputFormat = outputFormat
if (interrupt != null) interruptOnSpeech = interrupt
} catch (_: Throwable) {
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
}
}
private fun parseRunId(jsonString: String): String? {
val obj = json.parseToJsonElement(jsonString).asObjectOrNull() ?: return null
return obj["runId"].asStringOrNull()
}
private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray {
return withContext(Dispatchers.IO) {
val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId")
val conn = url.openConnection() as HttpURLConnection
conn.requestMethod = "POST"
conn.setRequestProperty("Content-Type", "application/json")
conn.setRequestProperty("Accept", "audio/mpeg")
conn.setRequestProperty("xi-api-key", apiKey)
conn.doOutput = true
val payload = buildRequestPayload(request)
conn.outputStream.use { it.write(payload.toByteArray()) }
val code = conn.responseCode
val stream = if (code >= 400) conn.errorStream else conn.inputStream
val data = stream.readBytes()
if (code >= 400) {
val message = String(data)
throw IllegalStateException("ElevenLabs failed: $code $message")
}
data
}
}
private fun buildRequestPayload(request: ElevenLabsRequest): String {
val voiceSettingsEntries =
buildJsonObject {
request.speed?.let { put("speed", JsonPrimitive(it)) }
request.stability?.let { put("stability", JsonPrimitive(it)) }
request.similarity?.let { put("similarity_boost", JsonPrimitive(it)) }
request.style?.let { put("style", JsonPrimitive(it)) }
request.speakerBoost?.let { put("use_speaker_boost", JsonPrimitive(it)) }
}
val payload =
buildJsonObject {
put("text", JsonPrimitive(request.text))
request.modelId?.takeIf { it.isNotEmpty() }?.let { put("model_id", JsonPrimitive(it)) }
request.outputFormat?.takeIf { it.isNotEmpty() }?.let { put("output_format", JsonPrimitive(it)) }
request.seed?.let { put("seed", JsonPrimitive(it)) }
request.normalize?.let { put("apply_text_normalization", JsonPrimitive(it)) }
request.language?.let { put("language_code", JsonPrimitive(it)) }
if (voiceSettingsEntries.isNotEmpty()) {
put("voice_settings", voiceSettingsEntries)
}
}
return payload.toString()
}
private data class ElevenLabsRequest(
val text: String,
val modelId: String?,
val outputFormat: String?,
val speed: Double?,
val stability: Double?,
val similarity: Double?,
val style: Double?,
val speakerBoost: Boolean?,
val seed: Long?,
val normalize: String?,
val language: String?,
)
private object TalkModeRuntime {
fun resolveSpeed(speed: Double?, rateWpm: Int?): Double? {
if (rateWpm != null && rateWpm > 0) {
val resolved = rateWpm.toDouble() / 175.0
if (resolved <= 0.5 || resolved >= 2.0) return null
return resolved
}
if (speed != null) {
if (speed <= 0.5 || speed >= 2.0) return null
return speed
}
return null
}
fun validatedUnit(value: Double?): Double? {
if (value == null) return null
if (value < 0 || value > 1) return null
return value
}
fun validatedSeed(value: Long?): Long? {
if (value == null) return null
if (value < 0 || value > 4294967295L) return null
return value
}
fun validatedNormalize(value: String?): String? {
val normalized = value?.trim()?.lowercase() ?: return null
return if (normalized in listOf("auto", "on", "off")) normalized else null
}
fun validatedLanguage(value: String?): String? {
val normalized = value?.trim()?.lowercase() ?: return null
if (normalized.length != 2) return null
if (!normalized.all { it in 'a'..'z' }) return null
return normalized
}
}
private fun ensureInterruptListener() {
if (!interruptOnSpeech || !_isEnabled.value) return
mainHandler.post {
if (stopRequested) return@post
if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
try {
if (recognizer == null) {
recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
}
recognizer?.cancel()
startListeningInternal(markListening = false)
} catch (_: Throwable) {
// ignore
}
}
}
private val listener =
object : RecognitionListener {
override fun onReadyForSpeech(params: Bundle?) {
if (_isEnabled.value) {
_statusText.value = if (_isListening.value) "Listening" else _statusText.value
}
}
override fun onBeginningOfSpeech() {}
override fun onRmsChanged(rmsdB: Float) {}
override fun onBufferReceived(buffer: ByteArray?) {}
override fun onEndOfSpeech() {
scheduleRestart()
}
override fun onError(error: Int) {
if (stopRequested) return
_isListening.value = false
if (error == SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) {
_statusText.value = "Microphone permission required"
return
}
_statusText.value =
when (error) {
SpeechRecognizer.ERROR_AUDIO -> "Audio error"
SpeechRecognizer.ERROR_CLIENT -> "Client error"
SpeechRecognizer.ERROR_NETWORK -> "Network error"
SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> "Network timeout"
SpeechRecognizer.ERROR_NO_MATCH -> "Listening"
SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "Recognizer busy"
SpeechRecognizer.ERROR_SERVER -> "Server error"
SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Listening"
else -> "Speech error ($error)"
}
scheduleRestart(delayMs = 600)
}
override fun onResults(results: Bundle?) {
val list = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty()
list.firstOrNull()?.let { handleTranscript(it, isFinal = true) }
scheduleRestart()
}
override fun onPartialResults(partialResults: Bundle?) {
val list = partialResults?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty()
list.firstOrNull()?.let { handleTranscript(it, isFinal = false) }
}
override fun onEvent(eventType: Int, params: Bundle?) {}
}
}
private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject
private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull
private fun JsonElement?.asBooleanOrNull(): Boolean? {
val primitive = this as? JsonPrimitive ?: return null
if (primitive.booleanOrNull != null) return primitive.booleanOrNull
val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null
return when (content) {
"true", "yes", "1" -> true
"false", "no", "0" -> false
else -> null
}
}

View File

@@ -0,0 +1,55 @@
package com.steipete.clawdis.node.voice
import org.junit.Assert.assertEquals
import org.junit.Assert.assertNull
import org.junit.Assert.assertTrue
import org.junit.Test
class TalkDirectiveParserTest {
@Test
fun parsesDirectiveAndStripsHeader() {
val input = """
{"voice":"voice-123","once":true}
Hello from talk mode.
""".trimIndent()
val result = TalkDirectiveParser.parse(input)
assertEquals("voice-123", result.directive?.voiceId)
assertEquals(true, result.directive?.once)
assertEquals("Hello from talk mode.", result.stripped.trim())
}
@Test
fun ignoresUnknownKeysButReportsThem() {
val input = """
{"voice":"abc","foo":1,"bar":"baz"}
Hi there.
""".trimIndent()
val result = TalkDirectiveParser.parse(input)
assertEquals("abc", result.directive?.voiceId)
assertTrue(result.unknownKeys.containsAll(listOf("bar", "foo")))
}
@Test
fun parsesAlternateKeys() {
val input = """
{"model_id":"eleven_v3","similarity_boost":0.4,"no_speaker_boost":true,"rate":200}
Speak.
""".trimIndent()
val result = TalkDirectiveParser.parse(input)
assertEquals("eleven_v3", result.directive?.modelId)
assertEquals(0.4, result.directive?.similarity)
assertEquals(false, result.directive?.speakerBoost)
assertEquals(200, result.directive?.rateWpm)
}
@Test
fun returnsNullWhenNoDirectivePresent() {
val input = """
{}
Hello.
""".trimIndent()
val result = TalkDirectiveParser.parse(input)
assertNull(result.directive)
assertEquals(input, result.stripped)
}
}

View File

@@ -28,6 +28,7 @@ final class NodeAppModel {
private var voiceWakeSyncTask: Task<Void, Never>?
@ObservationIgnored private var cameraHUDDismissTask: Task<Void, Never>?
let voiceWake = VoiceWakeManager()
let talkMode = TalkModeManager()
private var lastAutoA2uiURL: String?
var bridgeSession: BridgeSession { self.bridge }
@@ -49,6 +50,9 @@ final class NodeAppModel {
let enabled = UserDefaults.standard.bool(forKey: "voiceWake.enabled")
self.voiceWake.setEnabled(enabled)
self.talkMode.attachBridge(self.bridge)
let talkEnabled = UserDefaults.standard.bool(forKey: "talk.enabled")
self.talkMode.setEnabled(talkEnabled)
// Wire up deep links from canvas taps
self.screen.onDeepLink = { [weak self] url in
@@ -177,6 +181,10 @@ final class NodeAppModel {
self.voiceWake.setEnabled(enabled)
}
func setTalkEnabled(_ enabled: Bool) {
self.talkMode.setEnabled(enabled)
}
func connectToBridge(
endpoint: NWEndpoint,
hello: BridgeHello)

View File

@@ -20,6 +20,7 @@ struct SettingsTab: View {
@AppStorage("node.displayName") private var displayName: String = "iOS Node"
@AppStorage("node.instanceId") private var instanceId: String = UUID().uuidString
@AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false
@AppStorage("talk.enabled") private var talkEnabled: Bool = false
@AppStorage("camera.enabled") private var cameraEnabled: Bool = true
@AppStorage("screen.preventSleep") private var preventSleep: Bool = true
@AppStorage("bridge.preferredStableID") private var preferredBridgeStableID: String = ""
@@ -156,6 +157,10 @@ struct SettingsTab: View {
.onChange(of: self.voiceWakeEnabled) { _, newValue in
self.appModel.setVoiceWakeEnabled(newValue)
}
Toggle("Talk Mode", isOn: self.$talkEnabled)
.onChange(of: self.talkEnabled) { _, newValue in
self.appModel.setTalkEnabled(newValue)
}
NavigationLink {
VoiceWakeWordsSettingsView()

View File

@@ -0,0 +1,518 @@
import AVFAudio
import ClawdisKit
import Foundation
import Observation
import Speech
@MainActor
@Observable
final class TalkModeManager: NSObject {
var isEnabled: Bool = false
var isListening: Bool = false
var isSpeaking: Bool = false
var statusText: String = "Off"
private let audioEngine = AVAudioEngine()
private var speechRecognizer: SFSpeechRecognizer?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var silenceTask: Task<Void, Never>?
private var lastHeard: Date?
private var lastTranscript: String = ""
private var lastSpokenText: String?
private var lastInterruptedAtSeconds: Double?
private var defaultVoiceId: String?
private var currentVoiceId: String?
private var defaultModelId: String?
private var currentModelId: String?
private var defaultOutputFormat: String?
private var interruptOnSpeech: Bool = true
private var bridge: BridgeSession?
private let silenceWindow: TimeInterval = 0.7
private var player: AVAudioPlayer?
func attachBridge(_ bridge: BridgeSession) {
self.bridge = bridge
}
func setEnabled(_ enabled: Bool) {
self.isEnabled = enabled
if enabled {
Task { await self.start() }
} else {
self.stop()
}
}
func start() async {
guard self.isEnabled else { return }
if self.isListening { return }
self.statusText = "Requesting permissions…"
let micOk = await Self.requestMicrophonePermission()
guard micOk else {
self.statusText = "Microphone permission denied"
return
}
let speechOk = await Self.requestSpeechPermission()
guard speechOk else {
self.statusText = "Speech recognition permission denied"
return
}
await self.reloadConfig()
do {
try Self.configureAudioSession()
try self.startRecognition()
self.isListening = true
self.statusText = "Listening"
self.startSilenceMonitor()
} catch {
self.isListening = false
self.statusText = "Start failed: \(error.localizedDescription)"
}
}
func stop() {
self.isEnabled = false
self.isListening = false
self.statusText = "Off"
self.lastTranscript = ""
self.lastHeard = nil
self.silenceTask?.cancel()
self.silenceTask = nil
self.stopRecognition()
self.stopSpeaking()
}
private func startRecognition() throws {
self.speechRecognizer = SFSpeechRecognizer()
guard let recognizer = self.speechRecognizer else {
throw NSError(domain: "TalkMode", code: 1, userInfo: [
NSLocalizedDescriptionKey: "Speech recognizer unavailable",
])
}
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return }
let input = self.audioEngine.inputNode
let format = input.outputFormat(forBus: 0)
input.removeTap(onBus: 0)
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
request?.append(buffer)
}
self.audioEngine.prepare()
try self.audioEngine.start()
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
guard let self else { return }
if let error {
self.statusText = "Speech error: \(error.localizedDescription)"
}
guard let result else { return }
let transcript = result.bestTranscription.formattedString
Task { @MainActor in
await self.handleTranscript(transcript: transcript, isFinal: result.isFinal)
}
}
}
private func stopRecognition() {
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
self.audioEngine.inputNode.removeTap(onBus: 0)
self.audioEngine.stop()
self.speechRecognizer = nil
}
private func handleTranscript(transcript: String, isFinal: Bool) async {
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
if self.isSpeaking, self.interruptOnSpeech {
if self.shouldInterrupt(with: trimmed) {
self.stopSpeaking()
}
return
}
guard self.isListening else { return }
if !trimmed.isEmpty {
self.lastTranscript = trimmed
self.lastHeard = Date()
}
if isFinal {
self.lastTranscript = trimmed
}
}
private func startSilenceMonitor() {
self.silenceTask?.cancel()
self.silenceTask = Task { [weak self] in
guard let self else { return }
while self.isEnabled {
try? await Task.sleep(nanoseconds: 200_000_000)
await self.checkSilence()
}
}
}
private func checkSilence() async {
guard self.isListening else { return }
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !transcript.isEmpty else { return }
guard let lastHeard else { return }
if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return }
await self.finalizeTranscript(transcript)
}
private func finalizeTranscript(_ transcript: String) async {
self.isListening = false
self.statusText = "Thinking…"
self.lastTranscript = ""
self.lastHeard = nil
self.stopRecognition()
await self.reloadConfig()
let prompt = self.buildPrompt(transcript: transcript)
guard let bridge else {
self.statusText = "Bridge not connected"
await self.start()
return
}
do {
let runId = try await self.sendChat(prompt, bridge: bridge)
let ok = await self.waitForChatFinal(runId: runId, bridge: bridge)
if !ok {
self.statusText = "No reply"
await self.start()
return
}
guard let assistantText = try await self.fetchLatestAssistantText(bridge: bridge) else {
self.statusText = "No reply"
await self.start()
return
}
await self.playAssistant(text: assistantText)
} catch {
self.statusText = "Talk failed: \(error.localizedDescription)"
}
await self.start()
}
private func buildPrompt(transcript: String) -> String {
var lines: [String] = [
"Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
]
if let interrupted = self.lastInterruptedAtSeconds {
let formatted = String(format: "%.1f", interrupted)
lines.append("Assistant speech interrupted at \(formatted)s.")
self.lastInterruptedAtSeconds = nil
}
lines.append("")
lines.append(transcript)
return lines.joined(separator: "\n")
}
private func sendChat(_ message: String, bridge: BridgeSession) async throws -> String {
struct SendResponse: Decodable { let runId: String }
let payload: [String: Any] = [
"sessionKey": "main",
"message": message,
"thinking": "low",
"timeoutMs": 30_000,
"idempotencyKey": UUID().uuidString,
]
let data = try JSONSerialization.data(withJSONObject: payload)
let json = String(decoding: data, as: UTF8.self)
let res = try await bridge.request(method: "chat.send", paramsJSON: json, timeoutSeconds: 30)
let decoded = try JSONDecoder().decode(SendResponse.self, from: res)
return decoded.runId
}
private func waitForChatFinal(runId: String, bridge: BridgeSession) async -> Bool {
let stream = await bridge.subscribeServerEvents(bufferingNewest: 200)
let timeout = Date().addingTimeInterval(120)
for await evt in stream {
if Date() > timeout { return false }
guard evt.event == "chat", let payload = evt.payloadJSON else { continue }
guard let data = payload.data(using: .utf8) else { continue }
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { continue }
if (json["runId"] as? String) != runId { continue }
if let state = json["state"] as? String, state == "final" {
return true
}
}
return false
}
private func fetchLatestAssistantText(bridge: BridgeSession) async throws -> String? {
let res = try await bridge.request(method: "chat.history", paramsJSON: "{\"sessionKey\":\"main\"}", timeoutSeconds: 15)
guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return nil }
guard let messages = json["messages"] as? [[String: Any]] else { return nil }
for msg in messages.reversed() {
guard (msg["role"] as? String) == "assistant" else { continue }
guard let content = msg["content"] as? [[String: Any]] else { continue }
let text = content.compactMap { $0["text"] as? String }.joined(separator: "\n")
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
if !trimmed.isEmpty { return trimmed }
}
return nil
}
private func playAssistant(text: String) async {
let parsed = TalkDirectiveParser.parse(text)
let directive = parsed.directive
let cleaned = parsed.stripped.trimmingCharacters(in: .whitespacesAndNewlines)
guard !cleaned.isEmpty else { return }
if let voice = directive?.voiceId {
if directive?.once != true {
self.currentVoiceId = voice
}
}
if let model = directive?.modelId {
if directive?.once != true {
self.currentModelId = model
}
}
let voiceId = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId
guard let voiceId, !voiceId.isEmpty else {
self.statusText = "Missing voice ID"
return
}
guard let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"], !apiKey.isEmpty else {
self.statusText = "Missing ELEVENLABS_API_KEY"
return
}
self.statusText = "Speaking…"
self.isSpeaking = true
self.lastSpokenText = cleaned
do {
let request = ElevenLabsRequest(
text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
outputFormat: directive?.outputFormat ?? self.defaultOutputFormat,
speed: TalkModeRuntime.resolveSpeed(
speed: directive?.speed,
rateWPM: directive?.rateWPM),
stability: TalkModeRuntime.validatedUnit(directive?.stability),
similarity: TalkModeRuntime.validatedUnit(directive?.similarity),
style: TalkModeRuntime.validatedUnit(directive?.style),
speakerBoost: directive?.speakerBoost,
seed: TalkModeRuntime.validatedSeed(directive?.seed),
normalize: TalkModeRuntime.validatedNormalize(directive?.normalize),
language: TalkModeRuntime.validatedLanguage(directive?.language))
let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize(
voiceId: voiceId,
request: request)
try await self.playAudio(data: audio)
} catch {
self.statusText = "Speak failed: \(error.localizedDescription)"
}
self.isSpeaking = false
}
private func playAudio(data: Data) async throws {
self.player?.stop()
let player = try AVAudioPlayer(data: data)
self.player = player
player.prepareToPlay()
player.play()
while player.isPlaying {
try? await Task.sleep(nanoseconds: 120_000_000)
}
}
private func stopSpeaking() {
guard self.isSpeaking else { return }
self.lastInterruptedAtSeconds = self.player?.currentTime
self.player?.stop()
self.player = nil
self.isSpeaking = false
}
private func shouldInterrupt(with transcript: String) -> Bool {
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
guard trimmed.count >= 3 else { return false }
if let spoken = self.lastSpokenText?.lowercased(), spoken.contains(trimmed.lowercased()) {
return false
}
return true
}
private func reloadConfig() async {
guard let bridge else { return }
do {
let res = try await bridge.request(method: "config.get", paramsJSON: "{}", timeoutSeconds: 8)
guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return }
guard let config = json["config"] as? [String: Any] else { return }
let talk = config["talk"] as? [String: Any]
self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
self.currentVoiceId = self.defaultVoiceId
self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
self.currentModelId = self.defaultModelId
self.defaultOutputFormat = (talk?["outputFormat"] as? String)?
.trimmingCharacters(in: .whitespacesAndNewlines)
if let interrupt = talk?["interruptOnSpeech"] as? Bool {
self.interruptOnSpeech = interrupt
}
} catch {
// ignore
}
}
private static func configureAudioSession() throws {
let session = AVAudioSession.sharedInstance()
try session.setCategory(.playAndRecord, mode: .measurement, options: [
.duckOthers,
.mixWithOthers,
.allowBluetoothHFP,
.defaultToSpeaker,
])
try session.setActive(true, options: [])
}
private nonisolated static func requestMicrophonePermission() async -> Bool {
await withCheckedContinuation(isolation: nil) { cont in
AVAudioApplication.requestRecordPermission { ok in
cont.resume(returning: ok)
}
}
}
private nonisolated static func requestSpeechPermission() async -> Bool {
await withCheckedContinuation(isolation: nil) { cont in
SFSpeechRecognizer.requestAuthorization { status in
cont.resume(returning: status == .authorized)
}
}
}
}
private struct ElevenLabsRequest {
let text: String
let modelId: String?
let outputFormat: String?
let speed: Double?
let stability: Double?
let similarity: Double?
let style: Double?
let speakerBoost: Bool?
let seed: UInt32?
let normalize: String?
let language: String?
}
private struct ElevenLabsClient {
let apiKey: String
let baseUrl = URL(string: "https://api.elevenlabs.io")!
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
var payload: [String: Any] = [
"text": request.text,
]
if let modelId = request.modelId, !modelId.isEmpty {
payload["model_id"] = modelId
}
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
payload["output_format"] = outputFormat
}
if let seed = request.seed {
payload["seed"] = seed
}
if let normalize = request.normalize {
payload["apply_text_normalization"] = normalize
}
if let language = request.language {
payload["language_code"] = language
}
var voiceSettings: [String: Any] = [:]
if let speed = request.speed { voiceSettings["speed"] = speed }
if let stability = request.stability { voiceSettings["stability"] = stability }
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
if let style = request.style { voiceSettings["style"] = style }
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
if !voiceSettings.isEmpty { payload["voice_settings"] = voiceSettings }
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = String(data: data, encoding: .utf8) ?? "unknown"
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
])
}
return data
}
}
private enum TalkModeRuntime {
static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
if let rateWPM, rateWPM > 0 {
let resolved = Double(rateWPM) / 175.0
if resolved <= 0.5 || resolved >= 2.0 { return nil }
return resolved
}
if let speed {
if speed <= 0.5 || speed >= 2.0 { return nil }
return speed
}
return nil
}
static func validatedUnit(_ value: Double?) -> Double? {
guard let value else { return nil }
if value < 0 || value > 1 { return nil }
return value
}
static func validatedSeed(_ value: Int?) -> UInt32? {
guard let value else { return nil }
if value < 0 || value > 4294967295 { return nil }
return UInt32(value)
}
static func validatedNormalize(_ value: String?) -> String? {
guard let value else { return nil }
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
return ["auto", "on", "off"].contains(normalized) ? normalized : nil
}
static func validatedLanguage(_ value: String?) -> String? {
guard let value else { return nil }
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
return normalized
}
}

View File

@@ -4,6 +4,7 @@ struct VoiceTab: View {
@Environment(NodeAppModel.self) private var appModel
@Environment(VoiceWakeManager.self) private var voiceWake
@AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false
@AppStorage("talk.enabled") private var talkEnabled: Bool = false
var body: some View {
NavigationStack {
@@ -14,6 +15,7 @@ struct VoiceTab: View {
Text(self.voiceWake.statusText)
.font(.footnote)
.foregroundStyle(.secondary)
LabeledContent("Talk Mode", value: self.talkEnabled ? "Enabled" : "Disabled")
}
Section("Notes") {
@@ -36,6 +38,9 @@ struct VoiceTab: View {
.onChange(of: self.voiceWakeEnabled) { _, newValue in
self.appModel.setVoiceWakeEnabled(newValue)
}
.onChange(of: self.talkEnabled) { _, newValue in
self.appModel.setTalkEnabled(newValue)
}
}
}
}

View File

@@ -121,6 +121,15 @@ final class AppState {
forKey: voicePushToTalkEnabledKey) } }
}
var talkEnabled: Bool {
didSet {
self.ifNotPreview {
UserDefaults.standard.set(self.talkEnabled, forKey: talkEnabledKey)
Task { await TalkModeController.shared.setEnabled(self.talkEnabled) }
}
}
}
var iconOverride: IconOverrideSelection {
didSet { self.ifNotPreview { UserDefaults.standard.set(self.iconOverride.rawValue, forKey: iconOverrideKey) } }
}
@@ -216,6 +225,7 @@ final class AppState {
.stringArray(forKey: voiceWakeAdditionalLocalesKey) ?? []
self.voicePushToTalkEnabled = UserDefaults.standard
.object(forKey: voicePushToTalkEnabledKey) as? Bool ?? false
self.talkEnabled = UserDefaults.standard.bool(forKey: talkEnabledKey)
if let storedHeartbeats = UserDefaults.standard.object(forKey: heartbeatsEnabledKey) as? Bool {
self.heartbeatsEnabled = storedHeartbeats
} else {
@@ -256,9 +266,13 @@ final class AppState {
if self.swabbleEnabled, !PermissionManager.voiceWakePermissionsGranted() {
self.swabbleEnabled = false
}
if self.talkEnabled, !PermissionManager.voiceWakePermissionsGranted() {
self.talkEnabled = false
}
if !self.isPreview {
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
Task { await TalkModeController.shared.setEnabled(self.talkEnabled) }
}
}
@@ -312,6 +326,23 @@ final class AppState {
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
}
func setTalkEnabled(_ enabled: Bool) async {
guard voiceWakeSupported else {
self.talkEnabled = false
return
}
self.talkEnabled = enabled
guard !self.isPreview else { return }
if !enabled { return }
if PermissionManager.voiceWakePermissionsGranted() { return }
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
self.talkEnabled = granted
}
// MARK: - Global wake words sync (Gateway-owned)
func applyGlobalVoiceWakeTriggers(_ triggers: [String]) {
@@ -367,6 +398,7 @@ extension AppState {
state.voiceWakeLocaleID = Locale.current.identifier
state.voiceWakeAdditionalLocaleIDs = ["en-US", "de-DE"]
state.voicePushToTalkEnabled = false
state.talkEnabled = false
state.iconOverride = .system
state.heartbeatsEnabled = true
state.connectionMode = .local

View File

@@ -30,6 +30,10 @@ struct ConfigSettings: View {
@State private var browserColorHex: String = "#FF4500"
@State private var browserAttachOnly: Bool = false
// Talk mode settings (stored in ~/.clawdis/clawdis.json under "talk")
@State private var talkVoiceId: String = ""
@State private var talkInterruptOnSpeech: Bool = true
var body: some View {
ScrollView { self.content }
.onChange(of: self.modelCatalogPath) { _, _ in
@@ -53,6 +57,7 @@ struct ConfigSettings: View {
self.header
self.agentSection
self.heartbeatSection
self.talkSection
self.browserSection
Spacer(minLength: 0)
}
@@ -266,6 +271,37 @@ struct ConfigSettings: View {
.frame(maxWidth: .infinity, alignment: .leading)
}
private var talkSection: some View {
GroupBox("Talk Mode") {
Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) {
GridRow {
self.gridLabel("Voice ID")
VStack(alignment: .leading, spacing: 6) {
ComboBox("ElevenLabs voice ID", text: self.$talkVoiceId) {
ForEach(self.talkVoiceSuggestions, id: \.self) { value in
Text(value).tag(value)
}
}
.textFieldStyle(.roundedBorder)
.frame(maxWidth: .infinity)
.onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() }
Text("Defaults to ELEVENLABS_VOICE_ID / SAG_VOICE_ID if unset.")
.font(.footnote)
.foregroundStyle(.secondary)
}
}
GridRow {
self.gridLabel("Interrupt")
Toggle("Stop speaking when you start talking", isOn: self.$talkInterruptOnSpeech)
.labelsHidden()
.toggleStyle(.checkbox)
.onChange(of: self.talkInterruptOnSpeech) { _, _ in self.autosaveConfig() }
}
}
}
.frame(maxWidth: .infinity, alignment: .leading)
}
private func gridLabel(_ text: String) -> some View {
Text(text)
.foregroundStyle(.secondary)
@@ -278,6 +314,7 @@ struct ConfigSettings: View {
let heartbeatMinutes = agent?["heartbeatMinutes"] as? Int
let heartbeatBody = agent?["heartbeatBody"] as? String
let browser = parsed["browser"] as? [String: Any]
let talk = parsed["talk"] as? [String: Any]
let loadedModel = (agent?["model"] as? String) ?? ""
if !loadedModel.isEmpty {
@@ -297,6 +334,13 @@ struct ConfigSettings: View {
if let color = browser["color"] as? String, !color.isEmpty { self.browserColorHex = color }
if let attachOnly = browser["attachOnly"] as? Bool { self.browserAttachOnly = attachOnly }
}
if let talk {
if let voice = talk["voiceId"] as? String { self.talkVoiceId = voice }
if let interrupt = talk["interruptOnSpeech"] as? Bool {
self.talkInterruptOnSpeech = interrupt
}
}
}
private func autosaveConfig() {
@@ -312,6 +356,7 @@ struct ConfigSettings: View {
var root = self.loadConfigDict()
var agent = root["agent"] as? [String: Any] ?? [:]
var browser = root["browser"] as? [String: Any] ?? [:]
var talk = root["talk"] as? [String: Any] ?? [:]
let chosenModel = (self.configModel == "__custom__" ? self.customModel : self.configModel)
.trimmingCharacters(in: .whitespacesAndNewlines)
@@ -337,6 +382,15 @@ struct ConfigSettings: View {
browser["attachOnly"] = self.browserAttachOnly
root["browser"] = browser
let trimmedVoice = self.talkVoiceId.trimmingCharacters(in: .whitespacesAndNewlines)
if trimmedVoice.isEmpty {
talk.removeValue(forKey: "voiceId")
} else {
talk["voiceId"] = trimmedVoice
}
talk["interruptOnSpeech"] = self.talkInterruptOnSpeech
root["talk"] = talk
ClawdisConfigFile.saveDict(root)
}
@@ -354,6 +408,20 @@ struct ConfigSettings: View {
return Color(red: r, green: g, blue: b)
}
private var talkVoiceSuggestions: [String] {
let env = ProcessInfo.processInfo.environment
let candidates = [
self.talkVoiceId,
env["ELEVENLABS_VOICE_ID"] ?? "",
env["SAG_VOICE_ID"] ?? "",
]
var seen = Set<String>()
return candidates
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
.filter { seen.insert($0).inserted }
}
private var browserPathLabel: String? {
guard self.browserEnabled else { return nil }

View File

@@ -16,6 +16,7 @@ let voiceWakeMicKey = "clawdis.voiceWakeMicID"
let voiceWakeLocaleKey = "clawdis.voiceWakeLocaleID"
let voiceWakeAdditionalLocalesKey = "clawdis.voiceWakeAdditionalLocaleIDs"
let voicePushToTalkEnabledKey = "clawdis.voicePushToTalkEnabled"
let talkEnabledKey = "clawdis.talkEnabled"
let iconOverrideKey = "clawdis.iconOverride"
let connectionModeKey = "clawdis.connectionMode"
let remoteTargetKey = "clawdis.remoteTarget"

View File

@@ -72,6 +72,11 @@ struct MenuContent: View {
if self.showVoiceWakeMicPicker {
self.voiceWakeMicMenu
}
Toggle(isOn: self.talkBinding) {
Label("Talk", systemImage: "bubble.left.and.waveform")
}
.disabled(!voiceWakeSupported)
.opacity(voiceWakeSupported ? 1 : 0.5)
Divider()
Button {
Task { @MainActor in
@@ -331,6 +336,14 @@ struct MenuContent: View {
})
}
private var talkBinding: Binding<Bool> {
Binding(
get: { self.state.talkEnabled },
set: { newValue in
Task { await self.state.setTalkEnabled(newValue) }
})
}
private var showVoiceWakeMicPicker: Bool {
voiceWakeSupported && self.state.swabbleEnabled
}

View File

@@ -0,0 +1,54 @@
import AVFoundation
import Foundation
import OSLog
@MainActor
final class TalkAudioPlayer: NSObject, AVAudioPlayerDelegate {
static let shared = TalkAudioPlayer()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts")
private var player: AVAudioPlayer?
private var continuation: CheckedContinuation<TalkPlaybackResult, Never>?
func play(data: Data) async -> TalkPlaybackResult {
self.stopInternal(interrupted: true)
do {
let player = try AVAudioPlayer(data: data)
self.player = player
player.delegate = self
player.prepareToPlay()
player.play()
return await withCheckedContinuation { continuation in
self.continuation = continuation
}
} catch {
self.logger.error("talk audio player failed: \(error.localizedDescription, privacy: .public)")
return TalkPlaybackResult(finished: false, interruptedAt: nil)
}
}
func stop() -> Double? {
guard let player else { return nil }
let time = player.currentTime
self.stopInternal(interrupted: true, interruptedAt: time)
return time
}
func audioPlayerDidFinishPlaying(_: AVAudioPlayer, successfully flag: Bool) {
self.stopInternal(interrupted: !flag)
}
private func stopInternal(interrupted: Bool, interruptedAt: Double? = nil) {
self.player?.stop()
self.player = nil
if let continuation {
self.continuation = nil
continuation.resume(returning: TalkPlaybackResult(finished: !interrupted, interruptedAt: interruptedAt))
}
}
}
struct TalkPlaybackResult: Sendable {
let finished: Bool
let interruptedAt: Double?
}

View File

@@ -0,0 +1,42 @@
import Observation
import OSLog
@MainActor
@Observable
final class TalkModeController {
static let shared = TalkModeController()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.controller")
func setEnabled(_ enabled: Bool) async {
self.logger.info("talk enabled=\(enabled)")
if enabled {
TalkOverlayController.shared.present()
} else {
TalkOverlayController.shared.dismiss()
}
await TalkModeRuntime.shared.setEnabled(enabled)
}
func updatePhase(_ phase: TalkModePhase) {
TalkOverlayController.shared.updatePhase(phase)
}
func updateLevel(_ level: Double) {
TalkOverlayController.shared.updateLevel(level)
}
func stopSpeaking(reason: TalkStopReason = .userTap) {
Task { await TalkModeRuntime.shared.stopSpeaking(reason: reason) }
}
func exitTalkMode() {
Task { await AppStateStore.shared.setTalkEnabled(false) }
}
}
enum TalkStopReason {
case userTap
case speech
case manual
}

View File

@@ -0,0 +1,684 @@
import AVFoundation
import ClawdisChatUI
import ClawdisKit
import Foundation
import OSLog
import Speech
actor TalkModeRuntime {
static let shared = TalkModeRuntime()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime")
private var recognizer: SFSpeechRecognizer?
private var audioEngine: AVAudioEngine?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var recognitionGeneration: Int = 0
private var captureTask: Task<Void, Never>?
private var silenceTask: Task<Void, Never>?
private var phase: TalkModePhase = .idle
private var isEnabled = false
private var lastHeard: Date?
private var noiseFloorRMS: Double = 1e-4
private var lastTranscript: String = ""
private var lastSpeechEnergyAt: Date?
private var defaultVoiceId: String?
private var currentVoiceId: String?
private var defaultModelId: String?
private var currentModelId: String?
private var voiceOverrideActive = false
private var modelOverrideActive = false
private var defaultOutputFormat: String?
private var interruptOnSpeech: Bool = true
private var lastInterruptedAtSeconds: Double?
private var lastSpokenText: String?
private let silenceWindow: TimeInterval = 0.7
private let minSpeechRMS: Double = 1e-3
private let speechBoostFactor: Double = 6.0
// MARK: - Lifecycle
func setEnabled(_ enabled: Bool) async {
guard enabled != self.isEnabled else { return }
self.isEnabled = enabled
if enabled {
await self.start()
} else {
await self.stop()
}
}
private func start() async {
guard voiceWakeSupported else { return }
guard PermissionManager.voiceWakePermissionsGranted() else {
self.logger.debug("talk runtime not starting: permissions missing")
return
}
await self.reloadConfig()
await self.startRecognition()
self.phase = .listening
await MainActor.run { TalkModeController.shared.updatePhase(.listening) }
self.startSilenceMonitor()
}
private func stop() async {
self.captureTask?.cancel()
self.captureTask = nil
self.silenceTask?.cancel()
self.silenceTask = nil
self.lastTranscript = ""
self.lastHeard = nil
self.lastSpeechEnergyAt = nil
self.phase = .idle
await self.stopRecognition()
await self.stopSpeaking(reason: .manual)
await MainActor.run {
TalkModeController.shared.updateLevel(0)
TalkModeController.shared.updatePhase(.idle)
}
}
// MARK: - Speech recognition
private struct RecognitionUpdate {
let transcript: String?
let segments: [SFTranscriptionSegment]
let isFinal: Bool
let error: Error?
let generation: Int
}
private func startRecognition() async {
await self.stopRecognition()
self.recognitionGeneration &+= 1
let generation = self.recognitionGeneration
let locale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID }
self.recognizer = SFSpeechRecognizer(locale: Locale(identifier: locale))
guard let recognizer, recognizer.isAvailable else {
self.logger.error("talk recognizer unavailable")
return
}
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return }
if self.audioEngine == nil {
self.audioEngine = AVAudioEngine()
}
guard let audioEngine = self.audioEngine else { return }
let input = audioEngine.inputNode
let format = input.outputFormat(forBus: 0)
input.removeTap(onBus: 0)
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
request?.append(buffer)
if let rms = Self.rmsLevel(buffer: buffer) {
Task.detached { [weak self] in
await self?.noteAudioLevel(rms: rms)
}
}
}
audioEngine.prepare()
do {
try audioEngine.start()
} catch {
self.logger.error("talk audio engine start failed: \(error.localizedDescription, privacy: .public)")
return
}
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in
guard let self else { return }
let transcript = result?.bestTranscription.formattedString
let update = RecognitionUpdate(
transcript: transcript,
segments: result?.bestTranscription.segments ?? [],
isFinal: result?.isFinal ?? false,
error: error,
generation: generation)
Task { await self.handleRecognition(update) }
}
}
private func stopRecognition() async {
self.recognitionGeneration &+= 1
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
self.audioEngine?.inputNode.removeTap(onBus: 0)
self.audioEngine?.stop()
self.audioEngine = nil
self.recognizer = nil
}
private func handleRecognition(_ update: RecognitionUpdate) async {
guard update.generation == self.recognitionGeneration else { return }
if let error = update.error {
self.logger.debug("talk recognition error: \(error.localizedDescription, privacy: .public)")
}
guard let transcript = update.transcript else { return }
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
if self.phase == .speaking, self.interruptOnSpeech {
if await self.shouldInterrupt(transcript: trimmed, segments: update.segments) {
await self.stopSpeaking(reason: .speech)
self.lastTranscript = ""
self.lastHeard = nil
await self.startListening()
}
return
}
guard self.phase == .listening else { return }
if !trimmed.isEmpty {
self.lastTranscript = trimmed
self.lastHeard = Date()
}
if update.isFinal {
self.lastTranscript = trimmed
}
}
// MARK: - Silence handling
private func startSilenceMonitor() {
self.silenceTask?.cancel()
self.silenceTask = Task { [weak self] in
guard let self else { return }
while self.isEnabled {
try? await Task.sleep(nanoseconds: 200_000_000)
await self.checkSilence()
}
}
}
private func checkSilence() async {
guard self.phase == .listening else { return }
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !transcript.isEmpty else { return }
guard let lastHeard else { return }
let elapsed = Date().timeIntervalSince(lastHeard)
guard elapsed >= self.silenceWindow else { return }
await self.finalizeTranscript(transcript)
}
private func startListening() async {
self.phase = .listening
self.lastTranscript = ""
self.lastHeard = nil
await MainActor.run {
TalkModeController.shared.updatePhase(.listening)
TalkModeController.shared.updateLevel(0)
}
}
private func finalizeTranscript(_ text: String) async {
self.lastTranscript = ""
self.lastHeard = nil
self.phase = .thinking
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
await self.stopRecognition()
await self.sendAndSpeak(text)
}
// MARK: - Gateway + TTS
private func sendAndSpeak(_ transcript: String) async {
await self.reloadConfig()
let prompt = self.buildPrompt(transcript: transcript)
let runId = UUID().uuidString
do {
let response = try await GatewayConnection.shared.chatSend(
sessionKey: "main",
message: prompt,
thinking: "low",
idempotencyKey: runId,
attachments: [])
let completion = await self.waitForChatCompletion(
runId: response.runId,
timeoutSeconds: 120)
guard completion == .final else {
await self.startListening()
await self.startRecognition()
return
}
guard let assistantText = await self.latestAssistantText(sessionKey: "main") else {
await self.startListening()
await self.startRecognition()
return
}
await self.playAssistant(text: assistantText)
await self.startListening()
await self.startRecognition()
return
} catch {
self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)")
await self.startListening()
await self.startRecognition()
return
}
}
private func buildPrompt(transcript: String) -> String {
var lines: [String] = [
"Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
]
if let interrupted = self.lastInterruptedAtSeconds {
let formatted = String(format: "%.1f", interrupted)
lines.append("Assistant speech interrupted at \(formatted)s.")
self.lastInterruptedAtSeconds = nil
}
lines.append("")
lines.append(transcript)
return lines.joined(separator: "\n")
}
private enum ChatCompletionState {
case final
case aborted
case error
case timeout
}
private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState {
await withTaskGroup(of: ChatCompletionState.self) { group in
group.addTask { [runId] in
let stream = GatewayConnection.shared.subscribe()
for await push in stream {
if case let .event(evt) = push, evt.event == "chat", let payload = evt.payload {
if let chat = try? JSONDecoder().decode(
ClawdisChatEventPayload.self,
from: JSONEncoder().encode(payload))
{
guard chat.runId == runId else { continue }
switch chat.state {
case .some("final"): return .final
case .some("aborted"): return .aborted
case .some("error"): return .error
default: break
}
}
}
}
return .timeout
}
group.addTask {
try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000)
return .timeout
}
let result = await group.next() ?? .timeout
group.cancelAll()
return result
}
}
private func latestAssistantText(sessionKey: String) async -> String? {
do {
let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey)
let messages = history.messages ?? []
let decoded = messages.compactMap { item in
guard let data = try? JSONEncoder().encode(item) else { return nil }
return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data)
}
guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil }
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
return trimmed.isEmpty ? nil : trimmed
} catch {
self.logger.error("talk history fetch failed: \(error.localizedDescription, privacy: .public)")
return nil
}
}
private func playAssistant(text: String) async {
let parse = TalkDirectiveParser.parse(text)
let directive = parse.directive
let cleaned = parse.stripped.trimmingCharacters(in: .whitespacesAndNewlines)
guard !cleaned.isEmpty else { return }
if !parse.unknownKeys.isEmpty {
self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)")
}
if let voice = directive?.voiceId {
if directive?.once == true {
self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)")
} else {
self.currentVoiceId = voice
self.voiceOverrideActive = true
self.logger.info("talk voice override voiceId=\(voice, privacy: .public)")
}
}
if let model = directive?.modelId {
if directive?.once == true {
self.logger.info("talk model override (once) modelId=\(model, privacy: .public)")
} else {
self.currentModelId = model
self.modelOverrideActive = true
}
}
let voiceId =
directive?.voiceId ??
self.currentVoiceId ??
self.defaultVoiceId
guard let voiceId, !voiceId.isEmpty else {
self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
return
}
let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? ""
if apiKey.isEmpty {
self.logger.error("talk missing ELEVENLABS_API_KEY")
return
}
await self.startRecognition()
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
self.phase = .speaking
self.lastSpokenText = cleaned
let resolvedSpeed = Self.resolveSpeed(
speed: directive?.speed,
rateWPM: directive?.rateWPM,
logger: self.logger)
let request = ElevenLabsRequest(
text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
outputFormat: directive?.outputFormat ?? self.defaultOutputFormat,
speed: resolvedSpeed,
stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger),
similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger),
style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger),
speakerBoost: directive?.speakerBoost,
seed: Self.validatedSeed(directive?.seed, logger: self.logger),
normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger),
language: Self.validatedLanguage(directive?.language, logger: self.logger))
do {
let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize(
voiceId: voiceId,
request: request)
let result = await MainActor.run { await TalkAudioPlayer.shared.play(data: audio) }
if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
if self.interruptOnSpeech {
self.lastInterruptedAtSeconds = interruptedAt
}
}
} catch {
self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)")
}
self.phase = .thinking
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
}
func stopSpeaking(reason: TalkStopReason) async {
guard self.phase == .speaking else { return }
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
if reason == .speech, let interruptedAt {
self.lastInterruptedAtSeconds = interruptedAt
}
self.phase = .thinking
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
}
// MARK: - Config
private func reloadConfig() async {
let cfg = await self.fetchTalkConfig()
self.defaultVoiceId = cfg.voiceId
if !self.voiceOverrideActive {
self.currentVoiceId = cfg.voiceId
}
self.defaultModelId = cfg.modelId
if !self.modelOverrideActive {
self.currentModelId = cfg.modelId
}
self.defaultOutputFormat = cfg.outputFormat
self.interruptOnSpeech = cfg.interruptOnSpeech
}
private struct TalkRuntimeConfig {
let voiceId: String?
let modelId: String?
let outputFormat: String?
let interruptOnSpeech: Bool
}
private func fetchTalkConfig() async -> TalkRuntimeConfig {
let env = ProcessInfo.processInfo.environment
let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
do {
let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded(
method: .configGet,
params: nil,
timeoutMs: 8000)
let talk = snap.config?["talk"]?.dictionaryValue
let voice = talk?["voiceId"]?.stringValue
let model = talk?["modelId"]?.stringValue
let outputFormat = talk?["outputFormat"]?.stringValue
let interrupt = talk?["interruptOnSpeech"]?.boolValue
let resolvedVoice =
(voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ??
(envVoice?.isEmpty == false ? envVoice : nil) ??
(sagVoice?.isEmpty == false ? sagVoice : nil)
return TalkRuntimeConfig(
voiceId: resolvedVoice,
modelId: model,
outputFormat: outputFormat,
interruptOnSpeech: interrupt ?? true)
} catch {
let resolvedVoice =
(envVoice?.isEmpty == false ? envVoice : nil) ??
(sagVoice?.isEmpty == false ? sagVoice : nil)
return TalkRuntimeConfig(
voiceId: resolvedVoice,
modelId: nil,
outputFormat: nil,
interruptOnSpeech: true)
}
}
// MARK: - Audio level handling
private func noteAudioLevel(rms: Double) async {
if self.phase != .listening && self.phase != .speaking { return }
let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01
self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha)
let threshold = max(self.minSpeechRMS, self.noiseFloorRMS * self.speechBoostFactor)
if rms >= threshold {
let now = Date()
self.lastHeard = now
self.lastSpeechEnergyAt = now
}
if self.phase == .listening {
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
await MainActor.run { TalkModeController.shared.updateLevel(clamped) }
}
}
private static func rmsLevel(buffer: AVAudioPCMBuffer) -> Double? {
guard let channelData = buffer.floatChannelData?.pointee else { return nil }
let frameCount = Int(buffer.frameLength)
guard frameCount > 0 else { return nil }
var sum: Double = 0
for i in 0..<frameCount {
let sample = Double(channelData[i])
sum += sample * sample
}
return sqrt(sum / Double(frameCount))
}
private func shouldInterrupt(transcript: String, segments: [SFTranscriptionSegment]) async -> Bool {
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
guard trimmed.count >= 3 else { return false }
if self.isLikelyEcho(of: trimmed) { return false }
let now = Date()
if let lastSpeechEnergyAt, now.timeIntervalSince(lastSpeechEnergyAt) > 0.35 {
return false
}
let hasConfidence = segments.contains { $0.confidence > 0.6 }
return hasConfidence
}
private func isLikelyEcho(of transcript: String) -> Bool {
guard let spoken = self.lastSpokenText?.lowercased(), !spoken.isEmpty else { return false }
let probe = transcript.lowercased()
if probe.count < 6 {
return spoken.contains(probe)
}
return spoken.contains(probe)
}
private static func resolveSpeed(speed: Double?, rateWPM: Int?, logger: Logger) -> Double? {
if let rateWPM, rateWPM > 0 {
let resolved = Double(rateWPM) / 175.0
if resolved <= 0.5 || resolved >= 2.0 {
logger.warning("talk rateWPM out of range: \(rateWPM, privacy: .public)")
return nil
}
return resolved
}
if let speed {
if speed <= 0.5 || speed >= 2.0 {
logger.warning("talk speed out of range: \(speed, privacy: .public)")
return nil
}
return speed
}
return nil
}
private static func validatedUnit(_ value: Double?, name: String, logger: Logger) -> Double? {
guard let value else { return nil }
if value < 0 || value > 1 {
logger.warning("talk \(name, privacy: .public) out of range: \(value, privacy: .public)")
return nil
}
return value
}
private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? {
guard let value else { return nil }
if value < 0 || value > 4294967295 {
logger.warning("talk seed out of range: \(value, privacy: .public)")
return nil
}
return UInt32(value)
}
private static func validatedNormalize(_ value: String?, logger: Logger) -> String? {
guard let value else { return nil }
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard ["auto", "on", "off"].contains(normalized) else {
logger.warning("talk normalize invalid: \(normalized, privacy: .public)")
return nil
}
return normalized
}
private static func validatedLanguage(_ value: String?, logger: Logger) -> String? {
guard let value else { return nil }
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else {
logger.warning("talk language invalid: \(normalized, privacy: .public)")
return nil
}
return normalized
}
}
private struct ElevenLabsRequest {
let text: String
let modelId: String?
let outputFormat: String?
let speed: Double?
let stability: Double?
let similarity: Double?
let style: Double?
let speakerBoost: Bool?
let seed: UInt32?
let normalize: String?
let language: String?
}
private struct ElevenLabsClient {
let apiKey: String
let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
var payload: [String: Any] = [
"text": request.text,
]
if let modelId = request.modelId, !modelId.isEmpty {
payload["model_id"] = modelId
}
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
payload["output_format"] = outputFormat
}
if let seed = request.seed {
payload["seed"] = seed
}
if let normalize = request.normalize {
payload["apply_text_normalization"] = normalize
}
if let language = request.language {
payload["language_code"] = language
}
var voiceSettings: [String: Any] = [:]
if let speed = request.speed { voiceSettings["speed"] = speed }
if let stability = request.stability { voiceSettings["stability"] = stability }
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
if let style = request.style { voiceSettings["style"] = style }
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
if !voiceSettings.isEmpty {
payload["voice_settings"] = voiceSettings
}
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = String(data: data, encoding: .utf8) ?? "unknown"
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
])
}
return data
}
}

View File

@@ -0,0 +1,8 @@
import Foundation
enum TalkModePhase: String {
case idle
case listening
case thinking
case speaking
}

View File

@@ -0,0 +1,119 @@
import AppKit
import Observation
import OSLog
import SwiftUI
@MainActor
@Observable
final class TalkOverlayController {
static let shared = TalkOverlayController()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay")
struct Model {
var isVisible: Bool = false
var phase: TalkModePhase = .idle
var level: Double = 0
}
var model = Model()
private var window: NSPanel?
private var hostingView: NSHostingView<TalkOverlayView>?
private let width: CGFloat = 92
private let height: CGFloat = 92
private let padding: CGFloat = 8
func present() {
self.ensureWindow()
self.hostingView?.rootView = TalkOverlayView(controller: self)
let target = self.targetFrame()
guard let window else { return }
if !self.model.isVisible {
self.model.isVisible = true
let start = target.offsetBy(dx: 0, dy: -6)
window.setFrame(start, display: true)
window.alphaValue = 0
window.orderFrontRegardless()
NSAnimationContext.runAnimationGroup { context in
context.duration = 0.18
context.timingFunction = CAMediaTimingFunction(name: .easeOut)
window.animator().setFrame(target, display: true)
window.animator().alphaValue = 1
}
} else {
window.setFrame(target, display: true)
window.orderFrontRegardless()
}
}
func dismiss() {
guard let window else {
self.model.isVisible = false
return
}
let target = window.frame.offsetBy(dx: 6, dy: 6)
NSAnimationContext.runAnimationGroup { context in
context.duration = 0.16
context.timingFunction = CAMediaTimingFunction(name: .easeOut)
window.animator().setFrame(target, display: true)
window.animator().alphaValue = 0
} completionHandler: {
Task { @MainActor in
window.orderOut(nil)
self.model.isVisible = false
}
}
}
func updatePhase(_ phase: TalkModePhase) {
guard self.model.phase != phase else { return }
self.logger.info("talk overlay phase=\(phase.rawValue, privacy: .public)")
self.model.phase = phase
}
func updateLevel(_ level: Double) {
guard self.model.isVisible else { return }
self.model.level = max(0, min(1, level))
}
// MARK: - Private
private func ensureWindow() {
if self.window != nil { return }
let panel = NSPanel(
contentRect: NSRect(x: 0, y: 0, width: self.width, height: self.height),
styleMask: [.nonactivatingPanel, .borderless],
backing: .buffered,
defer: false)
panel.isOpaque = false
panel.backgroundColor = .clear
panel.hasShadow = false
panel.level = NSWindow.Level(rawValue: NSWindow.Level.popUpMenu.rawValue - 4)
panel.collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .transient]
panel.hidesOnDeactivate = false
panel.isMovable = false
panel.isFloatingPanel = true
panel.becomesKeyOnlyIfNeeded = true
panel.titleVisibility = .hidden
panel.titlebarAppearsTransparent = true
let host = NSHostingView(rootView: TalkOverlayView(controller: self))
host.translatesAutoresizingMaskIntoConstraints = false
panel.contentView = host
self.hostingView = host
self.window = panel
}
private func targetFrame() -> NSRect {
guard let screen = NSScreen.main else { return .zero }
let size = NSSize(width: self.width, height: self.height)
let visible = screen.visibleFrame
let origin = CGPoint(
x: visible.maxX - size.width - self.padding,
y: visible.maxY - size.height - self.padding)
return NSRect(origin: origin, size: size)
}
}

View File

@@ -0,0 +1,139 @@
import SwiftUI
struct TalkOverlayView: View {
var controller: TalkOverlayController
@State private var hovering = false
var body: some View {
ZStack(alignment: .topLeading) {
TalkCloudView(phase: self.controller.model.phase, level: self.controller.model.level)
.frame(width: 76, height: 64)
.contentShape(Rectangle())
.onTapGesture {
TalkModeController.shared.stopSpeaking(reason: .userTap)
}
.padding(8)
Button {
TalkModeController.shared.exitTalkMode()
} label: {
Image(systemName: "xmark")
.font(.system(size: 10, weight: .bold))
.foregroundStyle(Color.white.opacity(self.hovering ? 0.95 : 0.7))
.frame(width: 18, height: 18)
.background(Color.black.opacity(self.hovering ? 0.45 : 0.3))
.clipShape(Circle())
}
.buttonStyle(.plain)
.contentShape(Circle())
.padding(4)
.onHover { self.hovering = $0 }
}
.frame(width: 92, height: 92, alignment: .center)
}
}
private struct TalkCloudView: View {
let phase: TalkModePhase
let level: Double
var body: some View {
TimelineView(.animation) { context in
let t = context.date.timeIntervalSinceReferenceDate
let pulse = phase == .speaking ? (1 + 0.04 * sin(t * 6)) : 1
let sink = phase == .thinking ? (3 + 2 * sin(t * 2)) : 0
let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.14) : 1
let baseScale = phase == .thinking ? 0.94 : 1
ZStack {
CloudShape()
.fill(self.cloudGradient)
.overlay(
CloudShape()
.stroke(Color.white.opacity(0.35), lineWidth: 0.8))
.shadow(color: Color.black.opacity(0.18), radius: 8, x: 0, y: 4)
.scaleEffect(baseScale * pulse * listenScale)
.offset(y: sink)
if phase == .listening {
Circle()
.stroke(self.ringGradient, lineWidth: 1)
.scaleEffect(1 + CGFloat(self.level) * 0.45)
.opacity(0.3 + CGFloat(self.level) * 0.4)
.animation(.easeOut(duration: 0.08), value: self.level)
}
if phase == .thinking {
TalkThinkingDots(time: t)
.offset(y: 18)
}
if phase == .speaking {
TalkSpeakingRings(time: t)
}
}
}
}
private var cloudGradient: LinearGradient {
LinearGradient(
colors: [Color(red: 0.95, green: 0.98, blue: 1.0), Color(red: 0.75, green: 0.88, blue: 1.0)],
startPoint: .topLeading,
endPoint: .bottomTrailing)
}
private var ringGradient: LinearGradient {
LinearGradient(
colors: [Color.white.opacity(0.6), Color.white.opacity(0.1)],
startPoint: .top,
endPoint: .bottom)
}
}
private struct TalkThinkingDots: View {
let time: TimeInterval
var body: some View {
HStack(spacing: 4) {
ForEach(0..<3, id: \.self) { idx in
let phase = (time * 2 + Double(idx) * 0.45).truncatingRemainder(dividingBy: 1)
Circle()
.fill(Color.white.opacity(0.75))
.frame(width: 5, height: 5)
.opacity(0.35 + 0.55 * phase)
}
}
}
}
private struct TalkSpeakingRings: View {
let time: TimeInterval
var body: some View {
ZStack {
ForEach(0..<3, id: \.self) { idx in
let phase = (time * 1.1 + Double(idx) / 3).truncatingRemainder(dividingBy: 1)
Circle()
.stroke(Color.white.opacity(0.6 - phase * 0.5), lineWidth: 1)
.scaleEffect(0.8 + phase * 0.7)
.opacity(0.6 - phase * 0.6)
}
}
}
}
private struct CloudShape: Shape {
func path(in rect: CGRect) -> Path {
let w = rect.width
let h = rect.height
let baseHeight = h * 0.44
let baseRect = CGRect(x: rect.minX, y: rect.minY + h * 0.46, width: w, height: baseHeight)
var path = Path()
path.addRoundedRect(in: baseRect, cornerSize: CGSize(width: baseHeight / 2, height: baseHeight / 2))
path.addEllipse(in: CGRect(x: rect.minX + w * 0.05, y: rect.minY + h * 0.28, width: w * 0.36, height: h * 0.36))
path.addEllipse(in: CGRect(x: rect.minX + w * 0.28, y: rect.minY + h * 0.05, width: w * 0.44, height: h * 0.44))
path.addEllipse(in: CGRect(x: rect.minX + w * 0.62, y: rect.minY + h * 0.3, width: w * 0.3, height: h * 0.3))
return path
}
}

View File

@@ -0,0 +1,194 @@
import Foundation
public struct TalkDirective: Equatable, Sendable {
public var voiceId: String?
public var modelId: String?
public var speed: Double?
public var rateWPM: Int?
public var stability: Double?
public var similarity: Double?
public var style: Double?
public var speakerBoost: Bool?
public var seed: Int?
public var normalize: String?
public var language: String?
public var outputFormat: String?
public var latencyTier: Int?
public var once: Bool?
public init(
voiceId: String? = nil,
modelId: String? = nil,
speed: Double? = nil,
rateWPM: Int? = nil,
stability: Double? = nil,
similarity: Double? = nil,
style: Double? = nil,
speakerBoost: Bool? = nil,
seed: Int? = nil,
normalize: String? = nil,
language: String? = nil,
outputFormat: String? = nil,
latencyTier: Int? = nil,
once: Bool? = nil)
{
self.voiceId = voiceId
self.modelId = modelId
self.speed = speed
self.rateWPM = rateWPM
self.stability = stability
self.similarity = similarity
self.style = style
self.speakerBoost = speakerBoost
self.seed = seed
self.normalize = normalize
self.language = language
self.outputFormat = outputFormat
self.latencyTier = latencyTier
self.once = once
}
}
public struct TalkDirectiveParseResult: Equatable, Sendable {
public let directive: TalkDirective?
public let stripped: String
public let unknownKeys: [String]
public init(directive: TalkDirective?, stripped: String, unknownKeys: [String]) {
self.directive = directive
self.stripped = stripped
self.unknownKeys = unknownKeys
}
}
public enum TalkDirectiveParser {
public static func parse(_ text: String) -> TalkDirectiveParseResult {
let normalized = text.replacingOccurrences(of: "\r\n", with: "\n")
var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false)
guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) }
guard let firstNonEmpty = lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty })
else {
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
}
let head = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines)
guard head.hasPrefix("{"), head.hasSuffix("}") else {
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
}
guard let data = head.data(using: .utf8),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any]
else {
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
}
let speakerBoost = boolValue(json, keys: ["speaker_boost", "speakerBoost"])
?? boolValue(json, keys: ["no_speaker_boost", "noSpeakerBoost"]).map { !$0 }
let directive = TalkDirective(
voiceId: stringValue(json, keys: ["voice", "voice_id", "voiceId"]),
modelId: stringValue(json, keys: ["model", "model_id", "modelId"]),
speed: doubleValue(json, keys: ["speed"]),
rateWPM: intValue(json, keys: ["rate", "wpm"]),
stability: doubleValue(json, keys: ["stability"]),
similarity: doubleValue(json, keys: ["similarity", "similarity_boost", "similarityBoost"]),
style: doubleValue(json, keys: ["style"]),
speakerBoost: speakerBoost,
seed: intValue(json, keys: ["seed"]),
normalize: stringValue(json, keys: ["normalize", "apply_text_normalization"]),
language: stringValue(json, keys: ["lang", "language_code", "language"]),
outputFormat: stringValue(json, keys: ["output_format", "format"]),
latencyTier: intValue(json, keys: ["latency", "latency_tier", "latencyTier"]),
once: boolValue(json, keys: ["once"]))
let hasDirective = [
directive.voiceId,
directive.modelId,
directive.speed.map { "\($0)" },
directive.rateWPM.map { "\($0)" },
directive.stability.map { "\($0)" },
directive.similarity.map { "\($0)" },
directive.style.map { "\($0)" },
directive.speakerBoost.map { "\($0)" },
directive.seed.map { "\($0)" },
directive.normalize,
directive.language,
directive.outputFormat,
directive.latencyTier.map { "\($0)" },
directive.once.map { "\($0)" },
].contains { $0 != nil }
guard hasDirective else {
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
}
let knownKeys = Set([
"voice", "voice_id", "voiceid",
"model", "model_id", "modelid",
"speed", "rate", "wpm",
"stability", "similarity", "similarity_boost", "similarityboost",
"style",
"speaker_boost", "speakerboost",
"no_speaker_boost", "nospeakerboost",
"seed",
"normalize", "apply_text_normalization",
"lang", "language_code", "language",
"output_format", "format",
"latency", "latency_tier", "latencytier",
"once",
])
let unknownKeys = json.keys.filter { !knownKeys.contains($0.lowercased()) }.sorted()
lines.remove(at: firstNonEmpty)
if firstNonEmpty < lines.count {
let next = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines)
if next.isEmpty {
lines.remove(at: firstNonEmpty)
}
}
let stripped = lines.joined(separator: "\n")
return TalkDirectiveParseResult(directive: directive, stripped: stripped, unknownKeys: unknownKeys)
}
private static func stringValue(_ dict: [String: Any], keys: [String]) -> String? {
for key in keys {
if let value = dict[key] as? String {
let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines)
if !trimmed.isEmpty { return trimmed }
}
}
return nil
}
private static func doubleValue(_ dict: [String: Any], keys: [String]) -> Double? {
for key in keys {
if let value = dict[key] as? Double { return value }
if let value = dict[key] as? Int { return Double(value) }
if let value = dict[key] as? String, let parsed = Double(value) { return parsed }
}
return nil
}
private static func intValue(_ dict: [String: Any], keys: [String]) -> Int? {
for key in keys {
if let value = dict[key] as? Int { return value }
if let value = dict[key] as? Double { return Int(value) }
if let value = dict[key] as? String, let parsed = Int(value) { return parsed }
}
return nil
}
private static func boolValue(_ dict: [String: Any], keys: [String]) -> Bool? {
for key in keys {
if let value = dict[key] as? Bool { return value }
if let value = dict[key] as? String {
let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
if ["true", "yes", "1"].contains(trimmed) { return true }
if ["false", "no", "0"].contains(trimmed) { return false }
}
}
return nil
}
}

View File

@@ -0,0 +1,62 @@
import XCTest
@testable import ClawdisKit
final class TalkDirectiveTests: XCTestCase {
func testParsesDirectiveAndStripsLine() {
let text = """
{"voice":"abc123","once":true}
Hello there.
"""
let result = TalkDirectiveParser.parse(text)
XCTAssertEqual(result.directive?.voiceId, "abc123")
XCTAssertEqual(result.directive?.once, true)
XCTAssertEqual(result.stripped, "Hello there.")
}
func testIgnoresNonDirective() {
let text = "Hello world."
let result = TalkDirectiveParser.parse(text)
XCTAssertNil(result.directive)
XCTAssertEqual(result.stripped, text)
}
func testKeepsDirectiveLineIfNoRecognizedFields() {
let text = """
{"unknown":"value"}
Hello.
"""
let result = TalkDirectiveParser.parse(text)
XCTAssertNil(result.directive)
XCTAssertEqual(result.stripped, text)
}
func testParsesExtendedOptions() {
let text = """
{"voice_id":"v1","model_id":"m1","rate":200,"stability":0.5,"similarity":0.8,"style":0.2,"speaker_boost":true,"seed":1234,"normalize":"auto","lang":"en","output_format":"mp3_44100_128"}
Hello.
"""
let result = TalkDirectiveParser.parse(text)
XCTAssertEqual(result.directive?.voiceId, "v1")
XCTAssertEqual(result.directive?.modelId, "m1")
XCTAssertEqual(result.directive?.rateWPM, 200)
XCTAssertEqual(result.directive?.stability, 0.5)
XCTAssertEqual(result.directive?.similarity, 0.8)
XCTAssertEqual(result.directive?.style, 0.2)
XCTAssertEqual(result.directive?.speakerBoost, true)
XCTAssertEqual(result.directive?.seed, 1234)
XCTAssertEqual(result.directive?.normalize, "auto")
XCTAssertEqual(result.directive?.language, "en")
XCTAssertEqual(result.directive?.outputFormat, "mp3_44100_128")
XCTAssertEqual(result.stripped, "Hello.")
}
func testTracksUnknownKeys() {
let text = """
{"voice":"abc","mystery":"value","extra":1}
Hi.
"""
let result = TalkDirectiveParser.parse(text)
XCTAssertEqual(result.directive?.voiceId, "abc")
XCTAssertEqual(result.unknownKeys, ["extra", "mystery"])
}
}