diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt index 28d702975..ee1c83c9b 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt @@ -35,6 +35,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) { val voiceWakeMode: StateFlow = runtime.voiceWakeMode val voiceWakeStatusText: StateFlow = runtime.voiceWakeStatusText val voiceWakeIsListening: StateFlow = runtime.voiceWakeIsListening + val talkEnabled: StateFlow = runtime.talkEnabled + val talkStatusText: StateFlow = runtime.talkStatusText + val talkIsListening: StateFlow = runtime.talkIsListening + val talkIsSpeaking: StateFlow = runtime.talkIsSpeaking val manualEnabled: StateFlow = runtime.manualEnabled val manualHost: StateFlow = runtime.manualHost val manualPort: StateFlow = runtime.manualPort @@ -95,6 +99,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) { runtime.setVoiceWakeMode(mode) } + fun setTalkEnabled(enabled: Boolean) { + runtime.setTalkEnabled(enabled) + } + fun connect(endpoint: BridgeEndpoint) { runtime.connect(endpoint) } diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt index 0ade08e3b..4984f7e0f 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt @@ -25,6 +25,7 @@ import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UIAction import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UICommand import com.steipete.clawdis.node.protocol.ClawdisCanvasCommand import com.steipete.clawdis.node.protocol.ClawdisScreenCommand +import com.steipete.clawdis.node.voice.TalkModeManager import com.steipete.clawdis.node.voice.VoiceWakeManager import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers @@ -84,6 +85,15 @@ class NodeRuntime(context: Context) { val voiceWakeStatusText: StateFlow get() = voiceWake.statusText + val talkStatusText: StateFlow + get() = talkMode.statusText + + val talkIsListening: StateFlow + get() = talkMode.isListening + + val talkIsSpeaking: StateFlow + get() = talkMode.isSpeaking + private val discovery = BridgeDiscovery(appContext, scope = scope) val bridges: StateFlow> = discovery.bridges val discoveryStatusText: StateFlow = discovery.statusText @@ -133,6 +143,9 @@ class NodeRuntime(context: Context) { ) private val chat = ChatController(scope = scope, session = session, json = json) + private val talkMode: TalkModeManager by lazy { + TalkModeManager(context = appContext, scope = scope).also { it.attachSession(session) } + } private fun handleSessionDisconnected(message: String) { _statusText.value = message @@ -163,6 +176,7 @@ class NodeRuntime(context: Context) { val preventSleep: StateFlow = prefs.preventSleep val wakeWords: StateFlow> = prefs.wakeWords val voiceWakeMode: StateFlow = prefs.voiceWakeMode + val talkEnabled: StateFlow = prefs.talkEnabled val manualEnabled: StateFlow = prefs.manualEnabled val manualHost: StateFlow = prefs.manualHost val manualPort: StateFlow = prefs.manualPort @@ -218,6 +232,13 @@ class NodeRuntime(context: Context) { } } + scope.launch { + talkEnabled.collect { enabled -> + talkMode.setEnabled(enabled) + externalAudioCaptureActive.value = enabled + } + } + scope.launch(Dispatchers.Default) { bridges.collect { list -> if (list.isNotEmpty()) { @@ -311,6 +332,10 @@ class NodeRuntime(context: Context) { prefs.setVoiceWakeMode(mode) } + fun setTalkEnabled(value: Boolean) { + prefs.setTalkEnabled(value) + } + fun connect(endpoint: BridgeEndpoint) { scope.launch { _statusText.value = "Connecting…" @@ -548,6 +573,7 @@ class NodeRuntime(context: Context) { return } + talkMode.handleBridgeEvent(event, payloadJson) chat.handleBridgeEvent(event, payloadJson) } diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt index 8d7ceb0a2..b288ef29e 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt @@ -73,6 +73,9 @@ class SecurePrefs(context: Context) { private val _voiceWakeMode = MutableStateFlow(loadVoiceWakeMode()) val voiceWakeMode: StateFlow = _voiceWakeMode + private val _talkEnabled = MutableStateFlow(prefs.getBoolean("talk.enabled", false)) + val talkEnabled: StateFlow = _talkEnabled + fun setLastDiscoveredStableId(value: String) { val trimmed = value.trim() prefs.edit { putString("bridge.lastDiscoveredStableId", trimmed) } @@ -158,6 +161,11 @@ class SecurePrefs(context: Context) { _voiceWakeMode.value = mode } + fun setTalkEnabled(value: Boolean) { + prefs.edit { putBoolean("talk.enabled", value) } + _talkEnabled.value = value + } + private fun loadVoiceWakeMode(): VoiceWakeMode { val raw = prefs.getString(voiceWakeModeKey, null) val resolved = VoiceWakeMode.fromRawValue(raw) diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt index c7d011892..2ec4a7119 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt @@ -62,6 +62,8 @@ fun SettingsSheet(viewModel: MainViewModel) { val wakeWords by viewModel.wakeWords.collectAsState() val voiceWakeMode by viewModel.voiceWakeMode.collectAsState() val voiceWakeStatusText by viewModel.voiceWakeStatusText.collectAsState() + val talkEnabled by viewModel.talkEnabled.collectAsState() + val talkStatusText by viewModel.talkStatusText.collectAsState() val isConnected by viewModel.isConnected.collectAsState() val manualEnabled by viewModel.manualEnabled.collectAsState() val manualHost by viewModel.manualHost.collectAsState() @@ -307,6 +309,28 @@ fun SettingsSheet(viewModel: MainViewModel) { // Voice item { Text("Voice", style = MaterialTheme.typography.titleSmall) } + item { + ListItem( + headlineContent = { Text("Talk Mode") }, + supportingContent = { Text(talkStatusText) }, + trailingContent = { + Switch( + checked = talkEnabled, + onCheckedChange = { on -> + if (on) { + val micOk = + ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) == + PackageManager.PERMISSION_GRANTED + if (!micOk) audioPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO) + viewModel.setTalkEnabled(true) + } else { + viewModel.setTalkEnabled(false) + } + }, + ) + }, + ) + } item { val enabled = voiceWakeMode != VoiceWakeMode.Off ListItem( diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt new file mode 100644 index 000000000..539f556ff --- /dev/null +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt @@ -0,0 +1,194 @@ +package com.steipete.clawdis.node.voice + +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonElement +import kotlinx.serialization.json.JsonObject +import kotlinx.serialization.json.JsonPrimitive + +private val directiveJson = Json { ignoreUnknownKeys = true } + +data class TalkDirective( + val voiceId: String? = null, + val modelId: String? = null, + val speed: Double? = null, + val rateWpm: Int? = null, + val stability: Double? = null, + val similarity: Double? = null, + val style: Double? = null, + val speakerBoost: Boolean? = null, + val seed: Long? = null, + val normalize: String? = null, + val language: String? = null, + val outputFormat: String? = null, + val latencyTier: Int? = null, + val once: Boolean? = null, +) + +data class TalkDirectiveParseResult( + val directive: TalkDirective?, + val stripped: String, + val unknownKeys: List, +) + +object TalkDirectiveParser { + fun parse(text: String): TalkDirectiveParseResult { + val normalized = text.replace("\r\n", "\n") + val lines = normalized.split("\n").toMutableList() + if (lines.isEmpty()) return TalkDirectiveParseResult(null, text, emptyList()) + + val firstNonEmpty = lines.indexOfFirst { it.trim().isNotEmpty() } + if (firstNonEmpty == -1) return TalkDirectiveParseResult(null, text, emptyList()) + + val head = lines[firstNonEmpty].trim() + if (!head.startsWith("{") || !head.endsWith("}")) { + return TalkDirectiveParseResult(null, text, emptyList()) + } + + val obj = parseJsonObject(head) ?: return TalkDirectiveParseResult(null, text, emptyList()) + + val speakerBoost = + boolValue(obj, listOf("speaker_boost", "speakerBoost")) + ?: boolValue(obj, listOf("no_speaker_boost", "noSpeakerBoost"))?.not() + + val directive = TalkDirective( + voiceId = stringValue(obj, listOf("voice", "voice_id", "voiceId")), + modelId = stringValue(obj, listOf("model", "model_id", "modelId")), + speed = doubleValue(obj, listOf("speed")), + rateWpm = intValue(obj, listOf("rate", "wpm")), + stability = doubleValue(obj, listOf("stability")), + similarity = doubleValue(obj, listOf("similarity", "similarity_boost", "similarityBoost")), + style = doubleValue(obj, listOf("style")), + speakerBoost = speakerBoost, + seed = longValue(obj, listOf("seed")), + normalize = stringValue(obj, listOf("normalize", "apply_text_normalization")), + language = stringValue(obj, listOf("lang", "language_code", "language")), + outputFormat = stringValue(obj, listOf("output_format", "format")), + latencyTier = intValue(obj, listOf("latency", "latency_tier", "latencyTier")), + once = boolValue(obj, listOf("once")), + ) + + val hasDirective = listOf( + directive.voiceId, + directive.modelId, + directive.speed, + directive.rateWpm, + directive.stability, + directive.similarity, + directive.style, + directive.speakerBoost, + directive.seed, + directive.normalize, + directive.language, + directive.outputFormat, + directive.latencyTier, + directive.once, + ).any { it != null } + + if (!hasDirective) return TalkDirectiveParseResult(null, text, emptyList()) + + val knownKeys = setOf( + "voice", "voice_id", "voiceid", + "model", "model_id", "modelid", + "speed", "rate", "wpm", + "stability", "similarity", "similarity_boost", "similarityboost", + "style", + "speaker_boost", "speakerboost", + "no_speaker_boost", "nospeakerboost", + "seed", + "normalize", "apply_text_normalization", + "lang", "language_code", "language", + "output_format", "format", + "latency", "latency_tier", "latencytier", + "once", + ) + val unknownKeys = obj.keys.filter { !knownKeys.contains(it.lowercase()) }.sorted() + + lines.removeAt(firstNonEmpty) + if (firstNonEmpty < lines.size) { + if (lines[firstNonEmpty].trim().isEmpty()) { + lines.removeAt(firstNonEmpty) + } + } + + return TalkDirectiveParseResult(directive, lines.joinToString("\n"), unknownKeys) + } + + private fun parseJsonObject(line: String): JsonObject? { + return try { + directiveJson.parseToJsonElement(line) as? JsonObject + } catch (_: Throwable) { + null + } + } + + private fun stringValue(obj: JsonObject, keys: List): String? { + for (key in keys) { + val value = obj[key].asStringOrNull()?.trim() + if (!value.isNullOrEmpty()) return value + } + return null + } + + private fun doubleValue(obj: JsonObject, keys: List): Double? { + for (key in keys) { + val value = obj[key].asDoubleOrNull() + if (value != null) return value + } + return null + } + + private fun intValue(obj: JsonObject, keys: List): Int? { + for (key in keys) { + val value = obj[key].asIntOrNull() + if (value != null) return value + } + return null + } + + private fun longValue(obj: JsonObject, keys: List): Long? { + for (key in keys) { + val value = obj[key].asLongOrNull() + if (value != null) return value + } + return null + } + + private fun boolValue(obj: JsonObject, keys: List): Boolean? { + for (key in keys) { + val value = obj[key].asBooleanOrNull() + if (value != null) return value + } + return null + } +} + +private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull + +private fun JsonElement?.asDoubleOrNull(): Double? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.isString) return primitive.content.toDoubleOrNull() + return primitive.doubleOrNull +} + +private fun JsonElement?.asIntOrNull(): Int? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.isString) return primitive.content.toIntOrNull() + return primitive.intOrNull +} + +private fun JsonElement?.asLongOrNull(): Long? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.isString) return primitive.content.toLongOrNull() + return primitive.longOrNull +} + +private fun JsonElement?.asBooleanOrNull(): Boolean? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.booleanOrNull != null) return primitive.booleanOrNull + val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null + return when (content) { + "true", "yes", "1" -> true + "false", "no", "0" -> false + else -> null + } +} diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt new file mode 100644 index 000000000..ecbc51869 --- /dev/null +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt @@ -0,0 +1,713 @@ +package com.steipete.clawdis.node.voice + +import android.Manifest +import android.content.Context +import android.content.Intent +import android.content.pm.PackageManager +import android.media.AudioAttributes +import android.media.MediaPlayer +import android.os.Bundle +import android.os.Handler +import android.os.Looper +import android.os.SystemClock +import android.speech.RecognitionListener +import android.speech.RecognizerIntent +import android.speech.SpeechRecognizer +import android.util.Log +import androidx.core.content.ContextCompat +import com.steipete.clawdis.node.bridge.BridgeSession +import java.io.File +import java.net.HttpURLConnection +import java.net.URL +import java.util.UUID +import kotlinx.coroutines.CompletableDeferred +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.Job +import kotlinx.coroutines.delay +import kotlinx.coroutines.flow.MutableStateFlow +import kotlinx.coroutines.flow.StateFlow +import kotlinx.coroutines.launch +import kotlinx.coroutines.withContext +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonArray +import kotlinx.serialization.json.JsonElement +import kotlinx.serialization.json.JsonObject +import kotlinx.serialization.json.JsonPrimitive +import kotlinx.serialization.json.buildJsonObject + +class TalkModeManager( + private val context: Context, + private val scope: CoroutineScope, +) { + companion object { + private const val tag = "TalkMode" + } + + private val mainHandler = Handler(Looper.getMainLooper()) + private val json = Json { ignoreUnknownKeys = true } + + private val _isEnabled = MutableStateFlow(false) + val isEnabled: StateFlow = _isEnabled + + private val _isListening = MutableStateFlow(false) + val isListening: StateFlow = _isListening + + private val _isSpeaking = MutableStateFlow(false) + val isSpeaking: StateFlow = _isSpeaking + + private val _statusText = MutableStateFlow("Off") + val statusText: StateFlow = _statusText + + private var recognizer: SpeechRecognizer? = null + private var restartJob: Job? = null + private var stopRequested = false + private var listeningMode = false + + private var silenceJob: Job? = null + private val silenceWindowMs = 700L + private var lastTranscript: String = "" + private var lastHeardAtMs: Long? = null + private var lastSpokenText: String? = null + private var lastInterruptedAtSeconds: Double? = null + + private var defaultVoiceId: String? = null + private var currentVoiceId: String? = null + private var defaultModelId: String? = null + private var currentModelId: String? = null + private var defaultOutputFormat: String? = null + private var interruptOnSpeech: Boolean = true + private var voiceOverrideActive = false + private var modelOverrideActive = false + + private var session: BridgeSession? = null + private var pendingRunId: String? = null + private var pendingFinal: CompletableDeferred? = null + + private var player: MediaPlayer? = null + private var currentAudioFile: File? = null + + fun attachSession(session: BridgeSession) { + this.session = session + } + + fun setEnabled(enabled: Boolean) { + if (_isEnabled.value == enabled) return + _isEnabled.value = enabled + if (enabled) { + start() + } else { + stop() + } + } + + fun handleBridgeEvent(event: String, payloadJson: String?) { + if (event != "chat") return + if (payloadJson.isNullOrBlank()) return + val pending = pendingRunId ?: return + val obj = + try { + json.parseToJsonElement(payloadJson).asObjectOrNull() + } catch (_: Throwable) { + null + } ?: return + val runId = obj["runId"].asStringOrNull() ?: return + if (runId != pending) return + val state = obj["state"].asStringOrNull() ?: return + if (state == "final") { + pendingFinal?.complete(true) + pendingFinal = null + pendingRunId = null + } + } + + private fun start() { + mainHandler.post { + if (_isListening.value) return@post + stopRequested = false + listeningMode = true + + if (!SpeechRecognizer.isRecognitionAvailable(context)) { + _statusText.value = "Speech recognizer unavailable" + return@post + } + + val micOk = + ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) == + PackageManager.PERMISSION_GRANTED + if (!micOk) { + _statusText.value = "Microphone permission required" + return@post + } + + try { + recognizer?.destroy() + recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) } + startListeningInternal(markListening = true) + startSilenceMonitor() + } catch (err: Throwable) { + _statusText.value = "Start failed: ${err.message ?: err::class.simpleName}" + } + } + } + + private fun stop() { + stopRequested = true + listeningMode = false + restartJob?.cancel() + restartJob = null + silenceJob?.cancel() + silenceJob = null + lastTranscript = "" + lastHeardAtMs = null + _isListening.value = false + _statusText.value = "Off" + stopSpeaking() + + mainHandler.post { + recognizer?.cancel() + recognizer?.destroy() + recognizer = null + } + } + + private fun startListeningInternal(markListening: Boolean) { + val r = recognizer ?: return + val intent = + Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply { + putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM) + putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true) + putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3) + putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName) + } + + if (markListening) { + _statusText.value = "Listening" + _isListening.value = true + } + r.startListening(intent) + } + + private fun scheduleRestart(delayMs: Long = 350) { + if (stopRequested) return + restartJob?.cancel() + restartJob = + scope.launch { + delay(delayMs) + mainHandler.post { + if (stopRequested) return@post + try { + recognizer?.cancel() + val shouldListen = listeningMode + val shouldInterrupt = _isSpeaking.value && interruptOnSpeech + if (!shouldListen && !shouldInterrupt) return@post + startListeningInternal(markListening = shouldListen) + } catch (_: Throwable) { + // handled by onError + } + } + } + } + + private fun handleTranscript(text: String, isFinal: Boolean) { + val trimmed = text.trim() + if (_isSpeaking.value && interruptOnSpeech) { + if (shouldInterrupt(trimmed)) { + stopSpeaking() + } + return + } + + if (!_isListening.value) return + + if (trimmed.isNotEmpty()) { + lastTranscript = trimmed + lastHeardAtMs = SystemClock.elapsedRealtime() + } + + if (isFinal) { + lastTranscript = trimmed + } + } + + private fun startSilenceMonitor() { + silenceJob?.cancel() + silenceJob = + scope.launch { + while (_isEnabled.value) { + delay(200) + checkSilence() + } + } + } + + private fun checkSilence() { + if (!_isListening.value) return + val transcript = lastTranscript.trim() + if (transcript.isEmpty()) return + val lastHeard = lastHeardAtMs ?: return + val elapsed = SystemClock.elapsedRealtime() - lastHeard + if (elapsed < silenceWindowMs) return + scope.launch { finalizeTranscript(transcript) } + } + + private suspend fun finalizeTranscript(transcript: String) { + listeningMode = false + _isListening.value = false + _statusText.value = "Thinking…" + lastTranscript = "" + lastHeardAtMs = null + + reloadConfig() + val prompt = buildPrompt(transcript) + val bridge = session + if (bridge == null) { + _statusText.value = "Bridge not connected" + start() + return + } + + try { + val runId = sendChat(prompt, bridge) + val ok = waitForChatFinal(runId) + if (!ok) { + _statusText.value = "No reply" + start() + return + } + val assistant = fetchLatestAssistantText(bridge) + if (assistant.isNullOrBlank()) { + _statusText.value = "No reply" + start() + return + } + playAssistant(assistant) + } catch (err: Throwable) { + _statusText.value = "Talk failed: ${err.message ?: err::class.simpleName}" + } + + if (_isEnabled.value) { + start() + } + } + + private fun buildPrompt(transcript: String): String { + val lines = mutableListOf( + "Talk Mode active. Reply in a concise, spoken tone.", + "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", + ) + lastInterruptedAtSeconds?.let { + lines.add("Assistant speech interrupted at ${"%.1f".format(it)}s.") + lastInterruptedAtSeconds = null + } + lines.add("") + lines.add(transcript) + return lines.joinToString("\n") + } + + private suspend fun sendChat(message: String, bridge: BridgeSession): String { + val runId = UUID.randomUUID().toString() + val params = + buildJsonObject { + put("sessionKey", JsonPrimitive("main")) + put("message", JsonPrimitive(message)) + put("thinking", JsonPrimitive("low")) + put("timeoutMs", JsonPrimitive(30_000)) + put("idempotencyKey", JsonPrimitive(runId)) + } + val res = bridge.request("chat.send", params.toString()) + val parsed = parseRunId(res) ?: runId + if (parsed != runId) { + pendingRunId = parsed + } + return parsed + } + + private suspend fun waitForChatFinal(runId: String): Boolean { + pendingFinal?.cancel() + val deferred = CompletableDeferred() + pendingRunId = runId + pendingFinal = deferred + + val result = + withContext(Dispatchers.IO) { + try { + kotlinx.coroutines.withTimeout(120_000) { deferred.await() } + } catch (_: Throwable) { + false + } + } + + if (!result) { + pendingFinal = null + pendingRunId = null + } + return result + } + + private suspend fun fetchLatestAssistantText(bridge: BridgeSession): String? { + val res = bridge.request("chat.history", "{\"sessionKey\":\"main\"}") + val root = json.parseToJsonElement(res).asObjectOrNull() ?: return null + val messages = root["messages"] as? JsonArray ?: return null + for (item in messages.reversed()) { + val obj = item.asObjectOrNull() ?: continue + if (obj["role"].asStringOrNull() != "assistant") continue + val content = obj["content"] as? JsonArray ?: continue + val text = + content.mapNotNull { entry -> + entry.asObjectOrNull()?.get("text")?.asStringOrNull()?.trim() + }.filter { it.isNotEmpty() } + if (text.isNotEmpty()) return text.joinToString("\n") + } + return null + } + + private suspend fun playAssistant(text: String) { + val parsed = TalkDirectiveParser.parse(text) + if (parsed.unknownKeys.isNotEmpty()) { + Log.w(tag, "Unknown talk directive keys: ${parsed.unknownKeys}") + } + val directive = parsed.directive + val cleaned = parsed.stripped.trim() + if (cleaned.isEmpty()) return + + if (directive?.voiceId != null) { + if (directive.once != true) { + currentVoiceId = directive.voiceId + voiceOverrideActive = true + } + } + if (directive?.modelId != null) { + if (directive.once != true) { + currentModelId = directive.modelId + modelOverrideActive = true + } + } + + val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId + if (voiceId.isNullOrBlank()) { + _statusText.value = "Missing voice ID" + return + } + + val apiKey = System.getenv("ELEVENLABS_API_KEY")?.trim() + if (apiKey.isNullOrEmpty()) { + _statusText.value = "Missing ELEVENLABS_API_KEY" + return + } + + _statusText.value = "Speaking…" + _isSpeaking.value = true + lastSpokenText = cleaned + ensureInterruptListener() + + try { + val request = + ElevenLabsRequest( + text = cleaned, + modelId = directive?.modelId ?: currentModelId ?: defaultModelId, + outputFormat = directive?.outputFormat ?: defaultOutputFormat, + speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm), + stability = TalkModeRuntime.validatedUnit(directive?.stability), + similarity = TalkModeRuntime.validatedUnit(directive?.similarity), + style = TalkModeRuntime.validatedUnit(directive?.style), + speakerBoost = directive?.speakerBoost, + seed = TalkModeRuntime.validatedSeed(directive?.seed), + normalize = TalkModeRuntime.validatedNormalize(directive?.normalize), + language = TalkModeRuntime.validatedLanguage(directive?.language), + ) + val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request) + playAudio(audio) + } catch (err: Throwable) { + _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}" + } + + _isSpeaking.value = false + } + + private suspend fun playAudio(data: ByteArray) { + stopSpeaking(resetInterrupt = false) + val file = File.createTempFile("talk-", ".mp3", context.cacheDir) + file.writeBytes(data) + currentAudioFile = file + + val player = MediaPlayer() + this.player = player + + val finished = CompletableDeferred() + player.setAudioAttributes( + AudioAttributes.Builder() + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .setUsage(AudioAttributes.USAGE_ASSISTANT) + .build(), + ) + player.setOnCompletionListener { + finished.complete(Unit) + } + player.setOnErrorListener { _, _, _ -> + finished.completeExceptionally(IllegalStateException("MediaPlayer error")) + true + } + + player.setDataSource(file.absolutePath) + withContext(Dispatchers.Main) { + player.setOnPreparedListener { it.start() } + player.prepareAsync() + } + + try { + finished.await() + } finally { + cleanupPlayer() + } + } + + private fun stopSpeaking(resetInterrupt: Boolean = true) { + if (!_isSpeaking.value) { + cleanupPlayer() + return + } + if (resetInterrupt) { + val currentMs = player?.currentPosition?.toDouble() ?: 0.0 + lastInterruptedAtSeconds = currentMs / 1000.0 + } + cleanupPlayer() + _isSpeaking.value = false + } + + private fun cleanupPlayer() { + player?.stop() + player?.release() + player = null + currentAudioFile?.delete() + currentAudioFile = null + } + + private fun shouldInterrupt(transcript: String): Boolean { + val trimmed = transcript.trim() + if (trimmed.length < 3) return false + val spoken = lastSpokenText?.lowercase() + if (spoken != null && spoken.contains(trimmed.lowercase())) return false + return true + } + + private suspend fun reloadConfig() { + val bridge = session ?: return + val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim() + val sagVoice = System.getenv("SAG_VOICE_ID")?.trim() + try { + val res = bridge.request("config.get", "{}") + val root = json.parseToJsonElement(res).asObjectOrNull() + val config = root?.get("config").asObjectOrNull() + val talk = config?.get("talk").asObjectOrNull() + val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull() + + defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } + if (!voiceOverrideActive) currentVoiceId = defaultVoiceId + defaultModelId = model + if (!modelOverrideActive) currentModelId = defaultModelId + defaultOutputFormat = outputFormat + if (interrupt != null) interruptOnSpeech = interrupt + } catch (_: Throwable) { + defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } + } + } + + private fun parseRunId(jsonString: String): String? { + val obj = json.parseToJsonElement(jsonString).asObjectOrNull() ?: return null + return obj["runId"].asStringOrNull() + } + + private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray { + return withContext(Dispatchers.IO) { + val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId") + val conn = url.openConnection() as HttpURLConnection + conn.requestMethod = "POST" + conn.setRequestProperty("Content-Type", "application/json") + conn.setRequestProperty("Accept", "audio/mpeg") + conn.setRequestProperty("xi-api-key", apiKey) + conn.doOutput = true + + val payload = buildRequestPayload(request) + conn.outputStream.use { it.write(payload.toByteArray()) } + + val code = conn.responseCode + val stream = if (code >= 400) conn.errorStream else conn.inputStream + val data = stream.readBytes() + if (code >= 400) { + val message = String(data) + throw IllegalStateException("ElevenLabs failed: $code $message") + } + data + } + } + + private fun buildRequestPayload(request: ElevenLabsRequest): String { + val voiceSettingsEntries = + buildJsonObject { + request.speed?.let { put("speed", JsonPrimitive(it)) } + request.stability?.let { put("stability", JsonPrimitive(it)) } + request.similarity?.let { put("similarity_boost", JsonPrimitive(it)) } + request.style?.let { put("style", JsonPrimitive(it)) } + request.speakerBoost?.let { put("use_speaker_boost", JsonPrimitive(it)) } + } + + val payload = + buildJsonObject { + put("text", JsonPrimitive(request.text)) + request.modelId?.takeIf { it.isNotEmpty() }?.let { put("model_id", JsonPrimitive(it)) } + request.outputFormat?.takeIf { it.isNotEmpty() }?.let { put("output_format", JsonPrimitive(it)) } + request.seed?.let { put("seed", JsonPrimitive(it)) } + request.normalize?.let { put("apply_text_normalization", JsonPrimitive(it)) } + request.language?.let { put("language_code", JsonPrimitive(it)) } + if (voiceSettingsEntries.isNotEmpty()) { + put("voice_settings", voiceSettingsEntries) + } + } + + return payload.toString() + } + + private data class ElevenLabsRequest( + val text: String, + val modelId: String?, + val outputFormat: String?, + val speed: Double?, + val stability: Double?, + val similarity: Double?, + val style: Double?, + val speakerBoost: Boolean?, + val seed: Long?, + val normalize: String?, + val language: String?, + ) + + private object TalkModeRuntime { + fun resolveSpeed(speed: Double?, rateWpm: Int?): Double? { + if (rateWpm != null && rateWpm > 0) { + val resolved = rateWpm.toDouble() / 175.0 + if (resolved <= 0.5 || resolved >= 2.0) return null + return resolved + } + if (speed != null) { + if (speed <= 0.5 || speed >= 2.0) return null + return speed + } + return null + } + + fun validatedUnit(value: Double?): Double? { + if (value == null) return null + if (value < 0 || value > 1) return null + return value + } + + fun validatedSeed(value: Long?): Long? { + if (value == null) return null + if (value < 0 || value > 4294967295L) return null + return value + } + + fun validatedNormalize(value: String?): String? { + val normalized = value?.trim()?.lowercase() ?: return null + return if (normalized in listOf("auto", "on", "off")) normalized else null + } + + fun validatedLanguage(value: String?): String? { + val normalized = value?.trim()?.lowercase() ?: return null + if (normalized.length != 2) return null + if (!normalized.all { it in 'a'..'z' }) return null + return normalized + } + } + + private fun ensureInterruptListener() { + if (!interruptOnSpeech || !_isEnabled.value) return + mainHandler.post { + if (stopRequested) return@post + if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post + try { + if (recognizer == null) { + recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) } + } + recognizer?.cancel() + startListeningInternal(markListening = false) + } catch (_: Throwable) { + // ignore + } + } + } + + private val listener = + object : RecognitionListener { + override fun onReadyForSpeech(params: Bundle?) { + if (_isEnabled.value) { + _statusText.value = if (_isListening.value) "Listening" else _statusText.value + } + } + + override fun onBeginningOfSpeech() {} + + override fun onRmsChanged(rmsdB: Float) {} + + override fun onBufferReceived(buffer: ByteArray?) {} + + override fun onEndOfSpeech() { + scheduleRestart() + } + + override fun onError(error: Int) { + if (stopRequested) return + _isListening.value = false + if (error == SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) { + _statusText.value = "Microphone permission required" + return + } + + _statusText.value = + when (error) { + SpeechRecognizer.ERROR_AUDIO -> "Audio error" + SpeechRecognizer.ERROR_CLIENT -> "Client error" + SpeechRecognizer.ERROR_NETWORK -> "Network error" + SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> "Network timeout" + SpeechRecognizer.ERROR_NO_MATCH -> "Listening" + SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "Recognizer busy" + SpeechRecognizer.ERROR_SERVER -> "Server error" + SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Listening" + else -> "Speech error ($error)" + } + scheduleRestart(delayMs = 600) + } + + override fun onResults(results: Bundle?) { + val list = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty() + list.firstOrNull()?.let { handleTranscript(it, isFinal = true) } + scheduleRestart() + } + + override fun onPartialResults(partialResults: Bundle?) { + val list = partialResults?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty() + list.firstOrNull()?.let { handleTranscript(it, isFinal = false) } + } + + override fun onEvent(eventType: Int, params: Bundle?) {} + } +} + +private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject + +private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull + +private fun JsonElement?.asBooleanOrNull(): Boolean? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.booleanOrNull != null) return primitive.booleanOrNull + val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null + return when (content) { + "true", "yes", "1" -> true + "false", "no", "0" -> false + else -> null + } +} diff --git a/apps/android/app/src/test/java/com/steipete/clawdis/node/voice/TalkDirectiveParserTest.kt b/apps/android/app/src/test/java/com/steipete/clawdis/node/voice/TalkDirectiveParserTest.kt new file mode 100644 index 000000000..d69d2008f --- /dev/null +++ b/apps/android/app/src/test/java/com/steipete/clawdis/node/voice/TalkDirectiveParserTest.kt @@ -0,0 +1,55 @@ +package com.steipete.clawdis.node.voice + +import org.junit.Assert.assertEquals +import org.junit.Assert.assertNull +import org.junit.Assert.assertTrue +import org.junit.Test + +class TalkDirectiveParserTest { + @Test + fun parsesDirectiveAndStripsHeader() { + val input = """ + {"voice":"voice-123","once":true} + Hello from talk mode. + """.trimIndent() + val result = TalkDirectiveParser.parse(input) + assertEquals("voice-123", result.directive?.voiceId) + assertEquals(true, result.directive?.once) + assertEquals("Hello from talk mode.", result.stripped.trim()) + } + + @Test + fun ignoresUnknownKeysButReportsThem() { + val input = """ + {"voice":"abc","foo":1,"bar":"baz"} + Hi there. + """.trimIndent() + val result = TalkDirectiveParser.parse(input) + assertEquals("abc", result.directive?.voiceId) + assertTrue(result.unknownKeys.containsAll(listOf("bar", "foo"))) + } + + @Test + fun parsesAlternateKeys() { + val input = """ + {"model_id":"eleven_v3","similarity_boost":0.4,"no_speaker_boost":true,"rate":200} + Speak. + """.trimIndent() + val result = TalkDirectiveParser.parse(input) + assertEquals("eleven_v3", result.directive?.modelId) + assertEquals(0.4, result.directive?.similarity) + assertEquals(false, result.directive?.speakerBoost) + assertEquals(200, result.directive?.rateWpm) + } + + @Test + fun returnsNullWhenNoDirectivePresent() { + val input = """ + {} + Hello. + """.trimIndent() + val result = TalkDirectiveParser.parse(input) + assertNull(result.directive) + assertEquals(input, result.stripped) + } +} diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index 36b9345e1..4c491ea55 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -28,6 +28,7 @@ final class NodeAppModel { private var voiceWakeSyncTask: Task? @ObservationIgnored private var cameraHUDDismissTask: Task? let voiceWake = VoiceWakeManager() + let talkMode = TalkModeManager() private var lastAutoA2uiURL: String? var bridgeSession: BridgeSession { self.bridge } @@ -49,6 +50,9 @@ final class NodeAppModel { let enabled = UserDefaults.standard.bool(forKey: "voiceWake.enabled") self.voiceWake.setEnabled(enabled) + self.talkMode.attachBridge(self.bridge) + let talkEnabled = UserDefaults.standard.bool(forKey: "talk.enabled") + self.talkMode.setEnabled(talkEnabled) // Wire up deep links from canvas taps self.screen.onDeepLink = { [weak self] url in @@ -177,6 +181,10 @@ final class NodeAppModel { self.voiceWake.setEnabled(enabled) } + func setTalkEnabled(_ enabled: Bool) { + self.talkMode.setEnabled(enabled) + } + func connectToBridge( endpoint: NWEndpoint, hello: BridgeHello) diff --git a/apps/ios/Sources/Settings/SettingsTab.swift b/apps/ios/Sources/Settings/SettingsTab.swift index 34feee23a..265b7069c 100644 --- a/apps/ios/Sources/Settings/SettingsTab.swift +++ b/apps/ios/Sources/Settings/SettingsTab.swift @@ -20,6 +20,7 @@ struct SettingsTab: View { @AppStorage("node.displayName") private var displayName: String = "iOS Node" @AppStorage("node.instanceId") private var instanceId: String = UUID().uuidString @AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false + @AppStorage("talk.enabled") private var talkEnabled: Bool = false @AppStorage("camera.enabled") private var cameraEnabled: Bool = true @AppStorage("screen.preventSleep") private var preventSleep: Bool = true @AppStorage("bridge.preferredStableID") private var preferredBridgeStableID: String = "" @@ -156,6 +157,10 @@ struct SettingsTab: View { .onChange(of: self.voiceWakeEnabled) { _, newValue in self.appModel.setVoiceWakeEnabled(newValue) } + Toggle("Talk Mode", isOn: self.$talkEnabled) + .onChange(of: self.talkEnabled) { _, newValue in + self.appModel.setTalkEnabled(newValue) + } NavigationLink { VoiceWakeWordsSettingsView() diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift new file mode 100644 index 000000000..649eaa03a --- /dev/null +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -0,0 +1,518 @@ +import AVFAudio +import ClawdisKit +import Foundation +import Observation +import Speech + +@MainActor +@Observable +final class TalkModeManager: NSObject { + var isEnabled: Bool = false + var isListening: Bool = false + var isSpeaking: Bool = false + var statusText: String = "Off" + + private let audioEngine = AVAudioEngine() + private var speechRecognizer: SFSpeechRecognizer? + private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? + private var recognitionTask: SFSpeechRecognitionTask? + private var silenceTask: Task? + + private var lastHeard: Date? + private var lastTranscript: String = "" + private var lastSpokenText: String? + private var lastInterruptedAtSeconds: Double? + + private var defaultVoiceId: String? + private var currentVoiceId: String? + private var defaultModelId: String? + private var currentModelId: String? + private var defaultOutputFormat: String? + private var interruptOnSpeech: Bool = true + + private var bridge: BridgeSession? + private let silenceWindow: TimeInterval = 0.7 + + private var player: AVAudioPlayer? + + func attachBridge(_ bridge: BridgeSession) { + self.bridge = bridge + } + + func setEnabled(_ enabled: Bool) { + self.isEnabled = enabled + if enabled { + Task { await self.start() } + } else { + self.stop() + } + } + + func start() async { + guard self.isEnabled else { return } + if self.isListening { return } + + self.statusText = "Requesting permissions…" + let micOk = await Self.requestMicrophonePermission() + guard micOk else { + self.statusText = "Microphone permission denied" + return + } + let speechOk = await Self.requestSpeechPermission() + guard speechOk else { + self.statusText = "Speech recognition permission denied" + return + } + + await self.reloadConfig() + do { + try Self.configureAudioSession() + try self.startRecognition() + self.isListening = true + self.statusText = "Listening" + self.startSilenceMonitor() + } catch { + self.isListening = false + self.statusText = "Start failed: \(error.localizedDescription)" + } + } + + func stop() { + self.isEnabled = false + self.isListening = false + self.statusText = "Off" + self.lastTranscript = "" + self.lastHeard = nil + self.silenceTask?.cancel() + self.silenceTask = nil + self.stopRecognition() + self.stopSpeaking() + } + + private func startRecognition() throws { + self.speechRecognizer = SFSpeechRecognizer() + guard let recognizer = self.speechRecognizer else { + throw NSError(domain: "TalkMode", code: 1, userInfo: [ + NSLocalizedDescriptionKey: "Speech recognizer unavailable", + ]) + } + + self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() + self.recognitionRequest?.shouldReportPartialResults = true + guard let request = self.recognitionRequest else { return } + + let input = self.audioEngine.inputNode + let format = input.outputFormat(forBus: 0) + input.removeTap(onBus: 0) + input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in + request?.append(buffer) + } + + self.audioEngine.prepare() + try self.audioEngine.start() + + self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in + guard let self else { return } + if let error { + self.statusText = "Speech error: \(error.localizedDescription)" + } + guard let result else { return } + let transcript = result.bestTranscription.formattedString + Task { @MainActor in + await self.handleTranscript(transcript: transcript, isFinal: result.isFinal) + } + } + } + + private func stopRecognition() { + self.recognitionTask?.cancel() + self.recognitionTask = nil + self.recognitionRequest?.endAudio() + self.recognitionRequest = nil + self.audioEngine.inputNode.removeTap(onBus: 0) + self.audioEngine.stop() + self.speechRecognizer = nil + } + + private func handleTranscript(transcript: String, isFinal: Bool) async { + let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) + if self.isSpeaking, self.interruptOnSpeech { + if self.shouldInterrupt(with: trimmed) { + self.stopSpeaking() + } + return + } + + guard self.isListening else { return } + if !trimmed.isEmpty { + self.lastTranscript = trimmed + self.lastHeard = Date() + } + if isFinal { + self.lastTranscript = trimmed + } + } + + private func startSilenceMonitor() { + self.silenceTask?.cancel() + self.silenceTask = Task { [weak self] in + guard let self else { return } + while self.isEnabled { + try? await Task.sleep(nanoseconds: 200_000_000) + await self.checkSilence() + } + } + } + + private func checkSilence() async { + guard self.isListening else { return } + let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) + guard !transcript.isEmpty else { return } + guard let lastHeard else { return } + if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return } + await self.finalizeTranscript(transcript) + } + + private func finalizeTranscript(_ transcript: String) async { + self.isListening = false + self.statusText = "Thinking…" + self.lastTranscript = "" + self.lastHeard = nil + self.stopRecognition() + + await self.reloadConfig() + let prompt = self.buildPrompt(transcript: transcript) + guard let bridge else { + self.statusText = "Bridge not connected" + await self.start() + return + } + + do { + let runId = try await self.sendChat(prompt, bridge: bridge) + let ok = await self.waitForChatFinal(runId: runId, bridge: bridge) + if !ok { + self.statusText = "No reply" + await self.start() + return + } + + guard let assistantText = try await self.fetchLatestAssistantText(bridge: bridge) else { + self.statusText = "No reply" + await self.start() + return + } + await self.playAssistant(text: assistantText) + } catch { + self.statusText = "Talk failed: \(error.localizedDescription)" + } + + await self.start() + } + + private func buildPrompt(transcript: String) -> String { + var lines: [String] = [ + "Talk Mode active. Reply in a concise, spoken tone.", + "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", + ] + + if let interrupted = self.lastInterruptedAtSeconds { + let formatted = String(format: "%.1f", interrupted) + lines.append("Assistant speech interrupted at \(formatted)s.") + self.lastInterruptedAtSeconds = nil + } + + lines.append("") + lines.append(transcript) + return lines.joined(separator: "\n") + } + + private func sendChat(_ message: String, bridge: BridgeSession) async throws -> String { + struct SendResponse: Decodable { let runId: String } + let payload: [String: Any] = [ + "sessionKey": "main", + "message": message, + "thinking": "low", + "timeoutMs": 30_000, + "idempotencyKey": UUID().uuidString, + ] + let data = try JSONSerialization.data(withJSONObject: payload) + let json = String(decoding: data, as: UTF8.self) + let res = try await bridge.request(method: "chat.send", paramsJSON: json, timeoutSeconds: 30) + let decoded = try JSONDecoder().decode(SendResponse.self, from: res) + return decoded.runId + } + + private func waitForChatFinal(runId: String, bridge: BridgeSession) async -> Bool { + let stream = await bridge.subscribeServerEvents(bufferingNewest: 200) + let timeout = Date().addingTimeInterval(120) + for await evt in stream { + if Date() > timeout { return false } + guard evt.event == "chat", let payload = evt.payloadJSON else { continue } + guard let data = payload.data(using: .utf8) else { continue } + guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { continue } + if (json["runId"] as? String) != runId { continue } + if let state = json["state"] as? String, state == "final" { + return true + } + } + return false + } + + private func fetchLatestAssistantText(bridge: BridgeSession) async throws -> String? { + let res = try await bridge.request(method: "chat.history", paramsJSON: "{\"sessionKey\":\"main\"}", timeoutSeconds: 15) + guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return nil } + guard let messages = json["messages"] as? [[String: Any]] else { return nil } + for msg in messages.reversed() { + guard (msg["role"] as? String) == "assistant" else { continue } + guard let content = msg["content"] as? [[String: Any]] else { continue } + let text = content.compactMap { $0["text"] as? String }.joined(separator: "\n") + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + if !trimmed.isEmpty { return trimmed } + } + return nil + } + + private func playAssistant(text: String) async { + let parsed = TalkDirectiveParser.parse(text) + let directive = parsed.directive + let cleaned = parsed.stripped.trimmingCharacters(in: .whitespacesAndNewlines) + guard !cleaned.isEmpty else { return } + + if let voice = directive?.voiceId { + if directive?.once != true { + self.currentVoiceId = voice + } + } + if let model = directive?.modelId { + if directive?.once != true { + self.currentModelId = model + } + } + + let voiceId = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId + guard let voiceId, !voiceId.isEmpty else { + self.statusText = "Missing voice ID" + return + } + + guard let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"], !apiKey.isEmpty else { + self.statusText = "Missing ELEVENLABS_API_KEY" + return + } + + self.statusText = "Speaking…" + self.isSpeaking = true + self.lastSpokenText = cleaned + + do { + let request = ElevenLabsRequest( + text: cleaned, + modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, + outputFormat: directive?.outputFormat ?? self.defaultOutputFormat, + speed: TalkModeRuntime.resolveSpeed( + speed: directive?.speed, + rateWPM: directive?.rateWPM), + stability: TalkModeRuntime.validatedUnit(directive?.stability), + similarity: TalkModeRuntime.validatedUnit(directive?.similarity), + style: TalkModeRuntime.validatedUnit(directive?.style), + speakerBoost: directive?.speakerBoost, + seed: TalkModeRuntime.validatedSeed(directive?.seed), + normalize: TalkModeRuntime.validatedNormalize(directive?.normalize), + language: TalkModeRuntime.validatedLanguage(directive?.language)) + let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( + voiceId: voiceId, + request: request) + try await self.playAudio(data: audio) + } catch { + self.statusText = "Speak failed: \(error.localizedDescription)" + } + + self.isSpeaking = false + } + + private func playAudio(data: Data) async throws { + self.player?.stop() + let player = try AVAudioPlayer(data: data) + self.player = player + player.prepareToPlay() + player.play() + while player.isPlaying { + try? await Task.sleep(nanoseconds: 120_000_000) + } + } + + private func stopSpeaking() { + guard self.isSpeaking else { return } + self.lastInterruptedAtSeconds = self.player?.currentTime + self.player?.stop() + self.player = nil + self.isSpeaking = false + } + + private func shouldInterrupt(with transcript: String) -> Bool { + let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) + guard trimmed.count >= 3 else { return false } + if let spoken = self.lastSpokenText?.lowercased(), spoken.contains(trimmed.lowercased()) { + return false + } + return true + } + + private func reloadConfig() async { + guard let bridge else { return } + do { + let res = try await bridge.request(method: "config.get", paramsJSON: "{}", timeoutSeconds: 8) + guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return } + guard let config = json["config"] as? [String: Any] else { return } + let talk = config["talk"] as? [String: Any] + self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) + self.currentVoiceId = self.defaultVoiceId + self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) + self.currentModelId = self.defaultModelId + self.defaultOutputFormat = (talk?["outputFormat"] as? String)? + .trimmingCharacters(in: .whitespacesAndNewlines) + if let interrupt = talk?["interruptOnSpeech"] as? Bool { + self.interruptOnSpeech = interrupt + } + } catch { + // ignore + } + } + + private static func configureAudioSession() throws { + let session = AVAudioSession.sharedInstance() + try session.setCategory(.playAndRecord, mode: .measurement, options: [ + .duckOthers, + .mixWithOthers, + .allowBluetoothHFP, + .defaultToSpeaker, + ]) + try session.setActive(true, options: []) + } + + private nonisolated static func requestMicrophonePermission() async -> Bool { + await withCheckedContinuation(isolation: nil) { cont in + AVAudioApplication.requestRecordPermission { ok in + cont.resume(returning: ok) + } + } + } + + private nonisolated static func requestSpeechPermission() async -> Bool { + await withCheckedContinuation(isolation: nil) { cont in + SFSpeechRecognizer.requestAuthorization { status in + cont.resume(returning: status == .authorized) + } + } + } +} + +private struct ElevenLabsRequest { + let text: String + let modelId: String? + let outputFormat: String? + let speed: Double? + let stability: Double? + let similarity: Double? + let style: Double? + let speakerBoost: Bool? + let seed: UInt32? + let normalize: String? + let language: String? +} + +private struct ElevenLabsClient { + let apiKey: String + let baseUrl = URL(string: "https://api.elevenlabs.io")! + + func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data { + var url = self.baseUrl + url.appendPathComponent("v1") + url.appendPathComponent("text-to-speech") + url.appendPathComponent(voiceId) + + var payload: [String: Any] = [ + "text": request.text, + ] + if let modelId = request.modelId, !modelId.isEmpty { + payload["model_id"] = modelId + } + if let outputFormat = request.outputFormat, !outputFormat.isEmpty { + payload["output_format"] = outputFormat + } + if let seed = request.seed { + payload["seed"] = seed + } + if let normalize = request.normalize { + payload["apply_text_normalization"] = normalize + } + if let language = request.language { + payload["language_code"] = language + } + var voiceSettings: [String: Any] = [:] + if let speed = request.speed { voiceSettings["speed"] = speed } + if let stability = request.stability { voiceSettings["stability"] = stability } + if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity } + if let style = request.style { voiceSettings["style"] = style } + if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost } + if !voiceSettings.isEmpty { payload["voice_settings"] = voiceSettings } + + let body = try JSONSerialization.data(withJSONObject: payload, options: []) + var req = URLRequest(url: url) + req.httpMethod = "POST" + req.httpBody = body + req.setValue("application/json", forHTTPHeaderField: "Content-Type") + req.setValue("audio/mpeg", forHTTPHeaderField: "Accept") + req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") + + let (data, response) = try await URLSession.shared.data(for: req) + if let http = response as? HTTPURLResponse, http.statusCode >= 400 { + let message = String(data: data, encoding: .utf8) ?? "unknown" + throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)", + ]) + } + return data + } +} + +private enum TalkModeRuntime { + static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? { + if let rateWPM, rateWPM > 0 { + let resolved = Double(rateWPM) / 175.0 + if resolved <= 0.5 || resolved >= 2.0 { return nil } + return resolved + } + if let speed { + if speed <= 0.5 || speed >= 2.0 { return nil } + return speed + } + return nil + } + + static func validatedUnit(_ value: Double?) -> Double? { + guard let value else { return nil } + if value < 0 || value > 1 { return nil } + return value + } + + static func validatedSeed(_ value: Int?) -> UInt32? { + guard let value else { return nil } + if value < 0 || value > 4294967295 { return nil } + return UInt32(value) + } + + static func validatedNormalize(_ value: String?) -> String? { + guard let value else { return nil } + let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + return ["auto", "on", "off"].contains(normalized) ? normalized : nil + } + + static func validatedLanguage(_ value: String?) -> String? { + guard let value else { return nil } + let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil } + return normalized + } +} diff --git a/apps/ios/Sources/Voice/VoiceTab.swift b/apps/ios/Sources/Voice/VoiceTab.swift index 59e1cd6d4..4fedd0ce9 100644 --- a/apps/ios/Sources/Voice/VoiceTab.swift +++ b/apps/ios/Sources/Voice/VoiceTab.swift @@ -4,6 +4,7 @@ struct VoiceTab: View { @Environment(NodeAppModel.self) private var appModel @Environment(VoiceWakeManager.self) private var voiceWake @AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false + @AppStorage("talk.enabled") private var talkEnabled: Bool = false var body: some View { NavigationStack { @@ -14,6 +15,7 @@ struct VoiceTab: View { Text(self.voiceWake.statusText) .font(.footnote) .foregroundStyle(.secondary) + LabeledContent("Talk Mode", value: self.talkEnabled ? "Enabled" : "Disabled") } Section("Notes") { @@ -36,6 +38,9 @@ struct VoiceTab: View { .onChange(of: self.voiceWakeEnabled) { _, newValue in self.appModel.setVoiceWakeEnabled(newValue) } + .onChange(of: self.talkEnabled) { _, newValue in + self.appModel.setTalkEnabled(newValue) + } } } } diff --git a/apps/macos/Sources/Clawdis/AppState.swift b/apps/macos/Sources/Clawdis/AppState.swift index 53d81c02d..94e20538a 100644 --- a/apps/macos/Sources/Clawdis/AppState.swift +++ b/apps/macos/Sources/Clawdis/AppState.swift @@ -121,6 +121,15 @@ final class AppState { forKey: voicePushToTalkEnabledKey) } } } + var talkEnabled: Bool { + didSet { + self.ifNotPreview { + UserDefaults.standard.set(self.talkEnabled, forKey: talkEnabledKey) + Task { await TalkModeController.shared.setEnabled(self.talkEnabled) } + } + } + } + var iconOverride: IconOverrideSelection { didSet { self.ifNotPreview { UserDefaults.standard.set(self.iconOverride.rawValue, forKey: iconOverrideKey) } } } @@ -216,6 +225,7 @@ final class AppState { .stringArray(forKey: voiceWakeAdditionalLocalesKey) ?? [] self.voicePushToTalkEnabled = UserDefaults.standard .object(forKey: voicePushToTalkEnabledKey) as? Bool ?? false + self.talkEnabled = UserDefaults.standard.bool(forKey: talkEnabledKey) if let storedHeartbeats = UserDefaults.standard.object(forKey: heartbeatsEnabledKey) as? Bool { self.heartbeatsEnabled = storedHeartbeats } else { @@ -256,9 +266,13 @@ final class AppState { if self.swabbleEnabled, !PermissionManager.voiceWakePermissionsGranted() { self.swabbleEnabled = false } + if self.talkEnabled, !PermissionManager.voiceWakePermissionsGranted() { + self.talkEnabled = false + } if !self.isPreview { Task { await VoiceWakeRuntime.shared.refresh(state: self) } + Task { await TalkModeController.shared.setEnabled(self.talkEnabled) } } } @@ -312,6 +326,23 @@ final class AppState { Task { await VoiceWakeRuntime.shared.refresh(state: self) } } + func setTalkEnabled(_ enabled: Bool) async { + guard voiceWakeSupported else { + self.talkEnabled = false + return + } + + self.talkEnabled = enabled + guard !self.isPreview else { return } + + if !enabled { return } + + if PermissionManager.voiceWakePermissionsGranted() { return } + + let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true) + self.talkEnabled = granted + } + // MARK: - Global wake words sync (Gateway-owned) func applyGlobalVoiceWakeTriggers(_ triggers: [String]) { @@ -367,6 +398,7 @@ extension AppState { state.voiceWakeLocaleID = Locale.current.identifier state.voiceWakeAdditionalLocaleIDs = ["en-US", "de-DE"] state.voicePushToTalkEnabled = false + state.talkEnabled = false state.iconOverride = .system state.heartbeatsEnabled = true state.connectionMode = .local diff --git a/apps/macos/Sources/Clawdis/ConfigSettings.swift b/apps/macos/Sources/Clawdis/ConfigSettings.swift index 043139351..cbbf04d5a 100644 --- a/apps/macos/Sources/Clawdis/ConfigSettings.swift +++ b/apps/macos/Sources/Clawdis/ConfigSettings.swift @@ -30,6 +30,10 @@ struct ConfigSettings: View { @State private var browserColorHex: String = "#FF4500" @State private var browserAttachOnly: Bool = false + // Talk mode settings (stored in ~/.clawdis/clawdis.json under "talk") + @State private var talkVoiceId: String = "" + @State private var talkInterruptOnSpeech: Bool = true + var body: some View { ScrollView { self.content } .onChange(of: self.modelCatalogPath) { _, _ in @@ -53,6 +57,7 @@ struct ConfigSettings: View { self.header self.agentSection self.heartbeatSection + self.talkSection self.browserSection Spacer(minLength: 0) } @@ -266,6 +271,37 @@ struct ConfigSettings: View { .frame(maxWidth: .infinity, alignment: .leading) } + private var talkSection: some View { + GroupBox("Talk Mode") { + Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) { + GridRow { + self.gridLabel("Voice ID") + VStack(alignment: .leading, spacing: 6) { + ComboBox("ElevenLabs voice ID", text: self.$talkVoiceId) { + ForEach(self.talkVoiceSuggestions, id: \.self) { value in + Text(value).tag(value) + } + } + .textFieldStyle(.roundedBorder) + .frame(maxWidth: .infinity) + .onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() } + Text("Defaults to ELEVENLABS_VOICE_ID / SAG_VOICE_ID if unset.") + .font(.footnote) + .foregroundStyle(.secondary) + } + } + GridRow { + self.gridLabel("Interrupt") + Toggle("Stop speaking when you start talking", isOn: self.$talkInterruptOnSpeech) + .labelsHidden() + .toggleStyle(.checkbox) + .onChange(of: self.talkInterruptOnSpeech) { _, _ in self.autosaveConfig() } + } + } + } + .frame(maxWidth: .infinity, alignment: .leading) + } + private func gridLabel(_ text: String) -> some View { Text(text) .foregroundStyle(.secondary) @@ -278,6 +314,7 @@ struct ConfigSettings: View { let heartbeatMinutes = agent?["heartbeatMinutes"] as? Int let heartbeatBody = agent?["heartbeatBody"] as? String let browser = parsed["browser"] as? [String: Any] + let talk = parsed["talk"] as? [String: Any] let loadedModel = (agent?["model"] as? String) ?? "" if !loadedModel.isEmpty { @@ -297,6 +334,13 @@ struct ConfigSettings: View { if let color = browser["color"] as? String, !color.isEmpty { self.browserColorHex = color } if let attachOnly = browser["attachOnly"] as? Bool { self.browserAttachOnly = attachOnly } } + + if let talk { + if let voice = talk["voiceId"] as? String { self.talkVoiceId = voice } + if let interrupt = talk["interruptOnSpeech"] as? Bool { + self.talkInterruptOnSpeech = interrupt + } + } } private func autosaveConfig() { @@ -312,6 +356,7 @@ struct ConfigSettings: View { var root = self.loadConfigDict() var agent = root["agent"] as? [String: Any] ?? [:] var browser = root["browser"] as? [String: Any] ?? [:] + var talk = root["talk"] as? [String: Any] ?? [:] let chosenModel = (self.configModel == "__custom__" ? self.customModel : self.configModel) .trimmingCharacters(in: .whitespacesAndNewlines) @@ -337,6 +382,15 @@ struct ConfigSettings: View { browser["attachOnly"] = self.browserAttachOnly root["browser"] = browser + let trimmedVoice = self.talkVoiceId.trimmingCharacters(in: .whitespacesAndNewlines) + if trimmedVoice.isEmpty { + talk.removeValue(forKey: "voiceId") + } else { + talk["voiceId"] = trimmedVoice + } + talk["interruptOnSpeech"] = self.talkInterruptOnSpeech + root["talk"] = talk + ClawdisConfigFile.saveDict(root) } @@ -354,6 +408,20 @@ struct ConfigSettings: View { return Color(red: r, green: g, blue: b) } + private var talkVoiceSuggestions: [String] { + let env = ProcessInfo.processInfo.environment + let candidates = [ + self.talkVoiceId, + env["ELEVENLABS_VOICE_ID"] ?? "", + env["SAG_VOICE_ID"] ?? "", + ] + var seen = Set() + return candidates + .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty } + .filter { seen.insert($0).inserted } + } + private var browserPathLabel: String? { guard self.browserEnabled else { return nil } diff --git a/apps/macos/Sources/Clawdis/Constants.swift b/apps/macos/Sources/Clawdis/Constants.swift index 966d1744a..589091261 100644 --- a/apps/macos/Sources/Clawdis/Constants.swift +++ b/apps/macos/Sources/Clawdis/Constants.swift @@ -16,6 +16,7 @@ let voiceWakeMicKey = "clawdis.voiceWakeMicID" let voiceWakeLocaleKey = "clawdis.voiceWakeLocaleID" let voiceWakeAdditionalLocalesKey = "clawdis.voiceWakeAdditionalLocaleIDs" let voicePushToTalkEnabledKey = "clawdis.voicePushToTalkEnabled" +let talkEnabledKey = "clawdis.talkEnabled" let iconOverrideKey = "clawdis.iconOverride" let connectionModeKey = "clawdis.connectionMode" let remoteTargetKey = "clawdis.remoteTarget" diff --git a/apps/macos/Sources/Clawdis/MenuContentView.swift b/apps/macos/Sources/Clawdis/MenuContentView.swift index 6a5dc1e89..748ce018d 100644 --- a/apps/macos/Sources/Clawdis/MenuContentView.swift +++ b/apps/macos/Sources/Clawdis/MenuContentView.swift @@ -72,6 +72,11 @@ struct MenuContent: View { if self.showVoiceWakeMicPicker { self.voiceWakeMicMenu } + Toggle(isOn: self.talkBinding) { + Label("Talk", systemImage: "bubble.left.and.waveform") + } + .disabled(!voiceWakeSupported) + .opacity(voiceWakeSupported ? 1 : 0.5) Divider() Button { Task { @MainActor in @@ -331,6 +336,14 @@ struct MenuContent: View { }) } + private var talkBinding: Binding { + Binding( + get: { self.state.talkEnabled }, + set: { newValue in + Task { await self.state.setTalkEnabled(newValue) } + }) + } + private var showVoiceWakeMicPicker: Bool { voiceWakeSupported && self.state.swabbleEnabled } diff --git a/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift new file mode 100644 index 000000000..f72de1d02 --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift @@ -0,0 +1,54 @@ +import AVFoundation +import Foundation +import OSLog + +@MainActor +final class TalkAudioPlayer: NSObject, AVAudioPlayerDelegate { + static let shared = TalkAudioPlayer() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts") + private var player: AVAudioPlayer? + private var continuation: CheckedContinuation? + + func play(data: Data) async -> TalkPlaybackResult { + self.stopInternal(interrupted: true) + do { + let player = try AVAudioPlayer(data: data) + self.player = player + player.delegate = self + player.prepareToPlay() + player.play() + return await withCheckedContinuation { continuation in + self.continuation = continuation + } + } catch { + self.logger.error("talk audio player failed: \(error.localizedDescription, privacy: .public)") + return TalkPlaybackResult(finished: false, interruptedAt: nil) + } + } + + func stop() -> Double? { + guard let player else { return nil } + let time = player.currentTime + self.stopInternal(interrupted: true, interruptedAt: time) + return time + } + + func audioPlayerDidFinishPlaying(_: AVAudioPlayer, successfully flag: Bool) { + self.stopInternal(interrupted: !flag) + } + + private func stopInternal(interrupted: Bool, interruptedAt: Double? = nil) { + self.player?.stop() + self.player = nil + if let continuation { + self.continuation = nil + continuation.resume(returning: TalkPlaybackResult(finished: !interrupted, interruptedAt: interruptedAt)) + } + } +} + +struct TalkPlaybackResult: Sendable { + let finished: Bool + let interruptedAt: Double? +} diff --git a/apps/macos/Sources/Clawdis/TalkModeController.swift b/apps/macos/Sources/Clawdis/TalkModeController.swift new file mode 100644 index 000000000..920af0539 --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkModeController.swift @@ -0,0 +1,42 @@ +import Observation +import OSLog + +@MainActor +@Observable +final class TalkModeController { + static let shared = TalkModeController() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.controller") + + func setEnabled(_ enabled: Bool) async { + self.logger.info("talk enabled=\(enabled)") + if enabled { + TalkOverlayController.shared.present() + } else { + TalkOverlayController.shared.dismiss() + } + await TalkModeRuntime.shared.setEnabled(enabled) + } + + func updatePhase(_ phase: TalkModePhase) { + TalkOverlayController.shared.updatePhase(phase) + } + + func updateLevel(_ level: Double) { + TalkOverlayController.shared.updateLevel(level) + } + + func stopSpeaking(reason: TalkStopReason = .userTap) { + Task { await TalkModeRuntime.shared.stopSpeaking(reason: reason) } + } + + func exitTalkMode() { + Task { await AppStateStore.shared.setTalkEnabled(false) } + } +} + +enum TalkStopReason { + case userTap + case speech + case manual +} diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift new file mode 100644 index 000000000..955d9ceda --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -0,0 +1,684 @@ +import AVFoundation +import ClawdisChatUI +import ClawdisKit +import Foundation +import OSLog +import Speech + +actor TalkModeRuntime { + static let shared = TalkModeRuntime() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime") + + private var recognizer: SFSpeechRecognizer? + private var audioEngine: AVAudioEngine? + private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? + private var recognitionTask: SFSpeechRecognitionTask? + private var recognitionGeneration: Int = 0 + + private var captureTask: Task? + private var silenceTask: Task? + private var phase: TalkModePhase = .idle + private var isEnabled = false + + private var lastHeard: Date? + private var noiseFloorRMS: Double = 1e-4 + private var lastTranscript: String = "" + private var lastSpeechEnergyAt: Date? + + private var defaultVoiceId: String? + private var currentVoiceId: String? + private var defaultModelId: String? + private var currentModelId: String? + private var voiceOverrideActive = false + private var modelOverrideActive = false + private var defaultOutputFormat: String? + private var interruptOnSpeech: Bool = true + private var lastInterruptedAtSeconds: Double? + private var lastSpokenText: String? + + private let silenceWindow: TimeInterval = 0.7 + private let minSpeechRMS: Double = 1e-3 + private let speechBoostFactor: Double = 6.0 + + // MARK: - Lifecycle + + func setEnabled(_ enabled: Bool) async { + guard enabled != self.isEnabled else { return } + self.isEnabled = enabled + if enabled { + await self.start() + } else { + await self.stop() + } + } + + private func start() async { + guard voiceWakeSupported else { return } + guard PermissionManager.voiceWakePermissionsGranted() else { + self.logger.debug("talk runtime not starting: permissions missing") + return + } + await self.reloadConfig() + await self.startRecognition() + self.phase = .listening + await MainActor.run { TalkModeController.shared.updatePhase(.listening) } + self.startSilenceMonitor() + } + + private func stop() async { + self.captureTask?.cancel() + self.captureTask = nil + self.silenceTask?.cancel() + self.silenceTask = nil + self.lastTranscript = "" + self.lastHeard = nil + self.lastSpeechEnergyAt = nil + self.phase = .idle + await self.stopRecognition() + await self.stopSpeaking(reason: .manual) + await MainActor.run { + TalkModeController.shared.updateLevel(0) + TalkModeController.shared.updatePhase(.idle) + } + } + + // MARK: - Speech recognition + + private struct RecognitionUpdate { + let transcript: String? + let segments: [SFTranscriptionSegment] + let isFinal: Bool + let error: Error? + let generation: Int + } + + private func startRecognition() async { + await self.stopRecognition() + self.recognitionGeneration &+= 1 + let generation = self.recognitionGeneration + + let locale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID } + self.recognizer = SFSpeechRecognizer(locale: Locale(identifier: locale)) + guard let recognizer, recognizer.isAvailable else { + self.logger.error("talk recognizer unavailable") + return + } + + self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() + self.recognitionRequest?.shouldReportPartialResults = true + guard let request = self.recognitionRequest else { return } + + if self.audioEngine == nil { + self.audioEngine = AVAudioEngine() + } + guard let audioEngine = self.audioEngine else { return } + + let input = audioEngine.inputNode + let format = input.outputFormat(forBus: 0) + input.removeTap(onBus: 0) + input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in + request?.append(buffer) + if let rms = Self.rmsLevel(buffer: buffer) { + Task.detached { [weak self] in + await self?.noteAudioLevel(rms: rms) + } + } + } + + audioEngine.prepare() + do { + try audioEngine.start() + } catch { + self.logger.error("talk audio engine start failed: \(error.localizedDescription, privacy: .public)") + return + } + + self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in + guard let self else { return } + let transcript = result?.bestTranscription.formattedString + let update = RecognitionUpdate( + transcript: transcript, + segments: result?.bestTranscription.segments ?? [], + isFinal: result?.isFinal ?? false, + error: error, + generation: generation) + Task { await self.handleRecognition(update) } + } + } + + private func stopRecognition() async { + self.recognitionGeneration &+= 1 + self.recognitionTask?.cancel() + self.recognitionTask = nil + self.recognitionRequest?.endAudio() + self.recognitionRequest = nil + self.audioEngine?.inputNode.removeTap(onBus: 0) + self.audioEngine?.stop() + self.audioEngine = nil + self.recognizer = nil + } + + private func handleRecognition(_ update: RecognitionUpdate) async { + guard update.generation == self.recognitionGeneration else { return } + if let error = update.error { + self.logger.debug("talk recognition error: \(error.localizedDescription, privacy: .public)") + } + guard let transcript = update.transcript else { return } + + let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) + if self.phase == .speaking, self.interruptOnSpeech { + if await self.shouldInterrupt(transcript: trimmed, segments: update.segments) { + await self.stopSpeaking(reason: .speech) + self.lastTranscript = "" + self.lastHeard = nil + await self.startListening() + } + return + } + + guard self.phase == .listening else { return } + + if !trimmed.isEmpty { + self.lastTranscript = trimmed + self.lastHeard = Date() + } + + if update.isFinal { + self.lastTranscript = trimmed + } + } + + // MARK: - Silence handling + + private func startSilenceMonitor() { + self.silenceTask?.cancel() + self.silenceTask = Task { [weak self] in + guard let self else { return } + while self.isEnabled { + try? await Task.sleep(nanoseconds: 200_000_000) + await self.checkSilence() + } + } + } + + private func checkSilence() async { + guard self.phase == .listening else { return } + let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) + guard !transcript.isEmpty else { return } + guard let lastHeard else { return } + let elapsed = Date().timeIntervalSince(lastHeard) + guard elapsed >= self.silenceWindow else { return } + await self.finalizeTranscript(transcript) + } + + private func startListening() async { + self.phase = .listening + self.lastTranscript = "" + self.lastHeard = nil + await MainActor.run { + TalkModeController.shared.updatePhase(.listening) + TalkModeController.shared.updateLevel(0) + } + } + + private func finalizeTranscript(_ text: String) async { + self.lastTranscript = "" + self.lastHeard = nil + self.phase = .thinking + await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } + await self.stopRecognition() + await self.sendAndSpeak(text) + } + + // MARK: - Gateway + TTS + + private func sendAndSpeak(_ transcript: String) async { + await self.reloadConfig() + let prompt = self.buildPrompt(transcript: transcript) + let runId = UUID().uuidString + + do { + let response = try await GatewayConnection.shared.chatSend( + sessionKey: "main", + message: prompt, + thinking: "low", + idempotencyKey: runId, + attachments: []) + let completion = await self.waitForChatCompletion( + runId: response.runId, + timeoutSeconds: 120) + guard completion == .final else { + await self.startListening() + await self.startRecognition() + return + } + + guard let assistantText = await self.latestAssistantText(sessionKey: "main") else { + await self.startListening() + await self.startRecognition() + return + } + + await self.playAssistant(text: assistantText) + await self.startListening() + await self.startRecognition() + return + } catch { + self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)") + await self.startListening() + await self.startRecognition() + return + } + } + + private func buildPrompt(transcript: String) -> String { + var lines: [String] = [ + "Talk Mode active. Reply in a concise, spoken tone.", + "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", + ] + + if let interrupted = self.lastInterruptedAtSeconds { + let formatted = String(format: "%.1f", interrupted) + lines.append("Assistant speech interrupted at \(formatted)s.") + self.lastInterruptedAtSeconds = nil + } + + lines.append("") + lines.append(transcript) + return lines.joined(separator: "\n") + } + + private enum ChatCompletionState { + case final + case aborted + case error + case timeout + } + + private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState { + await withTaskGroup(of: ChatCompletionState.self) { group in + group.addTask { [runId] in + let stream = GatewayConnection.shared.subscribe() + for await push in stream { + if case let .event(evt) = push, evt.event == "chat", let payload = evt.payload { + if let chat = try? JSONDecoder().decode( + ClawdisChatEventPayload.self, + from: JSONEncoder().encode(payload)) + { + guard chat.runId == runId else { continue } + switch chat.state { + case .some("final"): return .final + case .some("aborted"): return .aborted + case .some("error"): return .error + default: break + } + } + } + } + return .timeout + } + group.addTask { + try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000) + return .timeout + } + let result = await group.next() ?? .timeout + group.cancelAll() + return result + } + } + + private func latestAssistantText(sessionKey: String) async -> String? { + do { + let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey) + let messages = history.messages ?? [] + let decoded = messages.compactMap { item in + guard let data = try? JSONEncoder().encode(item) else { return nil } + return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data) + } + guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil } + let text = assistant.content.compactMap { $0.text }.joined(separator: "\n") + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? nil : trimmed + } catch { + self.logger.error("talk history fetch failed: \(error.localizedDescription, privacy: .public)") + return nil + } + } + + private func playAssistant(text: String) async { + let parse = TalkDirectiveParser.parse(text) + let directive = parse.directive + let cleaned = parse.stripped.trimmingCharacters(in: .whitespacesAndNewlines) + guard !cleaned.isEmpty else { return } + + if !parse.unknownKeys.isEmpty { + self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)") + } + + if let voice = directive?.voiceId { + if directive?.once == true { + self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)") + } else { + self.currentVoiceId = voice + self.voiceOverrideActive = true + self.logger.info("talk voice override voiceId=\(voice, privacy: .public)") + } + } + + if let model = directive?.modelId { + if directive?.once == true { + self.logger.info("talk model override (once) modelId=\(model, privacy: .public)") + } else { + self.currentModelId = model + self.modelOverrideActive = true + } + } + + let voiceId = + directive?.voiceId ?? + self.currentVoiceId ?? + self.defaultVoiceId + + guard let voiceId, !voiceId.isEmpty else { + self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID") + return + } + + let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? "" + if apiKey.isEmpty { + self.logger.error("talk missing ELEVENLABS_API_KEY") + return + } + + await self.startRecognition() + await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } + self.phase = .speaking + self.lastSpokenText = cleaned + + let resolvedSpeed = Self.resolveSpeed( + speed: directive?.speed, + rateWPM: directive?.rateWPM, + logger: self.logger) + + let request = ElevenLabsRequest( + text: cleaned, + modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, + outputFormat: directive?.outputFormat ?? self.defaultOutputFormat, + speed: resolvedSpeed, + stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger), + similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger), + style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger), + speakerBoost: directive?.speakerBoost, + seed: Self.validatedSeed(directive?.seed, logger: self.logger), + normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger), + language: Self.validatedLanguage(directive?.language, logger: self.logger)) + + do { + let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( + voiceId: voiceId, + request: request) + let result = await MainActor.run { await TalkAudioPlayer.shared.play(data: audio) } + if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking { + if self.interruptOnSpeech { + self.lastInterruptedAtSeconds = interruptedAt + } + } + } catch { + self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)") + } + + self.phase = .thinking + await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } + } + + func stopSpeaking(reason: TalkStopReason) async { + guard self.phase == .speaking else { return } + let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() } + if reason == .speech, let interruptedAt { + self.lastInterruptedAtSeconds = interruptedAt + } + self.phase = .thinking + await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } + } + + // MARK: - Config + + private func reloadConfig() async { + let cfg = await self.fetchTalkConfig() + self.defaultVoiceId = cfg.voiceId + if !self.voiceOverrideActive { + self.currentVoiceId = cfg.voiceId + } + self.defaultModelId = cfg.modelId + if !self.modelOverrideActive { + self.currentModelId = cfg.modelId + } + self.defaultOutputFormat = cfg.outputFormat + self.interruptOnSpeech = cfg.interruptOnSpeech + } + + private struct TalkRuntimeConfig { + let voiceId: String? + let modelId: String? + let outputFormat: String? + let interruptOnSpeech: Bool + } + + private func fetchTalkConfig() async -> TalkRuntimeConfig { + let env = ProcessInfo.processInfo.environment + let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) + let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) + + do { + let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded( + method: .configGet, + params: nil, + timeoutMs: 8000) + let talk = snap.config?["talk"]?.dictionaryValue + let voice = talk?["voiceId"]?.stringValue + let model = talk?["modelId"]?.stringValue + let outputFormat = talk?["outputFormat"]?.stringValue + let interrupt = talk?["interruptOnSpeech"]?.boolValue + let resolvedVoice = + (voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ?? + (envVoice?.isEmpty == false ? envVoice : nil) ?? + (sagVoice?.isEmpty == false ? sagVoice : nil) + return TalkRuntimeConfig( + voiceId: resolvedVoice, + modelId: model, + outputFormat: outputFormat, + interruptOnSpeech: interrupt ?? true) + } catch { + let resolvedVoice = + (envVoice?.isEmpty == false ? envVoice : nil) ?? + (sagVoice?.isEmpty == false ? sagVoice : nil) + return TalkRuntimeConfig( + voiceId: resolvedVoice, + modelId: nil, + outputFormat: nil, + interruptOnSpeech: true) + } + } + + // MARK: - Audio level handling + + private func noteAudioLevel(rms: Double) async { + if self.phase != .listening && self.phase != .speaking { return } + let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01 + self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha) + + let threshold = max(self.minSpeechRMS, self.noiseFloorRMS * self.speechBoostFactor) + if rms >= threshold { + let now = Date() + self.lastHeard = now + self.lastSpeechEnergyAt = now + } + + if self.phase == .listening { + let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold))) + await MainActor.run { TalkModeController.shared.updateLevel(clamped) } + } + } + + private static func rmsLevel(buffer: AVAudioPCMBuffer) -> Double? { + guard let channelData = buffer.floatChannelData?.pointee else { return nil } + let frameCount = Int(buffer.frameLength) + guard frameCount > 0 else { return nil } + var sum: Double = 0 + for i in 0.. Bool { + let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) + guard trimmed.count >= 3 else { return false } + if self.isLikelyEcho(of: trimmed) { return false } + let now = Date() + if let lastSpeechEnergyAt, now.timeIntervalSince(lastSpeechEnergyAt) > 0.35 { + return false + } + let hasConfidence = segments.contains { $0.confidence > 0.6 } + return hasConfidence + } + + private func isLikelyEcho(of transcript: String) -> Bool { + guard let spoken = self.lastSpokenText?.lowercased(), !spoken.isEmpty else { return false } + let probe = transcript.lowercased() + if probe.count < 6 { + return spoken.contains(probe) + } + return spoken.contains(probe) + } + + private static func resolveSpeed(speed: Double?, rateWPM: Int?, logger: Logger) -> Double? { + if let rateWPM, rateWPM > 0 { + let resolved = Double(rateWPM) / 175.0 + if resolved <= 0.5 || resolved >= 2.0 { + logger.warning("talk rateWPM out of range: \(rateWPM, privacy: .public)") + return nil + } + return resolved + } + if let speed { + if speed <= 0.5 || speed >= 2.0 { + logger.warning("talk speed out of range: \(speed, privacy: .public)") + return nil + } + return speed + } + return nil + } + + private static func validatedUnit(_ value: Double?, name: String, logger: Logger) -> Double? { + guard let value else { return nil } + if value < 0 || value > 1 { + logger.warning("talk \(name, privacy: .public) out of range: \(value, privacy: .public)") + return nil + } + return value + } + + private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? { + guard let value else { return nil } + if value < 0 || value > 4294967295 { + logger.warning("talk seed out of range: \(value, privacy: .public)") + return nil + } + return UInt32(value) + } + + private static func validatedNormalize(_ value: String?, logger: Logger) -> String? { + guard let value else { return nil } + let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + guard ["auto", "on", "off"].contains(normalized) else { + logger.warning("talk normalize invalid: \(normalized, privacy: .public)") + return nil + } + return normalized + } + + private static func validatedLanguage(_ value: String?, logger: Logger) -> String? { + guard let value else { return nil } + let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { + logger.warning("talk language invalid: \(normalized, privacy: .public)") + return nil + } + return normalized + } +} + +private struct ElevenLabsRequest { + let text: String + let modelId: String? + let outputFormat: String? + let speed: Double? + let stability: Double? + let similarity: Double? + let style: Double? + let speakerBoost: Bool? + let seed: UInt32? + let normalize: String? + let language: String? +} + +private struct ElevenLabsClient { + let apiKey: String + let baseUrl: URL = URL(string: "https://api.elevenlabs.io")! + + func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data { + var url = self.baseUrl + url.appendPathComponent("v1") + url.appendPathComponent("text-to-speech") + url.appendPathComponent(voiceId) + + var payload: [String: Any] = [ + "text": request.text, + ] + if let modelId = request.modelId, !modelId.isEmpty { + payload["model_id"] = modelId + } + if let outputFormat = request.outputFormat, !outputFormat.isEmpty { + payload["output_format"] = outputFormat + } + if let seed = request.seed { + payload["seed"] = seed + } + if let normalize = request.normalize { + payload["apply_text_normalization"] = normalize + } + if let language = request.language { + payload["language_code"] = language + } + var voiceSettings: [String: Any] = [:] + if let speed = request.speed { voiceSettings["speed"] = speed } + if let stability = request.stability { voiceSettings["stability"] = stability } + if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity } + if let style = request.style { voiceSettings["style"] = style } + if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost } + if !voiceSettings.isEmpty { + payload["voice_settings"] = voiceSettings + } + + let body = try JSONSerialization.data(withJSONObject: payload, options: []) + var req = URLRequest(url: url) + req.httpMethod = "POST" + req.httpBody = body + req.setValue("application/json", forHTTPHeaderField: "Content-Type") + req.setValue("audio/mpeg", forHTTPHeaderField: "Accept") + req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") + + let (data, response) = try await URLSession.shared.data(for: req) + if let http = response as? HTTPURLResponse, http.statusCode >= 400 { + let message = String(data: data, encoding: .utf8) ?? "unknown" + throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)", + ]) + } + return data + } +} diff --git a/apps/macos/Sources/Clawdis/TalkModeTypes.swift b/apps/macos/Sources/Clawdis/TalkModeTypes.swift new file mode 100644 index 000000000..3ae978255 --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkModeTypes.swift @@ -0,0 +1,8 @@ +import Foundation + +enum TalkModePhase: String { + case idle + case listening + case thinking + case speaking +} diff --git a/apps/macos/Sources/Clawdis/TalkOverlay.swift b/apps/macos/Sources/Clawdis/TalkOverlay.swift new file mode 100644 index 000000000..63c9d5dce --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift @@ -0,0 +1,119 @@ +import AppKit +import Observation +import OSLog +import SwiftUI + +@MainActor +@Observable +final class TalkOverlayController { + static let shared = TalkOverlayController() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay") + + struct Model { + var isVisible: Bool = false + var phase: TalkModePhase = .idle + var level: Double = 0 + } + + var model = Model() + private var window: NSPanel? + private var hostingView: NSHostingView? + + private let width: CGFloat = 92 + private let height: CGFloat = 92 + private let padding: CGFloat = 8 + + func present() { + self.ensureWindow() + self.hostingView?.rootView = TalkOverlayView(controller: self) + let target = self.targetFrame() + + guard let window else { return } + if !self.model.isVisible { + self.model.isVisible = true + let start = target.offsetBy(dx: 0, dy: -6) + window.setFrame(start, display: true) + window.alphaValue = 0 + window.orderFrontRegardless() + NSAnimationContext.runAnimationGroup { context in + context.duration = 0.18 + context.timingFunction = CAMediaTimingFunction(name: .easeOut) + window.animator().setFrame(target, display: true) + window.animator().alphaValue = 1 + } + } else { + window.setFrame(target, display: true) + window.orderFrontRegardless() + } + } + + func dismiss() { + guard let window else { + self.model.isVisible = false + return + } + + let target = window.frame.offsetBy(dx: 6, dy: 6) + NSAnimationContext.runAnimationGroup { context in + context.duration = 0.16 + context.timingFunction = CAMediaTimingFunction(name: .easeOut) + window.animator().setFrame(target, display: true) + window.animator().alphaValue = 0 + } completionHandler: { + Task { @MainActor in + window.orderOut(nil) + self.model.isVisible = false + } + } + } + + func updatePhase(_ phase: TalkModePhase) { + guard self.model.phase != phase else { return } + self.logger.info("talk overlay phase=\(phase.rawValue, privacy: .public)") + self.model.phase = phase + } + + func updateLevel(_ level: Double) { + guard self.model.isVisible else { return } + self.model.level = max(0, min(1, level)) + } + + // MARK: - Private + + private func ensureWindow() { + if self.window != nil { return } + let panel = NSPanel( + contentRect: NSRect(x: 0, y: 0, width: self.width, height: self.height), + styleMask: [.nonactivatingPanel, .borderless], + backing: .buffered, + defer: false) + panel.isOpaque = false + panel.backgroundColor = .clear + panel.hasShadow = false + panel.level = NSWindow.Level(rawValue: NSWindow.Level.popUpMenu.rawValue - 4) + panel.collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .transient] + panel.hidesOnDeactivate = false + panel.isMovable = false + panel.isFloatingPanel = true + panel.becomesKeyOnlyIfNeeded = true + panel.titleVisibility = .hidden + panel.titlebarAppearsTransparent = true + + let host = NSHostingView(rootView: TalkOverlayView(controller: self)) + host.translatesAutoresizingMaskIntoConstraints = false + panel.contentView = host + self.hostingView = host + self.window = panel + } + + private func targetFrame() -> NSRect { + guard let screen = NSScreen.main else { return .zero } + let size = NSSize(width: self.width, height: self.height) + let visible = screen.visibleFrame + let origin = CGPoint( + x: visible.maxX - size.width - self.padding, + y: visible.maxY - size.height - self.padding) + return NSRect(origin: origin, size: size) + } +} diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift new file mode 100644 index 000000000..2f2be75ca --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -0,0 +1,139 @@ +import SwiftUI + +struct TalkOverlayView: View { + var controller: TalkOverlayController + @State private var hovering = false + + var body: some View { + ZStack(alignment: .topLeading) { + TalkCloudView(phase: self.controller.model.phase, level: self.controller.model.level) + .frame(width: 76, height: 64) + .contentShape(Rectangle()) + .onTapGesture { + TalkModeController.shared.stopSpeaking(reason: .userTap) + } + .padding(8) + + Button { + TalkModeController.shared.exitTalkMode() + } label: { + Image(systemName: "xmark") + .font(.system(size: 10, weight: .bold)) + .foregroundStyle(Color.white.opacity(self.hovering ? 0.95 : 0.7)) + .frame(width: 18, height: 18) + .background(Color.black.opacity(self.hovering ? 0.45 : 0.3)) + .clipShape(Circle()) + } + .buttonStyle(.plain) + .contentShape(Circle()) + .padding(4) + .onHover { self.hovering = $0 } + } + .frame(width: 92, height: 92, alignment: .center) + } +} + +private struct TalkCloudView: View { + let phase: TalkModePhase + let level: Double + + var body: some View { + TimelineView(.animation) { context in + let t = context.date.timeIntervalSinceReferenceDate + let pulse = phase == .speaking ? (1 + 0.04 * sin(t * 6)) : 1 + let sink = phase == .thinking ? (3 + 2 * sin(t * 2)) : 0 + let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.14) : 1 + let baseScale = phase == .thinking ? 0.94 : 1 + + ZStack { + CloudShape() + .fill(self.cloudGradient) + .overlay( + CloudShape() + .stroke(Color.white.opacity(0.35), lineWidth: 0.8)) + .shadow(color: Color.black.opacity(0.18), radius: 8, x: 0, y: 4) + .scaleEffect(baseScale * pulse * listenScale) + .offset(y: sink) + + if phase == .listening { + Circle() + .stroke(self.ringGradient, lineWidth: 1) + .scaleEffect(1 + CGFloat(self.level) * 0.45) + .opacity(0.3 + CGFloat(self.level) * 0.4) + .animation(.easeOut(duration: 0.08), value: self.level) + } + + if phase == .thinking { + TalkThinkingDots(time: t) + .offset(y: 18) + } + + if phase == .speaking { + TalkSpeakingRings(time: t) + } + } + } + } + + private var cloudGradient: LinearGradient { + LinearGradient( + colors: [Color(red: 0.95, green: 0.98, blue: 1.0), Color(red: 0.75, green: 0.88, blue: 1.0)], + startPoint: .topLeading, + endPoint: .bottomTrailing) + } + + private var ringGradient: LinearGradient { + LinearGradient( + colors: [Color.white.opacity(0.6), Color.white.opacity(0.1)], + startPoint: .top, + endPoint: .bottom) + } +} + +private struct TalkThinkingDots: View { + let time: TimeInterval + + var body: some View { + HStack(spacing: 4) { + ForEach(0..<3, id: \.self) { idx in + let phase = (time * 2 + Double(idx) * 0.45).truncatingRemainder(dividingBy: 1) + Circle() + .fill(Color.white.opacity(0.75)) + .frame(width: 5, height: 5) + .opacity(0.35 + 0.55 * phase) + } + } + } +} + +private struct TalkSpeakingRings: View { + let time: TimeInterval + + var body: some View { + ZStack { + ForEach(0..<3, id: \.self) { idx in + let phase = (time * 1.1 + Double(idx) / 3).truncatingRemainder(dividingBy: 1) + Circle() + .stroke(Color.white.opacity(0.6 - phase * 0.5), lineWidth: 1) + .scaleEffect(0.8 + phase * 0.7) + .opacity(0.6 - phase * 0.6) + } + } + } +} + +private struct CloudShape: Shape { + func path(in rect: CGRect) -> Path { + let w = rect.width + let h = rect.height + let baseHeight = h * 0.44 + let baseRect = CGRect(x: rect.minX, y: rect.minY + h * 0.46, width: w, height: baseHeight) + + var path = Path() + path.addRoundedRect(in: baseRect, cornerSize: CGSize(width: baseHeight / 2, height: baseHeight / 2)) + path.addEllipse(in: CGRect(x: rect.minX + w * 0.05, y: rect.minY + h * 0.28, width: w * 0.36, height: h * 0.36)) + path.addEllipse(in: CGRect(x: rect.minX + w * 0.28, y: rect.minY + h * 0.05, width: w * 0.44, height: h * 0.44)) + path.addEllipse(in: CGRect(x: rect.minX + w * 0.62, y: rect.minY + h * 0.3, width: w * 0.3, height: h * 0.3)) + return path + } +} diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift new file mode 100644 index 000000000..6bc4c0195 --- /dev/null +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift @@ -0,0 +1,194 @@ +import Foundation + +public struct TalkDirective: Equatable, Sendable { + public var voiceId: String? + public var modelId: String? + public var speed: Double? + public var rateWPM: Int? + public var stability: Double? + public var similarity: Double? + public var style: Double? + public var speakerBoost: Bool? + public var seed: Int? + public var normalize: String? + public var language: String? + public var outputFormat: String? + public var latencyTier: Int? + public var once: Bool? + + public init( + voiceId: String? = nil, + modelId: String? = nil, + speed: Double? = nil, + rateWPM: Int? = nil, + stability: Double? = nil, + similarity: Double? = nil, + style: Double? = nil, + speakerBoost: Bool? = nil, + seed: Int? = nil, + normalize: String? = nil, + language: String? = nil, + outputFormat: String? = nil, + latencyTier: Int? = nil, + once: Bool? = nil) + { + self.voiceId = voiceId + self.modelId = modelId + self.speed = speed + self.rateWPM = rateWPM + self.stability = stability + self.similarity = similarity + self.style = style + self.speakerBoost = speakerBoost + self.seed = seed + self.normalize = normalize + self.language = language + self.outputFormat = outputFormat + self.latencyTier = latencyTier + self.once = once + } +} + +public struct TalkDirectiveParseResult: Equatable, Sendable { + public let directive: TalkDirective? + public let stripped: String + public let unknownKeys: [String] + + public init(directive: TalkDirective?, stripped: String, unknownKeys: [String]) { + self.directive = directive + self.stripped = stripped + self.unknownKeys = unknownKeys + } +} + +public enum TalkDirectiveParser { + public static func parse(_ text: String) -> TalkDirectiveParseResult { + let normalized = text.replacingOccurrences(of: "\r\n", with: "\n") + var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false) + guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) } + + guard let firstNonEmpty = lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) + else { + return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) + } + + let head = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines) + guard head.hasPrefix("{"), head.hasSuffix("}") else { + return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) + } + + guard let data = head.data(using: .utf8), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] + else { + return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) + } + + let speakerBoost = boolValue(json, keys: ["speaker_boost", "speakerBoost"]) + ?? boolValue(json, keys: ["no_speaker_boost", "noSpeakerBoost"]).map { !$0 } + + let directive = TalkDirective( + voiceId: stringValue(json, keys: ["voice", "voice_id", "voiceId"]), + modelId: stringValue(json, keys: ["model", "model_id", "modelId"]), + speed: doubleValue(json, keys: ["speed"]), + rateWPM: intValue(json, keys: ["rate", "wpm"]), + stability: doubleValue(json, keys: ["stability"]), + similarity: doubleValue(json, keys: ["similarity", "similarity_boost", "similarityBoost"]), + style: doubleValue(json, keys: ["style"]), + speakerBoost: speakerBoost, + seed: intValue(json, keys: ["seed"]), + normalize: stringValue(json, keys: ["normalize", "apply_text_normalization"]), + language: stringValue(json, keys: ["lang", "language_code", "language"]), + outputFormat: stringValue(json, keys: ["output_format", "format"]), + latencyTier: intValue(json, keys: ["latency", "latency_tier", "latencyTier"]), + once: boolValue(json, keys: ["once"])) + + let hasDirective = [ + directive.voiceId, + directive.modelId, + directive.speed.map { "\($0)" }, + directive.rateWPM.map { "\($0)" }, + directive.stability.map { "\($0)" }, + directive.similarity.map { "\($0)" }, + directive.style.map { "\($0)" }, + directive.speakerBoost.map { "\($0)" }, + directive.seed.map { "\($0)" }, + directive.normalize, + directive.language, + directive.outputFormat, + directive.latencyTier.map { "\($0)" }, + directive.once.map { "\($0)" }, + ].contains { $0 != nil } + + guard hasDirective else { + return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) + } + + let knownKeys = Set([ + "voice", "voice_id", "voiceid", + "model", "model_id", "modelid", + "speed", "rate", "wpm", + "stability", "similarity", "similarity_boost", "similarityboost", + "style", + "speaker_boost", "speakerboost", + "no_speaker_boost", "nospeakerboost", + "seed", + "normalize", "apply_text_normalization", + "lang", "language_code", "language", + "output_format", "format", + "latency", "latency_tier", "latencytier", + "once", + ]) + let unknownKeys = json.keys.filter { !knownKeys.contains($0.lowercased()) }.sorted() + + lines.remove(at: firstNonEmpty) + if firstNonEmpty < lines.count { + let next = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines) + if next.isEmpty { + lines.remove(at: firstNonEmpty) + } + } + + let stripped = lines.joined(separator: "\n") + return TalkDirectiveParseResult(directive: directive, stripped: stripped, unknownKeys: unknownKeys) + } + + private static func stringValue(_ dict: [String: Any], keys: [String]) -> String? { + for key in keys { + if let value = dict[key] as? String { + let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines) + if !trimmed.isEmpty { return trimmed } + } + } + return nil + } + + private static func doubleValue(_ dict: [String: Any], keys: [String]) -> Double? { + for key in keys { + if let value = dict[key] as? Double { return value } + if let value = dict[key] as? Int { return Double(value) } + if let value = dict[key] as? String, let parsed = Double(value) { return parsed } + } + return nil + } + + private static func intValue(_ dict: [String: Any], keys: [String]) -> Int? { + for key in keys { + if let value = dict[key] as? Int { return value } + if let value = dict[key] as? Double { return Int(value) } + if let value = dict[key] as? String, let parsed = Int(value) { return parsed } + } + return nil + } + + private static func boolValue(_ dict: [String: Any], keys: [String]) -> Bool? { + for key in keys { + if let value = dict[key] as? Bool { return value } + if let value = dict[key] as? String { + let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + if ["true", "yes", "1"].contains(trimmed) { return true } + if ["false", "no", "0"].contains(trimmed) { return false } + } + } + return nil + } +} diff --git a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift new file mode 100644 index 000000000..cbfdb572b --- /dev/null +++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift @@ -0,0 +1,62 @@ +import XCTest +@testable import ClawdisKit + +final class TalkDirectiveTests: XCTestCase { + func testParsesDirectiveAndStripsLine() { + let text = """ + {"voice":"abc123","once":true} + Hello there. + """ + let result = TalkDirectiveParser.parse(text) + XCTAssertEqual(result.directive?.voiceId, "abc123") + XCTAssertEqual(result.directive?.once, true) + XCTAssertEqual(result.stripped, "Hello there.") + } + + func testIgnoresNonDirective() { + let text = "Hello world." + let result = TalkDirectiveParser.parse(text) + XCTAssertNil(result.directive) + XCTAssertEqual(result.stripped, text) + } + + func testKeepsDirectiveLineIfNoRecognizedFields() { + let text = """ + {"unknown":"value"} + Hello. + """ + let result = TalkDirectiveParser.parse(text) + XCTAssertNil(result.directive) + XCTAssertEqual(result.stripped, text) + } + + func testParsesExtendedOptions() { + let text = """ + {"voice_id":"v1","model_id":"m1","rate":200,"stability":0.5,"similarity":0.8,"style":0.2,"speaker_boost":true,"seed":1234,"normalize":"auto","lang":"en","output_format":"mp3_44100_128"} + Hello. + """ + let result = TalkDirectiveParser.parse(text) + XCTAssertEqual(result.directive?.voiceId, "v1") + XCTAssertEqual(result.directive?.modelId, "m1") + XCTAssertEqual(result.directive?.rateWPM, 200) + XCTAssertEqual(result.directive?.stability, 0.5) + XCTAssertEqual(result.directive?.similarity, 0.8) + XCTAssertEqual(result.directive?.style, 0.2) + XCTAssertEqual(result.directive?.speakerBoost, true) + XCTAssertEqual(result.directive?.seed, 1234) + XCTAssertEqual(result.directive?.normalize, "auto") + XCTAssertEqual(result.directive?.language, "en") + XCTAssertEqual(result.directive?.outputFormat, "mp3_44100_128") + XCTAssertEqual(result.stripped, "Hello.") + } + + func testTracksUnknownKeys() { + let text = """ + {"voice":"abc","mystery":"value","extra":1} + Hi. + """ + let result = TalkDirectiveParser.parse(text) + XCTAssertEqual(result.directive?.voiceId, "abc") + XCTAssertEqual(result.unknownKeys, ["extra", "mystery"]) + } +} diff --git a/docs/configuration.md b/docs/configuration.md index a6838f4cc..f15a8f046 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -195,6 +195,21 @@ Controls inbound/outbound prefixes and timestamps. } ``` +### `talk` + +Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` when unset. + +```json5 +{ + talk: { + voiceId: "elevenlabs_voice_id", + modelId: "eleven_v3", + outputFormat: "mp3_44100_128", + interruptOnSpeech: true + } +} +``` + ### `agent` Controls the embedded agent runtime (model/thinking/verbose/timeouts). diff --git a/docs/talk.md b/docs/talk.md new file mode 100644 index 000000000..4c3cf53cb --- /dev/null +++ b/docs/talk.md @@ -0,0 +1,72 @@ +--- +summary: "Talk mode: continuous speech conversations with ElevenLabs TTS" +read_when: + - Implementing Talk mode on macOS/iOS/Android + - Changing voice/TTS/interrupt behavior +--- +# Talk Mode + +Talk mode is a continuous voice conversation loop: +1) Listen for speech +2) Send transcript to the model (main session, chat.send) +3) Wait for the response +4) Speak it via ElevenLabs + +## Behavior (macOS) +- **Always-on overlay** while Talk mode is enabled. +- **Listening → Thinking → Speaking** phase transitions. +- On a **short pause** (silence window), the current transcript is sent. +- Replies are **written to WebChat** (same as typing). +- **Interrupt on speech** (default on): if the user starts talking while the assistant is speaking, we stop playback and note the interruption timestamp for the next prompt. + +## Voice directives in replies +The assistant may prefix its reply with a **single JSON line** to control voice: + +```json +{"voice":"","once":true} +``` + +Rules: +- First non-empty line only. +- Unknown keys are ignored. +- `once: true` applies to the current reply only. +- Without `once`, the voice becomes the new default for Talk mode. +- The JSON line is stripped before TTS playback. + +Supported keys: +- `voice` / `voice_id` / `voiceId` +- `model` / `model_id` / `modelId` +- `speed`, `rate` (WPM), `stability`, `similarity`, `style`, `speakerBoost` +- `seed`, `normalize`, `lang`, `output_format`, `latency_tier` +- `once` + +## Config (clawdis.json) +```json5 +{ + "talk": { + "voiceId": "elevenlabs_voice_id", + "modelId": "eleven_v3", + "outputFormat": "mp3_44100_128", + "interruptOnSpeech": true + } +} +``` + +Defaults: +- `interruptOnSpeech`: true +- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` + +## macOS UI +- Menu bar toggle: **Talk** +- Config tab: **Talk Mode** group (voice id + interrupt toggle) +- Overlay: + - **Listening**: cloud pulses with mic level + - **Thinking**: sinking animation + - **Speaking**: radiating rings + - Click cloud: stop speaking + - Click X: exit Talk mode + +## Notes +- Requires Speech + Microphone permissions. +- Uses `chat.send` against session key `main`. +- TTS uses ElevenLabs API with `ELEVENLABS_API_KEY`. diff --git a/src/config/config.ts b/src/config/config.ts index a9cbb2243..40ae5da06 100644 --- a/src/config/config.ts +++ b/src/config/config.ts @@ -219,6 +219,17 @@ export type CanvasHostConfig = { port?: number; }; +export type TalkConfig = { + /** Default ElevenLabs voice ID for Talk mode. */ + voiceId?: string; + /** Default ElevenLabs model ID for Talk mode. */ + modelId?: string; + /** Default ElevenLabs output format (e.g. mp3_44100_128). */ + outputFormat?: string; + /** Stop speaking when user starts talking (default: true). */ + interruptOnSpeech?: boolean; +}; + export type GatewayControlUiConfig = { /** If false, the Gateway will not serve the Control UI (/). Default: true. */ enabled?: boolean; @@ -391,6 +402,7 @@ export type ClawdisConfig = { bridge?: BridgeConfig; discovery?: DiscoveryConfig; canvasHost?: CanvasHostConfig; + talk?: TalkConfig; gateway?: GatewayConfig; skills?: Record; }; @@ -785,6 +797,14 @@ const ClawdisSchema = z.object({ port: z.number().int().positive().optional(), }) .optional(), + talk: z + .object({ + voiceId: z.string().optional(), + modelId: z.string().optional(), + outputFormat: z.string().optional(), + interruptOnSpeech: z.boolean().optional(), + }) + .optional(), gateway: z .object({ mode: z.union([z.literal("local"), z.literal("remote")]).optional(),