feat: add talk mode across nodes
This commit is contained in:
@@ -35,6 +35,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
|
||||
val voiceWakeMode: StateFlow<VoiceWakeMode> = runtime.voiceWakeMode
|
||||
val voiceWakeStatusText: StateFlow<String> = runtime.voiceWakeStatusText
|
||||
val voiceWakeIsListening: StateFlow<Boolean> = runtime.voiceWakeIsListening
|
||||
val talkEnabled: StateFlow<Boolean> = runtime.talkEnabled
|
||||
val talkStatusText: StateFlow<String> = runtime.talkStatusText
|
||||
val talkIsListening: StateFlow<Boolean> = runtime.talkIsListening
|
||||
val talkIsSpeaking: StateFlow<Boolean> = runtime.talkIsSpeaking
|
||||
val manualEnabled: StateFlow<Boolean> = runtime.manualEnabled
|
||||
val manualHost: StateFlow<String> = runtime.manualHost
|
||||
val manualPort: StateFlow<Int> = runtime.manualPort
|
||||
@@ -95,6 +99,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
|
||||
runtime.setVoiceWakeMode(mode)
|
||||
}
|
||||
|
||||
fun setTalkEnabled(enabled: Boolean) {
|
||||
runtime.setTalkEnabled(enabled)
|
||||
}
|
||||
|
||||
fun connect(endpoint: BridgeEndpoint) {
|
||||
runtime.connect(endpoint)
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UIAction
|
||||
import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UICommand
|
||||
import com.steipete.clawdis.node.protocol.ClawdisCanvasCommand
|
||||
import com.steipete.clawdis.node.protocol.ClawdisScreenCommand
|
||||
import com.steipete.clawdis.node.voice.TalkModeManager
|
||||
import com.steipete.clawdis.node.voice.VoiceWakeManager
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
@@ -84,6 +85,15 @@ class NodeRuntime(context: Context) {
|
||||
val voiceWakeStatusText: StateFlow<String>
|
||||
get() = voiceWake.statusText
|
||||
|
||||
val talkStatusText: StateFlow<String>
|
||||
get() = talkMode.statusText
|
||||
|
||||
val talkIsListening: StateFlow<Boolean>
|
||||
get() = talkMode.isListening
|
||||
|
||||
val talkIsSpeaking: StateFlow<Boolean>
|
||||
get() = talkMode.isSpeaking
|
||||
|
||||
private val discovery = BridgeDiscovery(appContext, scope = scope)
|
||||
val bridges: StateFlow<List<BridgeEndpoint>> = discovery.bridges
|
||||
val discoveryStatusText: StateFlow<String> = discovery.statusText
|
||||
@@ -133,6 +143,9 @@ class NodeRuntime(context: Context) {
|
||||
)
|
||||
|
||||
private val chat = ChatController(scope = scope, session = session, json = json)
|
||||
private val talkMode: TalkModeManager by lazy {
|
||||
TalkModeManager(context = appContext, scope = scope).also { it.attachSession(session) }
|
||||
}
|
||||
|
||||
private fun handleSessionDisconnected(message: String) {
|
||||
_statusText.value = message
|
||||
@@ -163,6 +176,7 @@ class NodeRuntime(context: Context) {
|
||||
val preventSleep: StateFlow<Boolean> = prefs.preventSleep
|
||||
val wakeWords: StateFlow<List<String>> = prefs.wakeWords
|
||||
val voiceWakeMode: StateFlow<VoiceWakeMode> = prefs.voiceWakeMode
|
||||
val talkEnabled: StateFlow<Boolean> = prefs.talkEnabled
|
||||
val manualEnabled: StateFlow<Boolean> = prefs.manualEnabled
|
||||
val manualHost: StateFlow<String> = prefs.manualHost
|
||||
val manualPort: StateFlow<Int> = prefs.manualPort
|
||||
@@ -218,6 +232,13 @@ class NodeRuntime(context: Context) {
|
||||
}
|
||||
}
|
||||
|
||||
scope.launch {
|
||||
talkEnabled.collect { enabled ->
|
||||
talkMode.setEnabled(enabled)
|
||||
externalAudioCaptureActive.value = enabled
|
||||
}
|
||||
}
|
||||
|
||||
scope.launch(Dispatchers.Default) {
|
||||
bridges.collect { list ->
|
||||
if (list.isNotEmpty()) {
|
||||
@@ -311,6 +332,10 @@ class NodeRuntime(context: Context) {
|
||||
prefs.setVoiceWakeMode(mode)
|
||||
}
|
||||
|
||||
fun setTalkEnabled(value: Boolean) {
|
||||
prefs.setTalkEnabled(value)
|
||||
}
|
||||
|
||||
fun connect(endpoint: BridgeEndpoint) {
|
||||
scope.launch {
|
||||
_statusText.value = "Connecting…"
|
||||
@@ -548,6 +573,7 @@ class NodeRuntime(context: Context) {
|
||||
return
|
||||
}
|
||||
|
||||
talkMode.handleBridgeEvent(event, payloadJson)
|
||||
chat.handleBridgeEvent(event, payloadJson)
|
||||
}
|
||||
|
||||
|
||||
@@ -73,6 +73,9 @@ class SecurePrefs(context: Context) {
|
||||
private val _voiceWakeMode = MutableStateFlow(loadVoiceWakeMode())
|
||||
val voiceWakeMode: StateFlow<VoiceWakeMode> = _voiceWakeMode
|
||||
|
||||
private val _talkEnabled = MutableStateFlow(prefs.getBoolean("talk.enabled", false))
|
||||
val talkEnabled: StateFlow<Boolean> = _talkEnabled
|
||||
|
||||
fun setLastDiscoveredStableId(value: String) {
|
||||
val trimmed = value.trim()
|
||||
prefs.edit { putString("bridge.lastDiscoveredStableId", trimmed) }
|
||||
@@ -158,6 +161,11 @@ class SecurePrefs(context: Context) {
|
||||
_voiceWakeMode.value = mode
|
||||
}
|
||||
|
||||
fun setTalkEnabled(value: Boolean) {
|
||||
prefs.edit { putBoolean("talk.enabled", value) }
|
||||
_talkEnabled.value = value
|
||||
}
|
||||
|
||||
private fun loadVoiceWakeMode(): VoiceWakeMode {
|
||||
val raw = prefs.getString(voiceWakeModeKey, null)
|
||||
val resolved = VoiceWakeMode.fromRawValue(raw)
|
||||
|
||||
@@ -62,6 +62,8 @@ fun SettingsSheet(viewModel: MainViewModel) {
|
||||
val wakeWords by viewModel.wakeWords.collectAsState()
|
||||
val voiceWakeMode by viewModel.voiceWakeMode.collectAsState()
|
||||
val voiceWakeStatusText by viewModel.voiceWakeStatusText.collectAsState()
|
||||
val talkEnabled by viewModel.talkEnabled.collectAsState()
|
||||
val talkStatusText by viewModel.talkStatusText.collectAsState()
|
||||
val isConnected by viewModel.isConnected.collectAsState()
|
||||
val manualEnabled by viewModel.manualEnabled.collectAsState()
|
||||
val manualHost by viewModel.manualHost.collectAsState()
|
||||
@@ -307,6 +309,28 @@ fun SettingsSheet(viewModel: MainViewModel) {
|
||||
|
||||
// Voice
|
||||
item { Text("Voice", style = MaterialTheme.typography.titleSmall) }
|
||||
item {
|
||||
ListItem(
|
||||
headlineContent = { Text("Talk Mode") },
|
||||
supportingContent = { Text(talkStatusText) },
|
||||
trailingContent = {
|
||||
Switch(
|
||||
checked = talkEnabled,
|
||||
onCheckedChange = { on ->
|
||||
if (on) {
|
||||
val micOk =
|
||||
ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
|
||||
PackageManager.PERMISSION_GRANTED
|
||||
if (!micOk) audioPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO)
|
||||
viewModel.setTalkEnabled(true)
|
||||
} else {
|
||||
viewModel.setTalkEnabled(false)
|
||||
}
|
||||
},
|
||||
)
|
||||
},
|
||||
)
|
||||
}
|
||||
item {
|
||||
val enabled = voiceWakeMode != VoiceWakeMode.Off
|
||||
ListItem(
|
||||
|
||||
@@ -0,0 +1,194 @@
|
||||
package com.steipete.clawdis.node.voice
|
||||
|
||||
import kotlinx.serialization.json.Json
|
||||
import kotlinx.serialization.json.JsonElement
|
||||
import kotlinx.serialization.json.JsonObject
|
||||
import kotlinx.serialization.json.JsonPrimitive
|
||||
|
||||
private val directiveJson = Json { ignoreUnknownKeys = true }
|
||||
|
||||
data class TalkDirective(
|
||||
val voiceId: String? = null,
|
||||
val modelId: String? = null,
|
||||
val speed: Double? = null,
|
||||
val rateWpm: Int? = null,
|
||||
val stability: Double? = null,
|
||||
val similarity: Double? = null,
|
||||
val style: Double? = null,
|
||||
val speakerBoost: Boolean? = null,
|
||||
val seed: Long? = null,
|
||||
val normalize: String? = null,
|
||||
val language: String? = null,
|
||||
val outputFormat: String? = null,
|
||||
val latencyTier: Int? = null,
|
||||
val once: Boolean? = null,
|
||||
)
|
||||
|
||||
data class TalkDirectiveParseResult(
|
||||
val directive: TalkDirective?,
|
||||
val stripped: String,
|
||||
val unknownKeys: List<String>,
|
||||
)
|
||||
|
||||
object TalkDirectiveParser {
|
||||
fun parse(text: String): TalkDirectiveParseResult {
|
||||
val normalized = text.replace("\r\n", "\n")
|
||||
val lines = normalized.split("\n").toMutableList()
|
||||
if (lines.isEmpty()) return TalkDirectiveParseResult(null, text, emptyList())
|
||||
|
||||
val firstNonEmpty = lines.indexOfFirst { it.trim().isNotEmpty() }
|
||||
if (firstNonEmpty == -1) return TalkDirectiveParseResult(null, text, emptyList())
|
||||
|
||||
val head = lines[firstNonEmpty].trim()
|
||||
if (!head.startsWith("{") || !head.endsWith("}")) {
|
||||
return TalkDirectiveParseResult(null, text, emptyList())
|
||||
}
|
||||
|
||||
val obj = parseJsonObject(head) ?: return TalkDirectiveParseResult(null, text, emptyList())
|
||||
|
||||
val speakerBoost =
|
||||
boolValue(obj, listOf("speaker_boost", "speakerBoost"))
|
||||
?: boolValue(obj, listOf("no_speaker_boost", "noSpeakerBoost"))?.not()
|
||||
|
||||
val directive = TalkDirective(
|
||||
voiceId = stringValue(obj, listOf("voice", "voice_id", "voiceId")),
|
||||
modelId = stringValue(obj, listOf("model", "model_id", "modelId")),
|
||||
speed = doubleValue(obj, listOf("speed")),
|
||||
rateWpm = intValue(obj, listOf("rate", "wpm")),
|
||||
stability = doubleValue(obj, listOf("stability")),
|
||||
similarity = doubleValue(obj, listOf("similarity", "similarity_boost", "similarityBoost")),
|
||||
style = doubleValue(obj, listOf("style")),
|
||||
speakerBoost = speakerBoost,
|
||||
seed = longValue(obj, listOf("seed")),
|
||||
normalize = stringValue(obj, listOf("normalize", "apply_text_normalization")),
|
||||
language = stringValue(obj, listOf("lang", "language_code", "language")),
|
||||
outputFormat = stringValue(obj, listOf("output_format", "format")),
|
||||
latencyTier = intValue(obj, listOf("latency", "latency_tier", "latencyTier")),
|
||||
once = boolValue(obj, listOf("once")),
|
||||
)
|
||||
|
||||
val hasDirective = listOf(
|
||||
directive.voiceId,
|
||||
directive.modelId,
|
||||
directive.speed,
|
||||
directive.rateWpm,
|
||||
directive.stability,
|
||||
directive.similarity,
|
||||
directive.style,
|
||||
directive.speakerBoost,
|
||||
directive.seed,
|
||||
directive.normalize,
|
||||
directive.language,
|
||||
directive.outputFormat,
|
||||
directive.latencyTier,
|
||||
directive.once,
|
||||
).any { it != null }
|
||||
|
||||
if (!hasDirective) return TalkDirectiveParseResult(null, text, emptyList())
|
||||
|
||||
val knownKeys = setOf(
|
||||
"voice", "voice_id", "voiceid",
|
||||
"model", "model_id", "modelid",
|
||||
"speed", "rate", "wpm",
|
||||
"stability", "similarity", "similarity_boost", "similarityboost",
|
||||
"style",
|
||||
"speaker_boost", "speakerboost",
|
||||
"no_speaker_boost", "nospeakerboost",
|
||||
"seed",
|
||||
"normalize", "apply_text_normalization",
|
||||
"lang", "language_code", "language",
|
||||
"output_format", "format",
|
||||
"latency", "latency_tier", "latencytier",
|
||||
"once",
|
||||
)
|
||||
val unknownKeys = obj.keys.filter { !knownKeys.contains(it.lowercase()) }.sorted()
|
||||
|
||||
lines.removeAt(firstNonEmpty)
|
||||
if (firstNonEmpty < lines.size) {
|
||||
if (lines[firstNonEmpty].trim().isEmpty()) {
|
||||
lines.removeAt(firstNonEmpty)
|
||||
}
|
||||
}
|
||||
|
||||
return TalkDirectiveParseResult(directive, lines.joinToString("\n"), unknownKeys)
|
||||
}
|
||||
|
||||
private fun parseJsonObject(line: String): JsonObject? {
|
||||
return try {
|
||||
directiveJson.parseToJsonElement(line) as? JsonObject
|
||||
} catch (_: Throwable) {
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
private fun stringValue(obj: JsonObject, keys: List<String>): String? {
|
||||
for (key in keys) {
|
||||
val value = obj[key].asStringOrNull()?.trim()
|
||||
if (!value.isNullOrEmpty()) return value
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
private fun doubleValue(obj: JsonObject, keys: List<String>): Double? {
|
||||
for (key in keys) {
|
||||
val value = obj[key].asDoubleOrNull()
|
||||
if (value != null) return value
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
private fun intValue(obj: JsonObject, keys: List<String>): Int? {
|
||||
for (key in keys) {
|
||||
val value = obj[key].asIntOrNull()
|
||||
if (value != null) return value
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
private fun longValue(obj: JsonObject, keys: List<String>): Long? {
|
||||
for (key in keys) {
|
||||
val value = obj[key].asLongOrNull()
|
||||
if (value != null) return value
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
private fun boolValue(obj: JsonObject, keys: List<String>): Boolean? {
|
||||
for (key in keys) {
|
||||
val value = obj[key].asBooleanOrNull()
|
||||
if (value != null) return value
|
||||
}
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull
|
||||
|
||||
private fun JsonElement?.asDoubleOrNull(): Double? {
|
||||
val primitive = this as? JsonPrimitive ?: return null
|
||||
if (primitive.isString) return primitive.content.toDoubleOrNull()
|
||||
return primitive.doubleOrNull
|
||||
}
|
||||
|
||||
private fun JsonElement?.asIntOrNull(): Int? {
|
||||
val primitive = this as? JsonPrimitive ?: return null
|
||||
if (primitive.isString) return primitive.content.toIntOrNull()
|
||||
return primitive.intOrNull
|
||||
}
|
||||
|
||||
private fun JsonElement?.asLongOrNull(): Long? {
|
||||
val primitive = this as? JsonPrimitive ?: return null
|
||||
if (primitive.isString) return primitive.content.toLongOrNull()
|
||||
return primitive.longOrNull
|
||||
}
|
||||
|
||||
private fun JsonElement?.asBooleanOrNull(): Boolean? {
|
||||
val primitive = this as? JsonPrimitive ?: return null
|
||||
if (primitive.booleanOrNull != null) return primitive.booleanOrNull
|
||||
val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null
|
||||
return when (content) {
|
||||
"true", "yes", "1" -> true
|
||||
"false", "no", "0" -> false
|
||||
else -> null
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,713 @@
|
||||
package com.steipete.clawdis.node.voice
|
||||
|
||||
import android.Manifest
|
||||
import android.content.Context
|
||||
import android.content.Intent
|
||||
import android.content.pm.PackageManager
|
||||
import android.media.AudioAttributes
|
||||
import android.media.MediaPlayer
|
||||
import android.os.Bundle
|
||||
import android.os.Handler
|
||||
import android.os.Looper
|
||||
import android.os.SystemClock
|
||||
import android.speech.RecognitionListener
|
||||
import android.speech.RecognizerIntent
|
||||
import android.speech.SpeechRecognizer
|
||||
import android.util.Log
|
||||
import androidx.core.content.ContextCompat
|
||||
import com.steipete.clawdis.node.bridge.BridgeSession
|
||||
import java.io.File
|
||||
import java.net.HttpURLConnection
|
||||
import java.net.URL
|
||||
import java.util.UUID
|
||||
import kotlinx.coroutines.CompletableDeferred
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.Job
|
||||
import kotlinx.coroutines.delay
|
||||
import kotlinx.coroutines.flow.MutableStateFlow
|
||||
import kotlinx.coroutines.flow.StateFlow
|
||||
import kotlinx.coroutines.launch
|
||||
import kotlinx.coroutines.withContext
|
||||
import kotlinx.serialization.json.Json
|
||||
import kotlinx.serialization.json.JsonArray
|
||||
import kotlinx.serialization.json.JsonElement
|
||||
import kotlinx.serialization.json.JsonObject
|
||||
import kotlinx.serialization.json.JsonPrimitive
|
||||
import kotlinx.serialization.json.buildJsonObject
|
||||
|
||||
class TalkModeManager(
|
||||
private val context: Context,
|
||||
private val scope: CoroutineScope,
|
||||
) {
|
||||
companion object {
|
||||
private const val tag = "TalkMode"
|
||||
}
|
||||
|
||||
private val mainHandler = Handler(Looper.getMainLooper())
|
||||
private val json = Json { ignoreUnknownKeys = true }
|
||||
|
||||
private val _isEnabled = MutableStateFlow(false)
|
||||
val isEnabled: StateFlow<Boolean> = _isEnabled
|
||||
|
||||
private val _isListening = MutableStateFlow(false)
|
||||
val isListening: StateFlow<Boolean> = _isListening
|
||||
|
||||
private val _isSpeaking = MutableStateFlow(false)
|
||||
val isSpeaking: StateFlow<Boolean> = _isSpeaking
|
||||
|
||||
private val _statusText = MutableStateFlow("Off")
|
||||
val statusText: StateFlow<String> = _statusText
|
||||
|
||||
private var recognizer: SpeechRecognizer? = null
|
||||
private var restartJob: Job? = null
|
||||
private var stopRequested = false
|
||||
private var listeningMode = false
|
||||
|
||||
private var silenceJob: Job? = null
|
||||
private val silenceWindowMs = 700L
|
||||
private var lastTranscript: String = ""
|
||||
private var lastHeardAtMs: Long? = null
|
||||
private var lastSpokenText: String? = null
|
||||
private var lastInterruptedAtSeconds: Double? = null
|
||||
|
||||
private var defaultVoiceId: String? = null
|
||||
private var currentVoiceId: String? = null
|
||||
private var defaultModelId: String? = null
|
||||
private var currentModelId: String? = null
|
||||
private var defaultOutputFormat: String? = null
|
||||
private var interruptOnSpeech: Boolean = true
|
||||
private var voiceOverrideActive = false
|
||||
private var modelOverrideActive = false
|
||||
|
||||
private var session: BridgeSession? = null
|
||||
private var pendingRunId: String? = null
|
||||
private var pendingFinal: CompletableDeferred<Boolean>? = null
|
||||
|
||||
private var player: MediaPlayer? = null
|
||||
private var currentAudioFile: File? = null
|
||||
|
||||
fun attachSession(session: BridgeSession) {
|
||||
this.session = session
|
||||
}
|
||||
|
||||
fun setEnabled(enabled: Boolean) {
|
||||
if (_isEnabled.value == enabled) return
|
||||
_isEnabled.value = enabled
|
||||
if (enabled) {
|
||||
start()
|
||||
} else {
|
||||
stop()
|
||||
}
|
||||
}
|
||||
|
||||
fun handleBridgeEvent(event: String, payloadJson: String?) {
|
||||
if (event != "chat") return
|
||||
if (payloadJson.isNullOrBlank()) return
|
||||
val pending = pendingRunId ?: return
|
||||
val obj =
|
||||
try {
|
||||
json.parseToJsonElement(payloadJson).asObjectOrNull()
|
||||
} catch (_: Throwable) {
|
||||
null
|
||||
} ?: return
|
||||
val runId = obj["runId"].asStringOrNull() ?: return
|
||||
if (runId != pending) return
|
||||
val state = obj["state"].asStringOrNull() ?: return
|
||||
if (state == "final") {
|
||||
pendingFinal?.complete(true)
|
||||
pendingFinal = null
|
||||
pendingRunId = null
|
||||
}
|
||||
}
|
||||
|
||||
private fun start() {
|
||||
mainHandler.post {
|
||||
if (_isListening.value) return@post
|
||||
stopRequested = false
|
||||
listeningMode = true
|
||||
|
||||
if (!SpeechRecognizer.isRecognitionAvailable(context)) {
|
||||
_statusText.value = "Speech recognizer unavailable"
|
||||
return@post
|
||||
}
|
||||
|
||||
val micOk =
|
||||
ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
|
||||
PackageManager.PERMISSION_GRANTED
|
||||
if (!micOk) {
|
||||
_statusText.value = "Microphone permission required"
|
||||
return@post
|
||||
}
|
||||
|
||||
try {
|
||||
recognizer?.destroy()
|
||||
recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
|
||||
startListeningInternal(markListening = true)
|
||||
startSilenceMonitor()
|
||||
} catch (err: Throwable) {
|
||||
_statusText.value = "Start failed: ${err.message ?: err::class.simpleName}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun stop() {
|
||||
stopRequested = true
|
||||
listeningMode = false
|
||||
restartJob?.cancel()
|
||||
restartJob = null
|
||||
silenceJob?.cancel()
|
||||
silenceJob = null
|
||||
lastTranscript = ""
|
||||
lastHeardAtMs = null
|
||||
_isListening.value = false
|
||||
_statusText.value = "Off"
|
||||
stopSpeaking()
|
||||
|
||||
mainHandler.post {
|
||||
recognizer?.cancel()
|
||||
recognizer?.destroy()
|
||||
recognizer = null
|
||||
}
|
||||
}
|
||||
|
||||
private fun startListeningInternal(markListening: Boolean) {
|
||||
val r = recognizer ?: return
|
||||
val intent =
|
||||
Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
|
||||
putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM)
|
||||
putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
|
||||
putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
|
||||
putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
|
||||
}
|
||||
|
||||
if (markListening) {
|
||||
_statusText.value = "Listening"
|
||||
_isListening.value = true
|
||||
}
|
||||
r.startListening(intent)
|
||||
}
|
||||
|
||||
private fun scheduleRestart(delayMs: Long = 350) {
|
||||
if (stopRequested) return
|
||||
restartJob?.cancel()
|
||||
restartJob =
|
||||
scope.launch {
|
||||
delay(delayMs)
|
||||
mainHandler.post {
|
||||
if (stopRequested) return@post
|
||||
try {
|
||||
recognizer?.cancel()
|
||||
val shouldListen = listeningMode
|
||||
val shouldInterrupt = _isSpeaking.value && interruptOnSpeech
|
||||
if (!shouldListen && !shouldInterrupt) return@post
|
||||
startListeningInternal(markListening = shouldListen)
|
||||
} catch (_: Throwable) {
|
||||
// handled by onError
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun handleTranscript(text: String, isFinal: Boolean) {
|
||||
val trimmed = text.trim()
|
||||
if (_isSpeaking.value && interruptOnSpeech) {
|
||||
if (shouldInterrupt(trimmed)) {
|
||||
stopSpeaking()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if (!_isListening.value) return
|
||||
|
||||
if (trimmed.isNotEmpty()) {
|
||||
lastTranscript = trimmed
|
||||
lastHeardAtMs = SystemClock.elapsedRealtime()
|
||||
}
|
||||
|
||||
if (isFinal) {
|
||||
lastTranscript = trimmed
|
||||
}
|
||||
}
|
||||
|
||||
private fun startSilenceMonitor() {
|
||||
silenceJob?.cancel()
|
||||
silenceJob =
|
||||
scope.launch {
|
||||
while (_isEnabled.value) {
|
||||
delay(200)
|
||||
checkSilence()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun checkSilence() {
|
||||
if (!_isListening.value) return
|
||||
val transcript = lastTranscript.trim()
|
||||
if (transcript.isEmpty()) return
|
||||
val lastHeard = lastHeardAtMs ?: return
|
||||
val elapsed = SystemClock.elapsedRealtime() - lastHeard
|
||||
if (elapsed < silenceWindowMs) return
|
||||
scope.launch { finalizeTranscript(transcript) }
|
||||
}
|
||||
|
||||
private suspend fun finalizeTranscript(transcript: String) {
|
||||
listeningMode = false
|
||||
_isListening.value = false
|
||||
_statusText.value = "Thinking…"
|
||||
lastTranscript = ""
|
||||
lastHeardAtMs = null
|
||||
|
||||
reloadConfig()
|
||||
val prompt = buildPrompt(transcript)
|
||||
val bridge = session
|
||||
if (bridge == null) {
|
||||
_statusText.value = "Bridge not connected"
|
||||
start()
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
val runId = sendChat(prompt, bridge)
|
||||
val ok = waitForChatFinal(runId)
|
||||
if (!ok) {
|
||||
_statusText.value = "No reply"
|
||||
start()
|
||||
return
|
||||
}
|
||||
val assistant = fetchLatestAssistantText(bridge)
|
||||
if (assistant.isNullOrBlank()) {
|
||||
_statusText.value = "No reply"
|
||||
start()
|
||||
return
|
||||
}
|
||||
playAssistant(assistant)
|
||||
} catch (err: Throwable) {
|
||||
_statusText.value = "Talk failed: ${err.message ?: err::class.simpleName}"
|
||||
}
|
||||
|
||||
if (_isEnabled.value) {
|
||||
start()
|
||||
}
|
||||
}
|
||||
|
||||
private fun buildPrompt(transcript: String): String {
|
||||
val lines = mutableListOf(
|
||||
"Talk Mode active. Reply in a concise, spoken tone.",
|
||||
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
|
||||
)
|
||||
lastInterruptedAtSeconds?.let {
|
||||
lines.add("Assistant speech interrupted at ${"%.1f".format(it)}s.")
|
||||
lastInterruptedAtSeconds = null
|
||||
}
|
||||
lines.add("")
|
||||
lines.add(transcript)
|
||||
return lines.joinToString("\n")
|
||||
}
|
||||
|
||||
private suspend fun sendChat(message: String, bridge: BridgeSession): String {
|
||||
val runId = UUID.randomUUID().toString()
|
||||
val params =
|
||||
buildJsonObject {
|
||||
put("sessionKey", JsonPrimitive("main"))
|
||||
put("message", JsonPrimitive(message))
|
||||
put("thinking", JsonPrimitive("low"))
|
||||
put("timeoutMs", JsonPrimitive(30_000))
|
||||
put("idempotencyKey", JsonPrimitive(runId))
|
||||
}
|
||||
val res = bridge.request("chat.send", params.toString())
|
||||
val parsed = parseRunId(res) ?: runId
|
||||
if (parsed != runId) {
|
||||
pendingRunId = parsed
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
private suspend fun waitForChatFinal(runId: String): Boolean {
|
||||
pendingFinal?.cancel()
|
||||
val deferred = CompletableDeferred<Boolean>()
|
||||
pendingRunId = runId
|
||||
pendingFinal = deferred
|
||||
|
||||
val result =
|
||||
withContext(Dispatchers.IO) {
|
||||
try {
|
||||
kotlinx.coroutines.withTimeout(120_000) { deferred.await() }
|
||||
} catch (_: Throwable) {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
pendingFinal = null
|
||||
pendingRunId = null
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
private suspend fun fetchLatestAssistantText(bridge: BridgeSession): String? {
|
||||
val res = bridge.request("chat.history", "{\"sessionKey\":\"main\"}")
|
||||
val root = json.parseToJsonElement(res).asObjectOrNull() ?: return null
|
||||
val messages = root["messages"] as? JsonArray ?: return null
|
||||
for (item in messages.reversed()) {
|
||||
val obj = item.asObjectOrNull() ?: continue
|
||||
if (obj["role"].asStringOrNull() != "assistant") continue
|
||||
val content = obj["content"] as? JsonArray ?: continue
|
||||
val text =
|
||||
content.mapNotNull { entry ->
|
||||
entry.asObjectOrNull()?.get("text")?.asStringOrNull()?.trim()
|
||||
}.filter { it.isNotEmpty() }
|
||||
if (text.isNotEmpty()) return text.joinToString("\n")
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
private suspend fun playAssistant(text: String) {
|
||||
val parsed = TalkDirectiveParser.parse(text)
|
||||
if (parsed.unknownKeys.isNotEmpty()) {
|
||||
Log.w(tag, "Unknown talk directive keys: ${parsed.unknownKeys}")
|
||||
}
|
||||
val directive = parsed.directive
|
||||
val cleaned = parsed.stripped.trim()
|
||||
if (cleaned.isEmpty()) return
|
||||
|
||||
if (directive?.voiceId != null) {
|
||||
if (directive.once != true) {
|
||||
currentVoiceId = directive.voiceId
|
||||
voiceOverrideActive = true
|
||||
}
|
||||
}
|
||||
if (directive?.modelId != null) {
|
||||
if (directive.once != true) {
|
||||
currentModelId = directive.modelId
|
||||
modelOverrideActive = true
|
||||
}
|
||||
}
|
||||
|
||||
val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId
|
||||
if (voiceId.isNullOrBlank()) {
|
||||
_statusText.value = "Missing voice ID"
|
||||
return
|
||||
}
|
||||
|
||||
val apiKey = System.getenv("ELEVENLABS_API_KEY")?.trim()
|
||||
if (apiKey.isNullOrEmpty()) {
|
||||
_statusText.value = "Missing ELEVENLABS_API_KEY"
|
||||
return
|
||||
}
|
||||
|
||||
_statusText.value = "Speaking…"
|
||||
_isSpeaking.value = true
|
||||
lastSpokenText = cleaned
|
||||
ensureInterruptListener()
|
||||
|
||||
try {
|
||||
val request =
|
||||
ElevenLabsRequest(
|
||||
text = cleaned,
|
||||
modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
|
||||
outputFormat = directive?.outputFormat ?: defaultOutputFormat,
|
||||
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
|
||||
stability = TalkModeRuntime.validatedUnit(directive?.stability),
|
||||
similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
|
||||
style = TalkModeRuntime.validatedUnit(directive?.style),
|
||||
speakerBoost = directive?.speakerBoost,
|
||||
seed = TalkModeRuntime.validatedSeed(directive?.seed),
|
||||
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
|
||||
language = TalkModeRuntime.validatedLanguage(directive?.language),
|
||||
)
|
||||
val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request)
|
||||
playAudio(audio)
|
||||
} catch (err: Throwable) {
|
||||
_statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
|
||||
}
|
||||
|
||||
_isSpeaking.value = false
|
||||
}
|
||||
|
||||
private suspend fun playAudio(data: ByteArray) {
|
||||
stopSpeaking(resetInterrupt = false)
|
||||
val file = File.createTempFile("talk-", ".mp3", context.cacheDir)
|
||||
file.writeBytes(data)
|
||||
currentAudioFile = file
|
||||
|
||||
val player = MediaPlayer()
|
||||
this.player = player
|
||||
|
||||
val finished = CompletableDeferred<Unit>()
|
||||
player.setAudioAttributes(
|
||||
AudioAttributes.Builder()
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
||||
.build(),
|
||||
)
|
||||
player.setOnCompletionListener {
|
||||
finished.complete(Unit)
|
||||
}
|
||||
player.setOnErrorListener { _, _, _ ->
|
||||
finished.completeExceptionally(IllegalStateException("MediaPlayer error"))
|
||||
true
|
||||
}
|
||||
|
||||
player.setDataSource(file.absolutePath)
|
||||
withContext(Dispatchers.Main) {
|
||||
player.setOnPreparedListener { it.start() }
|
||||
player.prepareAsync()
|
||||
}
|
||||
|
||||
try {
|
||||
finished.await()
|
||||
} finally {
|
||||
cleanupPlayer()
|
||||
}
|
||||
}
|
||||
|
||||
private fun stopSpeaking(resetInterrupt: Boolean = true) {
|
||||
if (!_isSpeaking.value) {
|
||||
cleanupPlayer()
|
||||
return
|
||||
}
|
||||
if (resetInterrupt) {
|
||||
val currentMs = player?.currentPosition?.toDouble() ?: 0.0
|
||||
lastInterruptedAtSeconds = currentMs / 1000.0
|
||||
}
|
||||
cleanupPlayer()
|
||||
_isSpeaking.value = false
|
||||
}
|
||||
|
||||
private fun cleanupPlayer() {
|
||||
player?.stop()
|
||||
player?.release()
|
||||
player = null
|
||||
currentAudioFile?.delete()
|
||||
currentAudioFile = null
|
||||
}
|
||||
|
||||
private fun shouldInterrupt(transcript: String): Boolean {
|
||||
val trimmed = transcript.trim()
|
||||
if (trimmed.length < 3) return false
|
||||
val spoken = lastSpokenText?.lowercase()
|
||||
if (spoken != null && spoken.contains(trimmed.lowercase())) return false
|
||||
return true
|
||||
}
|
||||
|
||||
private suspend fun reloadConfig() {
|
||||
val bridge = session ?: return
|
||||
val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim()
|
||||
val sagVoice = System.getenv("SAG_VOICE_ID")?.trim()
|
||||
try {
|
||||
val res = bridge.request("config.get", "{}")
|
||||
val root = json.parseToJsonElement(res).asObjectOrNull()
|
||||
val config = root?.get("config").asObjectOrNull()
|
||||
val talk = config?.get("talk").asObjectOrNull()
|
||||
val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
|
||||
val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
|
||||
val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
|
||||
val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull()
|
||||
|
||||
defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
|
||||
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
|
||||
defaultModelId = model
|
||||
if (!modelOverrideActive) currentModelId = defaultModelId
|
||||
defaultOutputFormat = outputFormat
|
||||
if (interrupt != null) interruptOnSpeech = interrupt
|
||||
} catch (_: Throwable) {
|
||||
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
|
||||
}
|
||||
}
|
||||
|
||||
private fun parseRunId(jsonString: String): String? {
|
||||
val obj = json.parseToJsonElement(jsonString).asObjectOrNull() ?: return null
|
||||
return obj["runId"].asStringOrNull()
|
||||
}
|
||||
|
||||
private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray {
|
||||
return withContext(Dispatchers.IO) {
|
||||
val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId")
|
||||
val conn = url.openConnection() as HttpURLConnection
|
||||
conn.requestMethod = "POST"
|
||||
conn.setRequestProperty("Content-Type", "application/json")
|
||||
conn.setRequestProperty("Accept", "audio/mpeg")
|
||||
conn.setRequestProperty("xi-api-key", apiKey)
|
||||
conn.doOutput = true
|
||||
|
||||
val payload = buildRequestPayload(request)
|
||||
conn.outputStream.use { it.write(payload.toByteArray()) }
|
||||
|
||||
val code = conn.responseCode
|
||||
val stream = if (code >= 400) conn.errorStream else conn.inputStream
|
||||
val data = stream.readBytes()
|
||||
if (code >= 400) {
|
||||
val message = String(data)
|
||||
throw IllegalStateException("ElevenLabs failed: $code $message")
|
||||
}
|
||||
data
|
||||
}
|
||||
}
|
||||
|
||||
private fun buildRequestPayload(request: ElevenLabsRequest): String {
|
||||
val voiceSettingsEntries =
|
||||
buildJsonObject {
|
||||
request.speed?.let { put("speed", JsonPrimitive(it)) }
|
||||
request.stability?.let { put("stability", JsonPrimitive(it)) }
|
||||
request.similarity?.let { put("similarity_boost", JsonPrimitive(it)) }
|
||||
request.style?.let { put("style", JsonPrimitive(it)) }
|
||||
request.speakerBoost?.let { put("use_speaker_boost", JsonPrimitive(it)) }
|
||||
}
|
||||
|
||||
val payload =
|
||||
buildJsonObject {
|
||||
put("text", JsonPrimitive(request.text))
|
||||
request.modelId?.takeIf { it.isNotEmpty() }?.let { put("model_id", JsonPrimitive(it)) }
|
||||
request.outputFormat?.takeIf { it.isNotEmpty() }?.let { put("output_format", JsonPrimitive(it)) }
|
||||
request.seed?.let { put("seed", JsonPrimitive(it)) }
|
||||
request.normalize?.let { put("apply_text_normalization", JsonPrimitive(it)) }
|
||||
request.language?.let { put("language_code", JsonPrimitive(it)) }
|
||||
if (voiceSettingsEntries.isNotEmpty()) {
|
||||
put("voice_settings", voiceSettingsEntries)
|
||||
}
|
||||
}
|
||||
|
||||
return payload.toString()
|
||||
}
|
||||
|
||||
private data class ElevenLabsRequest(
|
||||
val text: String,
|
||||
val modelId: String?,
|
||||
val outputFormat: String?,
|
||||
val speed: Double?,
|
||||
val stability: Double?,
|
||||
val similarity: Double?,
|
||||
val style: Double?,
|
||||
val speakerBoost: Boolean?,
|
||||
val seed: Long?,
|
||||
val normalize: String?,
|
||||
val language: String?,
|
||||
)
|
||||
|
||||
private object TalkModeRuntime {
|
||||
fun resolveSpeed(speed: Double?, rateWpm: Int?): Double? {
|
||||
if (rateWpm != null && rateWpm > 0) {
|
||||
val resolved = rateWpm.toDouble() / 175.0
|
||||
if (resolved <= 0.5 || resolved >= 2.0) return null
|
||||
return resolved
|
||||
}
|
||||
if (speed != null) {
|
||||
if (speed <= 0.5 || speed >= 2.0) return null
|
||||
return speed
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
fun validatedUnit(value: Double?): Double? {
|
||||
if (value == null) return null
|
||||
if (value < 0 || value > 1) return null
|
||||
return value
|
||||
}
|
||||
|
||||
fun validatedSeed(value: Long?): Long? {
|
||||
if (value == null) return null
|
||||
if (value < 0 || value > 4294967295L) return null
|
||||
return value
|
||||
}
|
||||
|
||||
fun validatedNormalize(value: String?): String? {
|
||||
val normalized = value?.trim()?.lowercase() ?: return null
|
||||
return if (normalized in listOf("auto", "on", "off")) normalized else null
|
||||
}
|
||||
|
||||
fun validatedLanguage(value: String?): String? {
|
||||
val normalized = value?.trim()?.lowercase() ?: return null
|
||||
if (normalized.length != 2) return null
|
||||
if (!normalized.all { it in 'a'..'z' }) return null
|
||||
return normalized
|
||||
}
|
||||
}
|
||||
|
||||
private fun ensureInterruptListener() {
|
||||
if (!interruptOnSpeech || !_isEnabled.value) return
|
||||
mainHandler.post {
|
||||
if (stopRequested) return@post
|
||||
if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
|
||||
try {
|
||||
if (recognizer == null) {
|
||||
recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
|
||||
}
|
||||
recognizer?.cancel()
|
||||
startListeningInternal(markListening = false)
|
||||
} catch (_: Throwable) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private val listener =
|
||||
object : RecognitionListener {
|
||||
override fun onReadyForSpeech(params: Bundle?) {
|
||||
if (_isEnabled.value) {
|
||||
_statusText.value = if (_isListening.value) "Listening" else _statusText.value
|
||||
}
|
||||
}
|
||||
|
||||
override fun onBeginningOfSpeech() {}
|
||||
|
||||
override fun onRmsChanged(rmsdB: Float) {}
|
||||
|
||||
override fun onBufferReceived(buffer: ByteArray?) {}
|
||||
|
||||
override fun onEndOfSpeech() {
|
||||
scheduleRestart()
|
||||
}
|
||||
|
||||
override fun onError(error: Int) {
|
||||
if (stopRequested) return
|
||||
_isListening.value = false
|
||||
if (error == SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) {
|
||||
_statusText.value = "Microphone permission required"
|
||||
return
|
||||
}
|
||||
|
||||
_statusText.value =
|
||||
when (error) {
|
||||
SpeechRecognizer.ERROR_AUDIO -> "Audio error"
|
||||
SpeechRecognizer.ERROR_CLIENT -> "Client error"
|
||||
SpeechRecognizer.ERROR_NETWORK -> "Network error"
|
||||
SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> "Network timeout"
|
||||
SpeechRecognizer.ERROR_NO_MATCH -> "Listening"
|
||||
SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "Recognizer busy"
|
||||
SpeechRecognizer.ERROR_SERVER -> "Server error"
|
||||
SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Listening"
|
||||
else -> "Speech error ($error)"
|
||||
}
|
||||
scheduleRestart(delayMs = 600)
|
||||
}
|
||||
|
||||
override fun onResults(results: Bundle?) {
|
||||
val list = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty()
|
||||
list.firstOrNull()?.let { handleTranscript(it, isFinal = true) }
|
||||
scheduleRestart()
|
||||
}
|
||||
|
||||
override fun onPartialResults(partialResults: Bundle?) {
|
||||
val list = partialResults?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty()
|
||||
list.firstOrNull()?.let { handleTranscript(it, isFinal = false) }
|
||||
}
|
||||
|
||||
override fun onEvent(eventType: Int, params: Bundle?) {}
|
||||
}
|
||||
}
|
||||
|
||||
private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject
|
||||
|
||||
private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull
|
||||
|
||||
private fun JsonElement?.asBooleanOrNull(): Boolean? {
|
||||
val primitive = this as? JsonPrimitive ?: return null
|
||||
if (primitive.booleanOrNull != null) return primitive.booleanOrNull
|
||||
val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null
|
||||
return when (content) {
|
||||
"true", "yes", "1" -> true
|
||||
"false", "no", "0" -> false
|
||||
else -> null
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
package com.steipete.clawdis.node.voice
|
||||
|
||||
import org.junit.Assert.assertEquals
|
||||
import org.junit.Assert.assertNull
|
||||
import org.junit.Assert.assertTrue
|
||||
import org.junit.Test
|
||||
|
||||
class TalkDirectiveParserTest {
|
||||
@Test
|
||||
fun parsesDirectiveAndStripsHeader() {
|
||||
val input = """
|
||||
{"voice":"voice-123","once":true}
|
||||
Hello from talk mode.
|
||||
""".trimIndent()
|
||||
val result = TalkDirectiveParser.parse(input)
|
||||
assertEquals("voice-123", result.directive?.voiceId)
|
||||
assertEquals(true, result.directive?.once)
|
||||
assertEquals("Hello from talk mode.", result.stripped.trim())
|
||||
}
|
||||
|
||||
@Test
|
||||
fun ignoresUnknownKeysButReportsThem() {
|
||||
val input = """
|
||||
{"voice":"abc","foo":1,"bar":"baz"}
|
||||
Hi there.
|
||||
""".trimIndent()
|
||||
val result = TalkDirectiveParser.parse(input)
|
||||
assertEquals("abc", result.directive?.voiceId)
|
||||
assertTrue(result.unknownKeys.containsAll(listOf("bar", "foo")))
|
||||
}
|
||||
|
||||
@Test
|
||||
fun parsesAlternateKeys() {
|
||||
val input = """
|
||||
{"model_id":"eleven_v3","similarity_boost":0.4,"no_speaker_boost":true,"rate":200}
|
||||
Speak.
|
||||
""".trimIndent()
|
||||
val result = TalkDirectiveParser.parse(input)
|
||||
assertEquals("eleven_v3", result.directive?.modelId)
|
||||
assertEquals(0.4, result.directive?.similarity)
|
||||
assertEquals(false, result.directive?.speakerBoost)
|
||||
assertEquals(200, result.directive?.rateWpm)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun returnsNullWhenNoDirectivePresent() {
|
||||
val input = """
|
||||
{}
|
||||
Hello.
|
||||
""".trimIndent()
|
||||
val result = TalkDirectiveParser.parse(input)
|
||||
assertNull(result.directive)
|
||||
assertEquals(input, result.stripped)
|
||||
}
|
||||
}
|
||||
@@ -28,6 +28,7 @@ final class NodeAppModel {
|
||||
private var voiceWakeSyncTask: Task<Void, Never>?
|
||||
@ObservationIgnored private var cameraHUDDismissTask: Task<Void, Never>?
|
||||
let voiceWake = VoiceWakeManager()
|
||||
let talkMode = TalkModeManager()
|
||||
private var lastAutoA2uiURL: String?
|
||||
|
||||
var bridgeSession: BridgeSession { self.bridge }
|
||||
@@ -49,6 +50,9 @@ final class NodeAppModel {
|
||||
|
||||
let enabled = UserDefaults.standard.bool(forKey: "voiceWake.enabled")
|
||||
self.voiceWake.setEnabled(enabled)
|
||||
self.talkMode.attachBridge(self.bridge)
|
||||
let talkEnabled = UserDefaults.standard.bool(forKey: "talk.enabled")
|
||||
self.talkMode.setEnabled(talkEnabled)
|
||||
|
||||
// Wire up deep links from canvas taps
|
||||
self.screen.onDeepLink = { [weak self] url in
|
||||
@@ -177,6 +181,10 @@ final class NodeAppModel {
|
||||
self.voiceWake.setEnabled(enabled)
|
||||
}
|
||||
|
||||
func setTalkEnabled(_ enabled: Bool) {
|
||||
self.talkMode.setEnabled(enabled)
|
||||
}
|
||||
|
||||
func connectToBridge(
|
||||
endpoint: NWEndpoint,
|
||||
hello: BridgeHello)
|
||||
|
||||
@@ -20,6 +20,7 @@ struct SettingsTab: View {
|
||||
@AppStorage("node.displayName") private var displayName: String = "iOS Node"
|
||||
@AppStorage("node.instanceId") private var instanceId: String = UUID().uuidString
|
||||
@AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false
|
||||
@AppStorage("talk.enabled") private var talkEnabled: Bool = false
|
||||
@AppStorage("camera.enabled") private var cameraEnabled: Bool = true
|
||||
@AppStorage("screen.preventSleep") private var preventSleep: Bool = true
|
||||
@AppStorage("bridge.preferredStableID") private var preferredBridgeStableID: String = ""
|
||||
@@ -156,6 +157,10 @@ struct SettingsTab: View {
|
||||
.onChange(of: self.voiceWakeEnabled) { _, newValue in
|
||||
self.appModel.setVoiceWakeEnabled(newValue)
|
||||
}
|
||||
Toggle("Talk Mode", isOn: self.$talkEnabled)
|
||||
.onChange(of: self.talkEnabled) { _, newValue in
|
||||
self.appModel.setTalkEnabled(newValue)
|
||||
}
|
||||
|
||||
NavigationLink {
|
||||
VoiceWakeWordsSettingsView()
|
||||
|
||||
518
apps/ios/Sources/Voice/TalkModeManager.swift
Normal file
518
apps/ios/Sources/Voice/TalkModeManager.swift
Normal file
@@ -0,0 +1,518 @@
|
||||
import AVFAudio
|
||||
import ClawdisKit
|
||||
import Foundation
|
||||
import Observation
|
||||
import Speech
|
||||
|
||||
@MainActor
|
||||
@Observable
|
||||
final class TalkModeManager: NSObject {
|
||||
var isEnabled: Bool = false
|
||||
var isListening: Bool = false
|
||||
var isSpeaking: Bool = false
|
||||
var statusText: String = "Off"
|
||||
|
||||
private let audioEngine = AVAudioEngine()
|
||||
private var speechRecognizer: SFSpeechRecognizer?
|
||||
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||
private var recognitionTask: SFSpeechRecognitionTask?
|
||||
private var silenceTask: Task<Void, Never>?
|
||||
|
||||
private var lastHeard: Date?
|
||||
private var lastTranscript: String = ""
|
||||
private var lastSpokenText: String?
|
||||
private var lastInterruptedAtSeconds: Double?
|
||||
|
||||
private var defaultVoiceId: String?
|
||||
private var currentVoiceId: String?
|
||||
private var defaultModelId: String?
|
||||
private var currentModelId: String?
|
||||
private var defaultOutputFormat: String?
|
||||
private var interruptOnSpeech: Bool = true
|
||||
|
||||
private var bridge: BridgeSession?
|
||||
private let silenceWindow: TimeInterval = 0.7
|
||||
|
||||
private var player: AVAudioPlayer?
|
||||
|
||||
func attachBridge(_ bridge: BridgeSession) {
|
||||
self.bridge = bridge
|
||||
}
|
||||
|
||||
func setEnabled(_ enabled: Bool) {
|
||||
self.isEnabled = enabled
|
||||
if enabled {
|
||||
Task { await self.start() }
|
||||
} else {
|
||||
self.stop()
|
||||
}
|
||||
}
|
||||
|
||||
func start() async {
|
||||
guard self.isEnabled else { return }
|
||||
if self.isListening { return }
|
||||
|
||||
self.statusText = "Requesting permissions…"
|
||||
let micOk = await Self.requestMicrophonePermission()
|
||||
guard micOk else {
|
||||
self.statusText = "Microphone permission denied"
|
||||
return
|
||||
}
|
||||
let speechOk = await Self.requestSpeechPermission()
|
||||
guard speechOk else {
|
||||
self.statusText = "Speech recognition permission denied"
|
||||
return
|
||||
}
|
||||
|
||||
await self.reloadConfig()
|
||||
do {
|
||||
try Self.configureAudioSession()
|
||||
try self.startRecognition()
|
||||
self.isListening = true
|
||||
self.statusText = "Listening"
|
||||
self.startSilenceMonitor()
|
||||
} catch {
|
||||
self.isListening = false
|
||||
self.statusText = "Start failed: \(error.localizedDescription)"
|
||||
}
|
||||
}
|
||||
|
||||
func stop() {
|
||||
self.isEnabled = false
|
||||
self.isListening = false
|
||||
self.statusText = "Off"
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.silenceTask?.cancel()
|
||||
self.silenceTask = nil
|
||||
self.stopRecognition()
|
||||
self.stopSpeaking()
|
||||
}
|
||||
|
||||
private func startRecognition() throws {
|
||||
self.speechRecognizer = SFSpeechRecognizer()
|
||||
guard let recognizer = self.speechRecognizer else {
|
||||
throw NSError(domain: "TalkMode", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "Speech recognizer unavailable",
|
||||
])
|
||||
}
|
||||
|
||||
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
||||
self.recognitionRequest?.shouldReportPartialResults = true
|
||||
guard let request = self.recognitionRequest else { return }
|
||||
|
||||
let input = self.audioEngine.inputNode
|
||||
let format = input.outputFormat(forBus: 0)
|
||||
input.removeTap(onBus: 0)
|
||||
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
|
||||
request?.append(buffer)
|
||||
}
|
||||
|
||||
self.audioEngine.prepare()
|
||||
try self.audioEngine.start()
|
||||
|
||||
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
||||
guard let self else { return }
|
||||
if let error {
|
||||
self.statusText = "Speech error: \(error.localizedDescription)"
|
||||
}
|
||||
guard let result else { return }
|
||||
let transcript = result.bestTranscription.formattedString
|
||||
Task { @MainActor in
|
||||
await self.handleTranscript(transcript: transcript, isFinal: result.isFinal)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func stopRecognition() {
|
||||
self.recognitionTask?.cancel()
|
||||
self.recognitionTask = nil
|
||||
self.recognitionRequest?.endAudio()
|
||||
self.recognitionRequest = nil
|
||||
self.audioEngine.inputNode.removeTap(onBus: 0)
|
||||
self.audioEngine.stop()
|
||||
self.speechRecognizer = nil
|
||||
}
|
||||
|
||||
private func handleTranscript(transcript: String, isFinal: Bool) async {
|
||||
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if self.isSpeaking, self.interruptOnSpeech {
|
||||
if self.shouldInterrupt(with: trimmed) {
|
||||
self.stopSpeaking()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
guard self.isListening else { return }
|
||||
if !trimmed.isEmpty {
|
||||
self.lastTranscript = trimmed
|
||||
self.lastHeard = Date()
|
||||
}
|
||||
if isFinal {
|
||||
self.lastTranscript = trimmed
|
||||
}
|
||||
}
|
||||
|
||||
private func startSilenceMonitor() {
|
||||
self.silenceTask?.cancel()
|
||||
self.silenceTask = Task { [weak self] in
|
||||
guard let self else { return }
|
||||
while self.isEnabled {
|
||||
try? await Task.sleep(nanoseconds: 200_000_000)
|
||||
await self.checkSilence()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func checkSilence() async {
|
||||
guard self.isListening else { return }
|
||||
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !transcript.isEmpty else { return }
|
||||
guard let lastHeard else { return }
|
||||
if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return }
|
||||
await self.finalizeTranscript(transcript)
|
||||
}
|
||||
|
||||
private func finalizeTranscript(_ transcript: String) async {
|
||||
self.isListening = false
|
||||
self.statusText = "Thinking…"
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.stopRecognition()
|
||||
|
||||
await self.reloadConfig()
|
||||
let prompt = self.buildPrompt(transcript: transcript)
|
||||
guard let bridge else {
|
||||
self.statusText = "Bridge not connected"
|
||||
await self.start()
|
||||
return
|
||||
}
|
||||
|
||||
do {
|
||||
let runId = try await self.sendChat(prompt, bridge: bridge)
|
||||
let ok = await self.waitForChatFinal(runId: runId, bridge: bridge)
|
||||
if !ok {
|
||||
self.statusText = "No reply"
|
||||
await self.start()
|
||||
return
|
||||
}
|
||||
|
||||
guard let assistantText = try await self.fetchLatestAssistantText(bridge: bridge) else {
|
||||
self.statusText = "No reply"
|
||||
await self.start()
|
||||
return
|
||||
}
|
||||
await self.playAssistant(text: assistantText)
|
||||
} catch {
|
||||
self.statusText = "Talk failed: \(error.localizedDescription)"
|
||||
}
|
||||
|
||||
await self.start()
|
||||
}
|
||||
|
||||
private func buildPrompt(transcript: String) -> String {
|
||||
var lines: [String] = [
|
||||
"Talk Mode active. Reply in a concise, spoken tone.",
|
||||
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
|
||||
]
|
||||
|
||||
if let interrupted = self.lastInterruptedAtSeconds {
|
||||
let formatted = String(format: "%.1f", interrupted)
|
||||
lines.append("Assistant speech interrupted at \(formatted)s.")
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
}
|
||||
|
||||
lines.append("")
|
||||
lines.append(transcript)
|
||||
return lines.joined(separator: "\n")
|
||||
}
|
||||
|
||||
private func sendChat(_ message: String, bridge: BridgeSession) async throws -> String {
|
||||
struct SendResponse: Decodable { let runId: String }
|
||||
let payload: [String: Any] = [
|
||||
"sessionKey": "main",
|
||||
"message": message,
|
||||
"thinking": "low",
|
||||
"timeoutMs": 30_000,
|
||||
"idempotencyKey": UUID().uuidString,
|
||||
]
|
||||
let data = try JSONSerialization.data(withJSONObject: payload)
|
||||
let json = String(decoding: data, as: UTF8.self)
|
||||
let res = try await bridge.request(method: "chat.send", paramsJSON: json, timeoutSeconds: 30)
|
||||
let decoded = try JSONDecoder().decode(SendResponse.self, from: res)
|
||||
return decoded.runId
|
||||
}
|
||||
|
||||
private func waitForChatFinal(runId: String, bridge: BridgeSession) async -> Bool {
|
||||
let stream = await bridge.subscribeServerEvents(bufferingNewest: 200)
|
||||
let timeout = Date().addingTimeInterval(120)
|
||||
for await evt in stream {
|
||||
if Date() > timeout { return false }
|
||||
guard evt.event == "chat", let payload = evt.payloadJSON else { continue }
|
||||
guard let data = payload.data(using: .utf8) else { continue }
|
||||
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { continue }
|
||||
if (json["runId"] as? String) != runId { continue }
|
||||
if let state = json["state"] as? String, state == "final" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
private func fetchLatestAssistantText(bridge: BridgeSession) async throws -> String? {
|
||||
let res = try await bridge.request(method: "chat.history", paramsJSON: "{\"sessionKey\":\"main\"}", timeoutSeconds: 15)
|
||||
guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return nil }
|
||||
guard let messages = json["messages"] as? [[String: Any]] else { return nil }
|
||||
for msg in messages.reversed() {
|
||||
guard (msg["role"] as? String) == "assistant" else { continue }
|
||||
guard let content = msg["content"] as? [[String: Any]] else { continue }
|
||||
let text = content.compactMap { $0["text"] as? String }.joined(separator: "\n")
|
||||
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if !trimmed.isEmpty { return trimmed }
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private func playAssistant(text: String) async {
|
||||
let parsed = TalkDirectiveParser.parse(text)
|
||||
let directive = parsed.directive
|
||||
let cleaned = parsed.stripped.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !cleaned.isEmpty else { return }
|
||||
|
||||
if let voice = directive?.voiceId {
|
||||
if directive?.once != true {
|
||||
self.currentVoiceId = voice
|
||||
}
|
||||
}
|
||||
if let model = directive?.modelId {
|
||||
if directive?.once != true {
|
||||
self.currentModelId = model
|
||||
}
|
||||
}
|
||||
|
||||
let voiceId = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId
|
||||
guard let voiceId, !voiceId.isEmpty else {
|
||||
self.statusText = "Missing voice ID"
|
||||
return
|
||||
}
|
||||
|
||||
guard let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"], !apiKey.isEmpty else {
|
||||
self.statusText = "Missing ELEVENLABS_API_KEY"
|
||||
return
|
||||
}
|
||||
|
||||
self.statusText = "Speaking…"
|
||||
self.isSpeaking = true
|
||||
self.lastSpokenText = cleaned
|
||||
|
||||
do {
|
||||
let request = ElevenLabsRequest(
|
||||
text: cleaned,
|
||||
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
|
||||
outputFormat: directive?.outputFormat ?? self.defaultOutputFormat,
|
||||
speed: TalkModeRuntime.resolveSpeed(
|
||||
speed: directive?.speed,
|
||||
rateWPM: directive?.rateWPM),
|
||||
stability: TalkModeRuntime.validatedUnit(directive?.stability),
|
||||
similarity: TalkModeRuntime.validatedUnit(directive?.similarity),
|
||||
style: TalkModeRuntime.validatedUnit(directive?.style),
|
||||
speakerBoost: directive?.speakerBoost,
|
||||
seed: TalkModeRuntime.validatedSeed(directive?.seed),
|
||||
normalize: TalkModeRuntime.validatedNormalize(directive?.normalize),
|
||||
language: TalkModeRuntime.validatedLanguage(directive?.language))
|
||||
let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize(
|
||||
voiceId: voiceId,
|
||||
request: request)
|
||||
try await self.playAudio(data: audio)
|
||||
} catch {
|
||||
self.statusText = "Speak failed: \(error.localizedDescription)"
|
||||
}
|
||||
|
||||
self.isSpeaking = false
|
||||
}
|
||||
|
||||
private func playAudio(data: Data) async throws {
|
||||
self.player?.stop()
|
||||
let player = try AVAudioPlayer(data: data)
|
||||
self.player = player
|
||||
player.prepareToPlay()
|
||||
player.play()
|
||||
while player.isPlaying {
|
||||
try? await Task.sleep(nanoseconds: 120_000_000)
|
||||
}
|
||||
}
|
||||
|
||||
private func stopSpeaking() {
|
||||
guard self.isSpeaking else { return }
|
||||
self.lastInterruptedAtSeconds = self.player?.currentTime
|
||||
self.player?.stop()
|
||||
self.player = nil
|
||||
self.isSpeaking = false
|
||||
}
|
||||
|
||||
private func shouldInterrupt(with transcript: String) -> Bool {
|
||||
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard trimmed.count >= 3 else { return false }
|
||||
if let spoken = self.lastSpokenText?.lowercased(), spoken.contains(trimmed.lowercased()) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
private func reloadConfig() async {
|
||||
guard let bridge else { return }
|
||||
do {
|
||||
let res = try await bridge.request(method: "config.get", paramsJSON: "{}", timeoutSeconds: 8)
|
||||
guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return }
|
||||
guard let config = json["config"] as? [String: Any] else { return }
|
||||
let talk = config["talk"] as? [String: Any]
|
||||
self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
self.currentVoiceId = self.defaultVoiceId
|
||||
self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
self.currentModelId = self.defaultModelId
|
||||
self.defaultOutputFormat = (talk?["outputFormat"] as? String)?
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if let interrupt = talk?["interruptOnSpeech"] as? Bool {
|
||||
self.interruptOnSpeech = interrupt
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
private static func configureAudioSession() throws {
|
||||
let session = AVAudioSession.sharedInstance()
|
||||
try session.setCategory(.playAndRecord, mode: .measurement, options: [
|
||||
.duckOthers,
|
||||
.mixWithOthers,
|
||||
.allowBluetoothHFP,
|
||||
.defaultToSpeaker,
|
||||
])
|
||||
try session.setActive(true, options: [])
|
||||
}
|
||||
|
||||
private nonisolated static func requestMicrophonePermission() async -> Bool {
|
||||
await withCheckedContinuation(isolation: nil) { cont in
|
||||
AVAudioApplication.requestRecordPermission { ok in
|
||||
cont.resume(returning: ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private nonisolated static func requestSpeechPermission() async -> Bool {
|
||||
await withCheckedContinuation(isolation: nil) { cont in
|
||||
SFSpeechRecognizer.requestAuthorization { status in
|
||||
cont.resume(returning: status == .authorized)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private struct ElevenLabsRequest {
|
||||
let text: String
|
||||
let modelId: String?
|
||||
let outputFormat: String?
|
||||
let speed: Double?
|
||||
let stability: Double?
|
||||
let similarity: Double?
|
||||
let style: Double?
|
||||
let speakerBoost: Bool?
|
||||
let seed: UInt32?
|
||||
let normalize: String?
|
||||
let language: String?
|
||||
}
|
||||
|
||||
private struct ElevenLabsClient {
|
||||
let apiKey: String
|
||||
let baseUrl = URL(string: "https://api.elevenlabs.io")!
|
||||
|
||||
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("text-to-speech")
|
||||
url.appendPathComponent(voiceId)
|
||||
|
||||
var payload: [String: Any] = [
|
||||
"text": request.text,
|
||||
]
|
||||
if let modelId = request.modelId, !modelId.isEmpty {
|
||||
payload["model_id"] = modelId
|
||||
}
|
||||
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
|
||||
payload["output_format"] = outputFormat
|
||||
}
|
||||
if let seed = request.seed {
|
||||
payload["seed"] = seed
|
||||
}
|
||||
if let normalize = request.normalize {
|
||||
payload["apply_text_normalization"] = normalize
|
||||
}
|
||||
if let language = request.language {
|
||||
payload["language_code"] = language
|
||||
}
|
||||
var voiceSettings: [String: Any] = [:]
|
||||
if let speed = request.speed { voiceSettings["speed"] = speed }
|
||||
if let stability = request.stability { voiceSettings["stability"] = stability }
|
||||
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
|
||||
if let style = request.style { voiceSettings["style"] = style }
|
||||
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
|
||||
if !voiceSettings.isEmpty { payload["voice_settings"] = voiceSettings }
|
||||
|
||||
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "POST"
|
||||
req.httpBody = body
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
|
||||
let message = String(data: data, encoding: .utf8) ?? "unknown"
|
||||
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
|
||||
])
|
||||
}
|
||||
return data
|
||||
}
|
||||
}
|
||||
|
||||
private enum TalkModeRuntime {
|
||||
static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
|
||||
if let rateWPM, rateWPM > 0 {
|
||||
let resolved = Double(rateWPM) / 175.0
|
||||
if resolved <= 0.5 || resolved >= 2.0 { return nil }
|
||||
return resolved
|
||||
}
|
||||
if let speed {
|
||||
if speed <= 0.5 || speed >= 2.0 { return nil }
|
||||
return speed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
static func validatedUnit(_ value: Double?) -> Double? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 1 { return nil }
|
||||
return value
|
||||
}
|
||||
|
||||
static func validatedSeed(_ value: Int?) -> UInt32? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 4294967295 { return nil }
|
||||
return UInt32(value)
|
||||
}
|
||||
|
||||
static func validatedNormalize(_ value: String?) -> String? {
|
||||
guard let value else { return nil }
|
||||
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
return ["auto", "on", "off"].contains(normalized) ? normalized : nil
|
||||
}
|
||||
|
||||
static func validatedLanguage(_ value: String?) -> String? {
|
||||
guard let value else { return nil }
|
||||
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
|
||||
return normalized
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,7 @@ struct VoiceTab: View {
|
||||
@Environment(NodeAppModel.self) private var appModel
|
||||
@Environment(VoiceWakeManager.self) private var voiceWake
|
||||
@AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false
|
||||
@AppStorage("talk.enabled") private var talkEnabled: Bool = false
|
||||
|
||||
var body: some View {
|
||||
NavigationStack {
|
||||
@@ -14,6 +15,7 @@ struct VoiceTab: View {
|
||||
Text(self.voiceWake.statusText)
|
||||
.font(.footnote)
|
||||
.foregroundStyle(.secondary)
|
||||
LabeledContent("Talk Mode", value: self.talkEnabled ? "Enabled" : "Disabled")
|
||||
}
|
||||
|
||||
Section("Notes") {
|
||||
@@ -36,6 +38,9 @@ struct VoiceTab: View {
|
||||
.onChange(of: self.voiceWakeEnabled) { _, newValue in
|
||||
self.appModel.setVoiceWakeEnabled(newValue)
|
||||
}
|
||||
.onChange(of: self.talkEnabled) { _, newValue in
|
||||
self.appModel.setTalkEnabled(newValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +121,15 @@ final class AppState {
|
||||
forKey: voicePushToTalkEnabledKey) } }
|
||||
}
|
||||
|
||||
var talkEnabled: Bool {
|
||||
didSet {
|
||||
self.ifNotPreview {
|
||||
UserDefaults.standard.set(self.talkEnabled, forKey: talkEnabledKey)
|
||||
Task { await TalkModeController.shared.setEnabled(self.talkEnabled) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var iconOverride: IconOverrideSelection {
|
||||
didSet { self.ifNotPreview { UserDefaults.standard.set(self.iconOverride.rawValue, forKey: iconOverrideKey) } }
|
||||
}
|
||||
@@ -216,6 +225,7 @@ final class AppState {
|
||||
.stringArray(forKey: voiceWakeAdditionalLocalesKey) ?? []
|
||||
self.voicePushToTalkEnabled = UserDefaults.standard
|
||||
.object(forKey: voicePushToTalkEnabledKey) as? Bool ?? false
|
||||
self.talkEnabled = UserDefaults.standard.bool(forKey: talkEnabledKey)
|
||||
if let storedHeartbeats = UserDefaults.standard.object(forKey: heartbeatsEnabledKey) as? Bool {
|
||||
self.heartbeatsEnabled = storedHeartbeats
|
||||
} else {
|
||||
@@ -256,9 +266,13 @@ final class AppState {
|
||||
if self.swabbleEnabled, !PermissionManager.voiceWakePermissionsGranted() {
|
||||
self.swabbleEnabled = false
|
||||
}
|
||||
if self.talkEnabled, !PermissionManager.voiceWakePermissionsGranted() {
|
||||
self.talkEnabled = false
|
||||
}
|
||||
|
||||
if !self.isPreview {
|
||||
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
|
||||
Task { await TalkModeController.shared.setEnabled(self.talkEnabled) }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -312,6 +326,23 @@ final class AppState {
|
||||
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
|
||||
}
|
||||
|
||||
func setTalkEnabled(_ enabled: Bool) async {
|
||||
guard voiceWakeSupported else {
|
||||
self.talkEnabled = false
|
||||
return
|
||||
}
|
||||
|
||||
self.talkEnabled = enabled
|
||||
guard !self.isPreview else { return }
|
||||
|
||||
if !enabled { return }
|
||||
|
||||
if PermissionManager.voiceWakePermissionsGranted() { return }
|
||||
|
||||
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
|
||||
self.talkEnabled = granted
|
||||
}
|
||||
|
||||
// MARK: - Global wake words sync (Gateway-owned)
|
||||
|
||||
func applyGlobalVoiceWakeTriggers(_ triggers: [String]) {
|
||||
@@ -367,6 +398,7 @@ extension AppState {
|
||||
state.voiceWakeLocaleID = Locale.current.identifier
|
||||
state.voiceWakeAdditionalLocaleIDs = ["en-US", "de-DE"]
|
||||
state.voicePushToTalkEnabled = false
|
||||
state.talkEnabled = false
|
||||
state.iconOverride = .system
|
||||
state.heartbeatsEnabled = true
|
||||
state.connectionMode = .local
|
||||
|
||||
@@ -30,6 +30,10 @@ struct ConfigSettings: View {
|
||||
@State private var browserColorHex: String = "#FF4500"
|
||||
@State private var browserAttachOnly: Bool = false
|
||||
|
||||
// Talk mode settings (stored in ~/.clawdis/clawdis.json under "talk")
|
||||
@State private var talkVoiceId: String = ""
|
||||
@State private var talkInterruptOnSpeech: Bool = true
|
||||
|
||||
var body: some View {
|
||||
ScrollView { self.content }
|
||||
.onChange(of: self.modelCatalogPath) { _, _ in
|
||||
@@ -53,6 +57,7 @@ struct ConfigSettings: View {
|
||||
self.header
|
||||
self.agentSection
|
||||
self.heartbeatSection
|
||||
self.talkSection
|
||||
self.browserSection
|
||||
Spacer(minLength: 0)
|
||||
}
|
||||
@@ -266,6 +271,37 @@ struct ConfigSettings: View {
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
}
|
||||
|
||||
private var talkSection: some View {
|
||||
GroupBox("Talk Mode") {
|
||||
Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) {
|
||||
GridRow {
|
||||
self.gridLabel("Voice ID")
|
||||
VStack(alignment: .leading, spacing: 6) {
|
||||
ComboBox("ElevenLabs voice ID", text: self.$talkVoiceId) {
|
||||
ForEach(self.talkVoiceSuggestions, id: \.self) { value in
|
||||
Text(value).tag(value)
|
||||
}
|
||||
}
|
||||
.textFieldStyle(.roundedBorder)
|
||||
.frame(maxWidth: .infinity)
|
||||
.onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() }
|
||||
Text("Defaults to ELEVENLABS_VOICE_ID / SAG_VOICE_ID if unset.")
|
||||
.font(.footnote)
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
}
|
||||
GridRow {
|
||||
self.gridLabel("Interrupt")
|
||||
Toggle("Stop speaking when you start talking", isOn: self.$talkInterruptOnSpeech)
|
||||
.labelsHidden()
|
||||
.toggleStyle(.checkbox)
|
||||
.onChange(of: self.talkInterruptOnSpeech) { _, _ in self.autosaveConfig() }
|
||||
}
|
||||
}
|
||||
}
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
}
|
||||
|
||||
private func gridLabel(_ text: String) -> some View {
|
||||
Text(text)
|
||||
.foregroundStyle(.secondary)
|
||||
@@ -278,6 +314,7 @@ struct ConfigSettings: View {
|
||||
let heartbeatMinutes = agent?["heartbeatMinutes"] as? Int
|
||||
let heartbeatBody = agent?["heartbeatBody"] as? String
|
||||
let browser = parsed["browser"] as? [String: Any]
|
||||
let talk = parsed["talk"] as? [String: Any]
|
||||
|
||||
let loadedModel = (agent?["model"] as? String) ?? ""
|
||||
if !loadedModel.isEmpty {
|
||||
@@ -297,6 +334,13 @@ struct ConfigSettings: View {
|
||||
if let color = browser["color"] as? String, !color.isEmpty { self.browserColorHex = color }
|
||||
if let attachOnly = browser["attachOnly"] as? Bool { self.browserAttachOnly = attachOnly }
|
||||
}
|
||||
|
||||
if let talk {
|
||||
if let voice = talk["voiceId"] as? String { self.talkVoiceId = voice }
|
||||
if let interrupt = talk["interruptOnSpeech"] as? Bool {
|
||||
self.talkInterruptOnSpeech = interrupt
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func autosaveConfig() {
|
||||
@@ -312,6 +356,7 @@ struct ConfigSettings: View {
|
||||
var root = self.loadConfigDict()
|
||||
var agent = root["agent"] as? [String: Any] ?? [:]
|
||||
var browser = root["browser"] as? [String: Any] ?? [:]
|
||||
var talk = root["talk"] as? [String: Any] ?? [:]
|
||||
|
||||
let chosenModel = (self.configModel == "__custom__" ? self.customModel : self.configModel)
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
@@ -337,6 +382,15 @@ struct ConfigSettings: View {
|
||||
browser["attachOnly"] = self.browserAttachOnly
|
||||
root["browser"] = browser
|
||||
|
||||
let trimmedVoice = self.talkVoiceId.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if trimmedVoice.isEmpty {
|
||||
talk.removeValue(forKey: "voiceId")
|
||||
} else {
|
||||
talk["voiceId"] = trimmedVoice
|
||||
}
|
||||
talk["interruptOnSpeech"] = self.talkInterruptOnSpeech
|
||||
root["talk"] = talk
|
||||
|
||||
ClawdisConfigFile.saveDict(root)
|
||||
}
|
||||
|
||||
@@ -354,6 +408,20 @@ struct ConfigSettings: View {
|
||||
return Color(red: r, green: g, blue: b)
|
||||
}
|
||||
|
||||
private var talkVoiceSuggestions: [String] {
|
||||
let env = ProcessInfo.processInfo.environment
|
||||
let candidates = [
|
||||
self.talkVoiceId,
|
||||
env["ELEVENLABS_VOICE_ID"] ?? "",
|
||||
env["SAG_VOICE_ID"] ?? "",
|
||||
]
|
||||
var seen = Set<String>()
|
||||
return candidates
|
||||
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
||||
.filter { !$0.isEmpty }
|
||||
.filter { seen.insert($0).inserted }
|
||||
}
|
||||
|
||||
private var browserPathLabel: String? {
|
||||
guard self.browserEnabled else { return nil }
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ let voiceWakeMicKey = "clawdis.voiceWakeMicID"
|
||||
let voiceWakeLocaleKey = "clawdis.voiceWakeLocaleID"
|
||||
let voiceWakeAdditionalLocalesKey = "clawdis.voiceWakeAdditionalLocaleIDs"
|
||||
let voicePushToTalkEnabledKey = "clawdis.voicePushToTalkEnabled"
|
||||
let talkEnabledKey = "clawdis.talkEnabled"
|
||||
let iconOverrideKey = "clawdis.iconOverride"
|
||||
let connectionModeKey = "clawdis.connectionMode"
|
||||
let remoteTargetKey = "clawdis.remoteTarget"
|
||||
|
||||
@@ -72,6 +72,11 @@ struct MenuContent: View {
|
||||
if self.showVoiceWakeMicPicker {
|
||||
self.voiceWakeMicMenu
|
||||
}
|
||||
Toggle(isOn: self.talkBinding) {
|
||||
Label("Talk", systemImage: "bubble.left.and.waveform")
|
||||
}
|
||||
.disabled(!voiceWakeSupported)
|
||||
.opacity(voiceWakeSupported ? 1 : 0.5)
|
||||
Divider()
|
||||
Button {
|
||||
Task { @MainActor in
|
||||
@@ -331,6 +336,14 @@ struct MenuContent: View {
|
||||
})
|
||||
}
|
||||
|
||||
private var talkBinding: Binding<Bool> {
|
||||
Binding(
|
||||
get: { self.state.talkEnabled },
|
||||
set: { newValue in
|
||||
Task { await self.state.setTalkEnabled(newValue) }
|
||||
})
|
||||
}
|
||||
|
||||
private var showVoiceWakeMicPicker: Bool {
|
||||
voiceWakeSupported && self.state.swabbleEnabled
|
||||
}
|
||||
|
||||
54
apps/macos/Sources/Clawdis/TalkAudioPlayer.swift
Normal file
54
apps/macos/Sources/Clawdis/TalkAudioPlayer.swift
Normal file
@@ -0,0 +1,54 @@
|
||||
import AVFoundation
|
||||
import Foundation
|
||||
import OSLog
|
||||
|
||||
@MainActor
|
||||
final class TalkAudioPlayer: NSObject, AVAudioPlayerDelegate {
|
||||
static let shared = TalkAudioPlayer()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts")
|
||||
private var player: AVAudioPlayer?
|
||||
private var continuation: CheckedContinuation<TalkPlaybackResult, Never>?
|
||||
|
||||
func play(data: Data) async -> TalkPlaybackResult {
|
||||
self.stopInternal(interrupted: true)
|
||||
do {
|
||||
let player = try AVAudioPlayer(data: data)
|
||||
self.player = player
|
||||
player.delegate = self
|
||||
player.prepareToPlay()
|
||||
player.play()
|
||||
return await withCheckedContinuation { continuation in
|
||||
self.continuation = continuation
|
||||
}
|
||||
} catch {
|
||||
self.logger.error("talk audio player failed: \(error.localizedDescription, privacy: .public)")
|
||||
return TalkPlaybackResult(finished: false, interruptedAt: nil)
|
||||
}
|
||||
}
|
||||
|
||||
func stop() -> Double? {
|
||||
guard let player else { return nil }
|
||||
let time = player.currentTime
|
||||
self.stopInternal(interrupted: true, interruptedAt: time)
|
||||
return time
|
||||
}
|
||||
|
||||
func audioPlayerDidFinishPlaying(_: AVAudioPlayer, successfully flag: Bool) {
|
||||
self.stopInternal(interrupted: !flag)
|
||||
}
|
||||
|
||||
private func stopInternal(interrupted: Bool, interruptedAt: Double? = nil) {
|
||||
self.player?.stop()
|
||||
self.player = nil
|
||||
if let continuation {
|
||||
self.continuation = nil
|
||||
continuation.resume(returning: TalkPlaybackResult(finished: !interrupted, interruptedAt: interruptedAt))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct TalkPlaybackResult: Sendable {
|
||||
let finished: Bool
|
||||
let interruptedAt: Double?
|
||||
}
|
||||
42
apps/macos/Sources/Clawdis/TalkModeController.swift
Normal file
42
apps/macos/Sources/Clawdis/TalkModeController.swift
Normal file
@@ -0,0 +1,42 @@
|
||||
import Observation
|
||||
import OSLog
|
||||
|
||||
@MainActor
|
||||
@Observable
|
||||
final class TalkModeController {
|
||||
static let shared = TalkModeController()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.controller")
|
||||
|
||||
func setEnabled(_ enabled: Bool) async {
|
||||
self.logger.info("talk enabled=\(enabled)")
|
||||
if enabled {
|
||||
TalkOverlayController.shared.present()
|
||||
} else {
|
||||
TalkOverlayController.shared.dismiss()
|
||||
}
|
||||
await TalkModeRuntime.shared.setEnabled(enabled)
|
||||
}
|
||||
|
||||
func updatePhase(_ phase: TalkModePhase) {
|
||||
TalkOverlayController.shared.updatePhase(phase)
|
||||
}
|
||||
|
||||
func updateLevel(_ level: Double) {
|
||||
TalkOverlayController.shared.updateLevel(level)
|
||||
}
|
||||
|
||||
func stopSpeaking(reason: TalkStopReason = .userTap) {
|
||||
Task { await TalkModeRuntime.shared.stopSpeaking(reason: reason) }
|
||||
}
|
||||
|
||||
func exitTalkMode() {
|
||||
Task { await AppStateStore.shared.setTalkEnabled(false) }
|
||||
}
|
||||
}
|
||||
|
||||
enum TalkStopReason {
|
||||
case userTap
|
||||
case speech
|
||||
case manual
|
||||
}
|
||||
684
apps/macos/Sources/Clawdis/TalkModeRuntime.swift
Normal file
684
apps/macos/Sources/Clawdis/TalkModeRuntime.swift
Normal file
@@ -0,0 +1,684 @@
|
||||
import AVFoundation
|
||||
import ClawdisChatUI
|
||||
import ClawdisKit
|
||||
import Foundation
|
||||
import OSLog
|
||||
import Speech
|
||||
|
||||
actor TalkModeRuntime {
|
||||
static let shared = TalkModeRuntime()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime")
|
||||
|
||||
private var recognizer: SFSpeechRecognizer?
|
||||
private var audioEngine: AVAudioEngine?
|
||||
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||
private var recognitionTask: SFSpeechRecognitionTask?
|
||||
private var recognitionGeneration: Int = 0
|
||||
|
||||
private var captureTask: Task<Void, Never>?
|
||||
private var silenceTask: Task<Void, Never>?
|
||||
private var phase: TalkModePhase = .idle
|
||||
private var isEnabled = false
|
||||
|
||||
private var lastHeard: Date?
|
||||
private var noiseFloorRMS: Double = 1e-4
|
||||
private var lastTranscript: String = ""
|
||||
private var lastSpeechEnergyAt: Date?
|
||||
|
||||
private var defaultVoiceId: String?
|
||||
private var currentVoiceId: String?
|
||||
private var defaultModelId: String?
|
||||
private var currentModelId: String?
|
||||
private var voiceOverrideActive = false
|
||||
private var modelOverrideActive = false
|
||||
private var defaultOutputFormat: String?
|
||||
private var interruptOnSpeech: Bool = true
|
||||
private var lastInterruptedAtSeconds: Double?
|
||||
private var lastSpokenText: String?
|
||||
|
||||
private let silenceWindow: TimeInterval = 0.7
|
||||
private let minSpeechRMS: Double = 1e-3
|
||||
private let speechBoostFactor: Double = 6.0
|
||||
|
||||
// MARK: - Lifecycle
|
||||
|
||||
func setEnabled(_ enabled: Bool) async {
|
||||
guard enabled != self.isEnabled else { return }
|
||||
self.isEnabled = enabled
|
||||
if enabled {
|
||||
await self.start()
|
||||
} else {
|
||||
await self.stop()
|
||||
}
|
||||
}
|
||||
|
||||
private func start() async {
|
||||
guard voiceWakeSupported else { return }
|
||||
guard PermissionManager.voiceWakePermissionsGranted() else {
|
||||
self.logger.debug("talk runtime not starting: permissions missing")
|
||||
return
|
||||
}
|
||||
await self.reloadConfig()
|
||||
await self.startRecognition()
|
||||
self.phase = .listening
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.listening) }
|
||||
self.startSilenceMonitor()
|
||||
}
|
||||
|
||||
private func stop() async {
|
||||
self.captureTask?.cancel()
|
||||
self.captureTask = nil
|
||||
self.silenceTask?.cancel()
|
||||
self.silenceTask = nil
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.lastSpeechEnergyAt = nil
|
||||
self.phase = .idle
|
||||
await self.stopRecognition()
|
||||
await self.stopSpeaking(reason: .manual)
|
||||
await MainActor.run {
|
||||
TalkModeController.shared.updateLevel(0)
|
||||
TalkModeController.shared.updatePhase(.idle)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Speech recognition
|
||||
|
||||
private struct RecognitionUpdate {
|
||||
let transcript: String?
|
||||
let segments: [SFTranscriptionSegment]
|
||||
let isFinal: Bool
|
||||
let error: Error?
|
||||
let generation: Int
|
||||
}
|
||||
|
||||
private func startRecognition() async {
|
||||
await self.stopRecognition()
|
||||
self.recognitionGeneration &+= 1
|
||||
let generation = self.recognitionGeneration
|
||||
|
||||
let locale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID }
|
||||
self.recognizer = SFSpeechRecognizer(locale: Locale(identifier: locale))
|
||||
guard let recognizer, recognizer.isAvailable else {
|
||||
self.logger.error("talk recognizer unavailable")
|
||||
return
|
||||
}
|
||||
|
||||
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
||||
self.recognitionRequest?.shouldReportPartialResults = true
|
||||
guard let request = self.recognitionRequest else { return }
|
||||
|
||||
if self.audioEngine == nil {
|
||||
self.audioEngine = AVAudioEngine()
|
||||
}
|
||||
guard let audioEngine = self.audioEngine else { return }
|
||||
|
||||
let input = audioEngine.inputNode
|
||||
let format = input.outputFormat(forBus: 0)
|
||||
input.removeTap(onBus: 0)
|
||||
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
|
||||
request?.append(buffer)
|
||||
if let rms = Self.rmsLevel(buffer: buffer) {
|
||||
Task.detached { [weak self] in
|
||||
await self?.noteAudioLevel(rms: rms)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
audioEngine.prepare()
|
||||
do {
|
||||
try audioEngine.start()
|
||||
} catch {
|
||||
self.logger.error("talk audio engine start failed: \(error.localizedDescription, privacy: .public)")
|
||||
return
|
||||
}
|
||||
|
||||
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in
|
||||
guard let self else { return }
|
||||
let transcript = result?.bestTranscription.formattedString
|
||||
let update = RecognitionUpdate(
|
||||
transcript: transcript,
|
||||
segments: result?.bestTranscription.segments ?? [],
|
||||
isFinal: result?.isFinal ?? false,
|
||||
error: error,
|
||||
generation: generation)
|
||||
Task { await self.handleRecognition(update) }
|
||||
}
|
||||
}
|
||||
|
||||
private func stopRecognition() async {
|
||||
self.recognitionGeneration &+= 1
|
||||
self.recognitionTask?.cancel()
|
||||
self.recognitionTask = nil
|
||||
self.recognitionRequest?.endAudio()
|
||||
self.recognitionRequest = nil
|
||||
self.audioEngine?.inputNode.removeTap(onBus: 0)
|
||||
self.audioEngine?.stop()
|
||||
self.audioEngine = nil
|
||||
self.recognizer = nil
|
||||
}
|
||||
|
||||
private func handleRecognition(_ update: RecognitionUpdate) async {
|
||||
guard update.generation == self.recognitionGeneration else { return }
|
||||
if let error = update.error {
|
||||
self.logger.debug("talk recognition error: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
guard let transcript = update.transcript else { return }
|
||||
|
||||
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if self.phase == .speaking, self.interruptOnSpeech {
|
||||
if await self.shouldInterrupt(transcript: trimmed, segments: update.segments) {
|
||||
await self.stopSpeaking(reason: .speech)
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
await self.startListening()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
guard self.phase == .listening else { return }
|
||||
|
||||
if !trimmed.isEmpty {
|
||||
self.lastTranscript = trimmed
|
||||
self.lastHeard = Date()
|
||||
}
|
||||
|
||||
if update.isFinal {
|
||||
self.lastTranscript = trimmed
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Silence handling
|
||||
|
||||
private func startSilenceMonitor() {
|
||||
self.silenceTask?.cancel()
|
||||
self.silenceTask = Task { [weak self] in
|
||||
guard let self else { return }
|
||||
while self.isEnabled {
|
||||
try? await Task.sleep(nanoseconds: 200_000_000)
|
||||
await self.checkSilence()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func checkSilence() async {
|
||||
guard self.phase == .listening else { return }
|
||||
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !transcript.isEmpty else { return }
|
||||
guard let lastHeard else { return }
|
||||
let elapsed = Date().timeIntervalSince(lastHeard)
|
||||
guard elapsed >= self.silenceWindow else { return }
|
||||
await self.finalizeTranscript(transcript)
|
||||
}
|
||||
|
||||
private func startListening() async {
|
||||
self.phase = .listening
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
await MainActor.run {
|
||||
TalkModeController.shared.updatePhase(.listening)
|
||||
TalkModeController.shared.updateLevel(0)
|
||||
}
|
||||
}
|
||||
|
||||
private func finalizeTranscript(_ text: String) async {
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.phase = .thinking
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
|
||||
await self.stopRecognition()
|
||||
await self.sendAndSpeak(text)
|
||||
}
|
||||
|
||||
// MARK: - Gateway + TTS
|
||||
|
||||
private func sendAndSpeak(_ transcript: String) async {
|
||||
await self.reloadConfig()
|
||||
let prompt = self.buildPrompt(transcript: transcript)
|
||||
let runId = UUID().uuidString
|
||||
|
||||
do {
|
||||
let response = try await GatewayConnection.shared.chatSend(
|
||||
sessionKey: "main",
|
||||
message: prompt,
|
||||
thinking: "low",
|
||||
idempotencyKey: runId,
|
||||
attachments: [])
|
||||
let completion = await self.waitForChatCompletion(
|
||||
runId: response.runId,
|
||||
timeoutSeconds: 120)
|
||||
guard completion == .final else {
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
return
|
||||
}
|
||||
|
||||
guard let assistantText = await self.latestAssistantText(sessionKey: "main") else {
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
return
|
||||
}
|
||||
|
||||
await self.playAssistant(text: assistantText)
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
return
|
||||
} catch {
|
||||
self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)")
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
private func buildPrompt(transcript: String) -> String {
|
||||
var lines: [String] = [
|
||||
"Talk Mode active. Reply in a concise, spoken tone.",
|
||||
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
|
||||
]
|
||||
|
||||
if let interrupted = self.lastInterruptedAtSeconds {
|
||||
let formatted = String(format: "%.1f", interrupted)
|
||||
lines.append("Assistant speech interrupted at \(formatted)s.")
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
}
|
||||
|
||||
lines.append("")
|
||||
lines.append(transcript)
|
||||
return lines.joined(separator: "\n")
|
||||
}
|
||||
|
||||
private enum ChatCompletionState {
|
||||
case final
|
||||
case aborted
|
||||
case error
|
||||
case timeout
|
||||
}
|
||||
|
||||
private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState {
|
||||
await withTaskGroup(of: ChatCompletionState.self) { group in
|
||||
group.addTask { [runId] in
|
||||
let stream = GatewayConnection.shared.subscribe()
|
||||
for await push in stream {
|
||||
if case let .event(evt) = push, evt.event == "chat", let payload = evt.payload {
|
||||
if let chat = try? JSONDecoder().decode(
|
||||
ClawdisChatEventPayload.self,
|
||||
from: JSONEncoder().encode(payload))
|
||||
{
|
||||
guard chat.runId == runId else { continue }
|
||||
switch chat.state {
|
||||
case .some("final"): return .final
|
||||
case .some("aborted"): return .aborted
|
||||
case .some("error"): return .error
|
||||
default: break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return .timeout
|
||||
}
|
||||
group.addTask {
|
||||
try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000)
|
||||
return .timeout
|
||||
}
|
||||
let result = await group.next() ?? .timeout
|
||||
group.cancelAll()
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
private func latestAssistantText(sessionKey: String) async -> String? {
|
||||
do {
|
||||
let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey)
|
||||
let messages = history.messages ?? []
|
||||
let decoded = messages.compactMap { item in
|
||||
guard let data = try? JSONEncoder().encode(item) else { return nil }
|
||||
return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data)
|
||||
}
|
||||
guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil }
|
||||
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
|
||||
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
return trimmed.isEmpty ? nil : trimmed
|
||||
} catch {
|
||||
self.logger.error("talk history fetch failed: \(error.localizedDescription, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
private func playAssistant(text: String) async {
|
||||
let parse = TalkDirectiveParser.parse(text)
|
||||
let directive = parse.directive
|
||||
let cleaned = parse.stripped.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !cleaned.isEmpty else { return }
|
||||
|
||||
if !parse.unknownKeys.isEmpty {
|
||||
self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)")
|
||||
}
|
||||
|
||||
if let voice = directive?.voiceId {
|
||||
if directive?.once == true {
|
||||
self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)")
|
||||
} else {
|
||||
self.currentVoiceId = voice
|
||||
self.voiceOverrideActive = true
|
||||
self.logger.info("talk voice override voiceId=\(voice, privacy: .public)")
|
||||
}
|
||||
}
|
||||
|
||||
if let model = directive?.modelId {
|
||||
if directive?.once == true {
|
||||
self.logger.info("talk model override (once) modelId=\(model, privacy: .public)")
|
||||
} else {
|
||||
self.currentModelId = model
|
||||
self.modelOverrideActive = true
|
||||
}
|
||||
}
|
||||
|
||||
let voiceId =
|
||||
directive?.voiceId ??
|
||||
self.currentVoiceId ??
|
||||
self.defaultVoiceId
|
||||
|
||||
guard let voiceId, !voiceId.isEmpty else {
|
||||
self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
|
||||
return
|
||||
}
|
||||
|
||||
let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? ""
|
||||
if apiKey.isEmpty {
|
||||
self.logger.error("talk missing ELEVENLABS_API_KEY")
|
||||
return
|
||||
}
|
||||
|
||||
await self.startRecognition()
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
|
||||
self.phase = .speaking
|
||||
self.lastSpokenText = cleaned
|
||||
|
||||
let resolvedSpeed = Self.resolveSpeed(
|
||||
speed: directive?.speed,
|
||||
rateWPM: directive?.rateWPM,
|
||||
logger: self.logger)
|
||||
|
||||
let request = ElevenLabsRequest(
|
||||
text: cleaned,
|
||||
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
|
||||
outputFormat: directive?.outputFormat ?? self.defaultOutputFormat,
|
||||
speed: resolvedSpeed,
|
||||
stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger),
|
||||
similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger),
|
||||
style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger),
|
||||
speakerBoost: directive?.speakerBoost,
|
||||
seed: Self.validatedSeed(directive?.seed, logger: self.logger),
|
||||
normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger),
|
||||
language: Self.validatedLanguage(directive?.language, logger: self.logger))
|
||||
|
||||
do {
|
||||
let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize(
|
||||
voiceId: voiceId,
|
||||
request: request)
|
||||
let result = await MainActor.run { await TalkAudioPlayer.shared.play(data: audio) }
|
||||
if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
|
||||
if self.interruptOnSpeech {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
|
||||
self.phase = .thinking
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
|
||||
}
|
||||
|
||||
func stopSpeaking(reason: TalkStopReason) async {
|
||||
guard self.phase == .speaking else { return }
|
||||
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
|
||||
if reason == .speech, let interruptedAt {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
self.phase = .thinking
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.thinking) }
|
||||
}
|
||||
|
||||
// MARK: - Config
|
||||
|
||||
private func reloadConfig() async {
|
||||
let cfg = await self.fetchTalkConfig()
|
||||
self.defaultVoiceId = cfg.voiceId
|
||||
if !self.voiceOverrideActive {
|
||||
self.currentVoiceId = cfg.voiceId
|
||||
}
|
||||
self.defaultModelId = cfg.modelId
|
||||
if !self.modelOverrideActive {
|
||||
self.currentModelId = cfg.modelId
|
||||
}
|
||||
self.defaultOutputFormat = cfg.outputFormat
|
||||
self.interruptOnSpeech = cfg.interruptOnSpeech
|
||||
}
|
||||
|
||||
private struct TalkRuntimeConfig {
|
||||
let voiceId: String?
|
||||
let modelId: String?
|
||||
let outputFormat: String?
|
||||
let interruptOnSpeech: Bool
|
||||
}
|
||||
|
||||
private func fetchTalkConfig() async -> TalkRuntimeConfig {
|
||||
let env = ProcessInfo.processInfo.environment
|
||||
let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
|
||||
do {
|
||||
let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded(
|
||||
method: .configGet,
|
||||
params: nil,
|
||||
timeoutMs: 8000)
|
||||
let talk = snap.config?["talk"]?.dictionaryValue
|
||||
let voice = talk?["voiceId"]?.stringValue
|
||||
let model = talk?["modelId"]?.stringValue
|
||||
let outputFormat = talk?["outputFormat"]?.stringValue
|
||||
let interrupt = talk?["interruptOnSpeech"]?.boolValue
|
||||
let resolvedVoice =
|
||||
(voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ??
|
||||
(envVoice?.isEmpty == false ? envVoice : nil) ??
|
||||
(sagVoice?.isEmpty == false ? sagVoice : nil)
|
||||
return TalkRuntimeConfig(
|
||||
voiceId: resolvedVoice,
|
||||
modelId: model,
|
||||
outputFormat: outputFormat,
|
||||
interruptOnSpeech: interrupt ?? true)
|
||||
} catch {
|
||||
let resolvedVoice =
|
||||
(envVoice?.isEmpty == false ? envVoice : nil) ??
|
||||
(sagVoice?.isEmpty == false ? sagVoice : nil)
|
||||
return TalkRuntimeConfig(
|
||||
voiceId: resolvedVoice,
|
||||
modelId: nil,
|
||||
outputFormat: nil,
|
||||
interruptOnSpeech: true)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Audio level handling
|
||||
|
||||
private func noteAudioLevel(rms: Double) async {
|
||||
if self.phase != .listening && self.phase != .speaking { return }
|
||||
let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01
|
||||
self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha)
|
||||
|
||||
let threshold = max(self.minSpeechRMS, self.noiseFloorRMS * self.speechBoostFactor)
|
||||
if rms >= threshold {
|
||||
let now = Date()
|
||||
self.lastHeard = now
|
||||
self.lastSpeechEnergyAt = now
|
||||
}
|
||||
|
||||
if self.phase == .listening {
|
||||
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
|
||||
await MainActor.run { TalkModeController.shared.updateLevel(clamped) }
|
||||
}
|
||||
}
|
||||
|
||||
private static func rmsLevel(buffer: AVAudioPCMBuffer) -> Double? {
|
||||
guard let channelData = buffer.floatChannelData?.pointee else { return nil }
|
||||
let frameCount = Int(buffer.frameLength)
|
||||
guard frameCount > 0 else { return nil }
|
||||
var sum: Double = 0
|
||||
for i in 0..<frameCount {
|
||||
let sample = Double(channelData[i])
|
||||
sum += sample * sample
|
||||
}
|
||||
return sqrt(sum / Double(frameCount))
|
||||
}
|
||||
|
||||
private func shouldInterrupt(transcript: String, segments: [SFTranscriptionSegment]) async -> Bool {
|
||||
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard trimmed.count >= 3 else { return false }
|
||||
if self.isLikelyEcho(of: trimmed) { return false }
|
||||
let now = Date()
|
||||
if let lastSpeechEnergyAt, now.timeIntervalSince(lastSpeechEnergyAt) > 0.35 {
|
||||
return false
|
||||
}
|
||||
let hasConfidence = segments.contains { $0.confidence > 0.6 }
|
||||
return hasConfidence
|
||||
}
|
||||
|
||||
private func isLikelyEcho(of transcript: String) -> Bool {
|
||||
guard let spoken = self.lastSpokenText?.lowercased(), !spoken.isEmpty else { return false }
|
||||
let probe = transcript.lowercased()
|
||||
if probe.count < 6 {
|
||||
return spoken.contains(probe)
|
||||
}
|
||||
return spoken.contains(probe)
|
||||
}
|
||||
|
||||
private static func resolveSpeed(speed: Double?, rateWPM: Int?, logger: Logger) -> Double? {
|
||||
if let rateWPM, rateWPM > 0 {
|
||||
let resolved = Double(rateWPM) / 175.0
|
||||
if resolved <= 0.5 || resolved >= 2.0 {
|
||||
logger.warning("talk rateWPM out of range: \(rateWPM, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return resolved
|
||||
}
|
||||
if let speed {
|
||||
if speed <= 0.5 || speed >= 2.0 {
|
||||
logger.warning("talk speed out of range: \(speed, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return speed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private static func validatedUnit(_ value: Double?, name: String, logger: Logger) -> Double? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 1 {
|
||||
logger.warning("talk \(name, privacy: .public) out of range: \(value, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 4294967295 {
|
||||
logger.warning("talk seed out of range: \(value, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return UInt32(value)
|
||||
}
|
||||
|
||||
private static func validatedNormalize(_ value: String?, logger: Logger) -> String? {
|
||||
guard let value else { return nil }
|
||||
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard ["auto", "on", "off"].contains(normalized) else {
|
||||
logger.warning("talk normalize invalid: \(normalized, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return normalized
|
||||
}
|
||||
|
||||
private static func validatedLanguage(_ value: String?, logger: Logger) -> String? {
|
||||
guard let value else { return nil }
|
||||
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else {
|
||||
logger.warning("talk language invalid: \(normalized, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return normalized
|
||||
}
|
||||
}
|
||||
|
||||
private struct ElevenLabsRequest {
|
||||
let text: String
|
||||
let modelId: String?
|
||||
let outputFormat: String?
|
||||
let speed: Double?
|
||||
let stability: Double?
|
||||
let similarity: Double?
|
||||
let style: Double?
|
||||
let speakerBoost: Bool?
|
||||
let seed: UInt32?
|
||||
let normalize: String?
|
||||
let language: String?
|
||||
}
|
||||
|
||||
private struct ElevenLabsClient {
|
||||
let apiKey: String
|
||||
let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
|
||||
|
||||
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("text-to-speech")
|
||||
url.appendPathComponent(voiceId)
|
||||
|
||||
var payload: [String: Any] = [
|
||||
"text": request.text,
|
||||
]
|
||||
if let modelId = request.modelId, !modelId.isEmpty {
|
||||
payload["model_id"] = modelId
|
||||
}
|
||||
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
|
||||
payload["output_format"] = outputFormat
|
||||
}
|
||||
if let seed = request.seed {
|
||||
payload["seed"] = seed
|
||||
}
|
||||
if let normalize = request.normalize {
|
||||
payload["apply_text_normalization"] = normalize
|
||||
}
|
||||
if let language = request.language {
|
||||
payload["language_code"] = language
|
||||
}
|
||||
var voiceSettings: [String: Any] = [:]
|
||||
if let speed = request.speed { voiceSettings["speed"] = speed }
|
||||
if let stability = request.stability { voiceSettings["stability"] = stability }
|
||||
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
|
||||
if let style = request.style { voiceSettings["style"] = style }
|
||||
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
|
||||
if !voiceSettings.isEmpty {
|
||||
payload["voice_settings"] = voiceSettings
|
||||
}
|
||||
|
||||
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "POST"
|
||||
req.httpBody = body
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
|
||||
let message = String(data: data, encoding: .utf8) ?? "unknown"
|
||||
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
|
||||
])
|
||||
}
|
||||
return data
|
||||
}
|
||||
}
|
||||
8
apps/macos/Sources/Clawdis/TalkModeTypes.swift
Normal file
8
apps/macos/Sources/Clawdis/TalkModeTypes.swift
Normal file
@@ -0,0 +1,8 @@
|
||||
import Foundation
|
||||
|
||||
enum TalkModePhase: String {
|
||||
case idle
|
||||
case listening
|
||||
case thinking
|
||||
case speaking
|
||||
}
|
||||
119
apps/macos/Sources/Clawdis/TalkOverlay.swift
Normal file
119
apps/macos/Sources/Clawdis/TalkOverlay.swift
Normal file
@@ -0,0 +1,119 @@
|
||||
import AppKit
|
||||
import Observation
|
||||
import OSLog
|
||||
import SwiftUI
|
||||
|
||||
@MainActor
|
||||
@Observable
|
||||
final class TalkOverlayController {
|
||||
static let shared = TalkOverlayController()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay")
|
||||
|
||||
struct Model {
|
||||
var isVisible: Bool = false
|
||||
var phase: TalkModePhase = .idle
|
||||
var level: Double = 0
|
||||
}
|
||||
|
||||
var model = Model()
|
||||
private var window: NSPanel?
|
||||
private var hostingView: NSHostingView<TalkOverlayView>?
|
||||
|
||||
private let width: CGFloat = 92
|
||||
private let height: CGFloat = 92
|
||||
private let padding: CGFloat = 8
|
||||
|
||||
func present() {
|
||||
self.ensureWindow()
|
||||
self.hostingView?.rootView = TalkOverlayView(controller: self)
|
||||
let target = self.targetFrame()
|
||||
|
||||
guard let window else { return }
|
||||
if !self.model.isVisible {
|
||||
self.model.isVisible = true
|
||||
let start = target.offsetBy(dx: 0, dy: -6)
|
||||
window.setFrame(start, display: true)
|
||||
window.alphaValue = 0
|
||||
window.orderFrontRegardless()
|
||||
NSAnimationContext.runAnimationGroup { context in
|
||||
context.duration = 0.18
|
||||
context.timingFunction = CAMediaTimingFunction(name: .easeOut)
|
||||
window.animator().setFrame(target, display: true)
|
||||
window.animator().alphaValue = 1
|
||||
}
|
||||
} else {
|
||||
window.setFrame(target, display: true)
|
||||
window.orderFrontRegardless()
|
||||
}
|
||||
}
|
||||
|
||||
func dismiss() {
|
||||
guard let window else {
|
||||
self.model.isVisible = false
|
||||
return
|
||||
}
|
||||
|
||||
let target = window.frame.offsetBy(dx: 6, dy: 6)
|
||||
NSAnimationContext.runAnimationGroup { context in
|
||||
context.duration = 0.16
|
||||
context.timingFunction = CAMediaTimingFunction(name: .easeOut)
|
||||
window.animator().setFrame(target, display: true)
|
||||
window.animator().alphaValue = 0
|
||||
} completionHandler: {
|
||||
Task { @MainActor in
|
||||
window.orderOut(nil)
|
||||
self.model.isVisible = false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func updatePhase(_ phase: TalkModePhase) {
|
||||
guard self.model.phase != phase else { return }
|
||||
self.logger.info("talk overlay phase=\(phase.rawValue, privacy: .public)")
|
||||
self.model.phase = phase
|
||||
}
|
||||
|
||||
func updateLevel(_ level: Double) {
|
||||
guard self.model.isVisible else { return }
|
||||
self.model.level = max(0, min(1, level))
|
||||
}
|
||||
|
||||
// MARK: - Private
|
||||
|
||||
private func ensureWindow() {
|
||||
if self.window != nil { return }
|
||||
let panel = NSPanel(
|
||||
contentRect: NSRect(x: 0, y: 0, width: self.width, height: self.height),
|
||||
styleMask: [.nonactivatingPanel, .borderless],
|
||||
backing: .buffered,
|
||||
defer: false)
|
||||
panel.isOpaque = false
|
||||
panel.backgroundColor = .clear
|
||||
panel.hasShadow = false
|
||||
panel.level = NSWindow.Level(rawValue: NSWindow.Level.popUpMenu.rawValue - 4)
|
||||
panel.collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .transient]
|
||||
panel.hidesOnDeactivate = false
|
||||
panel.isMovable = false
|
||||
panel.isFloatingPanel = true
|
||||
panel.becomesKeyOnlyIfNeeded = true
|
||||
panel.titleVisibility = .hidden
|
||||
panel.titlebarAppearsTransparent = true
|
||||
|
||||
let host = NSHostingView(rootView: TalkOverlayView(controller: self))
|
||||
host.translatesAutoresizingMaskIntoConstraints = false
|
||||
panel.contentView = host
|
||||
self.hostingView = host
|
||||
self.window = panel
|
||||
}
|
||||
|
||||
private func targetFrame() -> NSRect {
|
||||
guard let screen = NSScreen.main else { return .zero }
|
||||
let size = NSSize(width: self.width, height: self.height)
|
||||
let visible = screen.visibleFrame
|
||||
let origin = CGPoint(
|
||||
x: visible.maxX - size.width - self.padding,
|
||||
y: visible.maxY - size.height - self.padding)
|
||||
return NSRect(origin: origin, size: size)
|
||||
}
|
||||
}
|
||||
139
apps/macos/Sources/Clawdis/TalkOverlayView.swift
Normal file
139
apps/macos/Sources/Clawdis/TalkOverlayView.swift
Normal file
@@ -0,0 +1,139 @@
|
||||
import SwiftUI
|
||||
|
||||
struct TalkOverlayView: View {
|
||||
var controller: TalkOverlayController
|
||||
@State private var hovering = false
|
||||
|
||||
var body: some View {
|
||||
ZStack(alignment: .topLeading) {
|
||||
TalkCloudView(phase: self.controller.model.phase, level: self.controller.model.level)
|
||||
.frame(width: 76, height: 64)
|
||||
.contentShape(Rectangle())
|
||||
.onTapGesture {
|
||||
TalkModeController.shared.stopSpeaking(reason: .userTap)
|
||||
}
|
||||
.padding(8)
|
||||
|
||||
Button {
|
||||
TalkModeController.shared.exitTalkMode()
|
||||
} label: {
|
||||
Image(systemName: "xmark")
|
||||
.font(.system(size: 10, weight: .bold))
|
||||
.foregroundStyle(Color.white.opacity(self.hovering ? 0.95 : 0.7))
|
||||
.frame(width: 18, height: 18)
|
||||
.background(Color.black.opacity(self.hovering ? 0.45 : 0.3))
|
||||
.clipShape(Circle())
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.contentShape(Circle())
|
||||
.padding(4)
|
||||
.onHover { self.hovering = $0 }
|
||||
}
|
||||
.frame(width: 92, height: 92, alignment: .center)
|
||||
}
|
||||
}
|
||||
|
||||
private struct TalkCloudView: View {
|
||||
let phase: TalkModePhase
|
||||
let level: Double
|
||||
|
||||
var body: some View {
|
||||
TimelineView(.animation) { context in
|
||||
let t = context.date.timeIntervalSinceReferenceDate
|
||||
let pulse = phase == .speaking ? (1 + 0.04 * sin(t * 6)) : 1
|
||||
let sink = phase == .thinking ? (3 + 2 * sin(t * 2)) : 0
|
||||
let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.14) : 1
|
||||
let baseScale = phase == .thinking ? 0.94 : 1
|
||||
|
||||
ZStack {
|
||||
CloudShape()
|
||||
.fill(self.cloudGradient)
|
||||
.overlay(
|
||||
CloudShape()
|
||||
.stroke(Color.white.opacity(0.35), lineWidth: 0.8))
|
||||
.shadow(color: Color.black.opacity(0.18), radius: 8, x: 0, y: 4)
|
||||
.scaleEffect(baseScale * pulse * listenScale)
|
||||
.offset(y: sink)
|
||||
|
||||
if phase == .listening {
|
||||
Circle()
|
||||
.stroke(self.ringGradient, lineWidth: 1)
|
||||
.scaleEffect(1 + CGFloat(self.level) * 0.45)
|
||||
.opacity(0.3 + CGFloat(self.level) * 0.4)
|
||||
.animation(.easeOut(duration: 0.08), value: self.level)
|
||||
}
|
||||
|
||||
if phase == .thinking {
|
||||
TalkThinkingDots(time: t)
|
||||
.offset(y: 18)
|
||||
}
|
||||
|
||||
if phase == .speaking {
|
||||
TalkSpeakingRings(time: t)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var cloudGradient: LinearGradient {
|
||||
LinearGradient(
|
||||
colors: [Color(red: 0.95, green: 0.98, blue: 1.0), Color(red: 0.75, green: 0.88, blue: 1.0)],
|
||||
startPoint: .topLeading,
|
||||
endPoint: .bottomTrailing)
|
||||
}
|
||||
|
||||
private var ringGradient: LinearGradient {
|
||||
LinearGradient(
|
||||
colors: [Color.white.opacity(0.6), Color.white.opacity(0.1)],
|
||||
startPoint: .top,
|
||||
endPoint: .bottom)
|
||||
}
|
||||
}
|
||||
|
||||
private struct TalkThinkingDots: View {
|
||||
let time: TimeInterval
|
||||
|
||||
var body: some View {
|
||||
HStack(spacing: 4) {
|
||||
ForEach(0..<3, id: \.self) { idx in
|
||||
let phase = (time * 2 + Double(idx) * 0.45).truncatingRemainder(dividingBy: 1)
|
||||
Circle()
|
||||
.fill(Color.white.opacity(0.75))
|
||||
.frame(width: 5, height: 5)
|
||||
.opacity(0.35 + 0.55 * phase)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private struct TalkSpeakingRings: View {
|
||||
let time: TimeInterval
|
||||
|
||||
var body: some View {
|
||||
ZStack {
|
||||
ForEach(0..<3, id: \.self) { idx in
|
||||
let phase = (time * 1.1 + Double(idx) / 3).truncatingRemainder(dividingBy: 1)
|
||||
Circle()
|
||||
.stroke(Color.white.opacity(0.6 - phase * 0.5), lineWidth: 1)
|
||||
.scaleEffect(0.8 + phase * 0.7)
|
||||
.opacity(0.6 - phase * 0.6)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private struct CloudShape: Shape {
|
||||
func path(in rect: CGRect) -> Path {
|
||||
let w = rect.width
|
||||
let h = rect.height
|
||||
let baseHeight = h * 0.44
|
||||
let baseRect = CGRect(x: rect.minX, y: rect.minY + h * 0.46, width: w, height: baseHeight)
|
||||
|
||||
var path = Path()
|
||||
path.addRoundedRect(in: baseRect, cornerSize: CGSize(width: baseHeight / 2, height: baseHeight / 2))
|
||||
path.addEllipse(in: CGRect(x: rect.minX + w * 0.05, y: rect.minY + h * 0.28, width: w * 0.36, height: h * 0.36))
|
||||
path.addEllipse(in: CGRect(x: rect.minX + w * 0.28, y: rect.minY + h * 0.05, width: w * 0.44, height: h * 0.44))
|
||||
path.addEllipse(in: CGRect(x: rect.minX + w * 0.62, y: rect.minY + h * 0.3, width: w * 0.3, height: h * 0.3))
|
||||
return path
|
||||
}
|
||||
}
|
||||
194
apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift
Normal file
194
apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift
Normal file
@@ -0,0 +1,194 @@
|
||||
import Foundation
|
||||
|
||||
public struct TalkDirective: Equatable, Sendable {
|
||||
public var voiceId: String?
|
||||
public var modelId: String?
|
||||
public var speed: Double?
|
||||
public var rateWPM: Int?
|
||||
public var stability: Double?
|
||||
public var similarity: Double?
|
||||
public var style: Double?
|
||||
public var speakerBoost: Bool?
|
||||
public var seed: Int?
|
||||
public var normalize: String?
|
||||
public var language: String?
|
||||
public var outputFormat: String?
|
||||
public var latencyTier: Int?
|
||||
public var once: Bool?
|
||||
|
||||
public init(
|
||||
voiceId: String? = nil,
|
||||
modelId: String? = nil,
|
||||
speed: Double? = nil,
|
||||
rateWPM: Int? = nil,
|
||||
stability: Double? = nil,
|
||||
similarity: Double? = nil,
|
||||
style: Double? = nil,
|
||||
speakerBoost: Bool? = nil,
|
||||
seed: Int? = nil,
|
||||
normalize: String? = nil,
|
||||
language: String? = nil,
|
||||
outputFormat: String? = nil,
|
||||
latencyTier: Int? = nil,
|
||||
once: Bool? = nil)
|
||||
{
|
||||
self.voiceId = voiceId
|
||||
self.modelId = modelId
|
||||
self.speed = speed
|
||||
self.rateWPM = rateWPM
|
||||
self.stability = stability
|
||||
self.similarity = similarity
|
||||
self.style = style
|
||||
self.speakerBoost = speakerBoost
|
||||
self.seed = seed
|
||||
self.normalize = normalize
|
||||
self.language = language
|
||||
self.outputFormat = outputFormat
|
||||
self.latencyTier = latencyTier
|
||||
self.once = once
|
||||
}
|
||||
}
|
||||
|
||||
public struct TalkDirectiveParseResult: Equatable, Sendable {
|
||||
public let directive: TalkDirective?
|
||||
public let stripped: String
|
||||
public let unknownKeys: [String]
|
||||
|
||||
public init(directive: TalkDirective?, stripped: String, unknownKeys: [String]) {
|
||||
self.directive = directive
|
||||
self.stripped = stripped
|
||||
self.unknownKeys = unknownKeys
|
||||
}
|
||||
}
|
||||
|
||||
public enum TalkDirectiveParser {
|
||||
public static func parse(_ text: String) -> TalkDirectiveParseResult {
|
||||
let normalized = text.replacingOccurrences(of: "\r\n", with: "\n")
|
||||
var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false)
|
||||
guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) }
|
||||
|
||||
guard let firstNonEmpty = lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty })
|
||||
else {
|
||||
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
|
||||
}
|
||||
|
||||
let head = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard head.hasPrefix("{"), head.hasSuffix("}") else {
|
||||
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
|
||||
}
|
||||
|
||||
guard let data = head.data(using: .utf8),
|
||||
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any]
|
||||
else {
|
||||
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
|
||||
}
|
||||
|
||||
let speakerBoost = boolValue(json, keys: ["speaker_boost", "speakerBoost"])
|
||||
?? boolValue(json, keys: ["no_speaker_boost", "noSpeakerBoost"]).map { !$0 }
|
||||
|
||||
let directive = TalkDirective(
|
||||
voiceId: stringValue(json, keys: ["voice", "voice_id", "voiceId"]),
|
||||
modelId: stringValue(json, keys: ["model", "model_id", "modelId"]),
|
||||
speed: doubleValue(json, keys: ["speed"]),
|
||||
rateWPM: intValue(json, keys: ["rate", "wpm"]),
|
||||
stability: doubleValue(json, keys: ["stability"]),
|
||||
similarity: doubleValue(json, keys: ["similarity", "similarity_boost", "similarityBoost"]),
|
||||
style: doubleValue(json, keys: ["style"]),
|
||||
speakerBoost: speakerBoost,
|
||||
seed: intValue(json, keys: ["seed"]),
|
||||
normalize: stringValue(json, keys: ["normalize", "apply_text_normalization"]),
|
||||
language: stringValue(json, keys: ["lang", "language_code", "language"]),
|
||||
outputFormat: stringValue(json, keys: ["output_format", "format"]),
|
||||
latencyTier: intValue(json, keys: ["latency", "latency_tier", "latencyTier"]),
|
||||
once: boolValue(json, keys: ["once"]))
|
||||
|
||||
let hasDirective = [
|
||||
directive.voiceId,
|
||||
directive.modelId,
|
||||
directive.speed.map { "\($0)" },
|
||||
directive.rateWPM.map { "\($0)" },
|
||||
directive.stability.map { "\($0)" },
|
||||
directive.similarity.map { "\($0)" },
|
||||
directive.style.map { "\($0)" },
|
||||
directive.speakerBoost.map { "\($0)" },
|
||||
directive.seed.map { "\($0)" },
|
||||
directive.normalize,
|
||||
directive.language,
|
||||
directive.outputFormat,
|
||||
directive.latencyTier.map { "\($0)" },
|
||||
directive.once.map { "\($0)" },
|
||||
].contains { $0 != nil }
|
||||
|
||||
guard hasDirective else {
|
||||
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
|
||||
}
|
||||
|
||||
let knownKeys = Set([
|
||||
"voice", "voice_id", "voiceid",
|
||||
"model", "model_id", "modelid",
|
||||
"speed", "rate", "wpm",
|
||||
"stability", "similarity", "similarity_boost", "similarityboost",
|
||||
"style",
|
||||
"speaker_boost", "speakerboost",
|
||||
"no_speaker_boost", "nospeakerboost",
|
||||
"seed",
|
||||
"normalize", "apply_text_normalization",
|
||||
"lang", "language_code", "language",
|
||||
"output_format", "format",
|
||||
"latency", "latency_tier", "latencytier",
|
||||
"once",
|
||||
])
|
||||
let unknownKeys = json.keys.filter { !knownKeys.contains($0.lowercased()) }.sorted()
|
||||
|
||||
lines.remove(at: firstNonEmpty)
|
||||
if firstNonEmpty < lines.count {
|
||||
let next = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if next.isEmpty {
|
||||
lines.remove(at: firstNonEmpty)
|
||||
}
|
||||
}
|
||||
|
||||
let stripped = lines.joined(separator: "\n")
|
||||
return TalkDirectiveParseResult(directive: directive, stripped: stripped, unknownKeys: unknownKeys)
|
||||
}
|
||||
|
||||
private static func stringValue(_ dict: [String: Any], keys: [String]) -> String? {
|
||||
for key in keys {
|
||||
if let value = dict[key] as? String {
|
||||
let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if !trimmed.isEmpty { return trimmed }
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private static func doubleValue(_ dict: [String: Any], keys: [String]) -> Double? {
|
||||
for key in keys {
|
||||
if let value = dict[key] as? Double { return value }
|
||||
if let value = dict[key] as? Int { return Double(value) }
|
||||
if let value = dict[key] as? String, let parsed = Double(value) { return parsed }
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private static func intValue(_ dict: [String: Any], keys: [String]) -> Int? {
|
||||
for key in keys {
|
||||
if let value = dict[key] as? Int { return value }
|
||||
if let value = dict[key] as? Double { return Int(value) }
|
||||
if let value = dict[key] as? String, let parsed = Int(value) { return parsed }
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private static func boolValue(_ dict: [String: Any], keys: [String]) -> Bool? {
|
||||
for key in keys {
|
||||
if let value = dict[key] as? Bool { return value }
|
||||
if let value = dict[key] as? String {
|
||||
let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
if ["true", "yes", "1"].contains(trimmed) { return true }
|
||||
if ["false", "no", "0"].contains(trimmed) { return false }
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
import XCTest
|
||||
@testable import ClawdisKit
|
||||
|
||||
final class TalkDirectiveTests: XCTestCase {
|
||||
func testParsesDirectiveAndStripsLine() {
|
||||
let text = """
|
||||
{"voice":"abc123","once":true}
|
||||
Hello there.
|
||||
"""
|
||||
let result = TalkDirectiveParser.parse(text)
|
||||
XCTAssertEqual(result.directive?.voiceId, "abc123")
|
||||
XCTAssertEqual(result.directive?.once, true)
|
||||
XCTAssertEqual(result.stripped, "Hello there.")
|
||||
}
|
||||
|
||||
func testIgnoresNonDirective() {
|
||||
let text = "Hello world."
|
||||
let result = TalkDirectiveParser.parse(text)
|
||||
XCTAssertNil(result.directive)
|
||||
XCTAssertEqual(result.stripped, text)
|
||||
}
|
||||
|
||||
func testKeepsDirectiveLineIfNoRecognizedFields() {
|
||||
let text = """
|
||||
{"unknown":"value"}
|
||||
Hello.
|
||||
"""
|
||||
let result = TalkDirectiveParser.parse(text)
|
||||
XCTAssertNil(result.directive)
|
||||
XCTAssertEqual(result.stripped, text)
|
||||
}
|
||||
|
||||
func testParsesExtendedOptions() {
|
||||
let text = """
|
||||
{"voice_id":"v1","model_id":"m1","rate":200,"stability":0.5,"similarity":0.8,"style":0.2,"speaker_boost":true,"seed":1234,"normalize":"auto","lang":"en","output_format":"mp3_44100_128"}
|
||||
Hello.
|
||||
"""
|
||||
let result = TalkDirectiveParser.parse(text)
|
||||
XCTAssertEqual(result.directive?.voiceId, "v1")
|
||||
XCTAssertEqual(result.directive?.modelId, "m1")
|
||||
XCTAssertEqual(result.directive?.rateWPM, 200)
|
||||
XCTAssertEqual(result.directive?.stability, 0.5)
|
||||
XCTAssertEqual(result.directive?.similarity, 0.8)
|
||||
XCTAssertEqual(result.directive?.style, 0.2)
|
||||
XCTAssertEqual(result.directive?.speakerBoost, true)
|
||||
XCTAssertEqual(result.directive?.seed, 1234)
|
||||
XCTAssertEqual(result.directive?.normalize, "auto")
|
||||
XCTAssertEqual(result.directive?.language, "en")
|
||||
XCTAssertEqual(result.directive?.outputFormat, "mp3_44100_128")
|
||||
XCTAssertEqual(result.stripped, "Hello.")
|
||||
}
|
||||
|
||||
func testTracksUnknownKeys() {
|
||||
let text = """
|
||||
{"voice":"abc","mystery":"value","extra":1}
|
||||
Hi.
|
||||
"""
|
||||
let result = TalkDirectiveParser.parse(text)
|
||||
XCTAssertEqual(result.directive?.voiceId, "abc")
|
||||
XCTAssertEqual(result.unknownKeys, ["extra", "mystery"])
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user