feat: add talk voice alias map

This commit is contained in:
Peter Steinberger
2025-12-30 11:35:29 +01:00
parent ab27586674
commit 2814815312
7 changed files with 237 additions and 35 deletions

View File

@@ -61,6 +61,12 @@ class TalkModeManager(
private val _statusText = MutableStateFlow("Off") private val _statusText = MutableStateFlow("Off")
val statusText: StateFlow<String> = _statusText val statusText: StateFlow<String> = _statusText
private val _lastAssistantText = MutableStateFlow<String?>(null)
val lastAssistantText: StateFlow<String?> = _lastAssistantText
private val _usingFallbackTts = MutableStateFlow(false)
val usingFallbackTts: StateFlow<Boolean> = _usingFallbackTts
private var recognizer: SpeechRecognizer? = null private var recognizer: SpeechRecognizer? = null
private var restartJob: Job? = null private var restartJob: Job? = null
private var stopRequested = false private var stopRequested = false
@@ -79,6 +85,7 @@ class TalkModeManager(
private var currentModelId: String? = null private var currentModelId: String? = null
private var defaultOutputFormat: String? = null private var defaultOutputFormat: String? = null
private var apiKey: String? = null private var apiKey: String? = null
private var voiceAliases: Map<String, String> = emptyMap()
private var interruptOnSpeech: Boolean = true private var interruptOnSpeech: Boolean = true
private var voiceOverrideActive = false private var voiceOverrideActive = false
private var modelOverrideActive = false private var modelOverrideActive = false
@@ -179,6 +186,7 @@ class TalkModeManager(
_isListening.value = false _isListening.value = false
_statusText.value = "Off" _statusText.value = "Off"
stopSpeaking() stopSpeaking()
_usingFallbackTts.value = false
chatSubscribedSessionKey = null chatSubscribedSessionKey = null
mainHandler.post { mainHandler.post {
@@ -334,7 +342,7 @@ class TalkModeManager(
private fun buildPrompt(transcript: String): String { private fun buildPrompt(transcript: String): String {
val lines = mutableListOf( val lines = mutableListOf(
"Talk Mode active. Reply in a concise, spoken tone.", "Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.", "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice (id or alias), e.g. {\"voice\":\"<id>\",\"once\":true}.",
) )
lastInterruptedAtSeconds?.let { lastInterruptedAtSeconds?.let {
lines.add("Assistant speech interrupted at ${"%.1f".format(it)}s.") lines.add("Assistant speech interrupted at ${"%.1f".format(it)}s.")
@@ -432,10 +440,17 @@ class TalkModeManager(
val directive = parsed.directive val directive = parsed.directive
val cleaned = parsed.stripped.trim() val cleaned = parsed.stripped.trim()
if (cleaned.isEmpty()) return if (cleaned.isEmpty()) return
_lastAssistantText.value = cleaned
val requestedVoice = directive?.voiceId?.trim()?.takeIf { it.isNotEmpty() }
val resolvedVoice = resolveVoiceAlias(requestedVoice)
if (requestedVoice != null && resolvedVoice == null) {
Log.w(tag, "unknown voice alias: $requestedVoice")
}
if (directive?.voiceId != null) { if (directive?.voiceId != null) {
if (directive.once != true) { if (directive.once != true) {
currentVoiceId = directive.voiceId currentVoiceId = resolvedVoice
voiceOverrideActive = true voiceOverrideActive = true
} }
} }
@@ -449,7 +464,7 @@ class TalkModeManager(
val apiKey = val apiKey =
apiKey?.trim()?.takeIf { it.isNotEmpty() } apiKey?.trim()?.takeIf { it.isNotEmpty() }
?: System.getenv("ELEVENLABS_API_KEY")?.trim() ?: System.getenv("ELEVENLABS_API_KEY")?.trim()
val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId val voiceId = resolvedVoice ?: currentVoiceId ?: defaultVoiceId
_statusText.value = "Speaking…" _statusText.value = "Speaking…"
_isSpeaking.value = true _isSpeaking.value = true
@@ -465,9 +480,11 @@ class TalkModeManager(
if (apiKey.isNullOrEmpty()) { if (apiKey.isNullOrEmpty()) {
Log.w(tag, "missing ELEVENLABS_API_KEY; falling back to system voice") Log.w(tag, "missing ELEVENLABS_API_KEY; falling back to system voice")
} }
_usingFallbackTts.value = true
_statusText.value = "Speaking (System)…" _statusText.value = "Speaking (System)…"
speakWithSystemTts(cleaned) speakWithSystemTts(cleaned)
} else { } else {
_usingFallbackTts.value = false
val ttsStarted = SystemClock.elapsedRealtime() val ttsStarted = SystemClock.elapsedRealtime()
val request = val request =
ElevenLabsRequest( ElevenLabsRequest(
@@ -491,6 +508,7 @@ class TalkModeManager(
} catch (err: Throwable) { } catch (err: Throwable) {
Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice") Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
try { try {
_usingFallbackTts.value = true
_statusText.value = "Speaking (System)…" _statusText.value = "Speaking (System)…"
speakWithSystemTts(cleaned) speakWithSystemTts(cleaned)
} catch (fallbackErr: Throwable) { } catch (fallbackErr: Throwable) {
@@ -681,6 +699,11 @@ class TalkModeManager(
val sessionCfg = config?.get("session").asObjectOrNull() val sessionCfg = config?.get("session").asObjectOrNull()
val mainKey = sessionCfg?.get("mainKey").asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } ?: "main" val mainKey = sessionCfg?.get("mainKey").asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } ?: "main"
val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val aliases =
talk?.get("voiceAliases").asObjectOrNull()?.entries?.mapNotNull { (key, value) ->
val id = value.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } ?: return@mapNotNull null
normalizeAliasKey(key).takeIf { it.isNotEmpty() }?.let { it to id }
}?.toMap().orEmpty()
val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val key = talk?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val key = talk?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
@@ -688,6 +711,7 @@ class TalkModeManager(
mainSessionKey = mainKey mainSessionKey = mainKey
defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
voiceAliases = aliases
if (!voiceOverrideActive) currentVoiceId = defaultVoiceId if (!voiceOverrideActive) currentVoiceId = defaultVoiceId
defaultModelId = model defaultModelId = model
if (!modelOverrideActive) currentModelId = defaultModelId if (!modelOverrideActive) currentModelId = defaultModelId
@@ -697,6 +721,7 @@ class TalkModeManager(
} catch (_: Throwable) { } catch (_: Throwable) {
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
apiKey = envKey?.takeIf { it.isNotEmpty() } apiKey = envKey?.takeIf { it.isNotEmpty() }
voiceAliases = emptyMap()
} }
} }
@@ -842,6 +867,23 @@ class TalkModeManager(
} }
} }
private fun resolveVoiceAlias(value: String?): String? {
val trimmed = value?.trim().orEmpty()
if (trimmed.isEmpty()) return null
val normalized = normalizeAliasKey(trimmed)
voiceAliases[normalized]?.let { return it }
if (voiceAliases.values.any { it.equals(trimmed, ignoreCase = true) }) return trimmed
return if (isLikelyVoiceId(trimmed)) trimmed else null
}
private fun isLikelyVoiceId(value: String): Boolean {
if (value.length < 10) return false
return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
}
private fun normalizeAliasKey(value: String): String =
value.trim().lowercase()
private val listener = private val listener =
object : RecognitionListener { object : RecognitionListener {
override fun onReadyForSpeech(params: Bundle?) { override fun onReadyForSpeech(params: Bundle?) {

View File

@@ -33,6 +33,7 @@ final class TalkModeManager: NSObject {
private var modelOverrideActive = false private var modelOverrideActive = false
private var defaultOutputFormat: String? private var defaultOutputFormat: String?
private var apiKey: String? private var apiKey: String?
private var voiceAliases: [String: String] = [:]
private var interruptOnSpeech: Bool = true private var interruptOnSpeech: Bool = true
private var mainSessionKey: String = "main" private var mainSessionKey: String = "main"
@@ -419,7 +420,12 @@ final class TalkModeManager: NSObject {
let cleaned = parsed.stripped.trimmingCharacters(in: .whitespacesAndNewlines) let cleaned = parsed.stripped.trimmingCharacters(in: .whitespacesAndNewlines)
guard !cleaned.isEmpty else { return } guard !cleaned.isEmpty else { return }
if let voice = directive?.voiceId { let requestedVoice = directive?.voiceId?.trimmingCharacters(in: .whitespacesAndNewlines)
let resolvedVoice = self.resolveVoiceAlias(requestedVoice)
if requestedVoice?.isEmpty == false, resolvedVoice == nil {
self.logger.warning("unknown voice alias \(requestedVoice ?? "?", privacy: .public)")
}
if let voice = resolvedVoice {
if directive?.once != true { if directive?.once != true {
self.currentVoiceId = voice self.currentVoiceId = voice
self.voiceOverrideActive = true self.voiceOverrideActive = true
@@ -440,8 +446,7 @@ final class TalkModeManager: NSObject {
let started = Date() let started = Date()
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language) let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
let voiceId = (directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId)? let voiceId = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
.trimmingCharacters(in: .whitespacesAndNewlines)
let resolvedKey = let resolvedKey =
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ?? (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
@@ -565,6 +570,22 @@ final class TalkModeManager: NSObject {
return true return true
} }
private func resolveVoiceAlias(_ value: String?) -> String? {
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return nil }
let normalized = trimmed.lowercased()
if let mapped = self.voiceAliases[normalized] { return mapped }
if self.voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) {
return trimmed
}
return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
}
private static func isLikelyVoiceId(_ value: String) -> Bool {
guard value.count >= 10 else { return false }
return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
}
private func reloadConfig() async { private func reloadConfig() async {
guard let bridge else { return } guard let bridge else { return }
do { do {
@@ -576,6 +597,19 @@ final class TalkModeManager: NSObject {
let rawMainKey = (session?["mainKey"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" let rawMainKey = (session?["mainKey"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
self.mainSessionKey = rawMainKey.isEmpty ? "main" : rawMainKey self.mainSessionKey = rawMainKey.isEmpty ? "main" : rawMainKey
self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
if let aliases = talk?["voiceAliases"] as? [String: Any] {
self.voiceAliases =
aliases.compactMap { key, value in
guard let id = value as? String else { return nil }
let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { return nil }
return (normalizedKey, trimmedId)
}
.reduce(into: [:]) { $0[$1.0] = $1.1 }
} else {
self.voiceAliases = [:]
}
if !self.voiceOverrideActive { if !self.voiceOverrideActive {
self.currentVoiceId = self.defaultVoiceId self.currentVoiceId = self.defaultVoiceId
} }

View File

@@ -41,6 +41,7 @@ actor TalkModeRuntime {
private var silenceTask: Task<Void, Never>? private var silenceTask: Task<Void, Never>?
private var phase: TalkModePhase = .idle private var phase: TalkModePhase = .idle
private var isEnabled = false private var isEnabled = false
private var isPaused = false
private var lifecycleGeneration: Int = 0 private var lifecycleGeneration: Int = 0
private var lastHeard: Date? private var lastHeard: Date?
@@ -57,6 +58,7 @@ actor TalkModeRuntime {
private var defaultOutputFormat: String? private var defaultOutputFormat: String?
private var interruptOnSpeech: Bool = true private var interruptOnSpeech: Bool = true
private var lastInterruptedAtSeconds: Double? private var lastInterruptedAtSeconds: Double?
private var voiceAliases: [String: String] = [:]
private var lastSpokenText: String? private var lastSpokenText: String?
private var apiKey: String? private var apiKey: String?
private var fallbackVoiceId: String? private var fallbackVoiceId: String?
@@ -78,6 +80,29 @@ actor TalkModeRuntime {
} }
} }
func setPaused(_ paused: Bool) async {
guard paused != self.isPaused else { return }
self.isPaused = paused
await MainActor.run { TalkModeController.shared.updateLevel(0) }
guard self.isEnabled else { return }
if paused {
self.lastTranscript = ""
self.lastHeard = nil
self.lastSpeechEnergyAt = nil
await self.stopRecognition()
return
}
if self.phase == .idle || self.phase == .listening {
await self.startRecognition()
self.phase = .listening
await MainActor.run { TalkModeController.shared.updatePhase(.listening) }
self.startSilenceMonitor()
}
}
private func isCurrent(_ generation: Int) -> Bool { private func isCurrent(_ generation: Int) -> Bool {
generation == self.lifecycleGeneration && self.isEnabled generation == self.lifecycleGeneration && self.isEnabled
} }
@@ -91,6 +116,14 @@ actor TalkModeRuntime {
} }
await self.reloadConfig() await self.reloadConfig()
guard self.isCurrent(gen) else { return } guard self.isCurrent(gen) else { return }
if self.isPaused {
self.phase = .idle
await MainActor.run {
TalkModeController.shared.updateLevel(0)
TalkModeController.shared.updatePhase(.idle)
}
return
}
await self.startRecognition() await self.startRecognition()
guard self.isCurrent(gen) else { return } guard self.isCurrent(gen) else { return }
self.phase = .listening self.phase = .listening
@@ -211,6 +244,7 @@ actor TalkModeRuntime {
private func handleRecognition(_ update: RecognitionUpdate) async { private func handleRecognition(_ update: RecognitionUpdate) async {
guard update.generation == self.recognitionGeneration else { return } guard update.generation == self.recognitionGeneration else { return }
guard !self.isPaused else { return }
if let errorDescription = update.errorDescription { if let errorDescription = update.errorDescription {
self.logger.debug("talk recognition error: \(errorDescription, privacy: .public)") self.logger.debug("talk recognition error: \(errorDescription, privacy: .public)")
} }
@@ -256,6 +290,7 @@ actor TalkModeRuntime {
} }
private func checkSilence() async { private func checkSilence() async {
guard !self.isPaused else { return }
guard self.phase == .listening else { return } guard self.phase == .listening else { return }
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !transcript.isEmpty else { return } guard !transcript.isEmpty else { return }
@@ -292,11 +327,10 @@ actor TalkModeRuntime {
guard self.isCurrent(gen) else { return } guard self.isCurrent(gen) else { return }
let prompt = self.buildPrompt(transcript: transcript) let prompt = self.buildPrompt(transcript: transcript)
let activeSessionKey = await MainActor.run { WebChatManager.shared.activeSessionKey } let activeSessionKey = await MainActor.run { WebChatManager.shared.activeSessionKey }
let sessionKey: String let sessionKey: String = if let activeSessionKey {
if let activeSessionKey { activeSessionKey
sessionKey = activeSessionKey
} else { } else {
sessionKey = await GatewayConnection.shared.mainSessionKey() await GatewayConnection.shared.mainSessionKey()
} }
let runId = UUID().uuidString let runId = UUID().uuidString
let startedAt = Date().timeIntervalSince1970 let startedAt = Date().timeIntervalSince1970
@@ -329,17 +363,29 @@ actor TalkModeRuntime {
self.logger.info("talk assistant text len=\(assistantText.count, privacy: .public)") self.logger.info("talk assistant text len=\(assistantText.count, privacy: .public)")
await self.playAssistant(text: assistantText) await self.playAssistant(text: assistantText)
guard self.isCurrent(gen) else { return } guard self.isCurrent(gen) else { return }
await self.startListening() await self.resumeListeningIfNeeded()
await self.startRecognition()
return return
} catch { } catch {
self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)") self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)")
await self.startListening() await self.resumeListeningIfNeeded()
await self.startRecognition()
return return
} }
} }
private func resumeListeningIfNeeded() async {
if self.isPaused {
self.lastTranscript = ""
self.lastHeard = nil
self.lastSpeechEnergyAt = nil
await MainActor.run {
TalkModeController.shared.updateLevel(0)
}
return
}
await self.startListening()
await self.startRecognition()
}
private func buildPrompt(transcript: String) -> String { private func buildPrompt(transcript: String) -> String {
let interrupted = self.lastInterruptedAtSeconds let interrupted = self.lastInterruptedAtSeconds
self.lastInterruptedAtSeconds = nil self.lastInterruptedAtSeconds = nil
@@ -376,7 +422,7 @@ actor TalkModeRuntime {
return TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) return TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since)
} }
guard let assistant else { return nil } guard let assistant else { return nil }
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n") let text = assistant.content.compactMap(\.text).joined(separator: "\n")
let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
return trimmed.isEmpty ? nil : trimmed return trimmed.isEmpty ? nil : trimmed
} catch { } catch {
@@ -394,10 +440,16 @@ actor TalkModeRuntime {
guard self.isCurrent(gen) else { return } guard self.isCurrent(gen) else { return }
if !parse.unknownKeys.isEmpty { if !parse.unknownKeys.isEmpty {
self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)") self.logger
.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)")
} }
if let voice = directive?.voiceId { let requestedVoice = directive?.voiceId?.trimmingCharacters(in: .whitespacesAndNewlines)
let resolvedVoice = self.resolveVoiceAlias(requestedVoice)
if let requestedVoice, !requestedVoice.isEmpty, resolvedVoice == nil {
self.logger.warning("talk unknown voice alias \(requestedVoice, privacy: .public)")
}
if let voice = resolvedVoice {
if directive?.once == true { if directive?.once == true {
self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)") self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)")
} else { } else {
@@ -417,18 +469,17 @@ actor TalkModeRuntime {
} }
let apiKey = self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines) let apiKey = self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines)
let requestedVoice = let preferredVoice =
directive?.voiceId ?? resolvedVoice ??
self.currentVoiceId ?? self.currentVoiceId ??
self.defaultVoiceId self.defaultVoiceId
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language) let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
let voiceId: String? let voiceId: String? = if let apiKey, !apiKey.isEmpty {
if let apiKey, !apiKey.isEmpty { await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey)
} else { } else {
voiceId = nil nil
} }
if apiKey?.isEmpty != false { if apiKey?.isEmpty != false {
@@ -436,7 +487,8 @@ actor TalkModeRuntime {
} else if voiceId == nil { } else if voiceId == nil {
self.ttsLogger.warning("talk missing voiceId; falling back to system voice") self.ttsLogger.warning("talk missing voiceId; falling back to system voice")
} else if let voiceId { } else if let voiceId {
self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)") self.ttsLogger
.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")
} }
self.lastSpokenText = cleaned self.lastSpokenText = cleaned
@@ -447,7 +499,9 @@ actor TalkModeRuntime {
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat) let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty { if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)") self.logger
.warning(
"talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
} }
let request = ElevenLabsTTSRequest( let request = ElevenLabsTTSRequest(
@@ -481,7 +535,9 @@ actor TalkModeRuntime {
self.phase = .speaking self.phase = .speaking
let result = await TalkAudioPlayer.shared.play(data: audio) let result = await TalkAudioPlayer.shared.play(data: audio)
self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)") self.ttsLogger
.info(
"talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
if !result.finished, result.interruptedAt == nil { if !result.finished, result.interruptedAt == nil {
throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [ throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [
NSLocalizedDescriptionKey: "audio playback failed", NSLocalizedDescriptionKey: "audio playback failed",
@@ -505,7 +561,8 @@ actor TalkModeRuntime {
self.ttsLogger.info("talk system voice done") self.ttsLogger.info("talk system voice done")
} }
} catch { } catch {
self.ttsLogger.error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice") self.ttsLogger
.error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
do { do {
if self.interruptOnSpeech { if self.interruptOnSpeech {
await self.startRecognition() await self.startRecognition()
@@ -528,7 +585,10 @@ actor TalkModeRuntime {
private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? { private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
if !trimmed.isEmpty { return trimmed } if !trimmed.isEmpty {
if let resolved = self.resolveVoiceAlias(trimmed) { return resolved }
self.ttsLogger.warning("talk unknown voice alias \(trimmed, privacy: .public)")
}
if let fallbackVoiceId { return fallbackVoiceId } if let fallbackVoiceId { return fallbackVoiceId }
do { do {
@@ -545,7 +605,8 @@ actor TalkModeRuntime {
self.currentVoiceId = first.voiceId self.currentVoiceId = first.voiceId
} }
let name = first.name ?? "unknown" let name = first.name ?? "unknown"
self.ttsLogger.info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))") self.ttsLogger
.info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
return first.voiceId return first.voiceId
} catch { } catch {
self.ttsLogger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)") self.ttsLogger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
@@ -553,6 +614,22 @@ actor TalkModeRuntime {
} }
} }
private func resolveVoiceAlias(_ value: String?) -> String? {
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return nil }
let normalized = trimmed.lowercased()
if let mapped = self.voiceAliases[normalized] { return mapped }
if self.voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) {
return trimmed
}
return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
}
private static func isLikelyVoiceId(_ value: String) -> Bool {
guard value.count >= 10 else { return false }
return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
}
func stopSpeaking(reason: TalkStopReason) async { func stopSpeaking(reason: TalkStopReason) async {
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() } let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
await TalkSystemSpeechSynthesizer.shared.stop() await TalkSystemSpeechSynthesizer.shared.stop()
@@ -576,6 +653,7 @@ actor TalkModeRuntime {
private func reloadConfig() async { private func reloadConfig() async {
let cfg = await self.fetchTalkConfig() let cfg = await self.fetchTalkConfig()
self.defaultVoiceId = cfg.voiceId self.defaultVoiceId = cfg.voiceId
self.voiceAliases = cfg.voiceAliases
if !self.voiceOverrideActive { if !self.voiceOverrideActive {
self.currentVoiceId = cfg.voiceId self.currentVoiceId = cfg.voiceId
} }
@@ -589,11 +667,14 @@ actor TalkModeRuntime {
let hasApiKey = (cfg.apiKey?.isEmpty == false) let hasApiKey = (cfg.apiKey?.isEmpty == false)
let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none" let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none"
let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none" let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none"
self.logger.info("talk config voiceId=\(voiceLabel, privacy: .public) modelId=\(modelLabel, privacy: .public) apiKey=\(hasApiKey, privacy: .public) interrupt=\(cfg.interruptOnSpeech, privacy: .public)") self.logger
.info(
"talk config voiceId=\(voiceLabel, privacy: .public) modelId=\(modelLabel, privacy: .public) apiKey=\(hasApiKey, privacy: .public) interrupt=\(cfg.interruptOnSpeech, privacy: .public)")
} }
private struct TalkRuntimeConfig { private struct TalkRuntimeConfig {
let voiceId: String? let voiceId: String?
let voiceAliases: [String: String]
let modelId: String? let modelId: String?
let outputFormat: String? let outputFormat: String?
let interruptOnSpeech: Bool let interruptOnSpeech: Bool
@@ -618,6 +699,14 @@ actor TalkModeRuntime {
AppStateStore.shared.seamColorHex = rawSeam.isEmpty ? nil : rawSeam AppStateStore.shared.seamColorHex = rawSeam.isEmpty ? nil : rawSeam
} }
let voice = talk?["voiceId"]?.stringValue let voice = talk?["voiceId"]?.stringValue
let rawAliases = talk?["voiceAliases"]?.dictionaryValue
let resolvedAliases: [String: String] =
rawAliases?.reduce(into: [:]) { acc, entry in
let key = entry.key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
let value = entry.value.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
guard !key.isEmpty, !value.isEmpty else { return }
acc[key] = value
} ?? [:]
let model = talk?["modelId"]?.stringValue let model = talk?["modelId"]?.stringValue
let outputFormat = talk?["outputFormat"]?.stringValue let outputFormat = talk?["outputFormat"]?.stringValue
let interrupt = talk?["interruptOnSpeech"]?.boolValue let interrupt = talk?["interruptOnSpeech"]?.boolValue
@@ -631,6 +720,7 @@ actor TalkModeRuntime {
(apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? apiKey : nil) (apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? apiKey : nil)
return TalkRuntimeConfig( return TalkRuntimeConfig(
voiceId: resolvedVoice, voiceId: resolvedVoice,
voiceAliases: resolvedAliases,
modelId: model, modelId: model,
outputFormat: outputFormat, outputFormat: outputFormat,
interruptOnSpeech: interrupt ?? true, interruptOnSpeech: interrupt ?? true,
@@ -642,6 +732,7 @@ actor TalkModeRuntime {
let resolvedApiKey = envApiKey?.isEmpty == false ? envApiKey : nil let resolvedApiKey = envApiKey?.isEmpty == false ? envApiKey : nil
return TalkRuntimeConfig( return TalkRuntimeConfig(
voiceId: resolvedVoice, voiceId: resolvedVoice,
voiceAliases: [:],
modelId: nil, modelId: nil,
outputFormat: nil, outputFormat: nil,
interruptOnSpeech: true, interruptOnSpeech: true,
@@ -652,7 +743,7 @@ actor TalkModeRuntime {
// MARK: - Audio level handling // MARK: - Audio level handling
private func noteAudioLevel(rms: Double) async { private func noteAudioLevel(rms: Double) async {
if self.phase != .listening && self.phase != .speaking { return } if self.phase != .listening, self.phase != .speaking { return }
let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01 let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01
self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha) self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha)
@@ -731,7 +822,7 @@ actor TalkModeRuntime {
private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? { private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? {
guard let value else { return nil } guard let value else { return nil }
if value < 0 || value > 4294967295 { if value < 0 || value > 4_294_967_295 {
logger.warning("talk seed out of range: \(value, privacy: .public)") logger.warning("talk seed out of range: \(value, privacy: .public)")
return nil return nil
} }
@@ -747,5 +838,4 @@ actor TalkModeRuntime {
} }
return normalized return normalized
} }
} }

View File

@@ -2,7 +2,7 @@ public enum TalkPromptBuilder: Sendable {
public static func build(transcript: String, interruptedAtSeconds: Double?) -> String { public static func build(transcript: String, interruptedAtSeconds: Double?) -> String {
var lines: [String] = [ var lines: [String] = [
"Talk Mode active. Reply in a concise, spoken tone.", "Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.", "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice (id or alias), e.g. {\"voice\":\"<id>\",\"once\":true}.",
] ]
if let interruptedAtSeconds { if let interruptedAtSeconds {
@@ -15,4 +15,3 @@ public enum TalkPromptBuilder: Sendable {
return lines.joined(separator: "\n") return lines.joined(separator: "\n")
} }
} }

View File

@@ -199,11 +199,16 @@ Controls inbound/outbound prefixes and timestamps.
Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` when unset. Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` when unset.
`apiKey` falls back to `ELEVENLABS_API_KEY` (or the gateways shell profile) when unset. `apiKey` falls back to `ELEVENLABS_API_KEY` (or the gateways shell profile) when unset.
`voiceAliases` lets Talk directives use friendly names (e.g. `"voice":"Clawd"`).
```json5 ```json5
{ {
talk: { talk: {
voiceId: "elevenlabs_voice_id", voiceId: "elevenlabs_voice_id",
voiceAliases: {
Clawd: "EXAVITQu4vr4xnSDxMaL",
Roger: "CwhRBWXzGAHq8TQ4Fs17"
},
modelId: "eleven_v3", modelId: "eleven_v3",
outputFormat: "mp3_44100_128", outputFormat: "mp3_44100_128",
apiKey: "elevenlabs_api_key", apiKey: "elevenlabs_api_key",

View File

@@ -221,3 +221,32 @@ describe("talk api key fallback", () => {
}); });
}); });
}); });
describe("talk.voiceAliases", () => {
it("accepts a string map of voice aliases", async () => {
vi.resetModules();
const { validateConfigObject } = await import("./config.js");
const res = validateConfigObject({
talk: {
voiceAliases: {
Clawd: "EXAVITQu4vr4xnSDxMaL",
Roger: "CwhRBWXzGAHq8TQ4Fs17",
},
},
});
expect(res.ok).toBe(true);
});
it("rejects non-string voice alias values", async () => {
vi.resetModules();
const { validateConfigObject } = await import("./config.js");
const res = validateConfigObject({
talk: {
voiceAliases: {
Clawd: 123,
},
},
});
expect(res.ok).toBe(false);
});
});

View File

@@ -222,6 +222,8 @@ export type CanvasHostConfig = {
export type TalkConfig = { export type TalkConfig = {
/** Default ElevenLabs voice ID for Talk mode. */ /** Default ElevenLabs voice ID for Talk mode. */
voiceId?: string; voiceId?: string;
/** Optional voice name -> ElevenLabs voice ID map. */
voiceAliases?: Record<string, string>;
/** Default ElevenLabs model ID for Talk mode. */ /** Default ElevenLabs model ID for Talk mode. */
modelId?: string; modelId?: string;
/** Default ElevenLabs output format (e.g. mp3_44100_128). */ /** Default ElevenLabs output format (e.g. mp3_44100_128). */
@@ -815,6 +817,7 @@ const ClawdisSchema = z.object({
talk: z talk: z
.object({ .object({
voiceId: z.string().optional(), voiceId: z.string().optional(),
voiceAliases: z.record(z.string(), z.string()).optional(),
modelId: z.string().optional(), modelId: z.string().optional(),
outputFormat: z.string().optional(), outputFormat: z.string().optional(),
apiKey: z.string().optional(), apiKey: z.string().optional(),