From 2814815312845a4ce039cc84420f38e499aa0f7b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 11:35:29 +0100 Subject: [PATCH] feat: add talk voice alias map --- .../clawdis/node/voice/TalkModeManager.kt | 48 +++++- apps/ios/Sources/Voice/TalkModeManager.swift | 40 ++++- .../Sources/Clawdis/TalkModeRuntime.swift | 144 ++++++++++++++---- .../ClawdisKit/TalkPromptBuilder.swift | 3 +- docs/configuration.md | 5 + src/config/config.test.ts | 29 ++++ src/config/config.ts | 3 + 7 files changed, 237 insertions(+), 35 deletions(-) diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt index eabd0abbf..e72b07651 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt @@ -61,6 +61,12 @@ class TalkModeManager( private val _statusText = MutableStateFlow("Off") val statusText: StateFlow = _statusText + private val _lastAssistantText = MutableStateFlow(null) + val lastAssistantText: StateFlow = _lastAssistantText + + private val _usingFallbackTts = MutableStateFlow(false) + val usingFallbackTts: StateFlow = _usingFallbackTts + private var recognizer: SpeechRecognizer? = null private var restartJob: Job? = null private var stopRequested = false @@ -79,6 +85,7 @@ class TalkModeManager( private var currentModelId: String? = null private var defaultOutputFormat: String? = null private var apiKey: String? = null + private var voiceAliases: Map = emptyMap() private var interruptOnSpeech: Boolean = true private var voiceOverrideActive = false private var modelOverrideActive = false @@ -179,6 +186,7 @@ class TalkModeManager( _isListening.value = false _statusText.value = "Off" stopSpeaking() + _usingFallbackTts.value = false chatSubscribedSessionKey = null mainHandler.post { @@ -334,7 +342,7 @@ class TalkModeManager( private fun buildPrompt(transcript: String): String { val lines = mutableListOf( "Talk Mode active. Reply in a concise, spoken tone.", - "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", + "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice (id or alias), e.g. {\"voice\":\"\",\"once\":true}.", ) lastInterruptedAtSeconds?.let { lines.add("Assistant speech interrupted at ${"%.1f".format(it)}s.") @@ -432,10 +440,17 @@ class TalkModeManager( val directive = parsed.directive val cleaned = parsed.stripped.trim() if (cleaned.isEmpty()) return + _lastAssistantText.value = cleaned + + val requestedVoice = directive?.voiceId?.trim()?.takeIf { it.isNotEmpty() } + val resolvedVoice = resolveVoiceAlias(requestedVoice) + if (requestedVoice != null && resolvedVoice == null) { + Log.w(tag, "unknown voice alias: $requestedVoice") + } if (directive?.voiceId != null) { if (directive.once != true) { - currentVoiceId = directive.voiceId + currentVoiceId = resolvedVoice voiceOverrideActive = true } } @@ -449,7 +464,7 @@ class TalkModeManager( val apiKey = apiKey?.trim()?.takeIf { it.isNotEmpty() } ?: System.getenv("ELEVENLABS_API_KEY")?.trim() - val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId + val voiceId = resolvedVoice ?: currentVoiceId ?: defaultVoiceId _statusText.value = "Speaking…" _isSpeaking.value = true @@ -465,9 +480,11 @@ class TalkModeManager( if (apiKey.isNullOrEmpty()) { Log.w(tag, "missing ELEVENLABS_API_KEY; falling back to system voice") } + _usingFallbackTts.value = true _statusText.value = "Speaking (System)…" speakWithSystemTts(cleaned) } else { + _usingFallbackTts.value = false val ttsStarted = SystemClock.elapsedRealtime() val request = ElevenLabsRequest( @@ -491,6 +508,7 @@ class TalkModeManager( } catch (err: Throwable) { Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice") try { + _usingFallbackTts.value = true _statusText.value = "Speaking (System)…" speakWithSystemTts(cleaned) } catch (fallbackErr: Throwable) { @@ -681,6 +699,11 @@ class TalkModeManager( val sessionCfg = config?.get("session").asObjectOrNull() val mainKey = sessionCfg?.get("mainKey").asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } ?: "main" val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + val aliases = + talk?.get("voiceAliases").asObjectOrNull()?.entries?.mapNotNull { (key, value) -> + val id = value.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } ?: return@mapNotNull null + normalizeAliasKey(key).takeIf { it.isNotEmpty() }?.let { it to id } + }?.toMap().orEmpty() val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val key = talk?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } @@ -688,6 +711,7 @@ class TalkModeManager( mainSessionKey = mainKey defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } + voiceAliases = aliases if (!voiceOverrideActive) currentVoiceId = defaultVoiceId defaultModelId = model if (!modelOverrideActive) currentModelId = defaultModelId @@ -697,6 +721,7 @@ class TalkModeManager( } catch (_: Throwable) { defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } apiKey = envKey?.takeIf { it.isNotEmpty() } + voiceAliases = emptyMap() } } @@ -842,6 +867,23 @@ class TalkModeManager( } } + private fun resolveVoiceAlias(value: String?): String? { + val trimmed = value?.trim().orEmpty() + if (trimmed.isEmpty()) return null + val normalized = normalizeAliasKey(trimmed) + voiceAliases[normalized]?.let { return it } + if (voiceAliases.values.any { it.equals(trimmed, ignoreCase = true) }) return trimmed + return if (isLikelyVoiceId(trimmed)) trimmed else null + } + + private fun isLikelyVoiceId(value: String): Boolean { + if (value.length < 10) return false + return value.all { it.isLetterOrDigit() || it == '-' || it == '_' } + } + + private fun normalizeAliasKey(value: String): String = + value.trim().lowercase() + private val listener = object : RecognitionListener { override fun onReadyForSpeech(params: Bundle?) { diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 8a9845829..a84a7b4a4 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -33,6 +33,7 @@ final class TalkModeManager: NSObject { private var modelOverrideActive = false private var defaultOutputFormat: String? private var apiKey: String? + private var voiceAliases: [String: String] = [:] private var interruptOnSpeech: Bool = true private var mainSessionKey: String = "main" @@ -419,7 +420,12 @@ final class TalkModeManager: NSObject { let cleaned = parsed.stripped.trimmingCharacters(in: .whitespacesAndNewlines) guard !cleaned.isEmpty else { return } - if let voice = directive?.voiceId { + let requestedVoice = directive?.voiceId?.trimmingCharacters(in: .whitespacesAndNewlines) + let resolvedVoice = self.resolveVoiceAlias(requestedVoice) + if requestedVoice?.isEmpty == false, resolvedVoice == nil { + self.logger.warning("unknown voice alias \(requestedVoice ?? "?", privacy: .public)") + } + if let voice = resolvedVoice { if directive?.once != true { self.currentVoiceId = voice self.voiceOverrideActive = true @@ -440,8 +446,7 @@ final class TalkModeManager: NSObject { let started = Date() let language = ElevenLabsTTSClient.validatedLanguage(directive?.language) - let voiceId = (directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId)? - .trimmingCharacters(in: .whitespacesAndNewlines) + let voiceId = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId let resolvedKey = (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ?? ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] @@ -565,6 +570,22 @@ final class TalkModeManager: NSObject { return true } + private func resolveVoiceAlias(_ value: String?) -> String? { + let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return nil } + let normalized = trimmed.lowercased() + if let mapped = self.voiceAliases[normalized] { return mapped } + if self.voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) { + return trimmed + } + return Self.isLikelyVoiceId(trimmed) ? trimmed : nil + } + + private static func isLikelyVoiceId(_ value: String) -> Bool { + guard value.count >= 10 else { return false } + return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" } + } + private func reloadConfig() async { guard let bridge else { return } do { @@ -576,6 +597,19 @@ final class TalkModeManager: NSObject { let rawMainKey = (session?["mainKey"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" self.mainSessionKey = rawMainKey.isEmpty ? "main" : rawMainKey self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) + if let aliases = talk?["voiceAliases"] as? [String: Any] { + self.voiceAliases = + aliases.compactMap { key, value in + guard let id = value as? String else { return nil } + let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines) + guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { return nil } + return (normalizedKey, trimmedId) + } + .reduce(into: [:]) { $0[$1.0] = $1.1 } + } else { + self.voiceAliases = [:] + } if !self.voiceOverrideActive { self.currentVoiceId = self.defaultVoiceId } diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 48f64c129..9baf17707 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -41,6 +41,7 @@ actor TalkModeRuntime { private var silenceTask: Task? private var phase: TalkModePhase = .idle private var isEnabled = false + private var isPaused = false private var lifecycleGeneration: Int = 0 private var lastHeard: Date? @@ -57,6 +58,7 @@ actor TalkModeRuntime { private var defaultOutputFormat: String? private var interruptOnSpeech: Bool = true private var lastInterruptedAtSeconds: Double? + private var voiceAliases: [String: String] = [:] private var lastSpokenText: String? private var apiKey: String? private var fallbackVoiceId: String? @@ -78,6 +80,29 @@ actor TalkModeRuntime { } } + func setPaused(_ paused: Bool) async { + guard paused != self.isPaused else { return } + self.isPaused = paused + await MainActor.run { TalkModeController.shared.updateLevel(0) } + + guard self.isEnabled else { return } + + if paused { + self.lastTranscript = "" + self.lastHeard = nil + self.lastSpeechEnergyAt = nil + await self.stopRecognition() + return + } + + if self.phase == .idle || self.phase == .listening { + await self.startRecognition() + self.phase = .listening + await MainActor.run { TalkModeController.shared.updatePhase(.listening) } + self.startSilenceMonitor() + } + } + private func isCurrent(_ generation: Int) -> Bool { generation == self.lifecycleGeneration && self.isEnabled } @@ -91,6 +116,14 @@ actor TalkModeRuntime { } await self.reloadConfig() guard self.isCurrent(gen) else { return } + if self.isPaused { + self.phase = .idle + await MainActor.run { + TalkModeController.shared.updateLevel(0) + TalkModeController.shared.updatePhase(.idle) + } + return + } await self.startRecognition() guard self.isCurrent(gen) else { return } self.phase = .listening @@ -211,6 +244,7 @@ actor TalkModeRuntime { private func handleRecognition(_ update: RecognitionUpdate) async { guard update.generation == self.recognitionGeneration else { return } + guard !self.isPaused else { return } if let errorDescription = update.errorDescription { self.logger.debug("talk recognition error: \(errorDescription, privacy: .public)") } @@ -256,6 +290,7 @@ actor TalkModeRuntime { } private func checkSilence() async { + guard !self.isPaused else { return } guard self.phase == .listening else { return } let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) guard !transcript.isEmpty else { return } @@ -292,11 +327,10 @@ actor TalkModeRuntime { guard self.isCurrent(gen) else { return } let prompt = self.buildPrompt(transcript: transcript) let activeSessionKey = await MainActor.run { WebChatManager.shared.activeSessionKey } - let sessionKey: String - if let activeSessionKey { - sessionKey = activeSessionKey + let sessionKey: String = if let activeSessionKey { + activeSessionKey } else { - sessionKey = await GatewayConnection.shared.mainSessionKey() + await GatewayConnection.shared.mainSessionKey() } let runId = UUID().uuidString let startedAt = Date().timeIntervalSince1970 @@ -329,17 +363,29 @@ actor TalkModeRuntime { self.logger.info("talk assistant text len=\(assistantText.count, privacy: .public)") await self.playAssistant(text: assistantText) guard self.isCurrent(gen) else { return } - await self.startListening() - await self.startRecognition() + await self.resumeListeningIfNeeded() return } catch { self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)") - await self.startListening() - await self.startRecognition() + await self.resumeListeningIfNeeded() return } } + private func resumeListeningIfNeeded() async { + if self.isPaused { + self.lastTranscript = "" + self.lastHeard = nil + self.lastSpeechEnergyAt = nil + await MainActor.run { + TalkModeController.shared.updateLevel(0) + } + return + } + await self.startListening() + await self.startRecognition() + } + private func buildPrompt(transcript: String) -> String { let interrupted = self.lastInterruptedAtSeconds self.lastInterruptedAtSeconds = nil @@ -376,7 +422,7 @@ actor TalkModeRuntime { return TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) } guard let assistant else { return nil } - let text = assistant.content.compactMap { $0.text }.joined(separator: "\n") + let text = assistant.content.compactMap(\.text).joined(separator: "\n") let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) return trimmed.isEmpty ? nil : trimmed } catch { @@ -394,10 +440,16 @@ actor TalkModeRuntime { guard self.isCurrent(gen) else { return } if !parse.unknownKeys.isEmpty { - self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)") + self.logger + .warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)") } - if let voice = directive?.voiceId { + let requestedVoice = directive?.voiceId?.trimmingCharacters(in: .whitespacesAndNewlines) + let resolvedVoice = self.resolveVoiceAlias(requestedVoice) + if let requestedVoice, !requestedVoice.isEmpty, resolvedVoice == nil { + self.logger.warning("talk unknown voice alias \(requestedVoice, privacy: .public)") + } + if let voice = resolvedVoice { if directive?.once == true { self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)") } else { @@ -417,18 +469,17 @@ actor TalkModeRuntime { } let apiKey = self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines) - let requestedVoice = - directive?.voiceId ?? + let preferredVoice = + resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId let language = ElevenLabsTTSClient.validatedLanguage(directive?.language) - let voiceId: String? - if let apiKey, !apiKey.isEmpty { - voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey) + let voiceId: String? = if let apiKey, !apiKey.isEmpty { + await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey) } else { - voiceId = nil + nil } if apiKey?.isEmpty != false { @@ -436,7 +487,8 @@ actor TalkModeRuntime { } else if voiceId == nil { self.ttsLogger.warning("talk missing voiceId; falling back to system voice") } else if let voiceId { - self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)") + self.ttsLogger + .info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)") } self.lastSpokenText = cleaned @@ -447,7 +499,9 @@ actor TalkModeRuntime { let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat) if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty { - self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)") + self.logger + .warning( + "talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)") } let request = ElevenLabsTTSRequest( @@ -481,7 +535,9 @@ actor TalkModeRuntime { self.phase = .speaking let result = await TalkAudioPlayer.shared.play(data: audio) - self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)") + self.ttsLogger + .info( + "talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)") if !result.finished, result.interruptedAt == nil { throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [ NSLocalizedDescriptionKey: "audio playback failed", @@ -505,7 +561,8 @@ actor TalkModeRuntime { self.ttsLogger.info("talk system voice done") } } catch { - self.ttsLogger.error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice") + self.ttsLogger + .error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice") do { if self.interruptOnSpeech { await self.startRecognition() @@ -528,7 +585,10 @@ actor TalkModeRuntime { private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? { let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" - if !trimmed.isEmpty { return trimmed } + if !trimmed.isEmpty { + if let resolved = self.resolveVoiceAlias(trimmed) { return resolved } + self.ttsLogger.warning("talk unknown voice alias \(trimmed, privacy: .public)") + } if let fallbackVoiceId { return fallbackVoiceId } do { @@ -545,7 +605,8 @@ actor TalkModeRuntime { self.currentVoiceId = first.voiceId } let name = first.name ?? "unknown" - self.ttsLogger.info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))") + self.ttsLogger + .info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))") return first.voiceId } catch { self.ttsLogger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)") @@ -553,6 +614,22 @@ actor TalkModeRuntime { } } + private func resolveVoiceAlias(_ value: String?) -> String? { + let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return nil } + let normalized = trimmed.lowercased() + if let mapped = self.voiceAliases[normalized] { return mapped } + if self.voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) { + return trimmed + } + return Self.isLikelyVoiceId(trimmed) ? trimmed : nil + } + + private static func isLikelyVoiceId(_ value: String) -> Bool { + guard value.count >= 10 else { return false } + return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" } + } + func stopSpeaking(reason: TalkStopReason) async { let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() } await TalkSystemSpeechSynthesizer.shared.stop() @@ -576,6 +653,7 @@ actor TalkModeRuntime { private func reloadConfig() async { let cfg = await self.fetchTalkConfig() self.defaultVoiceId = cfg.voiceId + self.voiceAliases = cfg.voiceAliases if !self.voiceOverrideActive { self.currentVoiceId = cfg.voiceId } @@ -589,11 +667,14 @@ actor TalkModeRuntime { let hasApiKey = (cfg.apiKey?.isEmpty == false) let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none" let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none" - self.logger.info("talk config voiceId=\(voiceLabel, privacy: .public) modelId=\(modelLabel, privacy: .public) apiKey=\(hasApiKey, privacy: .public) interrupt=\(cfg.interruptOnSpeech, privacy: .public)") + self.logger + .info( + "talk config voiceId=\(voiceLabel, privacy: .public) modelId=\(modelLabel, privacy: .public) apiKey=\(hasApiKey, privacy: .public) interrupt=\(cfg.interruptOnSpeech, privacy: .public)") } private struct TalkRuntimeConfig { let voiceId: String? + let voiceAliases: [String: String] let modelId: String? let outputFormat: String? let interruptOnSpeech: Bool @@ -618,6 +699,14 @@ actor TalkModeRuntime { AppStateStore.shared.seamColorHex = rawSeam.isEmpty ? nil : rawSeam } let voice = talk?["voiceId"]?.stringValue + let rawAliases = talk?["voiceAliases"]?.dictionaryValue + let resolvedAliases: [String: String] = + rawAliases?.reduce(into: [:]) { acc, entry in + let key = entry.key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + let value = entry.value.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + guard !key.isEmpty, !value.isEmpty else { return } + acc[key] = value + } ?? [:] let model = talk?["modelId"]?.stringValue let outputFormat = talk?["outputFormat"]?.stringValue let interrupt = talk?["interruptOnSpeech"]?.boolValue @@ -631,6 +720,7 @@ actor TalkModeRuntime { (apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? apiKey : nil) return TalkRuntimeConfig( voiceId: resolvedVoice, + voiceAliases: resolvedAliases, modelId: model, outputFormat: outputFormat, interruptOnSpeech: interrupt ?? true, @@ -642,6 +732,7 @@ actor TalkModeRuntime { let resolvedApiKey = envApiKey?.isEmpty == false ? envApiKey : nil return TalkRuntimeConfig( voiceId: resolvedVoice, + voiceAliases: [:], modelId: nil, outputFormat: nil, interruptOnSpeech: true, @@ -652,7 +743,7 @@ actor TalkModeRuntime { // MARK: - Audio level handling private func noteAudioLevel(rms: Double) async { - if self.phase != .listening && self.phase != .speaking { return } + if self.phase != .listening, self.phase != .speaking { return } let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01 self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha) @@ -731,7 +822,7 @@ actor TalkModeRuntime { private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? { guard let value else { return nil } - if value < 0 || value > 4294967295 { + if value < 0 || value > 4_294_967_295 { logger.warning("talk seed out of range: \(value, privacy: .public)") return nil } @@ -747,5 +838,4 @@ actor TalkModeRuntime { } return normalized } - } diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift index 95842d685..c63f40e9d 100644 --- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift @@ -2,7 +2,7 @@ public enum TalkPromptBuilder: Sendable { public static func build(transcript: String, interruptedAtSeconds: Double?) -> String { var lines: [String] = [ "Talk Mode active. Reply in a concise, spoken tone.", - "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", + "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice (id or alias), e.g. {\"voice\":\"\",\"once\":true}.", ] if let interruptedAtSeconds { @@ -15,4 +15,3 @@ public enum TalkPromptBuilder: Sendable { return lines.joined(separator: "\n") } } - diff --git a/docs/configuration.md b/docs/configuration.md index ac4b3ccf4..91f1baddf 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -199,11 +199,16 @@ Controls inbound/outbound prefixes and timestamps. Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` when unset. `apiKey` falls back to `ELEVENLABS_API_KEY` (or the gateway’s shell profile) when unset. +`voiceAliases` lets Talk directives use friendly names (e.g. `"voice":"Clawd"`). ```json5 { talk: { voiceId: "elevenlabs_voice_id", + voiceAliases: { + Clawd: "EXAVITQu4vr4xnSDxMaL", + Roger: "CwhRBWXzGAHq8TQ4Fs17" + }, modelId: "eleven_v3", outputFormat: "mp3_44100_128", apiKey: "elevenlabs_api_key", diff --git a/src/config/config.test.ts b/src/config/config.test.ts index ff2009a35..36ff17454 100644 --- a/src/config/config.test.ts +++ b/src/config/config.test.ts @@ -221,3 +221,32 @@ describe("talk api key fallback", () => { }); }); }); + +describe("talk.voiceAliases", () => { + it("accepts a string map of voice aliases", async () => { + vi.resetModules(); + const { validateConfigObject } = await import("./config.js"); + const res = validateConfigObject({ + talk: { + voiceAliases: { + Clawd: "EXAVITQu4vr4xnSDxMaL", + Roger: "CwhRBWXzGAHq8TQ4Fs17", + }, + }, + }); + expect(res.ok).toBe(true); + }); + + it("rejects non-string voice alias values", async () => { + vi.resetModules(); + const { validateConfigObject } = await import("./config.js"); + const res = validateConfigObject({ + talk: { + voiceAliases: { + Clawd: 123, + }, + }, + }); + expect(res.ok).toBe(false); + }); +}); diff --git a/src/config/config.ts b/src/config/config.ts index 9d3b23442..89d01fdaf 100644 --- a/src/config/config.ts +++ b/src/config/config.ts @@ -222,6 +222,8 @@ export type CanvasHostConfig = { export type TalkConfig = { /** Default ElevenLabs voice ID for Talk mode. */ voiceId?: string; + /** Optional voice name -> ElevenLabs voice ID map. */ + voiceAliases?: Record; /** Default ElevenLabs model ID for Talk mode. */ modelId?: string; /** Default ElevenLabs output format (e.g. mp3_44100_128). */ @@ -815,6 +817,7 @@ const ClawdisSchema = z.object({ talk: z .object({ voiceId: z.string().optional(), + voiceAliases: z.record(z.string(), z.string()).optional(), modelId: z.string().optional(), outputFormat: z.string().optional(), apiKey: z.string().optional(),