feat: add talk voice alias map
This commit is contained in:
@@ -41,6 +41,7 @@ actor TalkModeRuntime {
|
||||
private var silenceTask: Task<Void, Never>?
|
||||
private var phase: TalkModePhase = .idle
|
||||
private var isEnabled = false
|
||||
private var isPaused = false
|
||||
private var lifecycleGeneration: Int = 0
|
||||
|
||||
private var lastHeard: Date?
|
||||
@@ -57,6 +58,7 @@ actor TalkModeRuntime {
|
||||
private var defaultOutputFormat: String?
|
||||
private var interruptOnSpeech: Bool = true
|
||||
private var lastInterruptedAtSeconds: Double?
|
||||
private var voiceAliases: [String: String] = [:]
|
||||
private var lastSpokenText: String?
|
||||
private var apiKey: String?
|
||||
private var fallbackVoiceId: String?
|
||||
@@ -78,6 +80,29 @@ actor TalkModeRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
func setPaused(_ paused: Bool) async {
|
||||
guard paused != self.isPaused else { return }
|
||||
self.isPaused = paused
|
||||
await MainActor.run { TalkModeController.shared.updateLevel(0) }
|
||||
|
||||
guard self.isEnabled else { return }
|
||||
|
||||
if paused {
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.lastSpeechEnergyAt = nil
|
||||
await self.stopRecognition()
|
||||
return
|
||||
}
|
||||
|
||||
if self.phase == .idle || self.phase == .listening {
|
||||
await self.startRecognition()
|
||||
self.phase = .listening
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.listening) }
|
||||
self.startSilenceMonitor()
|
||||
}
|
||||
}
|
||||
|
||||
private func isCurrent(_ generation: Int) -> Bool {
|
||||
generation == self.lifecycleGeneration && self.isEnabled
|
||||
}
|
||||
@@ -91,6 +116,14 @@ actor TalkModeRuntime {
|
||||
}
|
||||
await self.reloadConfig()
|
||||
guard self.isCurrent(gen) else { return }
|
||||
if self.isPaused {
|
||||
self.phase = .idle
|
||||
await MainActor.run {
|
||||
TalkModeController.shared.updateLevel(0)
|
||||
TalkModeController.shared.updatePhase(.idle)
|
||||
}
|
||||
return
|
||||
}
|
||||
await self.startRecognition()
|
||||
guard self.isCurrent(gen) else { return }
|
||||
self.phase = .listening
|
||||
@@ -211,6 +244,7 @@ actor TalkModeRuntime {
|
||||
|
||||
private func handleRecognition(_ update: RecognitionUpdate) async {
|
||||
guard update.generation == self.recognitionGeneration else { return }
|
||||
guard !self.isPaused else { return }
|
||||
if let errorDescription = update.errorDescription {
|
||||
self.logger.debug("talk recognition error: \(errorDescription, privacy: .public)")
|
||||
}
|
||||
@@ -256,6 +290,7 @@ actor TalkModeRuntime {
|
||||
}
|
||||
|
||||
private func checkSilence() async {
|
||||
guard !self.isPaused else { return }
|
||||
guard self.phase == .listening else { return }
|
||||
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !transcript.isEmpty else { return }
|
||||
@@ -292,11 +327,10 @@ actor TalkModeRuntime {
|
||||
guard self.isCurrent(gen) else { return }
|
||||
let prompt = self.buildPrompt(transcript: transcript)
|
||||
let activeSessionKey = await MainActor.run { WebChatManager.shared.activeSessionKey }
|
||||
let sessionKey: String
|
||||
if let activeSessionKey {
|
||||
sessionKey = activeSessionKey
|
||||
let sessionKey: String = if let activeSessionKey {
|
||||
activeSessionKey
|
||||
} else {
|
||||
sessionKey = await GatewayConnection.shared.mainSessionKey()
|
||||
await GatewayConnection.shared.mainSessionKey()
|
||||
}
|
||||
let runId = UUID().uuidString
|
||||
let startedAt = Date().timeIntervalSince1970
|
||||
@@ -329,17 +363,29 @@ actor TalkModeRuntime {
|
||||
self.logger.info("talk assistant text len=\(assistantText.count, privacy: .public)")
|
||||
await self.playAssistant(text: assistantText)
|
||||
guard self.isCurrent(gen) else { return }
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
await self.resumeListeningIfNeeded()
|
||||
return
|
||||
} catch {
|
||||
self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)")
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
await self.resumeListeningIfNeeded()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
private func resumeListeningIfNeeded() async {
|
||||
if self.isPaused {
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.lastSpeechEnergyAt = nil
|
||||
await MainActor.run {
|
||||
TalkModeController.shared.updateLevel(0)
|
||||
}
|
||||
return
|
||||
}
|
||||
await self.startListening()
|
||||
await self.startRecognition()
|
||||
}
|
||||
|
||||
private func buildPrompt(transcript: String) -> String {
|
||||
let interrupted = self.lastInterruptedAtSeconds
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
@@ -376,7 +422,7 @@ actor TalkModeRuntime {
|
||||
return TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since)
|
||||
}
|
||||
guard let assistant else { return nil }
|
||||
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
|
||||
let text = assistant.content.compactMap(\.text).joined(separator: "\n")
|
||||
let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
|
||||
return trimmed.isEmpty ? nil : trimmed
|
||||
} catch {
|
||||
@@ -394,10 +440,16 @@ actor TalkModeRuntime {
|
||||
guard self.isCurrent(gen) else { return }
|
||||
|
||||
if !parse.unknownKeys.isEmpty {
|
||||
self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)")
|
||||
self.logger
|
||||
.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)")
|
||||
}
|
||||
|
||||
if let voice = directive?.voiceId {
|
||||
let requestedVoice = directive?.voiceId?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let resolvedVoice = self.resolveVoiceAlias(requestedVoice)
|
||||
if let requestedVoice, !requestedVoice.isEmpty, resolvedVoice == nil {
|
||||
self.logger.warning("talk unknown voice alias \(requestedVoice, privacy: .public)")
|
||||
}
|
||||
if let voice = resolvedVoice {
|
||||
if directive?.once == true {
|
||||
self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)")
|
||||
} else {
|
||||
@@ -417,18 +469,17 @@ actor TalkModeRuntime {
|
||||
}
|
||||
|
||||
let apiKey = self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let requestedVoice =
|
||||
directive?.voiceId ??
|
||||
let preferredVoice =
|
||||
resolvedVoice ??
|
||||
self.currentVoiceId ??
|
||||
self.defaultVoiceId
|
||||
|
||||
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
|
||||
|
||||
let voiceId: String?
|
||||
if let apiKey, !apiKey.isEmpty {
|
||||
voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey)
|
||||
let voiceId: String? = if let apiKey, !apiKey.isEmpty {
|
||||
await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
|
||||
} else {
|
||||
voiceId = nil
|
||||
nil
|
||||
}
|
||||
|
||||
if apiKey?.isEmpty != false {
|
||||
@@ -436,7 +487,8 @@ actor TalkModeRuntime {
|
||||
} else if voiceId == nil {
|
||||
self.ttsLogger.warning("talk missing voiceId; falling back to system voice")
|
||||
} else if let voiceId {
|
||||
self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")
|
||||
self.ttsLogger
|
||||
.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")
|
||||
}
|
||||
self.lastSpokenText = cleaned
|
||||
|
||||
@@ -447,7 +499,9 @@ actor TalkModeRuntime {
|
||||
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
|
||||
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
|
||||
self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
|
||||
self.logger
|
||||
.warning(
|
||||
"talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
|
||||
}
|
||||
|
||||
let request = ElevenLabsTTSRequest(
|
||||
@@ -481,7 +535,9 @@ actor TalkModeRuntime {
|
||||
self.phase = .speaking
|
||||
|
||||
let result = await TalkAudioPlayer.shared.play(data: audio)
|
||||
self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
|
||||
self.ttsLogger
|
||||
.info(
|
||||
"talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
|
||||
if !result.finished, result.interruptedAt == nil {
|
||||
throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "audio playback failed",
|
||||
@@ -505,7 +561,8 @@ actor TalkModeRuntime {
|
||||
self.ttsLogger.info("talk system voice done")
|
||||
}
|
||||
} catch {
|
||||
self.ttsLogger.error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
|
||||
self.ttsLogger
|
||||
.error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
|
||||
do {
|
||||
if self.interruptOnSpeech {
|
||||
await self.startRecognition()
|
||||
@@ -528,7 +585,10 @@ actor TalkModeRuntime {
|
||||
|
||||
private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
|
||||
let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||
if !trimmed.isEmpty { return trimmed }
|
||||
if !trimmed.isEmpty {
|
||||
if let resolved = self.resolveVoiceAlias(trimmed) { return resolved }
|
||||
self.ttsLogger.warning("talk unknown voice alias \(trimmed, privacy: .public)")
|
||||
}
|
||||
if let fallbackVoiceId { return fallbackVoiceId }
|
||||
|
||||
do {
|
||||
@@ -545,7 +605,8 @@ actor TalkModeRuntime {
|
||||
self.currentVoiceId = first.voiceId
|
||||
}
|
||||
let name = first.name ?? "unknown"
|
||||
self.ttsLogger.info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
|
||||
self.ttsLogger
|
||||
.info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
|
||||
return first.voiceId
|
||||
} catch {
|
||||
self.ttsLogger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
|
||||
@@ -553,6 +614,22 @@ actor TalkModeRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
private func resolveVoiceAlias(_ value: String?) -> String? {
|
||||
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return nil }
|
||||
let normalized = trimmed.lowercased()
|
||||
if let mapped = self.voiceAliases[normalized] { return mapped }
|
||||
if self.voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) {
|
||||
return trimmed
|
||||
}
|
||||
return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
|
||||
}
|
||||
|
||||
private static func isLikelyVoiceId(_ value: String) -> Bool {
|
||||
guard value.count >= 10 else { return false }
|
||||
return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
|
||||
}
|
||||
|
||||
func stopSpeaking(reason: TalkStopReason) async {
|
||||
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
|
||||
await TalkSystemSpeechSynthesizer.shared.stop()
|
||||
@@ -576,6 +653,7 @@ actor TalkModeRuntime {
|
||||
private func reloadConfig() async {
|
||||
let cfg = await self.fetchTalkConfig()
|
||||
self.defaultVoiceId = cfg.voiceId
|
||||
self.voiceAliases = cfg.voiceAliases
|
||||
if !self.voiceOverrideActive {
|
||||
self.currentVoiceId = cfg.voiceId
|
||||
}
|
||||
@@ -589,11 +667,14 @@ actor TalkModeRuntime {
|
||||
let hasApiKey = (cfg.apiKey?.isEmpty == false)
|
||||
let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none"
|
||||
let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none"
|
||||
self.logger.info("talk config voiceId=\(voiceLabel, privacy: .public) modelId=\(modelLabel, privacy: .public) apiKey=\(hasApiKey, privacy: .public) interrupt=\(cfg.interruptOnSpeech, privacy: .public)")
|
||||
self.logger
|
||||
.info(
|
||||
"talk config voiceId=\(voiceLabel, privacy: .public) modelId=\(modelLabel, privacy: .public) apiKey=\(hasApiKey, privacy: .public) interrupt=\(cfg.interruptOnSpeech, privacy: .public)")
|
||||
}
|
||||
|
||||
private struct TalkRuntimeConfig {
|
||||
let voiceId: String?
|
||||
let voiceAliases: [String: String]
|
||||
let modelId: String?
|
||||
let outputFormat: String?
|
||||
let interruptOnSpeech: Bool
|
||||
@@ -618,6 +699,14 @@ actor TalkModeRuntime {
|
||||
AppStateStore.shared.seamColorHex = rawSeam.isEmpty ? nil : rawSeam
|
||||
}
|
||||
let voice = talk?["voiceId"]?.stringValue
|
||||
let rawAliases = talk?["voiceAliases"]?.dictionaryValue
|
||||
let resolvedAliases: [String: String] =
|
||||
rawAliases?.reduce(into: [:]) { acc, entry in
|
||||
let key = entry.key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
let value = entry.value.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||
guard !key.isEmpty, !value.isEmpty else { return }
|
||||
acc[key] = value
|
||||
} ?? [:]
|
||||
let model = talk?["modelId"]?.stringValue
|
||||
let outputFormat = talk?["outputFormat"]?.stringValue
|
||||
let interrupt = talk?["interruptOnSpeech"]?.boolValue
|
||||
@@ -631,6 +720,7 @@ actor TalkModeRuntime {
|
||||
(apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? apiKey : nil)
|
||||
return TalkRuntimeConfig(
|
||||
voiceId: resolvedVoice,
|
||||
voiceAliases: resolvedAliases,
|
||||
modelId: model,
|
||||
outputFormat: outputFormat,
|
||||
interruptOnSpeech: interrupt ?? true,
|
||||
@@ -642,6 +732,7 @@ actor TalkModeRuntime {
|
||||
let resolvedApiKey = envApiKey?.isEmpty == false ? envApiKey : nil
|
||||
return TalkRuntimeConfig(
|
||||
voiceId: resolvedVoice,
|
||||
voiceAliases: [:],
|
||||
modelId: nil,
|
||||
outputFormat: nil,
|
||||
interruptOnSpeech: true,
|
||||
@@ -652,7 +743,7 @@ actor TalkModeRuntime {
|
||||
// MARK: - Audio level handling
|
||||
|
||||
private func noteAudioLevel(rms: Double) async {
|
||||
if self.phase != .listening && self.phase != .speaking { return }
|
||||
if self.phase != .listening, self.phase != .speaking { return }
|
||||
let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01
|
||||
self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha)
|
||||
|
||||
@@ -731,7 +822,7 @@ actor TalkModeRuntime {
|
||||
|
||||
private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 4294967295 {
|
||||
if value < 0 || value > 4_294_967_295 {
|
||||
logger.warning("talk seed out of range: \(value, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
@@ -747,5 +838,4 @@ actor TalkModeRuntime {
|
||||
}
|
||||
return normalized
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user