import AVFoundation import ClawdbotChatUI import ClawdbotKit import Foundation import OSLog import Speech actor TalkModeRuntime { static let shared = TalkModeRuntime() private let logger = Logger(subsystem: "com.clawdbot", category: "talk.runtime") private let ttsLogger = Logger(subsystem: "com.clawdbot", category: "talk.tts") private static let defaultModelIdFallback = "eleven_v3" private final class RMSMeter: @unchecked Sendable { private let lock = NSLock() private var latestRMS: Double = 0 func set(_ rms: Double) { self.lock.lock() self.latestRMS = rms self.lock.unlock() } func get() -> Double { self.lock.lock() let value = self.latestRMS self.lock.unlock() return value } } private var recognizer: SFSpeechRecognizer? private var audioEngine: AVAudioEngine? private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? private var recognitionTask: SFSpeechRecognitionTask? private var recognitionGeneration: Int = 0 private var rmsTask: Task? private let rmsMeter = RMSMeter() private var captureTask: Task? private var silenceTask: Task? private var phase: TalkModePhase = .idle private var isEnabled = false private var isPaused = false private var lifecycleGeneration: Int = 0 private var lastHeard: Date? private var noiseFloorRMS: Double = 1e-4 private var lastTranscript: String = "" private var lastSpeechEnergyAt: Date? private var defaultVoiceId: String? private var currentVoiceId: String? private var defaultModelId: String? private var currentModelId: String? private var voiceOverrideActive = false private var modelOverrideActive = false private var defaultOutputFormat: String? private var interruptOnSpeech: Bool = true private var lastInterruptedAtSeconds: Double? private var voiceAliases: [String: String] = [:] private var lastSpokenText: String? private var apiKey: String? private var fallbackVoiceId: String? private var lastPlaybackWasPCM: Bool = false private let silenceWindow: TimeInterval = 0.7 private let minSpeechRMS: Double = 1e-3 private let speechBoostFactor: Double = 6.0 // MARK: - Lifecycle func setEnabled(_ enabled: Bool) async { guard enabled != self.isEnabled else { return } self.isEnabled = enabled self.lifecycleGeneration &+= 1 if enabled { await self.start() } else { await self.stop() } } func setPaused(_ paused: Bool) async { guard paused != self.isPaused else { return } self.isPaused = paused await MainActor.run { TalkModeController.shared.updateLevel(0) } guard self.isEnabled else { return } if paused { self.lastTranscript = "" self.lastHeard = nil self.lastSpeechEnergyAt = nil await self.stopRecognition() return } if self.phase == .idle || self.phase == .listening { await self.startRecognition() self.phase = .listening await MainActor.run { TalkModeController.shared.updatePhase(.listening) } self.startSilenceMonitor() } } private func isCurrent(_ generation: Int) -> Bool { generation == self.lifecycleGeneration && self.isEnabled } private func start() async { let gen = self.lifecycleGeneration guard voiceWakeSupported else { return } guard PermissionManager.voiceWakePermissionsGranted() else { self.logger.debug("talk runtime not starting: permissions missing") return } await self.reloadConfig() guard self.isCurrent(gen) else { return } if self.isPaused { self.phase = .idle await MainActor.run { TalkModeController.shared.updateLevel(0) TalkModeController.shared.updatePhase(.idle) } return } await self.startRecognition() guard self.isCurrent(gen) else { return } self.phase = .listening await MainActor.run { TalkModeController.shared.updatePhase(.listening) } self.startSilenceMonitor() } private func stop() async { self.captureTask?.cancel() self.captureTask = nil self.silenceTask?.cancel() self.silenceTask = nil // Stop audio before changing phase (stopSpeaking is gated on .speaking). await self.stopSpeaking(reason: .manual) self.lastTranscript = "" self.lastHeard = nil self.lastSpeechEnergyAt = nil self.phase = .idle await self.stopRecognition() await MainActor.run { TalkModeController.shared.updateLevel(0) TalkModeController.shared.updatePhase(.idle) } } // MARK: - Speech recognition private struct RecognitionUpdate { let transcript: String? let hasConfidence: Bool let isFinal: Bool let errorDescription: String? let generation: Int } private func startRecognition() async { await self.stopRecognition() self.recognitionGeneration &+= 1 let generation = self.recognitionGeneration let locale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID } self.recognizer = SFSpeechRecognizer(locale: Locale(identifier: locale)) guard let recognizer, recognizer.isAvailable else { self.logger.error("talk recognizer unavailable") return } self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() self.recognitionRequest?.shouldReportPartialResults = true guard let request = self.recognitionRequest else { return } if self.audioEngine == nil { self.audioEngine = AVAudioEngine() } guard let audioEngine = self.audioEngine else { return } let input = audioEngine.inputNode let format = input.outputFormat(forBus: 0) input.removeTap(onBus: 0) let meter = self.rmsMeter input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request, meter] buffer, _ in request?.append(buffer) if let rms = Self.rmsLevel(buffer: buffer) { meter.set(rms) } } audioEngine.prepare() do { try audioEngine.start() } catch { self.logger.error("talk audio engine start failed: \(error.localizedDescription, privacy: .public)") return } self.startRMSTicker(meter: meter) self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in guard let self else { return } let segments = result?.bestTranscription.segments ?? [] let transcript = result?.bestTranscription.formattedString let update = RecognitionUpdate( transcript: transcript, hasConfidence: segments.contains { $0.confidence > 0.6 }, isFinal: result?.isFinal ?? false, errorDescription: error?.localizedDescription, generation: generation) Task { await self.handleRecognition(update) } } } private func stopRecognition() async { self.recognitionGeneration &+= 1 self.recognitionTask?.cancel() self.recognitionTask = nil self.recognitionRequest?.endAudio() self.recognitionRequest = nil self.audioEngine?.inputNode.removeTap(onBus: 0) self.audioEngine?.stop() self.audioEngine = nil self.recognizer = nil self.rmsTask?.cancel() self.rmsTask = nil } private func startRMSTicker(meter: RMSMeter) { self.rmsTask?.cancel() self.rmsTask = Task { [weak self, meter] in while let self { try? await Task.sleep(nanoseconds: 50_000_000) if Task.isCancelled { return } await self.noteAudioLevel(rms: meter.get()) } } } private func handleRecognition(_ update: RecognitionUpdate) async { guard update.generation == self.recognitionGeneration else { return } guard !self.isPaused else { return } if let errorDescription = update.errorDescription { self.logger.debug("talk recognition error: \(errorDescription, privacy: .public)") } guard let transcript = update.transcript else { return } let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) if self.phase == .speaking, self.interruptOnSpeech { if await self.shouldInterrupt(transcript: trimmed, hasConfidence: update.hasConfidence) { await self.stopSpeaking(reason: .speech) self.lastTranscript = "" self.lastHeard = nil await self.startListening() } return } guard self.phase == .listening else { return } if !trimmed.isEmpty { self.lastTranscript = trimmed self.lastHeard = Date() } if update.isFinal { self.lastTranscript = trimmed } } // MARK: - Silence handling private func startSilenceMonitor() { self.silenceTask?.cancel() self.silenceTask = Task { [weak self] in await self?.silenceLoop() } } private func silenceLoop() async { while self.isEnabled { try? await Task.sleep(nanoseconds: 200_000_000) await self.checkSilence() } } private func checkSilence() async { guard !self.isPaused else { return } guard self.phase == .listening else { return } let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) guard !transcript.isEmpty else { return } guard let lastHeard else { return } let elapsed = Date().timeIntervalSince(lastHeard) guard elapsed >= self.silenceWindow else { return } await self.finalizeTranscript(transcript) } private func startListening() async { self.phase = .listening self.lastTranscript = "" self.lastHeard = nil await MainActor.run { TalkModeController.shared.updatePhase(.listening) TalkModeController.shared.updateLevel(0) } } private func finalizeTranscript(_ text: String) async { self.lastTranscript = "" self.lastHeard = nil self.phase = .thinking await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } await self.stopRecognition() await self.sendAndSpeak(text) } // MARK: - Gateway + TTS private func sendAndSpeak(_ transcript: String) async { let gen = self.lifecycleGeneration await self.reloadConfig() guard self.isCurrent(gen) else { return } let prompt = self.buildPrompt(transcript: transcript) let activeSessionKey = await MainActor.run { WebChatManager.shared.activeSessionKey } let sessionKey: String = if let activeSessionKey { activeSessionKey } else { await GatewayConnection.shared.mainSessionKey() } let runId = UUID().uuidString let startedAt = Date().timeIntervalSince1970 self.logger.info( "talk send start runId=\(runId, privacy: .public) " + "session=\(sessionKey, privacy: .public) " + "chars=\(prompt.count, privacy: .public)") do { let response = try await GatewayConnection.shared.chatSend( sessionKey: sessionKey, message: prompt, thinking: "low", idempotencyKey: runId, attachments: []) guard self.isCurrent(gen) else { return } self.logger.info( "talk chat.send ok runId=\(response.runId, privacy: .public) " + "session=\(sessionKey, privacy: .public)") guard let assistantText = await self.waitForAssistantText( sessionKey: sessionKey, since: startedAt, timeoutSeconds: 45) else { self.logger.warning("talk assistant text missing after timeout") await self.startListening() await self.startRecognition() return } guard self.isCurrent(gen) else { return } self.logger.info("talk assistant text len=\(assistantText.count, privacy: .public)") await self.playAssistant(text: assistantText) guard self.isCurrent(gen) else { return } await self.resumeListeningIfNeeded() return } catch { self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)") await self.resumeListeningIfNeeded() return } } private func resumeListeningIfNeeded() async { if self.isPaused { self.lastTranscript = "" self.lastHeard = nil self.lastSpeechEnergyAt = nil await MainActor.run { TalkModeController.shared.updateLevel(0) } return } await self.startListening() await self.startRecognition() } private func buildPrompt(transcript: String) -> String { let interrupted = self.lastInterruptedAtSeconds self.lastInterruptedAtSeconds = nil return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted) } private func waitForAssistantText( sessionKey: String, since: Double, timeoutSeconds: Int) async -> String? { let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds)) while Date() < deadline { if let text = await self.latestAssistantText(sessionKey: sessionKey, since: since) { return text } try? await Task.sleep(nanoseconds: 300_000_000) } return nil } private func latestAssistantText(sessionKey: String, since: Double? = nil) async -> String? { do { let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey) let messages = history.messages ?? [] let decoded: [ClawdbotChatMessage] = messages.compactMap { item in guard let data = try? JSONEncoder().encode(item) else { return nil } return try? JSONDecoder().decode(ClawdbotChatMessage.self, from: data) } let assistant = decoded.last { message in guard message.role == "assistant" else { return false } guard let since else { return true } guard let timestamp = message.timestamp else { return false } return TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) } guard let assistant else { return nil } let text = assistant.content.compactMap(\.text).joined(separator: "\n") let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) return trimmed.isEmpty ? nil : trimmed } catch { self.logger.error("talk history fetch failed: \(error.localizedDescription, privacy: .public)") return nil } } private func playAssistant(text: String) async { guard let input = await self.preparePlaybackInput(text: text) else { return } do { if let apiKey = input.apiKey, !apiKey.isEmpty, let voiceId = input.voiceId { try await self.playElevenLabs(input: input, apiKey: apiKey, voiceId: voiceId) } else { try await self.playSystemVoice(input: input) } } catch { self.ttsLogger .error( "talk TTS failed: \(error.localizedDescription, privacy: .public); " + "falling back to system voice") do { try await self.playSystemVoice(input: input) } catch { self.ttsLogger.error("talk system voice failed: \(error.localizedDescription, privacy: .public)") } } if self.phase == .speaking { self.phase = .thinking await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } } } private struct TalkPlaybackInput { let generation: Int let cleanedText: String let directive: TalkDirective? let apiKey: String? let voiceId: String? let language: String? let synthTimeoutSeconds: Double } private func preparePlaybackInput(text: String) async -> TalkPlaybackInput? { let gen = self.lifecycleGeneration let parse = TalkDirectiveParser.parse(text) let directive = parse.directive let cleaned = parse.stripped.trimmingCharacters(in: .whitespacesAndNewlines) guard !cleaned.isEmpty else { return nil } guard self.isCurrent(gen) else { return nil } if !parse.unknownKeys.isEmpty { self.logger .warning( "talk directive ignored keys: " + "\(parse.unknownKeys.joined(separator: ","), privacy: .public)") } let requestedVoice = directive?.voiceId?.trimmingCharacters(in: .whitespacesAndNewlines) let resolvedVoice = self.resolveVoiceAlias(requestedVoice) if let requestedVoice, !requestedVoice.isEmpty, resolvedVoice == nil { self.logger.warning("talk unknown voice alias \(requestedVoice, privacy: .public)") } if let voice = resolvedVoice { if directive?.once == true { self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)") } else { self.currentVoiceId = voice self.voiceOverrideActive = true self.logger.info("talk voice override voiceId=\(voice, privacy: .public)") } } if let model = directive?.modelId { if directive?.once == true { self.logger.info("talk model override (once) modelId=\(model, privacy: .public)") } else { self.currentModelId = model self.modelOverrideActive = true } } let apiKey = self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines) let preferredVoice = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId let language = ElevenLabsTTSClient.validatedLanguage(directive?.language) let voiceId: String? = if let apiKey, !apiKey.isEmpty { await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey) } else { nil } if apiKey?.isEmpty != false { self.ttsLogger.warning("talk missing ELEVENLABS_API_KEY; falling back to system voice") } else if voiceId == nil { self.ttsLogger.warning("talk missing voiceId; falling back to system voice") } else if let voiceId { self.ttsLogger .info( "talk TTS request voiceId=\(voiceId, privacy: .public) " + "chars=\(cleaned.count, privacy: .public)") } self.lastSpokenText = cleaned let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12)) guard self.isCurrent(gen) else { return nil } return TalkPlaybackInput( generation: gen, cleanedText: cleaned, directive: directive, apiKey: apiKey, voiceId: voiceId, language: language, synthTimeoutSeconds: synthTimeoutSeconds) } private func playElevenLabs(input: TalkPlaybackInput, apiKey: String, voiceId: String) async throws { let desiredOutputFormat = input.directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100" let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat) if outputFormat == nil, !desiredOutputFormat.isEmpty { self.logger .warning( "talk output_format unsupported for local playback: " + "\(desiredOutputFormat, privacy: .public)") } let modelId = input.directive?.modelId ?? self.currentModelId ?? self.defaultModelId func makeRequest(outputFormat: String?) -> ElevenLabsTTSRequest { ElevenLabsTTSRequest( text: input.cleanedText, modelId: modelId, outputFormat: outputFormat, speed: TalkTTSValidation.resolveSpeed( speed: input.directive?.speed, rateWPM: input.directive?.rateWPM), stability: TalkTTSValidation.validatedStability( input.directive?.stability, modelId: modelId), similarity: TalkTTSValidation.validatedUnit(input.directive?.similarity), style: TalkTTSValidation.validatedUnit(input.directive?.style), speakerBoost: input.directive?.speakerBoost, seed: TalkTTSValidation.validatedSeed(input.directive?.seed), normalize: ElevenLabsTTSClient.validatedNormalize(input.directive?.normalize), language: input.language, latencyTier: TalkTTSValidation.validatedLatencyTier(input.directive?.latencyTier)) } let request = makeRequest(outputFormat: outputFormat) self.ttsLogger.info("talk TTS synth timeout=\(input.synthTimeoutSeconds, privacy: .public)s") let client = ElevenLabsTTSClient(apiKey: apiKey) let stream = client.streamSynthesize(voiceId: voiceId, request: request) guard self.isCurrent(input.generation) else { return } if self.interruptOnSpeech { guard await self.prepareForPlayback(generation: input.generation) else { return } } await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } self.phase = .speaking let result = await self.playRemoteStream( client: client, voiceId: voiceId, outputFormat: outputFormat, makeRequest: makeRequest, stream: stream) self.ttsLogger .info( "talk audio result finished=\(result.finished, privacy: .public) " + "interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)") if !result.finished, result.interruptedAt == nil { throw NSError(domain: "StreamingAudioPlayer", code: 1, userInfo: [ NSLocalizedDescriptionKey: "audio playback failed", ]) } if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking { if self.interruptOnSpeech { self.lastInterruptedAtSeconds = interruptedAt } } } private func playRemoteStream( client: ElevenLabsTTSClient, voiceId: String, outputFormat: String?, makeRequest: (String?) -> ElevenLabsTTSRequest, stream: AsyncThrowingStream) async -> StreamingPlaybackResult { let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat) if let sampleRate { self.lastPlaybackWasPCM = true let result = await self.playPCM(stream: stream, sampleRate: sampleRate) if result.finished || result.interruptedAt != nil { return result } let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100") self.ttsLogger.warning("talk pcm playback failed; retrying mp3") self.lastPlaybackWasPCM = false let mp3Stream = client.streamSynthesize( voiceId: voiceId, request: makeRequest(mp3Format)) return await self.playMP3(stream: mp3Stream) } self.lastPlaybackWasPCM = false return await self.playMP3(stream: stream) } private func playSystemVoice(input: TalkPlaybackInput) async throws { self.ttsLogger.info("talk system voice start chars=\(input.cleanedText.count, privacy: .public)") if self.interruptOnSpeech { guard await self.prepareForPlayback(generation: input.generation) else { return } } await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } self.phase = .speaking await TalkSystemSpeechSynthesizer.shared.stop() try await TalkSystemSpeechSynthesizer.shared.speak( text: input.cleanedText, language: input.language) self.ttsLogger.info("talk system voice done") } private func prepareForPlayback(generation: Int) async -> Bool { await self.startRecognition() return self.isCurrent(generation) } private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? { let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" if !trimmed.isEmpty { if let resolved = self.resolveVoiceAlias(trimmed) { return resolved } self.ttsLogger.warning("talk unknown voice alias \(trimmed, privacy: .public)") } if let fallbackVoiceId { return fallbackVoiceId } do { let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices() guard let first = voices.first else { self.ttsLogger.error("elevenlabs voices list empty") return nil } self.fallbackVoiceId = first.voiceId if self.defaultVoiceId == nil { self.defaultVoiceId = first.voiceId } if !self.voiceOverrideActive { self.currentVoiceId = first.voiceId } let name = first.name ?? "unknown" self.ttsLogger .info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))") return first.voiceId } catch { self.ttsLogger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)") return nil } } private func resolveVoiceAlias(_ value: String?) -> String? { let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines) guard !trimmed.isEmpty else { return nil } let normalized = trimmed.lowercased() if let mapped = self.voiceAliases[normalized] { return mapped } if self.voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) { return trimmed } return Self.isLikelyVoiceId(trimmed) ? trimmed : nil } private static func isLikelyVoiceId(_ value: String) -> Bool { guard value.count >= 10 else { return false } return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" } } func stopSpeaking(reason: TalkStopReason) async { let usePCM = self.lastPlaybackWasPCM let interruptedAt = usePCM ? await self.stopPCM() : await self.stopMP3() _ = usePCM ? await self.stopMP3() : await self.stopPCM() await TalkSystemSpeechSynthesizer.shared.stop() guard self.phase == .speaking else { return } if reason == .speech, let interruptedAt { self.lastInterruptedAtSeconds = interruptedAt } if reason == .manual { return } if reason == .speech || reason == .userTap { await self.startListening() return } self.phase = .thinking await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } } } extension TalkModeRuntime { // MARK: - Audio playback (MainActor helpers) @MainActor private func playPCM( stream: AsyncThrowingStream, sampleRate: Double) async -> StreamingPlaybackResult { await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate) } @MainActor private func playMP3(stream: AsyncThrowingStream) async -> StreamingPlaybackResult { await StreamingAudioPlayer.shared.play(stream: stream) } @MainActor private func stopPCM() -> Double? { PCMStreamingAudioPlayer.shared.stop() } @MainActor private func stopMP3() -> Double? { StreamingAudioPlayer.shared.stop() } // MARK: - Config private func reloadConfig() async { let cfg = await self.fetchTalkConfig() self.defaultVoiceId = cfg.voiceId self.voiceAliases = cfg.voiceAliases if !self.voiceOverrideActive { self.currentVoiceId = cfg.voiceId } self.defaultModelId = cfg.modelId if !self.modelOverrideActive { self.currentModelId = cfg.modelId } self.defaultOutputFormat = cfg.outputFormat self.interruptOnSpeech = cfg.interruptOnSpeech self.apiKey = cfg.apiKey let hasApiKey = (cfg.apiKey?.isEmpty == false) let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none" let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none" self.logger .info( "talk config voiceId=\(voiceLabel, privacy: .public) " + "modelId=\(modelLabel, privacy: .public) " + "apiKey=\(hasApiKey, privacy: .public) " + "interrupt=\(cfg.interruptOnSpeech, privacy: .public)") } private struct TalkRuntimeConfig { let voiceId: String? let voiceAliases: [String: String] let modelId: String? let outputFormat: String? let interruptOnSpeech: Bool let apiKey: String? } private func fetchTalkConfig() async -> TalkRuntimeConfig { let env = ProcessInfo.processInfo.environment let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) let envApiKey = env["ELEVENLABS_API_KEY"]?.trimmingCharacters(in: .whitespacesAndNewlines) do { let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded( method: .configGet, params: nil, timeoutMs: 8000) let talk = snap.config?["talk"]?.dictionaryValue let ui = snap.config?["ui"]?.dictionaryValue let rawSeam = ui?["seamColor"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" await MainActor.run { AppStateStore.shared.seamColorHex = rawSeam.isEmpty ? nil : rawSeam } let voice = talk?["voiceId"]?.stringValue let rawAliases = talk?["voiceAliases"]?.dictionaryValue let resolvedAliases: [String: String] = rawAliases?.reduce(into: [:]) { acc, entry in let key = entry.key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() let value = entry.value.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" guard !key.isEmpty, !value.isEmpty else { return } acc[key] = value } ?? [:] let model = talk?["modelId"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) let resolvedModel = (model?.isEmpty == false) ? model! : Self.defaultModelIdFallback let outputFormat = talk?["outputFormat"]?.stringValue let interrupt = talk?["interruptOnSpeech"]?.boolValue let apiKey = talk?["apiKey"]?.stringValue let resolvedVoice = (voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ?? (envVoice?.isEmpty == false ? envVoice : nil) ?? (sagVoice?.isEmpty == false ? sagVoice : nil) let resolvedApiKey = (envApiKey?.isEmpty == false ? envApiKey : nil) ?? (apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? apiKey : nil) return TalkRuntimeConfig( voiceId: resolvedVoice, voiceAliases: resolvedAliases, modelId: resolvedModel, outputFormat: outputFormat, interruptOnSpeech: interrupt ?? true, apiKey: resolvedApiKey) } catch { let resolvedVoice = (envVoice?.isEmpty == false ? envVoice : nil) ?? (sagVoice?.isEmpty == false ? sagVoice : nil) let resolvedApiKey = envApiKey?.isEmpty == false ? envApiKey : nil return TalkRuntimeConfig( voiceId: resolvedVoice, voiceAliases: [:], modelId: Self.defaultModelIdFallback, outputFormat: nil, interruptOnSpeech: true, apiKey: resolvedApiKey) } } // MARK: - Audio level handling private func noteAudioLevel(rms: Double) async { if self.phase != .listening, self.phase != .speaking { return } let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01 self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha) let threshold = max(self.minSpeechRMS, self.noiseFloorRMS * self.speechBoostFactor) if rms >= threshold { let now = Date() self.lastHeard = now self.lastSpeechEnergyAt = now } if self.phase == .listening { let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold))) await MainActor.run { TalkModeController.shared.updateLevel(clamped) } } } private static func rmsLevel(buffer: AVAudioPCMBuffer) -> Double? { guard let channelData = buffer.floatChannelData?.pointee else { return nil } let frameCount = Int(buffer.frameLength) guard frameCount > 0 else { return nil } var sum: Double = 0 for i in 0.. Bool { let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) guard trimmed.count >= 3 else { return false } if self.isLikelyEcho(of: trimmed) { return false } let now = Date() if let lastSpeechEnergyAt, now.timeIntervalSince(lastSpeechEnergyAt) > 0.35 { return false } return hasConfidence } private func isLikelyEcho(of transcript: String) -> Bool { guard let spoken = self.lastSpokenText?.lowercased(), !spoken.isEmpty else { return false } let probe = transcript.lowercased() if probe.count < 6 { return spoken.contains(probe) } return spoken.contains(probe) } private static func resolveSpeed(speed: Double?, rateWPM: Int?, logger: Logger) -> Double? { if let rateWPM, rateWPM > 0 { let resolved = Double(rateWPM) / 175.0 if resolved <= 0.5 || resolved >= 2.0 { logger.warning("talk rateWPM out of range: \(rateWPM, privacy: .public)") return nil } return resolved } if let speed { if speed <= 0.5 || speed >= 2.0 { logger.warning("talk speed out of range: \(speed, privacy: .public)") return nil } return speed } return nil } private static func validatedUnit(_ value: Double?, name: String, logger: Logger) -> Double? { guard let value else { return nil } if value < 0 || value > 1 { logger.warning("talk \(name, privacy: .public) out of range: \(value, privacy: .public)") return nil } return value } private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? { guard let value else { return nil } if value < 0 || value > 4_294_967_295 { logger.warning("talk seed out of range: \(value, privacy: .public)") return nil } return UInt32(value) } private static func validatedNormalize(_ value: String?, logger: Logger) -> String? { guard let value else { return nil } let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() guard ["auto", "on", "off"].contains(normalized) else { logger.warning("talk normalize invalid: \(normalized, privacy: .public)") return nil } return normalized } }