From f86772f26ce625314324abba0c7c715913593fc9 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 07:40:02 +0100 Subject: [PATCH] fix(talk): harden TTS + add system fallback --- CHANGELOG.md | 2 + .../com/steipete/clawdis/node/NodeRuntime.kt | 2 +- .../clawdis/node/ui/TalkOrbOverlay.kt | 4 +- .../clawdis/node/voice/TalkModeManager.kt | 177 ++++++++-- apps/ios/Sources/Model/NodeAppModel.swift | 2 +- apps/ios/Sources/Voice/TalkModeManager.swift | 291 +++++----------- apps/ios/Sources/Voice/TalkOrbOverlay.swift | 6 +- .../Sources/Clawdis/TalkModeRuntime.swift | 316 ++++++------------ apps/macos/Sources/Clawdis/TalkOverlay.swift | 2 +- .../Sources/Clawdis/TalkOverlayView.swift | 4 +- .../Sources/Clawdis/WebChatManager.swift | 4 + .../Sources/ClawdisKit/ElevenLabsTTS.swift | 233 +++++++++++++ .../Sources/ClawdisKit/TalkDirective.swift | 8 +- .../ClawdisKit/TalkHistoryTimestamp.swift | 13 + .../ClawdisKit/TalkPromptBuilder.swift | 18 + .../TalkSystemSpeechSynthesizer.swift | 110 ++++++ .../ClawdisKit/TalkTTSValidation.swift | 27 ++ .../ElevenLabsTTSValidationTests.swift | 20 ++ .../ClawdisKitTests/TalkDirectiveTests.swift | 12 + .../TalkHistoryTimestampTests.swift | 16 + .../TalkPromptBuilderTests.swift | 16 + .../TalkTTSValidationTests.swift | 24 ++ 22 files changed, 839 insertions(+), 468 deletions(-) create mode 100644 apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift create mode 100644 apps/shared/ClawdisKit/Sources/ClawdisKit/TalkHistoryTimestamp.swift create mode 100644 apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift create mode 100644 apps/shared/ClawdisKit/Sources/ClawdisKit/TalkSystemSpeechSynthesizer.swift create mode 100644 apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift create mode 100644 apps/shared/ClawdisKit/Tests/ClawdisKitTests/ElevenLabsTTSValidationTests.swift create mode 100644 apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkHistoryTimestampTests.swift create mode 100644 apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkPromptBuilderTests.swift create mode 100644 apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift diff --git a/CHANGELOG.md b/CHANGELOG.md index a4ab83493..2a31b58c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ - macOS Talk Mode: fix audio stop ordering so disabling Talk Mode always stops in-flight playback. - macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn. - macOS Talk Mode: increase overlay window size so wave rings don’t clip; close button is hover-only and closer to the orb. +- Talk Mode: fall back to system TTS when ElevenLabs is unavailable, returns non-audio, or playback fails (macOS/iOS/Android). +- ElevenLabs: add retry/backoff for 429/5xx and include content-type in errors for debugging. - Talk Mode: align to the gateway’s main session key and fall back to history polling when chat events drop (prevents stuck “thinking” / missing messages). - Talk Mode: treat history timestamps as seconds or milliseconds to avoid stale assistant picks (macOS/iOS/Android). - Chat UI: dedupe identical history messages to avoid duplicate bubbles. diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt index 7a68abbd3..1b63a4948 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt @@ -930,7 +930,7 @@ class NodeRuntime(context: Context) { private data class Quad(val first: A, val second: B, val third: C, val fourth: D) -private const val DEFAULT_SEAM_COLOR_ARGB: Long = 0xFF7FB8D4 +private const val DEFAULT_SEAM_COLOR_ARGB: Long = 0xFF4F7A9A private const val a2uiReadyCheckJS: String = """ diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/TalkOrbOverlay.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/TalkOrbOverlay.kt index 11b7a3176..c36dbf8e4 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/TalkOrbOverlay.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/TalkOrbOverlay.kt @@ -62,9 +62,9 @@ fun TalkOrbOverlay( verticalArrangement = Arrangement.spacedBy(12.dp), ) { Box(contentAlignment = Alignment.Center) { - Canvas(modifier = Modifier.size(300.dp)) { + Canvas(modifier = Modifier.size(360.dp)) { val center = this.center - val baseRadius = size.minDimension * 0.27f + val baseRadius = size.minDimension * 0.30f val ring1 = 1.05f + (t * 0.25f) val ring2 = 1.20f + (t * 0.55f) diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt index e015aafcf..eabd0abbf 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt @@ -13,6 +13,8 @@ import android.os.SystemClock import android.speech.RecognitionListener import android.speech.RecognizerIntent import android.speech.SpeechRecognizer +import android.speech.tts.TextToSpeech +import android.speech.tts.UtteranceProgressListener import android.util.Log import androidx.core.content.ContextCompat import com.steipete.clawdis.node.bridge.BridgeSession @@ -89,6 +91,9 @@ class TalkModeManager( private var player: MediaPlayer? = null private var currentAudioFile: File? = null + private var systemTts: TextToSpeech? = null + private var systemTtsPending: CompletableDeferred? = null + private var systemTtsPendingId: String? = null fun attachSession(session: BridgeSession) { this.session = session @@ -181,6 +186,10 @@ class TalkModeManager( recognizer?.destroy() recognizer = null } + systemTts?.stop() + systemTtsPending?.cancel() + systemTtsPending = null + systemTtsPendingId = null } private fun startListeningInternal(markListening: Boolean) { @@ -441,16 +450,6 @@ class TalkModeManager( apiKey?.trim()?.takeIf { it.isNotEmpty() } ?: System.getenv("ELEVENLABS_API_KEY")?.trim() val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId - if (voiceId.isNullOrBlank()) { - _statusText.value = "Missing voice ID" - Log.w(tag, "missing voiceId") - return - } - if (apiKey.isNullOrEmpty()) { - _statusText.value = "Missing ELEVENLABS_API_KEY" - Log.w(tag, "missing ELEVENLABS_API_KEY") - return - } _statusText.value = "Speaking…" _isSpeaking.value = true @@ -458,28 +457,46 @@ class TalkModeManager( ensureInterruptListener() try { - val ttsStarted = SystemClock.elapsedRealtime() - val request = - ElevenLabsRequest( - text = cleaned, - modelId = directive?.modelId ?: currentModelId ?: defaultModelId, - outputFormat = - TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat), - speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm), - stability = TalkModeRuntime.validatedUnit(directive?.stability), - similarity = TalkModeRuntime.validatedUnit(directive?.similarity), - style = TalkModeRuntime.validatedUnit(directive?.style), - speakerBoost = directive?.speakerBoost, - seed = TalkModeRuntime.validatedSeed(directive?.seed), - normalize = TalkModeRuntime.validatedNormalize(directive?.normalize), - language = TalkModeRuntime.validatedLanguage(directive?.language), - ) - val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request) - Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}") - playAudio(audio) + val canUseElevenLabs = !voiceId.isNullOrBlank() && !apiKey.isNullOrEmpty() + if (!canUseElevenLabs) { + if (voiceId.isNullOrBlank()) { + Log.w(tag, "missing voiceId; falling back to system voice") + } + if (apiKey.isNullOrEmpty()) { + Log.w(tag, "missing ELEVENLABS_API_KEY; falling back to system voice") + } + _statusText.value = "Speaking (System)…" + speakWithSystemTts(cleaned) + } else { + val ttsStarted = SystemClock.elapsedRealtime() + val request = + ElevenLabsRequest( + text = cleaned, + modelId = directive?.modelId ?: currentModelId ?: defaultModelId, + outputFormat = + TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat), + speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm), + stability = TalkModeRuntime.validatedUnit(directive?.stability), + similarity = TalkModeRuntime.validatedUnit(directive?.similarity), + style = TalkModeRuntime.validatedUnit(directive?.style), + speakerBoost = directive?.speakerBoost, + seed = TalkModeRuntime.validatedSeed(directive?.seed), + normalize = TalkModeRuntime.validatedNormalize(directive?.normalize), + language = TalkModeRuntime.validatedLanguage(directive?.language), + ) + val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request) + Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}") + playAudio(audio) + } } catch (err: Throwable) { - _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}" - Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}") + Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice") + try { + _statusText.value = "Speaking (System)…" + speakWithSystemTts(cleaned) + } catch (fallbackErr: Throwable) { + _statusText.value = "Speak failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}" + Log.w(tag, "system voice failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}") + } } _isSpeaking.value = false @@ -524,9 +541,103 @@ class TalkModeManager( Log.d(tag, "play done") } + private suspend fun speakWithSystemTts(text: String) { + val trimmed = text.trim() + if (trimmed.isEmpty()) return + val ok = ensureSystemTts() + if (!ok) { + throw IllegalStateException("system TTS unavailable") + } + + val tts = systemTts ?: throw IllegalStateException("system TTS unavailable") + val utteranceId = "talk-${UUID.randomUUID()}" + val deferred = CompletableDeferred() + systemTtsPending?.cancel() + systemTtsPending = deferred + systemTtsPendingId = utteranceId + + withContext(Dispatchers.Main) { + val params = Bundle() + tts.speak(trimmed, TextToSpeech.QUEUE_FLUSH, params, utteranceId) + } + + withContext(Dispatchers.IO) { + try { + kotlinx.coroutines.withTimeout(180_000) { deferred.await() } + } catch (err: Throwable) { + throw err + } + } + } + + private suspend fun ensureSystemTts(): Boolean { + if (systemTts != null) return true + return withContext(Dispatchers.Main) { + val deferred = CompletableDeferred() + val tts = + try { + TextToSpeech(context) { status -> + deferred.complete(status == TextToSpeech.SUCCESS) + } + } catch (_: Throwable) { + deferred.complete(false) + null + } + if (tts == null) return@withContext false + + tts.setOnUtteranceProgressListener( + object : UtteranceProgressListener() { + override fun onStart(utteranceId: String?) {} + + override fun onDone(utteranceId: String?) { + if (utteranceId == null) return + if (utteranceId != systemTtsPendingId) return + systemTtsPending?.complete(Unit) + systemTtsPending = null + systemTtsPendingId = null + } + + @Deprecated("Deprecated in Java") + override fun onError(utteranceId: String?) { + if (utteranceId == null) return + if (utteranceId != systemTtsPendingId) return + systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error")) + systemTtsPending = null + systemTtsPendingId = null + } + + override fun onError(utteranceId: String?, errorCode: Int) { + if (utteranceId == null) return + if (utteranceId != systemTtsPendingId) return + systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error $errorCode")) + systemTtsPending = null + systemTtsPendingId = null + } + }, + ) + + val ok = + try { + deferred.await() + } catch (_: Throwable) { + false + } + if (ok) { + systemTts = tts + } else { + tts.shutdown() + } + ok + } + } + private fun stopSpeaking(resetInterrupt: Boolean = true) { if (!_isSpeaking.value) { cleanupPlayer() + systemTts?.stop() + systemTtsPending?.cancel() + systemTtsPending = null + systemTtsPendingId = null return } if (resetInterrupt) { @@ -534,6 +645,10 @@ class TalkModeManager( lastInterruptedAtSeconds = currentMs / 1000.0 } cleanupPlayer() + systemTts?.stop() + systemTtsPending?.cancel() + systemTtsPending = null + systemTtsPendingId = null _isSpeaking.value = false } diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index 805cd7638..189c5c160 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -293,7 +293,7 @@ final class NodeAppModel { Self.color(fromHex: self.seamColorHex) ?? Self.defaultSeamColor } - private static let defaultSeamColor = Color(red: 127 / 255.0, green: 184 / 255.0, blue: 212 / 255.0) + private static let defaultSeamColor = Color(red: 79 / 255.0, green: 122 / 255.0, blue: 154 / 255.0) private static func color(fromHex raw: String?) -> Color? { let trimmed = (raw ?? "").trimmingCharacters(in: .whitespacesAndNewlines) diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index a52774b27..dff45d4d7 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -105,6 +105,7 @@ final class TalkModeManager: NSObject { self.stopRecognition() self.stopSpeaking() self.lastInterruptedAtSeconds = nil + TalkSystemSpeechSynthesizer.shared.stop() do { try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation]) } catch { @@ -301,20 +302,9 @@ final class TalkModeManager: NSObject { } private func buildPrompt(transcript: String) -> String { - var lines: [String] = [ - "Talk Mode active. Reply in a concise, spoken tone.", - "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", - ] - - if let interrupted = self.lastInterruptedAtSeconds { - let formatted = String(format: "%.1f", interrupted) - lines.append("Assistant speech interrupted at \(formatted)s.") - self.lastInterruptedAtSeconds = nil - } - - lines.append("") - lines.append(transcript) - return lines.joined(separator: "\n") + let interrupted = self.lastInterruptedAtSeconds + self.lastInterruptedAtSeconds = nil + return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted) } private enum ChatCompletionState: CustomStringConvertible { @@ -409,7 +399,7 @@ final class TalkModeManager: NSObject { for msg in messages.reversed() { guard (msg["role"] as? String) == "assistant" else { continue } if let since, let timestamp = msg["timestamp"] as? Double, - TalkModeRuntime.isMessageTimestampAfter(timestamp, sinceSeconds: since) == false + TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) == false { continue } @@ -440,81 +430,91 @@ final class TalkModeManager: NSObject { } } - let voiceId = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId - guard let voiceId, !voiceId.isEmpty else { - self.statusText = "Missing voice ID" - self.logger.error("missing voiceId") - return - } - - let resolvedKey = - (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ?? - ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] - guard let apiKey = resolvedKey, !apiKey.isEmpty else { - self.statusText = "Missing ELEVENLABS_API_KEY" - self.logger.error("missing ELEVENLABS_API_KEY") - return - } - self.statusText = "Generating voice…" self.isSpeaking = true self.lastSpokenText = cleaned do { let started = Date() - let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat - let outputFormat = TalkModeRuntime.validatedOutputFormat(desiredOutputFormat) - if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty { - self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)") - } - let request = ElevenLabsRequest( - text: cleaned, - modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, - outputFormat: outputFormat, - speed: TalkModeRuntime.resolveSpeed( - speed: directive?.speed, - rateWPM: directive?.rateWPM), - stability: TalkModeRuntime.validatedUnit(directive?.stability), - similarity: TalkModeRuntime.validatedUnit(directive?.similarity), - style: TalkModeRuntime.validatedUnit(directive?.style), - speakerBoost: directive?.speakerBoost, - seed: TalkModeRuntime.validatedSeed(directive?.seed), - normalize: TalkModeRuntime.validatedNormalize(directive?.normalize), - language: TalkModeRuntime.validatedLanguage(directive?.language)) + let language = ElevenLabsTTSClient.validatedLanguage(directive?.language) - let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12)) - let client = ElevenLabsClient(apiKey: apiKey) - let audio = try await withThrowingTaskGroup(of: Data.self) { group in - group.addTask { - try await client.synthesize(voiceId: voiceId, request: request) - } - group.addTask { - try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000)) - throw NSError(domain: "TalkTTS", code: 408, userInfo: [ - NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s", - ]) - } - let data = try await group.next()! - group.cancelAll() - return data - } - self.logger - .info( - "elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s") + let voiceId = (directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId)? + .trimmingCharacters(in: .whitespacesAndNewlines) + let resolvedKey = + (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ?? + ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] + let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines) + let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false) - if self.interruptOnSpeech { - do { - try self.startRecognition() - } catch { - self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)") + if canUseElevenLabs, let voiceId, let apiKey { + let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat + let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat) + if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty { + self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)") } - } - self.statusText = "Speaking…" - try await self.playAudio(data: audio) + let request = ElevenLabsTTSRequest( + text: cleaned, + modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, + outputFormat: outputFormat, + speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM), + stability: TalkTTSValidation.validatedUnit(directive?.stability), + similarity: TalkTTSValidation.validatedUnit(directive?.similarity), + style: TalkTTSValidation.validatedUnit(directive?.style), + speakerBoost: directive?.speakerBoost, + seed: TalkTTSValidation.validatedSeed(directive?.seed), + normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize), + language: language) + + let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12)) + let client = ElevenLabsTTSClient(apiKey: apiKey) + let audio = try await client.synthesizeWithHardTimeout( + voiceId: voiceId, + request: request, + hardTimeoutSeconds: synthTimeoutSeconds) + self.logger + .info( + "elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s") + + if self.interruptOnSpeech { + do { + try self.startRecognition() + } catch { + self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)") + } + } + + self.statusText = "Speaking…" + try await self.playAudio(data: audio) + } else { + self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)") + if self.interruptOnSpeech { + do { + try self.startRecognition() + } catch { + self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)") + } + } + self.statusText = "Speaking (System)…" + try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language) + } } catch { - self.statusText = "Speak failed: \(error.localizedDescription)" - self.logger.error("speak failed: \(error.localizedDescription, privacy: .public)") + self.logger.error("tts failed: \(error.localizedDescription, privacy: .public); falling back to system voice") + do { + if self.interruptOnSpeech { + do { + try self.startRecognition() + } catch { + self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)") + } + } + self.statusText = "Speaking (System)…" + let language = ElevenLabsTTSClient.validatedLanguage(directive?.language) + try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language) + } catch { + self.statusText = "Speak failed: \(error.localizedDescription)" + self.logger.error("system voice failed: \(error.localizedDescription, privacy: .public)") + } } self.stopRecognition() @@ -527,7 +527,11 @@ final class TalkModeManager: NSObject { self.player = player player.prepareToPlay() self.logger.info("play start") - player.play() + guard player.play() else { + throw NSError(domain: "TalkMode", code: 2, userInfo: [ + NSLocalizedDescriptionKey: "audio player refused to play", + ]) + } while player.isPlaying { try? await Task.sleep(nanoseconds: 120_000_000) } @@ -541,6 +545,7 @@ final class TalkModeManager: NSObject { } self.player?.stop() self.player = nil + TalkSystemSpeechSynthesizer.shared.stop() self.isSpeaking = false } @@ -584,7 +589,7 @@ final class TalkModeManager: NSObject { private static func configureAudioSession() throws { let session = AVAudioSession.sharedInstance() - try session.setCategory(.playAndRecord, mode: .measurement, options: [ + try session.setCategory(.playAndRecord, mode: .voiceChat, options: [ .duckOthers, .mixWithOthers, .allowBluetoothHFP, @@ -609,127 +614,3 @@ final class TalkModeManager: NSObject { } } } - -private struct ElevenLabsRequest { - let text: String - let modelId: String? - let outputFormat: String? - let speed: Double? - let stability: Double? - let similarity: Double? - let style: Double? - let speakerBoost: Bool? - let seed: UInt32? - let normalize: String? - let language: String? -} - -private struct ElevenLabsClient { - let apiKey: String - let baseUrl = URL(string: "https://api.elevenlabs.io")! - - func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data { - var url = self.baseUrl - url.appendPathComponent("v1") - url.appendPathComponent("text-to-speech") - url.appendPathComponent(voiceId) - - var payload: [String: Any] = [ - "text": request.text, - ] - if let modelId = request.modelId, !modelId.isEmpty { - payload["model_id"] = modelId - } - if let outputFormat = request.outputFormat, !outputFormat.isEmpty { - payload["output_format"] = outputFormat - } - if let seed = request.seed { - payload["seed"] = seed - } - if let normalize = request.normalize { - payload["apply_text_normalization"] = normalize - } - if let language = request.language { - payload["language_code"] = language - } - var voiceSettings: [String: Any] = [:] - if let speed = request.speed { voiceSettings["speed"] = speed } - if let stability = request.stability { voiceSettings["stability"] = stability } - if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity } - if let style = request.style { voiceSettings["style"] = style } - if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost } - if !voiceSettings.isEmpty { payload["voice_settings"] = voiceSettings } - - let body = try JSONSerialization.data(withJSONObject: payload, options: []) - var req = URLRequest(url: url) - req.httpMethod = "POST" - req.httpBody = body - req.timeoutInterval = 45 - req.setValue("application/json", forHTTPHeaderField: "Content-Type") - req.setValue("audio/mpeg", forHTTPHeaderField: "Accept") - req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") - - let (data, response) = try await URLSession.shared.data(for: req) - if let http = response as? HTTPURLResponse, http.statusCode >= 400 { - let message = String(data: data, encoding: .utf8) ?? "unknown" - throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ - NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)", - ]) - } - return data - } -} - -private enum TalkModeRuntime { - static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? { - if let rateWPM, rateWPM > 0 { - let resolved = Double(rateWPM) / 175.0 - if resolved <= 0.5 || resolved >= 2.0 { return nil } - return resolved - } - if let speed { - if speed <= 0.5 || speed >= 2.0 { return nil } - return speed - } - return nil - } - - static func validatedUnit(_ value: Double?) -> Double? { - guard let value else { return nil } - if value < 0 || value > 1 { return nil } - return value - } - - static func validatedSeed(_ value: Int?) -> UInt32? { - guard let value else { return nil } - if value < 0 || value > 4_294_967_295 { return nil } - return UInt32(value) - } - - static func validatedNormalize(_ value: String?) -> String? { - guard let value else { return nil } - let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() - return ["auto", "on", "off"].contains(normalized) ? normalized : nil - } - - static func validatedLanguage(_ value: String?) -> String? { - guard let value else { return nil } - let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() - guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil } - return normalized - } - - static func validatedOutputFormat(_ value: String?) -> String? { - let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" - guard !trimmed.isEmpty else { return nil } - return trimmed.hasPrefix("mp3_") ? trimmed : nil - } - - static func isMessageTimestampAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool { - let sinceMs = sinceSeconds * 1000 - if timestamp > 10_000_000_000 { - return timestamp >= sinceMs - 500 - } - return timestamp >= sinceSeconds - 0.5 - } -} diff --git a/apps/ios/Sources/Voice/TalkOrbOverlay.swift b/apps/ios/Sources/Voice/TalkOrbOverlay.swift index 27bfbf4dc..cce8c1c61 100644 --- a/apps/ios/Sources/Voice/TalkOrbOverlay.swift +++ b/apps/ios/Sources/Voice/TalkOrbOverlay.swift @@ -12,14 +12,14 @@ struct TalkOrbOverlay: View { ZStack { Circle() .stroke(seam.opacity(0.26), lineWidth: 2) - .frame(width: 280, height: 280) + .frame(width: 320, height: 320) .scaleEffect(self.pulse ? 1.15 : 0.96) .opacity(self.pulse ? 0.0 : 1.0) .animation(.easeOut(duration: 1.3).repeatForever(autoreverses: false), value: self.pulse) Circle() .stroke(seam.opacity(0.18), lineWidth: 2) - .frame(width: 280, height: 280) + .frame(width: 320, height: 320) .scaleEffect(self.pulse ? 1.45 : 1.02) .opacity(self.pulse ? 0.0 : 0.9) .animation(.easeOut(duration: 1.9).repeatForever(autoreverses: false).delay(0.2), value: self.pulse) @@ -35,7 +35,7 @@ struct TalkOrbOverlay: View { center: .center, startRadius: 1, endRadius: 112)) - .frame(width: 168, height: 168) + .frame(width: 190, height: 190) .overlay( Circle() .stroke(seam.opacity(0.35), lineWidth: 1)) diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 8b2dd7061..97f8ee12f 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -291,7 +291,9 @@ actor TalkModeRuntime { await self.reloadConfig() guard self.isCurrent(gen) else { return } let prompt = self.buildPrompt(transcript: transcript) - let sessionKey = await GatewayConnection.shared.mainSessionKey() + let sessionKey = + await MainActor.run { WebChatManager.shared.activeSessionKey } ?? + await GatewayConnection.shared.mainSessionKey() let runId = UUID().uuidString let startedAt = Date().timeIntervalSince1970 self.logger.info( @@ -335,20 +337,9 @@ actor TalkModeRuntime { } private func buildPrompt(transcript: String) -> String { - var lines: [String] = [ - "Talk Mode active. Reply in a concise, spoken tone.", - "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", - ] - - if let interrupted = self.lastInterruptedAtSeconds { - let formatted = String(format: "%.1f", interrupted) - lines.append("Assistant speech interrupted at \(formatted)s.") - self.lastInterruptedAtSeconds = nil - } - - lines.append("") - lines.append(transcript) - return lines.joined(separator: "\n") + let interrupted = self.lastInterruptedAtSeconds + self.lastInterruptedAtSeconds = nil + return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted) } private func waitForAssistantText( @@ -378,7 +369,7 @@ actor TalkModeRuntime { guard message.role == "assistant" else { return false } guard let since else { return true } guard let timestamp = message.timestamp else { return false } - return Self.isMessageTimestampAfter(timestamp, sinceSeconds: since) + return TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) } guard let assistant else { return nil } let text = assistant.content.compactMap { $0.text }.joined(separator: "\n") @@ -421,76 +412,108 @@ actor TalkModeRuntime { } } - guard let apiKey = self.apiKey, !apiKey.isEmpty else { - self.logger.error("talk missing ELEVENLABS_API_KEY") - return - } - + let apiKey = self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines) let requestedVoice = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId - guard let voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey) else { - self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID") - return - } - guard self.isCurrent(gen) else { return } - self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)") - await self.startRecognition() - guard self.isCurrent(gen) else { return } - await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } - self.phase = .speaking + let language = ElevenLabsTTSClient.validatedLanguage(directive?.language) + + let voiceId: String? + if let apiKey, !apiKey.isEmpty { + voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey) + } else { + voiceId = nil + } + + if apiKey?.isEmpty != false { + self.ttsLogger.warning("talk missing ELEVENLABS_API_KEY; falling back to system voice") + } else if voiceId == nil { + self.ttsLogger.warning("talk missing voiceId; falling back to system voice") + } else if let voiceId { + self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)") + } self.lastSpokenText = cleaned - let resolvedSpeed = Self.resolveSpeed( - speed: directive?.speed, - rateWPM: directive?.rateWPM, - logger: self.logger) - - let request = ElevenLabsRequest( - text: cleaned, - modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, - outputFormat: Self.validatedOutputFormat(directive?.outputFormat ?? self.defaultOutputFormat, logger: self.logger), - speed: resolvedSpeed, - stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger), - similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger), - style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger), - speakerBoost: directive?.speakerBoost, - seed: Self.validatedSeed(directive?.seed, logger: self.logger), - normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger), - language: Self.validatedLanguage(directive?.language, logger: self.logger)) - let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12)) - self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s") do { - let client = ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger) - let audio = try await withThrowingTaskGroup(of: Data.self) { group in - group.addTask { - try await client.synthesize(voiceId: voiceId, request: request) + if let apiKey, !apiKey.isEmpty, let voiceId { + let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat + let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat) + if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty { + self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)") } - group.addTask { - try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000)) - throw NSError(domain: "TalkTTS", code: 408, userInfo: [ - NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s", + + let request = ElevenLabsTTSRequest( + text: cleaned, + modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, + outputFormat: outputFormat, + speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM), + stability: TalkTTSValidation.validatedUnit(directive?.stability), + similarity: TalkTTSValidation.validatedUnit(directive?.similarity), + style: TalkTTSValidation.validatedUnit(directive?.style), + speakerBoost: directive?.speakerBoost, + seed: TalkTTSValidation.validatedSeed(directive?.seed), + normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize), + language: language) + + self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s") + let client = ElevenLabsTTSClient(apiKey: apiKey) + let audio = try await client.synthesizeWithHardTimeout( + voiceId: voiceId, + request: request, + hardTimeoutSeconds: synthTimeoutSeconds) + guard self.isCurrent(gen) else { return } + self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)") + + if self.interruptOnSpeech { + await self.startRecognition() + guard self.isCurrent(gen) else { return } + } + + await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } + self.phase = .speaking + + let result = await TalkAudioPlayer.shared.play(data: audio) + self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)") + if !result.finished, result.interruptedAt == nil { + throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [ + NSLocalizedDescriptionKey: "audio playback failed", ]) } - let data = try await group.next()! - group.cancelAll() - return data - } - guard self.isCurrent(gen) else { return } - self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)") - let result = await TalkAudioPlayer.shared.play(data: audio) - self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)") - if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking { - if self.interruptOnSpeech { - self.lastInterruptedAtSeconds = interruptedAt + if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking { + if self.interruptOnSpeech { + self.lastInterruptedAtSeconds = interruptedAt + } } + } else { + self.ttsLogger.info("talk system voice start chars=\(cleaned.count, privacy: .public)") + if self.interruptOnSpeech { + await self.startRecognition() + guard self.isCurrent(gen) else { return } + } + await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } + self.phase = .speaking + await TalkSystemSpeechSynthesizer.shared.stop() + try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language) + self.ttsLogger.info("talk system voice done") } } catch { - self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)") + self.ttsLogger.error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice") + do { + if self.interruptOnSpeech { + await self.startRecognition() + guard self.isCurrent(gen) else { return } + } + await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } + self.phase = .speaking + await TalkSystemSpeechSynthesizer.shared.stop() + try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language) + } catch { + self.ttsLogger.error("talk system voice failed: \(error.localizedDescription, privacy: .public)") + } } if self.phase == .speaking { @@ -505,7 +528,7 @@ actor TalkModeRuntime { if let fallbackVoiceId { return fallbackVoiceId } do { - let voices = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).listVoices() + let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices() guard let first = voices.first else { self.ttsLogger.error("elevenlabs voices list empty") return nil @@ -528,6 +551,7 @@ actor TalkModeRuntime { func stopSpeaking(reason: TalkStopReason) async { let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() } + await TalkSystemSpeechSynthesizer.shared.stop() guard self.phase == .speaking else { return } if reason == .speech, let interruptedAt { self.lastInterruptedAtSeconds = interruptedAt @@ -720,154 +744,4 @@ actor TalkModeRuntime { return normalized } - private static func validatedLanguage(_ value: String?, logger: Logger) -> String? { - guard let value else { return nil } - let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() - guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { - logger.warning("talk language invalid: \(normalized, privacy: .public)") - return nil - } - return normalized - } - - private static func validatedOutputFormat(_ value: String?, logger: Logger) -> String? { - let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" - guard !trimmed.isEmpty else { return nil } - guard trimmed.hasPrefix("mp3_") else { - logger.warning("talk output_format unsupported for local playback: \(trimmed, privacy: .public)") - return nil - } - return trimmed - } - - private static func isMessageTimestampAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool { - let sinceMs = sinceSeconds * 1000 - if timestamp > 10_000_000_000 { - return timestamp >= sinceMs - 500 - } - return timestamp >= sinceSeconds - 0.5 - } -} - -private struct ElevenLabsRequest { - let text: String - let modelId: String? - let outputFormat: String? - let speed: Double? - let stability: Double? - let similarity: Double? - let style: Double? - let speakerBoost: Bool? - let seed: UInt32? - let normalize: String? - let language: String? -} - -private struct ElevenLabsClient { - let apiKey: String - let logger: Logger - let baseUrl: URL = URL(string: "https://api.elevenlabs.io")! - let ttsTimeoutSeconds: TimeInterval = 45 - let listVoicesTimeoutSeconds: TimeInterval = 15 - - func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data { - var url = self.baseUrl - url.appendPathComponent("v1") - url.appendPathComponent("text-to-speech") - url.appendPathComponent(voiceId) - - let charCount = request.text.count - self.logger.info( - "elevenlabs tts request voice=\(voiceId, privacy: .public) model=\(request.modelId ?? "default", privacy: .public) chars=\(charCount, privacy: .public)") - let startedAt = Date() - - var payload: [String: Any] = [ - "text": request.text, - ] - if let modelId = request.modelId, !modelId.isEmpty { - payload["model_id"] = modelId - } - if let outputFormat = request.outputFormat, !outputFormat.isEmpty { - payload["output_format"] = outputFormat - } - if let seed = request.seed { - payload["seed"] = seed - } - if let normalize = request.normalize { - payload["apply_text_normalization"] = normalize - } - if let language = request.language { - payload["language_code"] = language - } - var voiceSettings: [String: Any] = [:] - if let speed = request.speed { voiceSettings["speed"] = speed } - if let stability = request.stability { voiceSettings["stability"] = stability } - if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity } - if let style = request.style { voiceSettings["style"] = style } - if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost } - if !voiceSettings.isEmpty { - payload["voice_settings"] = voiceSettings - } - - let body = try JSONSerialization.data(withJSONObject: payload, options: []) - var req = URLRequest(url: url) - req.httpMethod = "POST" - req.httpBody = body - req.timeoutInterval = self.ttsTimeoutSeconds - req.setValue("application/json", forHTTPHeaderField: "Content-Type") - req.setValue("audio/mpeg", forHTTPHeaderField: "Accept") - req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") - - let (data, response) = try await URLSession.shared.data(for: req) - if let http = response as? HTTPURLResponse, http.statusCode >= 400 { - let message = String(data: data, encoding: .utf8) ?? "unknown" - self.logger.error( - "elevenlabs tts failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)") - throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ - NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)", - ]) - } - let elapsed = Date().timeIntervalSince(startedAt) - self.logger.info("elevenlabs tts ok bytes=\(data.count, privacy: .public) dur=\(elapsed, privacy: .public)s") - return data - } - - func listVoices() async throws -> [ElevenLabsVoice] { - var url = self.baseUrl - url.appendPathComponent("v1") - url.appendPathComponent("voices") - - self.logger.info("elevenlabs voices list request") - var req = URLRequest(url: url) - req.httpMethod = "GET" - req.timeoutInterval = self.listVoicesTimeoutSeconds - req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") - - let (data, response) = try await URLSession.shared.data(for: req) - if let http = response as? HTTPURLResponse, http.statusCode >= 400 { - let message = String(data: data, encoding: .utf8) ?? "unknown" - self.logger.error( - "elevenlabs voices list failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)") - throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ - NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)", - ]) - } - - let decoded = try JSONDecoder().decode(ElevenLabsVoicesResponse.self, from: data) - return decoded.voices - } -} - -private struct ElevenLabsVoice: Decodable { - let voiceId: String - let name: String? - - enum CodingKeys: String, CodingKey { - case voiceId = "voice_id" - case name - } -} - -private struct ElevenLabsVoicesResponse: Decodable { - let voices: [ElevenLabsVoice] } diff --git a/apps/macos/Sources/Clawdis/TalkOverlay.swift b/apps/macos/Sources/Clawdis/TalkOverlay.swift index bbddd2a77..7f5ec7848 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlay.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift @@ -7,7 +7,7 @@ import SwiftUI @Observable final class TalkOverlayController { static let shared = TalkOverlayController() - static let overlaySize: CGFloat = 360 + static let overlaySize: CGFloat = 440 static let windowInset: CGFloat = 88 private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay") diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift index e9b3091d5..c305cbf08 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -31,7 +31,7 @@ struct TalkOverlayView: View { } .buttonStyle(.plain) .contentShape(Circle()) - .offset(x: -5, y: -5) + .offset(x: -2, y: -2) .opacity(self.hoveringWindow ? 1 : 0) .animation(.easeOut(duration: 0.12), value: self.hoveringWindow) .allowsHitTesting(self.hoveringWindow) @@ -42,7 +42,7 @@ struct TalkOverlayView: View { .onHover { self.hoveringWindow = $0 } } - private static let defaultSeamColor = Color(red: 127 / 255.0, green: 184 / 255.0, blue: 212 / 255.0) + private static let defaultSeamColor = Color(red: 79 / 255.0, green: 122 / 255.0, blue: 154 / 255.0) private var seamColor: Color { Self.color(fromHex: self.appState.seamColorHex) ?? Self.defaultSeamColor diff --git a/apps/macos/Sources/Clawdis/WebChatManager.swift b/apps/macos/Sources/Clawdis/WebChatManager.swift index 3d550ada3..2f77692de 100644 --- a/apps/macos/Sources/Clawdis/WebChatManager.swift +++ b/apps/macos/Sources/Clawdis/WebChatManager.swift @@ -29,6 +29,10 @@ final class WebChatManager { var onPanelVisibilityChanged: ((Bool) -> Void)? + var activeSessionKey: String? { + self.panelSessionKey ?? self.windowSessionKey + } + func show(sessionKey: String) { self.closePanel() if let controller = self.windowController { diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift new file mode 100644 index 000000000..c4b1e7999 --- /dev/null +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift @@ -0,0 +1,233 @@ +import Foundation + +public struct ElevenLabsVoice: Decodable, Sendable { + public let voiceId: String + public let name: String? + + enum CodingKeys: String, CodingKey { + case voiceId = "voice_id" + case name + } +} + +public struct ElevenLabsTTSRequest: Sendable { + public var text: String + public var modelId: String? + public var outputFormat: String? + public var speed: Double? + public var stability: Double? + public var similarity: Double? + public var style: Double? + public var speakerBoost: Bool? + public var seed: UInt32? + public var normalize: String? + public var language: String? + + public init( + text: String, + modelId: String? = nil, + outputFormat: String? = nil, + speed: Double? = nil, + stability: Double? = nil, + similarity: Double? = nil, + style: Double? = nil, + speakerBoost: Bool? = nil, + seed: UInt32? = nil, + normalize: String? = nil, + language: String? = nil) + { + self.text = text + self.modelId = modelId + self.outputFormat = outputFormat + self.speed = speed + self.stability = stability + self.similarity = similarity + self.style = style + self.speakerBoost = speakerBoost + self.seed = seed + self.normalize = normalize + self.language = language + } +} + +public struct ElevenLabsTTSClient: Sendable { + public var apiKey: String + public var requestTimeoutSeconds: TimeInterval + public var listVoicesTimeoutSeconds: TimeInterval + public var baseUrl: URL + + public init( + apiKey: String, + requestTimeoutSeconds: TimeInterval = 45, + listVoicesTimeoutSeconds: TimeInterval = 15, + baseUrl: URL = URL(string: "https://api.elevenlabs.io")!) + { + self.apiKey = apiKey + self.requestTimeoutSeconds = requestTimeoutSeconds + self.listVoicesTimeoutSeconds = listVoicesTimeoutSeconds + self.baseUrl = baseUrl + } + + public func synthesizeWithHardTimeout( + voiceId: String, + request: ElevenLabsTTSRequest, + hardTimeoutSeconds: TimeInterval) async throws -> Data + { + try await withThrowingTaskGroup(of: Data.self) { group in + group.addTask { + try await self.synthesize(voiceId: voiceId, request: request) + } + group.addTask { + try await Task.sleep(nanoseconds: UInt64(hardTimeoutSeconds * 1_000_000_000)) + throw NSError(domain: "ElevenLabsTTS", code: 408, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(hardTimeoutSeconds)s", + ]) + } + let data = try await group.next()! + group.cancelAll() + return data + } + } + + public func synthesize(voiceId: String, request: ElevenLabsTTSRequest) async throws -> Data { + var url = self.baseUrl + url.appendPathComponent("v1") + url.appendPathComponent("text-to-speech") + url.appendPathComponent(voiceId) + + let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: []) + + var lastError: Error? + for attempt in 0..<3 { + var req = URLRequest(url: url) + req.httpMethod = "POST" + req.httpBody = body + req.timeoutInterval = self.requestTimeoutSeconds + req.setValue("application/json", forHTTPHeaderField: "Content-Type") + req.setValue("audio/mpeg", forHTTPHeaderField: "Accept") + req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") + + do { + let (data, response) = try await URLSession.shared.data(for: req) + if let http = response as? HTTPURLResponse { + let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased() + if http.statusCode == 429 || http.statusCode >= 500 { + let message = Self.truncatedErrorBody(data) + lastError = NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs retryable failure: \(http.statusCode) ct=\(contentType) \(message)", + ]) + if attempt < 2 { + let retryAfter = Double(http.value(forHTTPHeaderField: "Retry-After") ?? "") + let baseDelay = [0.25, 0.75, 1.5][attempt] + let delaySeconds = max(baseDelay, retryAfter ?? 0) + try? await Task.sleep(nanoseconds: UInt64(delaySeconds * 1_000_000_000)) + continue + } + throw lastError! + } + + if http.statusCode >= 400 { + let message = Self.truncatedErrorBody(data) + throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)", + ]) + } + + if !contentType.contains("audio") { + let message = Self.truncatedErrorBody(data) + throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)", + ]) + } + } + return data + } catch { + lastError = error + if attempt < 2 { + try? await Task.sleep(nanoseconds: UInt64([0.25, 0.75, 1.5][attempt] * 1_000_000_000)) + continue + } + throw error + } + } + throw lastError ?? NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs failed", + ]) + } + + public func listVoices() async throws -> [ElevenLabsVoice] { + var url = self.baseUrl + url.appendPathComponent("v1") + url.appendPathComponent("voices") + + var req = URLRequest(url: url) + req.httpMethod = "GET" + req.timeoutInterval = self.listVoicesTimeoutSeconds + req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") + + let (data, response) = try await URLSession.shared.data(for: req) + if let http = response as? HTTPURLResponse, http.statusCode >= 400 { + let message = Self.truncatedErrorBody(data) + throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)", + ]) + } + + struct VoicesResponse: Decodable { let voices: [ElevenLabsVoice] } + return try JSONDecoder().decode(VoicesResponse.self, from: data).voices + } + + public static func validatedOutputFormat(_ value: String?) -> String? { + let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return nil } + guard trimmed.hasPrefix("mp3_") else { return nil } + return trimmed + } + + public static func validatedLanguage(_ value: String?) -> String? { + let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil } + return normalized + } + + public static func validatedNormalize(_ value: String?) -> String? { + let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + guard ["auto", "on", "off"].contains(normalized) else { return nil } + return normalized + } + + private static func buildPayload(_ request: ElevenLabsTTSRequest) -> [String: Any] { + var payload: [String: Any] = ["text": request.text] + if let modelId = request.modelId?.trimmingCharacters(in: .whitespacesAndNewlines), !modelId.isEmpty { + payload["model_id"] = modelId + } + if let outputFormat = request.outputFormat?.trimmingCharacters(in: .whitespacesAndNewlines), !outputFormat.isEmpty { + payload["output_format"] = outputFormat + } + if let seed = request.seed { + payload["seed"] = seed + } + if let normalize = request.normalize { + payload["apply_text_normalization"] = normalize + } + if let language = request.language { + payload["language_code"] = language + } + + var voiceSettings: [String: Any] = [:] + if let speed = request.speed { voiceSettings["speed"] = speed } + if let stability = request.stability { voiceSettings["stability"] = stability } + if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity } + if let style = request.style { voiceSettings["style"] = style } + if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost } + if !voiceSettings.isEmpty { + payload["voice_settings"] = voiceSettings + } + return payload + } + + private static func truncatedErrorBody(_ data: Data) -> String { + let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown" + return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ") + } +} diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift index af0e2365f..6c460dc02 100644 --- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift @@ -67,12 +67,18 @@ public enum TalkDirectiveParser { var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false) guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) } - guard let firstNonEmpty = + guard let firstNonEmptyIndex = lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) } + var firstNonEmpty = firstNonEmptyIndex + if firstNonEmpty > 0 { + lines.removeSubrange(0.. Bool { + let sinceMs = sinceSeconds * 1000 + // ~2286-11-20 in epoch seconds. Anything bigger is almost certainly epoch milliseconds. + if timestamp > 10_000_000_000 { + return timestamp >= sinceMs - 500 + } + return timestamp >= sinceSeconds - 0.5 + } +} + diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift new file mode 100644 index 000000000..95842d685 --- /dev/null +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift @@ -0,0 +1,18 @@ +public enum TalkPromptBuilder: Sendable { + public static func build(transcript: String, interruptedAtSeconds: Double?) -> String { + var lines: [String] = [ + "Talk Mode active. Reply in a concise, spoken tone.", + "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", + ] + + if let interruptedAtSeconds { + let formatted = String(format: "%.1f", interruptedAtSeconds) + lines.append("Assistant speech interrupted at \(formatted)s.") + } + + lines.append("") + lines.append(transcript) + return lines.joined(separator: "\n") + } +} + diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkSystemSpeechSynthesizer.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkSystemSpeechSynthesizer.swift new file mode 100644 index 000000000..d1d4eeb39 --- /dev/null +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkSystemSpeechSynthesizer.swift @@ -0,0 +1,110 @@ +import AVFoundation +import Foundation + +@MainActor +public final class TalkSystemSpeechSynthesizer: NSObject { + public enum SpeakError: Error { + case canceled + } + + public static let shared = TalkSystemSpeechSynthesizer() + + private let synth = AVSpeechSynthesizer() + private var speakContinuation: CheckedContinuation? + private var currentUtterance: AVSpeechUtterance? + private var currentToken = UUID() + private var watchdog: Task? + + public var isSpeaking: Bool { self.synth.isSpeaking } + + private override init() { + super.init() + self.synth.delegate = self + } + + public func stop() { + self.currentToken = UUID() + self.watchdog?.cancel() + self.watchdog = nil + self.synth.stopSpeaking(at: .immediate) + self.finishCurrent(with: SpeakError.canceled) + } + + public func speak(text: String, language: String? = nil) async throws { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return } + + self.stop() + let token = UUID() + self.currentToken = token + + let utterance = AVSpeechUtterance(string: trimmed) + if let language, let voice = AVSpeechSynthesisVoice(language: language) { + utterance.voice = voice + } + self.currentUtterance = utterance + + let estimatedSeconds = max(3.0, min(180.0, Double(trimmed.count) * 0.08)) + self.watchdog?.cancel() + self.watchdog = Task { @MainActor [weak self] in + guard let self else { return } + try? await Task.sleep(nanoseconds: UInt64(estimatedSeconds * 1_000_000_000)) + if Task.isCancelled { return } + guard self.currentToken == token else { return } + if self.synth.isSpeaking { + self.synth.stopSpeaking(at: .immediate) + } + self.finishCurrent( + with: NSError(domain: "TalkSystemSpeechSynthesizer", code: 408, userInfo: [ + NSLocalizedDescriptionKey: "system TTS timed out after \(estimatedSeconds)s", + ])) + } + + try await withTaskCancellationHandler(operation: { + try await withCheckedThrowingContinuation { cont in + self.speakContinuation = cont + self.synth.speak(utterance) + } + }, onCancel: { + Task { @MainActor in + self.stop() + } + }) + + if self.currentToken != token { + throw SpeakError.canceled + } + } + + private func handleFinish(error: Error?) { + guard self.currentUtterance != nil else { return } + self.watchdog?.cancel() + self.watchdog = nil + self.finishCurrent(with: error) + } + + private func finishCurrent(with error: Error?) { + self.currentUtterance = nil + let cont = self.speakContinuation + self.speakContinuation = nil + if let error { + cont?.resume(throwing: error) + } else { + cont?.resume(returning: ()) + } + } +} + +extension TalkSystemSpeechSynthesizer: AVSpeechSynthesizerDelegate { + public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) { + Task { @MainActor in + self.handleFinish(error: nil) + } + } + + public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) { + Task { @MainActor in + self.handleFinish(error: SpeakError.canceled) + } + } +} diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift new file mode 100644 index 000000000..8137998ac --- /dev/null +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift @@ -0,0 +1,27 @@ +public enum TalkTTSValidation: Sendable { + public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? { + if let rateWPM, rateWPM > 0 { + let resolved = Double(rateWPM) / 175.0 + if resolved <= 0.5 || resolved >= 2.0 { return nil } + return resolved + } + if let speed { + if speed <= 0.5 || speed >= 2.0 { return nil } + return speed + } + return nil + } + + public static func validatedUnit(_ value: Double?) -> Double? { + guard let value else { return nil } + if value < 0 || value > 1 { return nil } + return value + } + + public static func validatedSeed(_ value: Int?) -> UInt32? { + guard let value else { return nil } + if value < 0 || value > 4294967295 { return nil } + return UInt32(value) + } +} + diff --git a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/ElevenLabsTTSValidationTests.swift b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/ElevenLabsTTSValidationTests.swift new file mode 100644 index 000000000..716c50508 --- /dev/null +++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/ElevenLabsTTSValidationTests.swift @@ -0,0 +1,20 @@ +import XCTest +@testable import ClawdisKit + +final class ElevenLabsTTSValidationTests: XCTestCase { + func testValidatedOutputFormatAllowsOnlyMp3Presets() { + XCTAssertEqual(ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128"), "mp3_44100_128") + XCTAssertNil(ElevenLabsTTSClient.validatedOutputFormat("pcm_16000")) + } + + func testValidatedLanguageAcceptsTwoLetterCodes() { + XCTAssertEqual(ElevenLabsTTSClient.validatedLanguage("EN"), "en") + XCTAssertNil(ElevenLabsTTSClient.validatedLanguage("eng")) + } + + func testValidatedNormalizeAcceptsKnownValues() { + XCTAssertEqual(ElevenLabsTTSClient.validatedNormalize("AUTO"), "auto") + XCTAssertNil(ElevenLabsTTSClient.validatedNormalize("maybe")) + } +} + diff --git a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift index cbfdb572b..c1169317f 100644 --- a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift +++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift @@ -50,6 +50,18 @@ final class TalkDirectiveTests: XCTestCase { XCTAssertEqual(result.stripped, "Hello.") } + func testSkipsLeadingEmptyLinesWhenParsingDirective() { + let text = """ + + + {"voice":"abc123"} + Hello there. + """ + let result = TalkDirectiveParser.parse(text) + XCTAssertEqual(result.directive?.voiceId, "abc123") + XCTAssertEqual(result.stripped, "Hello there.") + } + func testTracksUnknownKeys() { let text = """ {"voice":"abc","mystery":"value","extra":1} diff --git a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkHistoryTimestampTests.swift b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkHistoryTimestampTests.swift new file mode 100644 index 000000000..e4e6d8aea --- /dev/null +++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkHistoryTimestampTests.swift @@ -0,0 +1,16 @@ +import XCTest +@testable import ClawdisKit + +final class TalkHistoryTimestampTests: XCTestCase { + func testSecondsTimestampsAreAcceptedWithSmallTolerance() { + XCTAssertTrue(TalkHistoryTimestamp.isAfter(999.6, sinceSeconds: 1000)) + XCTAssertFalse(TalkHistoryTimestamp.isAfter(999.4, sinceSeconds: 1000)) + } + + func testMillisecondsTimestampsAreAcceptedWithSmallTolerance() { + let sinceSeconds = 1_700_000_000.0 + let sinceMs = sinceSeconds * 1000 + XCTAssertTrue(TalkHistoryTimestamp.isAfter(sinceMs - 500, sinceSeconds: sinceSeconds)) + XCTAssertFalse(TalkHistoryTimestamp.isAfter(sinceMs - 501, sinceSeconds: sinceSeconds)) + } +} diff --git a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkPromptBuilderTests.swift b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkPromptBuilderTests.swift new file mode 100644 index 000000000..c034cf07d --- /dev/null +++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkPromptBuilderTests.swift @@ -0,0 +1,16 @@ +import XCTest +@testable import ClawdisKit + +final class TalkPromptBuilderTests: XCTestCase { + func testBuildIncludesTranscript() { + let prompt = TalkPromptBuilder.build(transcript: "Hello", interruptedAtSeconds: nil) + XCTAssertTrue(prompt.contains("Talk Mode active.")) + XCTAssertTrue(prompt.hasSuffix("\n\nHello")) + } + + func testBuildIncludesInterruptionLineWhenProvided() { + let prompt = TalkPromptBuilder.build(transcript: "Hi", interruptedAtSeconds: 1.234) + XCTAssertTrue(prompt.contains("Assistant speech interrupted at 1.2s.")) + } +} + diff --git a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift new file mode 100644 index 000000000..f2d7c3c9e --- /dev/null +++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkTTSValidationTests.swift @@ -0,0 +1,24 @@ +import XCTest +@testable import ClawdisKit + +final class TalkTTSValidationTests: XCTestCase { + func testResolveSpeedUsesRateWPMWhenProvided() { + let resolved = TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 175) + XCTAssertNotNil(resolved) + XCTAssertEqual(resolved ?? 0, 1.0, accuracy: 0.0001) + XCTAssertNil(TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 400)) + } + + func testValidatedUnitBounds() { + XCTAssertEqual(TalkTTSValidation.validatedUnit(0), 0) + XCTAssertEqual(TalkTTSValidation.validatedUnit(1), 1) + XCTAssertNil(TalkTTSValidation.validatedUnit(-0.01)) + XCTAssertNil(TalkTTSValidation.validatedUnit(1.01)) + } + + func testValidatedSeedBounds() { + XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0) + XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234) + XCTAssertNil(TalkTTSValidation.validatedSeed(-1)) + } +}