fix(talk): harden TTS + add system fallback

This commit is contained in:
Peter Steinberger
2025-12-30 07:40:02 +01:00
parent a7617e4d79
commit f86772f26c
22 changed files with 839 additions and 468 deletions

View File

@@ -16,6 +16,8 @@
- macOS Talk Mode: fix audio stop ordering so disabling Talk Mode always stops in-flight playback.
- macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn.
- macOS Talk Mode: increase overlay window size so wave rings dont clip; close button is hover-only and closer to the orb.
- Talk Mode: fall back to system TTS when ElevenLabs is unavailable, returns non-audio, or playback fails (macOS/iOS/Android).
- ElevenLabs: add retry/backoff for 429/5xx and include content-type in errors for debugging.
- Talk Mode: align to the gateways main session key and fall back to history polling when chat events drop (prevents stuck “thinking” / missing messages).
- Talk Mode: treat history timestamps as seconds or milliseconds to avoid stale assistant picks (macOS/iOS/Android).
- Chat UI: dedupe identical history messages to avoid duplicate bubbles.

View File

@@ -930,7 +930,7 @@ class NodeRuntime(context: Context) {
private data class Quad<A, B, C, D>(val first: A, val second: B, val third: C, val fourth: D)
private const val DEFAULT_SEAM_COLOR_ARGB: Long = 0xFF7FB8D4
private const val DEFAULT_SEAM_COLOR_ARGB: Long = 0xFF4F7A9A
private const val a2uiReadyCheckJS: String =
"""

View File

@@ -62,9 +62,9 @@ fun TalkOrbOverlay(
verticalArrangement = Arrangement.spacedBy(12.dp),
) {
Box(contentAlignment = Alignment.Center) {
Canvas(modifier = Modifier.size(300.dp)) {
Canvas(modifier = Modifier.size(360.dp)) {
val center = this.center
val baseRadius = size.minDimension * 0.27f
val baseRadius = size.minDimension * 0.30f
val ring1 = 1.05f + (t * 0.25f)
val ring2 = 1.20f + (t * 0.55f)

View File

@@ -13,6 +13,8 @@ import android.os.SystemClock
import android.speech.RecognitionListener
import android.speech.RecognizerIntent
import android.speech.SpeechRecognizer
import android.speech.tts.TextToSpeech
import android.speech.tts.UtteranceProgressListener
import android.util.Log
import androidx.core.content.ContextCompat
import com.steipete.clawdis.node.bridge.BridgeSession
@@ -89,6 +91,9 @@ class TalkModeManager(
private var player: MediaPlayer? = null
private var currentAudioFile: File? = null
private var systemTts: TextToSpeech? = null
private var systemTtsPending: CompletableDeferred<Unit>? = null
private var systemTtsPendingId: String? = null
fun attachSession(session: BridgeSession) {
this.session = session
@@ -181,6 +186,10 @@ class TalkModeManager(
recognizer?.destroy()
recognizer = null
}
systemTts?.stop()
systemTtsPending?.cancel()
systemTtsPending = null
systemTtsPendingId = null
}
private fun startListeningInternal(markListening: Boolean) {
@@ -441,16 +450,6 @@ class TalkModeManager(
apiKey?.trim()?.takeIf { it.isNotEmpty() }
?: System.getenv("ELEVENLABS_API_KEY")?.trim()
val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId
if (voiceId.isNullOrBlank()) {
_statusText.value = "Missing voice ID"
Log.w(tag, "missing voiceId")
return
}
if (apiKey.isNullOrEmpty()) {
_statusText.value = "Missing ELEVENLABS_API_KEY"
Log.w(tag, "missing ELEVENLABS_API_KEY")
return
}
_statusText.value = "Speaking…"
_isSpeaking.value = true
@@ -458,28 +457,46 @@ class TalkModeManager(
ensureInterruptListener()
try {
val ttsStarted = SystemClock.elapsedRealtime()
val request =
ElevenLabsRequest(
text = cleaned,
modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
outputFormat =
TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
stability = TalkModeRuntime.validatedUnit(directive?.stability),
similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
style = TalkModeRuntime.validatedUnit(directive?.style),
speakerBoost = directive?.speakerBoost,
seed = TalkModeRuntime.validatedSeed(directive?.seed),
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
language = TalkModeRuntime.validatedLanguage(directive?.language),
)
val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request)
Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
playAudio(audio)
val canUseElevenLabs = !voiceId.isNullOrBlank() && !apiKey.isNullOrEmpty()
if (!canUseElevenLabs) {
if (voiceId.isNullOrBlank()) {
Log.w(tag, "missing voiceId; falling back to system voice")
}
if (apiKey.isNullOrEmpty()) {
Log.w(tag, "missing ELEVENLABS_API_KEY; falling back to system voice")
}
_statusText.value = "Speaking (System)…"
speakWithSystemTts(cleaned)
} else {
val ttsStarted = SystemClock.elapsedRealtime()
val request =
ElevenLabsRequest(
text = cleaned,
modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
outputFormat =
TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
stability = TalkModeRuntime.validatedUnit(directive?.stability),
similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
style = TalkModeRuntime.validatedUnit(directive?.style),
speakerBoost = directive?.speakerBoost,
seed = TalkModeRuntime.validatedSeed(directive?.seed),
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
language = TalkModeRuntime.validatedLanguage(directive?.language),
)
val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
playAudio(audio)
}
} catch (err: Throwable) {
_statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}")
Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
try {
_statusText.value = "Speaking (System)…"
speakWithSystemTts(cleaned)
} catch (fallbackErr: Throwable) {
_statusText.value = "Speak failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}"
Log.w(tag, "system voice failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}")
}
}
_isSpeaking.value = false
@@ -524,9 +541,103 @@ class TalkModeManager(
Log.d(tag, "play done")
}
private suspend fun speakWithSystemTts(text: String) {
val trimmed = text.trim()
if (trimmed.isEmpty()) return
val ok = ensureSystemTts()
if (!ok) {
throw IllegalStateException("system TTS unavailable")
}
val tts = systemTts ?: throw IllegalStateException("system TTS unavailable")
val utteranceId = "talk-${UUID.randomUUID()}"
val deferred = CompletableDeferred<Unit>()
systemTtsPending?.cancel()
systemTtsPending = deferred
systemTtsPendingId = utteranceId
withContext(Dispatchers.Main) {
val params = Bundle()
tts.speak(trimmed, TextToSpeech.QUEUE_FLUSH, params, utteranceId)
}
withContext(Dispatchers.IO) {
try {
kotlinx.coroutines.withTimeout(180_000) { deferred.await() }
} catch (err: Throwable) {
throw err
}
}
}
private suspend fun ensureSystemTts(): Boolean {
if (systemTts != null) return true
return withContext(Dispatchers.Main) {
val deferred = CompletableDeferred<Boolean>()
val tts =
try {
TextToSpeech(context) { status ->
deferred.complete(status == TextToSpeech.SUCCESS)
}
} catch (_: Throwable) {
deferred.complete(false)
null
}
if (tts == null) return@withContext false
tts.setOnUtteranceProgressListener(
object : UtteranceProgressListener() {
override fun onStart(utteranceId: String?) {}
override fun onDone(utteranceId: String?) {
if (utteranceId == null) return
if (utteranceId != systemTtsPendingId) return
systemTtsPending?.complete(Unit)
systemTtsPending = null
systemTtsPendingId = null
}
@Deprecated("Deprecated in Java")
override fun onError(utteranceId: String?) {
if (utteranceId == null) return
if (utteranceId != systemTtsPendingId) return
systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error"))
systemTtsPending = null
systemTtsPendingId = null
}
override fun onError(utteranceId: String?, errorCode: Int) {
if (utteranceId == null) return
if (utteranceId != systemTtsPendingId) return
systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error $errorCode"))
systemTtsPending = null
systemTtsPendingId = null
}
},
)
val ok =
try {
deferred.await()
} catch (_: Throwable) {
false
}
if (ok) {
systemTts = tts
} else {
tts.shutdown()
}
ok
}
}
private fun stopSpeaking(resetInterrupt: Boolean = true) {
if (!_isSpeaking.value) {
cleanupPlayer()
systemTts?.stop()
systemTtsPending?.cancel()
systemTtsPending = null
systemTtsPendingId = null
return
}
if (resetInterrupt) {
@@ -534,6 +645,10 @@ class TalkModeManager(
lastInterruptedAtSeconds = currentMs / 1000.0
}
cleanupPlayer()
systemTts?.stop()
systemTtsPending?.cancel()
systemTtsPending = null
systemTtsPendingId = null
_isSpeaking.value = false
}

View File

@@ -293,7 +293,7 @@ final class NodeAppModel {
Self.color(fromHex: self.seamColorHex) ?? Self.defaultSeamColor
}
private static let defaultSeamColor = Color(red: 127 / 255.0, green: 184 / 255.0, blue: 212 / 255.0)
private static let defaultSeamColor = Color(red: 79 / 255.0, green: 122 / 255.0, blue: 154 / 255.0)
private static func color(fromHex raw: String?) -> Color? {
let trimmed = (raw ?? "").trimmingCharacters(in: .whitespacesAndNewlines)

View File

@@ -105,6 +105,7 @@ final class TalkModeManager: NSObject {
self.stopRecognition()
self.stopSpeaking()
self.lastInterruptedAtSeconds = nil
TalkSystemSpeechSynthesizer.shared.stop()
do {
try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
} catch {
@@ -301,20 +302,9 @@ final class TalkModeManager: NSObject {
}
private func buildPrompt(transcript: String) -> String {
var lines: [String] = [
"Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
]
if let interrupted = self.lastInterruptedAtSeconds {
let formatted = String(format: "%.1f", interrupted)
lines.append("Assistant speech interrupted at \(formatted)s.")
self.lastInterruptedAtSeconds = nil
}
lines.append("")
lines.append(transcript)
return lines.joined(separator: "\n")
let interrupted = self.lastInterruptedAtSeconds
self.lastInterruptedAtSeconds = nil
return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted)
}
private enum ChatCompletionState: CustomStringConvertible {
@@ -409,7 +399,7 @@ final class TalkModeManager: NSObject {
for msg in messages.reversed() {
guard (msg["role"] as? String) == "assistant" else { continue }
if let since, let timestamp = msg["timestamp"] as? Double,
TalkModeRuntime.isMessageTimestampAfter(timestamp, sinceSeconds: since) == false
TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) == false
{
continue
}
@@ -440,81 +430,91 @@ final class TalkModeManager: NSObject {
}
}
let voiceId = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId
guard let voiceId, !voiceId.isEmpty else {
self.statusText = "Missing voice ID"
self.logger.error("missing voiceId")
return
}
let resolvedKey =
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
guard let apiKey = resolvedKey, !apiKey.isEmpty else {
self.statusText = "Missing ELEVENLABS_API_KEY"
self.logger.error("missing ELEVENLABS_API_KEY")
return
}
self.statusText = "Generating voice…"
self.isSpeaking = true
self.lastSpokenText = cleaned
do {
let started = Date()
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
let outputFormat = TalkModeRuntime.validatedOutputFormat(desiredOutputFormat)
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
}
let request = ElevenLabsRequest(
text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
outputFormat: outputFormat,
speed: TalkModeRuntime.resolveSpeed(
speed: directive?.speed,
rateWPM: directive?.rateWPM),
stability: TalkModeRuntime.validatedUnit(directive?.stability),
similarity: TalkModeRuntime.validatedUnit(directive?.similarity),
style: TalkModeRuntime.validatedUnit(directive?.style),
speakerBoost: directive?.speakerBoost,
seed: TalkModeRuntime.validatedSeed(directive?.seed),
normalize: TalkModeRuntime.validatedNormalize(directive?.normalize),
language: TalkModeRuntime.validatedLanguage(directive?.language))
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
let client = ElevenLabsClient(apiKey: apiKey)
let audio = try await withThrowingTaskGroup(of: Data.self) { group in
group.addTask {
try await client.synthesize(voiceId: voiceId, request: request)
}
group.addTask {
try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000))
throw NSError(domain: "TalkTTS", code: 408, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s",
])
}
let data = try await group.next()!
group.cancelAll()
return data
}
self.logger
.info(
"elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
let voiceId = (directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId)?
.trimmingCharacters(in: .whitespacesAndNewlines)
let resolvedKey =
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)
if self.interruptOnSpeech {
do {
try self.startRecognition()
} catch {
self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
if canUseElevenLabs, let voiceId, let apiKey {
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
}
}
self.statusText = "Speaking…"
try await self.playAudio(data: audio)
let request = ElevenLabsTTSRequest(
text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
outputFormat: outputFormat,
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
stability: TalkTTSValidation.validatedUnit(directive?.stability),
similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
style: TalkTTSValidation.validatedUnit(directive?.style),
speakerBoost: directive?.speakerBoost,
seed: TalkTTSValidation.validatedSeed(directive?.seed),
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
language: language)
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
let client = ElevenLabsTTSClient(apiKey: apiKey)
let audio = try await client.synthesizeWithHardTimeout(
voiceId: voiceId,
request: request,
hardTimeoutSeconds: synthTimeoutSeconds)
self.logger
.info(
"elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
if self.interruptOnSpeech {
do {
try self.startRecognition()
} catch {
self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
}
}
self.statusText = "Speaking…"
try await self.playAudio(data: audio)
} else {
self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
if self.interruptOnSpeech {
do {
try self.startRecognition()
} catch {
self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
}
}
self.statusText = "Speaking (System)…"
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
}
} catch {
self.statusText = "Speak failed: \(error.localizedDescription)"
self.logger.error("speak failed: \(error.localizedDescription, privacy: .public)")
self.logger.error("tts failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
do {
if self.interruptOnSpeech {
do {
try self.startRecognition()
} catch {
self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
}
}
self.statusText = "Speaking (System)…"
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
} catch {
self.statusText = "Speak failed: \(error.localizedDescription)"
self.logger.error("system voice failed: \(error.localizedDescription, privacy: .public)")
}
}
self.stopRecognition()
@@ -527,7 +527,11 @@ final class TalkModeManager: NSObject {
self.player = player
player.prepareToPlay()
self.logger.info("play start")
player.play()
guard player.play() else {
throw NSError(domain: "TalkMode", code: 2, userInfo: [
NSLocalizedDescriptionKey: "audio player refused to play",
])
}
while player.isPlaying {
try? await Task.sleep(nanoseconds: 120_000_000)
}
@@ -541,6 +545,7 @@ final class TalkModeManager: NSObject {
}
self.player?.stop()
self.player = nil
TalkSystemSpeechSynthesizer.shared.stop()
self.isSpeaking = false
}
@@ -584,7 +589,7 @@ final class TalkModeManager: NSObject {
private static func configureAudioSession() throws {
let session = AVAudioSession.sharedInstance()
try session.setCategory(.playAndRecord, mode: .measurement, options: [
try session.setCategory(.playAndRecord, mode: .voiceChat, options: [
.duckOthers,
.mixWithOthers,
.allowBluetoothHFP,
@@ -609,127 +614,3 @@ final class TalkModeManager: NSObject {
}
}
}
private struct ElevenLabsRequest {
let text: String
let modelId: String?
let outputFormat: String?
let speed: Double?
let stability: Double?
let similarity: Double?
let style: Double?
let speakerBoost: Bool?
let seed: UInt32?
let normalize: String?
let language: String?
}
private struct ElevenLabsClient {
let apiKey: String
let baseUrl = URL(string: "https://api.elevenlabs.io")!
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
var payload: [String: Any] = [
"text": request.text,
]
if let modelId = request.modelId, !modelId.isEmpty {
payload["model_id"] = modelId
}
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
payload["output_format"] = outputFormat
}
if let seed = request.seed {
payload["seed"] = seed
}
if let normalize = request.normalize {
payload["apply_text_normalization"] = normalize
}
if let language = request.language {
payload["language_code"] = language
}
var voiceSettings: [String: Any] = [:]
if let speed = request.speed { voiceSettings["speed"] = speed }
if let stability = request.stability { voiceSettings["stability"] = stability }
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
if let style = request.style { voiceSettings["style"] = style }
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
if !voiceSettings.isEmpty { payload["voice_settings"] = voiceSettings }
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.timeoutInterval = 45
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = String(data: data, encoding: .utf8) ?? "unknown"
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
])
}
return data
}
}
private enum TalkModeRuntime {
static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
if let rateWPM, rateWPM > 0 {
let resolved = Double(rateWPM) / 175.0
if resolved <= 0.5 || resolved >= 2.0 { return nil }
return resolved
}
if let speed {
if speed <= 0.5 || speed >= 2.0 { return nil }
return speed
}
return nil
}
static func validatedUnit(_ value: Double?) -> Double? {
guard let value else { return nil }
if value < 0 || value > 1 { return nil }
return value
}
static func validatedSeed(_ value: Int?) -> UInt32? {
guard let value else { return nil }
if value < 0 || value > 4_294_967_295 { return nil }
return UInt32(value)
}
static func validatedNormalize(_ value: String?) -> String? {
guard let value else { return nil }
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
return ["auto", "on", "off"].contains(normalized) ? normalized : nil
}
static func validatedLanguage(_ value: String?) -> String? {
guard let value else { return nil }
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
return normalized
}
static func validatedOutputFormat(_ value: String?) -> String? {
let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
guard !trimmed.isEmpty else { return nil }
return trimmed.hasPrefix("mp3_") ? trimmed : nil
}
static func isMessageTimestampAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
let sinceMs = sinceSeconds * 1000
if timestamp > 10_000_000_000 {
return timestamp >= sinceMs - 500
}
return timestamp >= sinceSeconds - 0.5
}
}

View File

@@ -12,14 +12,14 @@ struct TalkOrbOverlay: View {
ZStack {
Circle()
.stroke(seam.opacity(0.26), lineWidth: 2)
.frame(width: 280, height: 280)
.frame(width: 320, height: 320)
.scaleEffect(self.pulse ? 1.15 : 0.96)
.opacity(self.pulse ? 0.0 : 1.0)
.animation(.easeOut(duration: 1.3).repeatForever(autoreverses: false), value: self.pulse)
Circle()
.stroke(seam.opacity(0.18), lineWidth: 2)
.frame(width: 280, height: 280)
.frame(width: 320, height: 320)
.scaleEffect(self.pulse ? 1.45 : 1.02)
.opacity(self.pulse ? 0.0 : 0.9)
.animation(.easeOut(duration: 1.9).repeatForever(autoreverses: false).delay(0.2), value: self.pulse)
@@ -35,7 +35,7 @@ struct TalkOrbOverlay: View {
center: .center,
startRadius: 1,
endRadius: 112))
.frame(width: 168, height: 168)
.frame(width: 190, height: 190)
.overlay(
Circle()
.stroke(seam.opacity(0.35), lineWidth: 1))

View File

@@ -291,7 +291,9 @@ actor TalkModeRuntime {
await self.reloadConfig()
guard self.isCurrent(gen) else { return }
let prompt = self.buildPrompt(transcript: transcript)
let sessionKey = await GatewayConnection.shared.mainSessionKey()
let sessionKey =
await MainActor.run { WebChatManager.shared.activeSessionKey } ??
await GatewayConnection.shared.mainSessionKey()
let runId = UUID().uuidString
let startedAt = Date().timeIntervalSince1970
self.logger.info(
@@ -335,20 +337,9 @@ actor TalkModeRuntime {
}
private func buildPrompt(transcript: String) -> String {
var lines: [String] = [
"Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
]
if let interrupted = self.lastInterruptedAtSeconds {
let formatted = String(format: "%.1f", interrupted)
lines.append("Assistant speech interrupted at \(formatted)s.")
self.lastInterruptedAtSeconds = nil
}
lines.append("")
lines.append(transcript)
return lines.joined(separator: "\n")
let interrupted = self.lastInterruptedAtSeconds
self.lastInterruptedAtSeconds = nil
return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted)
}
private func waitForAssistantText(
@@ -378,7 +369,7 @@ actor TalkModeRuntime {
guard message.role == "assistant" else { return false }
guard let since else { return true }
guard let timestamp = message.timestamp else { return false }
return Self.isMessageTimestampAfter(timestamp, sinceSeconds: since)
return TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since)
}
guard let assistant else { return nil }
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
@@ -421,76 +412,108 @@ actor TalkModeRuntime {
}
}
guard let apiKey = self.apiKey, !apiKey.isEmpty else {
self.logger.error("talk missing ELEVENLABS_API_KEY")
return
}
let apiKey = self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines)
let requestedVoice =
directive?.voiceId ??
self.currentVoiceId ??
self.defaultVoiceId
guard let voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey) else {
self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
return
}
guard self.isCurrent(gen) else { return }
self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")
await self.startRecognition()
guard self.isCurrent(gen) else { return }
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
self.phase = .speaking
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
let voiceId: String?
if let apiKey, !apiKey.isEmpty {
voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey)
} else {
voiceId = nil
}
if apiKey?.isEmpty != false {
self.ttsLogger.warning("talk missing ELEVENLABS_API_KEY; falling back to system voice")
} else if voiceId == nil {
self.ttsLogger.warning("talk missing voiceId; falling back to system voice")
} else if let voiceId {
self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")
}
self.lastSpokenText = cleaned
let resolvedSpeed = Self.resolveSpeed(
speed: directive?.speed,
rateWPM: directive?.rateWPM,
logger: self.logger)
let request = ElevenLabsRequest(
text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
outputFormat: Self.validatedOutputFormat(directive?.outputFormat ?? self.defaultOutputFormat, logger: self.logger),
speed: resolvedSpeed,
stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger),
similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger),
style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger),
speakerBoost: directive?.speakerBoost,
seed: Self.validatedSeed(directive?.seed, logger: self.logger),
normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger),
language: Self.validatedLanguage(directive?.language, logger: self.logger))
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
do {
let client = ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger)
let audio = try await withThrowingTaskGroup(of: Data.self) { group in
group.addTask {
try await client.synthesize(voiceId: voiceId, request: request)
if let apiKey, !apiKey.isEmpty, let voiceId {
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
}
group.addTask {
try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000))
throw NSError(domain: "TalkTTS", code: 408, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s",
let request = ElevenLabsTTSRequest(
text: cleaned,
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
outputFormat: outputFormat,
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
stability: TalkTTSValidation.validatedUnit(directive?.stability),
similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
style: TalkTTSValidation.validatedUnit(directive?.style),
speakerBoost: directive?.speakerBoost,
seed: TalkTTSValidation.validatedSeed(directive?.seed),
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
language: language)
self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
let client = ElevenLabsTTSClient(apiKey: apiKey)
let audio = try await client.synthesizeWithHardTimeout(
voiceId: voiceId,
request: request,
hardTimeoutSeconds: synthTimeoutSeconds)
guard self.isCurrent(gen) else { return }
self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
if self.interruptOnSpeech {
await self.startRecognition()
guard self.isCurrent(gen) else { return }
}
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
self.phase = .speaking
let result = await TalkAudioPlayer.shared.play(data: audio)
self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
if !result.finished, result.interruptedAt == nil {
throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [
NSLocalizedDescriptionKey: "audio playback failed",
])
}
let data = try await group.next()!
group.cancelAll()
return data
}
guard self.isCurrent(gen) else { return }
self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
let result = await TalkAudioPlayer.shared.play(data: audio)
self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
if self.interruptOnSpeech {
self.lastInterruptedAtSeconds = interruptedAt
if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
if self.interruptOnSpeech {
self.lastInterruptedAtSeconds = interruptedAt
}
}
} else {
self.ttsLogger.info("talk system voice start chars=\(cleaned.count, privacy: .public)")
if self.interruptOnSpeech {
await self.startRecognition()
guard self.isCurrent(gen) else { return }
}
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
self.phase = .speaking
await TalkSystemSpeechSynthesizer.shared.stop()
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
self.ttsLogger.info("talk system voice done")
}
} catch {
self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)")
self.ttsLogger.error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
do {
if self.interruptOnSpeech {
await self.startRecognition()
guard self.isCurrent(gen) else { return }
}
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
self.phase = .speaking
await TalkSystemSpeechSynthesizer.shared.stop()
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
} catch {
self.ttsLogger.error("talk system voice failed: \(error.localizedDescription, privacy: .public)")
}
}
if self.phase == .speaking {
@@ -505,7 +528,7 @@ actor TalkModeRuntime {
if let fallbackVoiceId { return fallbackVoiceId }
do {
let voices = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).listVoices()
let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
guard let first = voices.first else {
self.ttsLogger.error("elevenlabs voices list empty")
return nil
@@ -528,6 +551,7 @@ actor TalkModeRuntime {
func stopSpeaking(reason: TalkStopReason) async {
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
await TalkSystemSpeechSynthesizer.shared.stop()
guard self.phase == .speaking else { return }
if reason == .speech, let interruptedAt {
self.lastInterruptedAtSeconds = interruptedAt
@@ -720,154 +744,4 @@ actor TalkModeRuntime {
return normalized
}
private static func validatedLanguage(_ value: String?, logger: Logger) -> String? {
guard let value else { return nil }
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else {
logger.warning("talk language invalid: \(normalized, privacy: .public)")
return nil
}
return normalized
}
private static func validatedOutputFormat(_ value: String?, logger: Logger) -> String? {
let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
guard !trimmed.isEmpty else { return nil }
guard trimmed.hasPrefix("mp3_") else {
logger.warning("talk output_format unsupported for local playback: \(trimmed, privacy: .public)")
return nil
}
return trimmed
}
private static func isMessageTimestampAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
let sinceMs = sinceSeconds * 1000
if timestamp > 10_000_000_000 {
return timestamp >= sinceMs - 500
}
return timestamp >= sinceSeconds - 0.5
}
}
private struct ElevenLabsRequest {
let text: String
let modelId: String?
let outputFormat: String?
let speed: Double?
let stability: Double?
let similarity: Double?
let style: Double?
let speakerBoost: Bool?
let seed: UInt32?
let normalize: String?
let language: String?
}
private struct ElevenLabsClient {
let apiKey: String
let logger: Logger
let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
let ttsTimeoutSeconds: TimeInterval = 45
let listVoicesTimeoutSeconds: TimeInterval = 15
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
let charCount = request.text.count
self.logger.info(
"elevenlabs tts request voice=\(voiceId, privacy: .public) model=\(request.modelId ?? "default", privacy: .public) chars=\(charCount, privacy: .public)")
let startedAt = Date()
var payload: [String: Any] = [
"text": request.text,
]
if let modelId = request.modelId, !modelId.isEmpty {
payload["model_id"] = modelId
}
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
payload["output_format"] = outputFormat
}
if let seed = request.seed {
payload["seed"] = seed
}
if let normalize = request.normalize {
payload["apply_text_normalization"] = normalize
}
if let language = request.language {
payload["language_code"] = language
}
var voiceSettings: [String: Any] = [:]
if let speed = request.speed { voiceSettings["speed"] = speed }
if let stability = request.stability { voiceSettings["stability"] = stability }
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
if let style = request.style { voiceSettings["style"] = style }
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
if !voiceSettings.isEmpty {
payload["voice_settings"] = voiceSettings
}
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.timeoutInterval = self.ttsTimeoutSeconds
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = String(data: data, encoding: .utf8) ?? "unknown"
self.logger.error(
"elevenlabs tts failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)")
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
])
}
let elapsed = Date().timeIntervalSince(startedAt)
self.logger.info("elevenlabs tts ok bytes=\(data.count, privacy: .public) dur=\(elapsed, privacy: .public)s")
return data
}
func listVoices() async throws -> [ElevenLabsVoice] {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("voices")
self.logger.info("elevenlabs voices list request")
var req = URLRequest(url: url)
req.httpMethod = "GET"
req.timeoutInterval = self.listVoicesTimeoutSeconds
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = String(data: data, encoding: .utf8) ?? "unknown"
self.logger.error(
"elevenlabs voices list failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)")
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
])
}
let decoded = try JSONDecoder().decode(ElevenLabsVoicesResponse.self, from: data)
return decoded.voices
}
}
private struct ElevenLabsVoice: Decodable {
let voiceId: String
let name: String?
enum CodingKeys: String, CodingKey {
case voiceId = "voice_id"
case name
}
}
private struct ElevenLabsVoicesResponse: Decodable {
let voices: [ElevenLabsVoice]
}

View File

@@ -7,7 +7,7 @@ import SwiftUI
@Observable
final class TalkOverlayController {
static let shared = TalkOverlayController()
static let overlaySize: CGFloat = 360
static let overlaySize: CGFloat = 440
static let windowInset: CGFloat = 88
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay")

View File

@@ -31,7 +31,7 @@ struct TalkOverlayView: View {
}
.buttonStyle(.plain)
.contentShape(Circle())
.offset(x: -5, y: -5)
.offset(x: -2, y: -2)
.opacity(self.hoveringWindow ? 1 : 0)
.animation(.easeOut(duration: 0.12), value: self.hoveringWindow)
.allowsHitTesting(self.hoveringWindow)
@@ -42,7 +42,7 @@ struct TalkOverlayView: View {
.onHover { self.hoveringWindow = $0 }
}
private static let defaultSeamColor = Color(red: 127 / 255.0, green: 184 / 255.0, blue: 212 / 255.0)
private static let defaultSeamColor = Color(red: 79 / 255.0, green: 122 / 255.0, blue: 154 / 255.0)
private var seamColor: Color {
Self.color(fromHex: self.appState.seamColorHex) ?? Self.defaultSeamColor

View File

@@ -29,6 +29,10 @@ final class WebChatManager {
var onPanelVisibilityChanged: ((Bool) -> Void)?
var activeSessionKey: String? {
self.panelSessionKey ?? self.windowSessionKey
}
func show(sessionKey: String) {
self.closePanel()
if let controller = self.windowController {

View File

@@ -0,0 +1,233 @@
import Foundation
public struct ElevenLabsVoice: Decodable, Sendable {
public let voiceId: String
public let name: String?
enum CodingKeys: String, CodingKey {
case voiceId = "voice_id"
case name
}
}
public struct ElevenLabsTTSRequest: Sendable {
public var text: String
public var modelId: String?
public var outputFormat: String?
public var speed: Double?
public var stability: Double?
public var similarity: Double?
public var style: Double?
public var speakerBoost: Bool?
public var seed: UInt32?
public var normalize: String?
public var language: String?
public init(
text: String,
modelId: String? = nil,
outputFormat: String? = nil,
speed: Double? = nil,
stability: Double? = nil,
similarity: Double? = nil,
style: Double? = nil,
speakerBoost: Bool? = nil,
seed: UInt32? = nil,
normalize: String? = nil,
language: String? = nil)
{
self.text = text
self.modelId = modelId
self.outputFormat = outputFormat
self.speed = speed
self.stability = stability
self.similarity = similarity
self.style = style
self.speakerBoost = speakerBoost
self.seed = seed
self.normalize = normalize
self.language = language
}
}
public struct ElevenLabsTTSClient: Sendable {
public var apiKey: String
public var requestTimeoutSeconds: TimeInterval
public var listVoicesTimeoutSeconds: TimeInterval
public var baseUrl: URL
public init(
apiKey: String,
requestTimeoutSeconds: TimeInterval = 45,
listVoicesTimeoutSeconds: TimeInterval = 15,
baseUrl: URL = URL(string: "https://api.elevenlabs.io")!)
{
self.apiKey = apiKey
self.requestTimeoutSeconds = requestTimeoutSeconds
self.listVoicesTimeoutSeconds = listVoicesTimeoutSeconds
self.baseUrl = baseUrl
}
public func synthesizeWithHardTimeout(
voiceId: String,
request: ElevenLabsTTSRequest,
hardTimeoutSeconds: TimeInterval) async throws -> Data
{
try await withThrowingTaskGroup(of: Data.self) { group in
group.addTask {
try await self.synthesize(voiceId: voiceId, request: request)
}
group.addTask {
try await Task.sleep(nanoseconds: UInt64(hardTimeoutSeconds * 1_000_000_000))
throw NSError(domain: "ElevenLabsTTS", code: 408, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(hardTimeoutSeconds)s",
])
}
let data = try await group.next()!
group.cancelAll()
return data
}
}
public func synthesize(voiceId: String, request: ElevenLabsTTSRequest) async throws -> Data {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
var lastError: Error?
for attempt in 0..<3 {
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.timeoutInterval = self.requestTimeoutSeconds
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
do {
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse {
let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
if http.statusCode == 429 || http.statusCode >= 500 {
let message = Self.truncatedErrorBody(data)
lastError = NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs retryable failure: \(http.statusCode) ct=\(contentType) \(message)",
])
if attempt < 2 {
let retryAfter = Double(http.value(forHTTPHeaderField: "Retry-After") ?? "")
let baseDelay = [0.25, 0.75, 1.5][attempt]
let delaySeconds = max(baseDelay, retryAfter ?? 0)
try? await Task.sleep(nanoseconds: UInt64(delaySeconds * 1_000_000_000))
continue
}
throw lastError!
}
if http.statusCode >= 400 {
let message = Self.truncatedErrorBody(data)
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
])
}
if !contentType.contains("audio") {
let message = Self.truncatedErrorBody(data)
throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
])
}
}
return data
} catch {
lastError = error
if attempt < 2 {
try? await Task.sleep(nanoseconds: UInt64([0.25, 0.75, 1.5][attempt] * 1_000_000_000))
continue
}
throw error
}
}
throw lastError ?? NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed",
])
}
public func listVoices() async throws -> [ElevenLabsVoice] {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("voices")
var req = URLRequest(url: url)
req.httpMethod = "GET"
req.timeoutInterval = self.listVoicesTimeoutSeconds
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = Self.truncatedErrorBody(data)
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
])
}
struct VoicesResponse: Decodable { let voices: [ElevenLabsVoice] }
return try JSONDecoder().decode(VoicesResponse.self, from: data).voices
}
public static func validatedOutputFormat(_ value: String?) -> String? {
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return nil }
guard trimmed.hasPrefix("mp3_") else { return nil }
return trimmed
}
public static func validatedLanguage(_ value: String?) -> String? {
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
return normalized
}
public static func validatedNormalize(_ value: String?) -> String? {
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard ["auto", "on", "off"].contains(normalized) else { return nil }
return normalized
}
private static func buildPayload(_ request: ElevenLabsTTSRequest) -> [String: Any] {
var payload: [String: Any] = ["text": request.text]
if let modelId = request.modelId?.trimmingCharacters(in: .whitespacesAndNewlines), !modelId.isEmpty {
payload["model_id"] = modelId
}
if let outputFormat = request.outputFormat?.trimmingCharacters(in: .whitespacesAndNewlines), !outputFormat.isEmpty {
payload["output_format"] = outputFormat
}
if let seed = request.seed {
payload["seed"] = seed
}
if let normalize = request.normalize {
payload["apply_text_normalization"] = normalize
}
if let language = request.language {
payload["language_code"] = language
}
var voiceSettings: [String: Any] = [:]
if let speed = request.speed { voiceSettings["speed"] = speed }
if let stability = request.stability { voiceSettings["stability"] = stability }
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
if let style = request.style { voiceSettings["style"] = style }
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
if !voiceSettings.isEmpty {
payload["voice_settings"] = voiceSettings
}
return payload
}
private static func truncatedErrorBody(_ data: Data) -> String {
let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
}
}

View File

@@ -67,12 +67,18 @@ public enum TalkDirectiveParser {
var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false)
guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) }
guard let firstNonEmpty =
guard let firstNonEmptyIndex =
lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty })
else {
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
}
var firstNonEmpty = firstNonEmptyIndex
if firstNonEmpty > 0 {
lines.removeSubrange(0..<firstNonEmpty)
firstNonEmpty = 0
}
let head = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines)
guard head.hasPrefix("{"), head.hasSuffix("}") else {
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])

View File

@@ -0,0 +1,13 @@
public enum TalkHistoryTimestamp: Sendable {
/// Gateway history timestamps have historically been emitted as either seconds (Double, epoch seconds)
/// or milliseconds (Double, epoch ms). This helper accepts either.
public static func isAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
let sinceMs = sinceSeconds * 1000
// ~2286-11-20 in epoch seconds. Anything bigger is almost certainly epoch milliseconds.
if timestamp > 10_000_000_000 {
return timestamp >= sinceMs - 500
}
return timestamp >= sinceSeconds - 0.5
}
}

View File

@@ -0,0 +1,18 @@
public enum TalkPromptBuilder: Sendable {
public static func build(transcript: String, interruptedAtSeconds: Double?) -> String {
var lines: [String] = [
"Talk Mode active. Reply in a concise, spoken tone.",
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
]
if let interruptedAtSeconds {
let formatted = String(format: "%.1f", interruptedAtSeconds)
lines.append("Assistant speech interrupted at \(formatted)s.")
}
lines.append("")
lines.append(transcript)
return lines.joined(separator: "\n")
}
}

View File

@@ -0,0 +1,110 @@
import AVFoundation
import Foundation
@MainActor
public final class TalkSystemSpeechSynthesizer: NSObject {
public enum SpeakError: Error {
case canceled
}
public static let shared = TalkSystemSpeechSynthesizer()
private let synth = AVSpeechSynthesizer()
private var speakContinuation: CheckedContinuation<Void, Error>?
private var currentUtterance: AVSpeechUtterance?
private var currentToken = UUID()
private var watchdog: Task<Void, Never>?
public var isSpeaking: Bool { self.synth.isSpeaking }
private override init() {
super.init()
self.synth.delegate = self
}
public func stop() {
self.currentToken = UUID()
self.watchdog?.cancel()
self.watchdog = nil
self.synth.stopSpeaking(at: .immediate)
self.finishCurrent(with: SpeakError.canceled)
}
public func speak(text: String, language: String? = nil) async throws {
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return }
self.stop()
let token = UUID()
self.currentToken = token
let utterance = AVSpeechUtterance(string: trimmed)
if let language, let voice = AVSpeechSynthesisVoice(language: language) {
utterance.voice = voice
}
self.currentUtterance = utterance
let estimatedSeconds = max(3.0, min(180.0, Double(trimmed.count) * 0.08))
self.watchdog?.cancel()
self.watchdog = Task { @MainActor [weak self] in
guard let self else { return }
try? await Task.sleep(nanoseconds: UInt64(estimatedSeconds * 1_000_000_000))
if Task.isCancelled { return }
guard self.currentToken == token else { return }
if self.synth.isSpeaking {
self.synth.stopSpeaking(at: .immediate)
}
self.finishCurrent(
with: NSError(domain: "TalkSystemSpeechSynthesizer", code: 408, userInfo: [
NSLocalizedDescriptionKey: "system TTS timed out after \(estimatedSeconds)s",
]))
}
try await withTaskCancellationHandler(operation: {
try await withCheckedThrowingContinuation { cont in
self.speakContinuation = cont
self.synth.speak(utterance)
}
}, onCancel: {
Task { @MainActor in
self.stop()
}
})
if self.currentToken != token {
throw SpeakError.canceled
}
}
private func handleFinish(error: Error?) {
guard self.currentUtterance != nil else { return }
self.watchdog?.cancel()
self.watchdog = nil
self.finishCurrent(with: error)
}
private func finishCurrent(with error: Error?) {
self.currentUtterance = nil
let cont = self.speakContinuation
self.speakContinuation = nil
if let error {
cont?.resume(throwing: error)
} else {
cont?.resume(returning: ())
}
}
}
extension TalkSystemSpeechSynthesizer: AVSpeechSynthesizerDelegate {
public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
Task { @MainActor in
self.handleFinish(error: nil)
}
}
public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
Task { @MainActor in
self.handleFinish(error: SpeakError.canceled)
}
}
}

View File

@@ -0,0 +1,27 @@
public enum TalkTTSValidation: Sendable {
public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
if let rateWPM, rateWPM > 0 {
let resolved = Double(rateWPM) / 175.0
if resolved <= 0.5 || resolved >= 2.0 { return nil }
return resolved
}
if let speed {
if speed <= 0.5 || speed >= 2.0 { return nil }
return speed
}
return nil
}
public static func validatedUnit(_ value: Double?) -> Double? {
guard let value else { return nil }
if value < 0 || value > 1 { return nil }
return value
}
public static func validatedSeed(_ value: Int?) -> UInt32? {
guard let value else { return nil }
if value < 0 || value > 4294967295 { return nil }
return UInt32(value)
}
}

View File

@@ -0,0 +1,20 @@
import XCTest
@testable import ClawdisKit
final class ElevenLabsTTSValidationTests: XCTestCase {
func testValidatedOutputFormatAllowsOnlyMp3Presets() {
XCTAssertEqual(ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128"), "mp3_44100_128")
XCTAssertNil(ElevenLabsTTSClient.validatedOutputFormat("pcm_16000"))
}
func testValidatedLanguageAcceptsTwoLetterCodes() {
XCTAssertEqual(ElevenLabsTTSClient.validatedLanguage("EN"), "en")
XCTAssertNil(ElevenLabsTTSClient.validatedLanguage("eng"))
}
func testValidatedNormalizeAcceptsKnownValues() {
XCTAssertEqual(ElevenLabsTTSClient.validatedNormalize("AUTO"), "auto")
XCTAssertNil(ElevenLabsTTSClient.validatedNormalize("maybe"))
}
}

View File

@@ -50,6 +50,18 @@ final class TalkDirectiveTests: XCTestCase {
XCTAssertEqual(result.stripped, "Hello.")
}
func testSkipsLeadingEmptyLinesWhenParsingDirective() {
let text = """
{"voice":"abc123"}
Hello there.
"""
let result = TalkDirectiveParser.parse(text)
XCTAssertEqual(result.directive?.voiceId, "abc123")
XCTAssertEqual(result.stripped, "Hello there.")
}
func testTracksUnknownKeys() {
let text = """
{"voice":"abc","mystery":"value","extra":1}

View File

@@ -0,0 +1,16 @@
import XCTest
@testable import ClawdisKit
final class TalkHistoryTimestampTests: XCTestCase {
func testSecondsTimestampsAreAcceptedWithSmallTolerance() {
XCTAssertTrue(TalkHistoryTimestamp.isAfter(999.6, sinceSeconds: 1000))
XCTAssertFalse(TalkHistoryTimestamp.isAfter(999.4, sinceSeconds: 1000))
}
func testMillisecondsTimestampsAreAcceptedWithSmallTolerance() {
let sinceSeconds = 1_700_000_000.0
let sinceMs = sinceSeconds * 1000
XCTAssertTrue(TalkHistoryTimestamp.isAfter(sinceMs - 500, sinceSeconds: sinceSeconds))
XCTAssertFalse(TalkHistoryTimestamp.isAfter(sinceMs - 501, sinceSeconds: sinceSeconds))
}
}

View File

@@ -0,0 +1,16 @@
import XCTest
@testable import ClawdisKit
final class TalkPromptBuilderTests: XCTestCase {
func testBuildIncludesTranscript() {
let prompt = TalkPromptBuilder.build(transcript: "Hello", interruptedAtSeconds: nil)
XCTAssertTrue(prompt.contains("Talk Mode active."))
XCTAssertTrue(prompt.hasSuffix("\n\nHello"))
}
func testBuildIncludesInterruptionLineWhenProvided() {
let prompt = TalkPromptBuilder.build(transcript: "Hi", interruptedAtSeconds: 1.234)
XCTAssertTrue(prompt.contains("Assistant speech interrupted at 1.2s."))
}
}

View File

@@ -0,0 +1,24 @@
import XCTest
@testable import ClawdisKit
final class TalkTTSValidationTests: XCTestCase {
func testResolveSpeedUsesRateWPMWhenProvided() {
let resolved = TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 175)
XCTAssertNotNil(resolved)
XCTAssertEqual(resolved ?? 0, 1.0, accuracy: 0.0001)
XCTAssertNil(TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 400))
}
func testValidatedUnitBounds() {
XCTAssertEqual(TalkTTSValidation.validatedUnit(0), 0)
XCTAssertEqual(TalkTTSValidation.validatedUnit(1), 1)
XCTAssertNil(TalkTTSValidation.validatedUnit(-0.01))
XCTAssertNil(TalkTTSValidation.validatedUnit(1.01))
}
func testValidatedSeedBounds() {
XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0)
XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234)
XCTAssertNil(TalkTTSValidation.validatedSeed(-1))
}
}