fix(talk): harden TTS + add system fallback
This commit is contained in:
@@ -16,6 +16,8 @@
|
||||
- macOS Talk Mode: fix audio stop ordering so disabling Talk Mode always stops in-flight playback.
|
||||
- macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn.
|
||||
- macOS Talk Mode: increase overlay window size so wave rings don’t clip; close button is hover-only and closer to the orb.
|
||||
- Talk Mode: fall back to system TTS when ElevenLabs is unavailable, returns non-audio, or playback fails (macOS/iOS/Android).
|
||||
- ElevenLabs: add retry/backoff for 429/5xx and include content-type in errors for debugging.
|
||||
- Talk Mode: align to the gateway’s main session key and fall back to history polling when chat events drop (prevents stuck “thinking” / missing messages).
|
||||
- Talk Mode: treat history timestamps as seconds or milliseconds to avoid stale assistant picks (macOS/iOS/Android).
|
||||
- Chat UI: dedupe identical history messages to avoid duplicate bubbles.
|
||||
|
||||
@@ -930,7 +930,7 @@ class NodeRuntime(context: Context) {
|
||||
|
||||
private data class Quad<A, B, C, D>(val first: A, val second: B, val third: C, val fourth: D)
|
||||
|
||||
private const val DEFAULT_SEAM_COLOR_ARGB: Long = 0xFF7FB8D4
|
||||
private const val DEFAULT_SEAM_COLOR_ARGB: Long = 0xFF4F7A9A
|
||||
|
||||
private const val a2uiReadyCheckJS: String =
|
||||
"""
|
||||
|
||||
@@ -62,9 +62,9 @@ fun TalkOrbOverlay(
|
||||
verticalArrangement = Arrangement.spacedBy(12.dp),
|
||||
) {
|
||||
Box(contentAlignment = Alignment.Center) {
|
||||
Canvas(modifier = Modifier.size(300.dp)) {
|
||||
Canvas(modifier = Modifier.size(360.dp)) {
|
||||
val center = this.center
|
||||
val baseRadius = size.minDimension * 0.27f
|
||||
val baseRadius = size.minDimension * 0.30f
|
||||
|
||||
val ring1 = 1.05f + (t * 0.25f)
|
||||
val ring2 = 1.20f + (t * 0.55f)
|
||||
|
||||
@@ -13,6 +13,8 @@ import android.os.SystemClock
|
||||
import android.speech.RecognitionListener
|
||||
import android.speech.RecognizerIntent
|
||||
import android.speech.SpeechRecognizer
|
||||
import android.speech.tts.TextToSpeech
|
||||
import android.speech.tts.UtteranceProgressListener
|
||||
import android.util.Log
|
||||
import androidx.core.content.ContextCompat
|
||||
import com.steipete.clawdis.node.bridge.BridgeSession
|
||||
@@ -89,6 +91,9 @@ class TalkModeManager(
|
||||
|
||||
private var player: MediaPlayer? = null
|
||||
private var currentAudioFile: File? = null
|
||||
private var systemTts: TextToSpeech? = null
|
||||
private var systemTtsPending: CompletableDeferred<Unit>? = null
|
||||
private var systemTtsPendingId: String? = null
|
||||
|
||||
fun attachSession(session: BridgeSession) {
|
||||
this.session = session
|
||||
@@ -181,6 +186,10 @@ class TalkModeManager(
|
||||
recognizer?.destroy()
|
||||
recognizer = null
|
||||
}
|
||||
systemTts?.stop()
|
||||
systemTtsPending?.cancel()
|
||||
systemTtsPending = null
|
||||
systemTtsPendingId = null
|
||||
}
|
||||
|
||||
private fun startListeningInternal(markListening: Boolean) {
|
||||
@@ -441,16 +450,6 @@ class TalkModeManager(
|
||||
apiKey?.trim()?.takeIf { it.isNotEmpty() }
|
||||
?: System.getenv("ELEVENLABS_API_KEY")?.trim()
|
||||
val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId
|
||||
if (voiceId.isNullOrBlank()) {
|
||||
_statusText.value = "Missing voice ID"
|
||||
Log.w(tag, "missing voiceId")
|
||||
return
|
||||
}
|
||||
if (apiKey.isNullOrEmpty()) {
|
||||
_statusText.value = "Missing ELEVENLABS_API_KEY"
|
||||
Log.w(tag, "missing ELEVENLABS_API_KEY")
|
||||
return
|
||||
}
|
||||
|
||||
_statusText.value = "Speaking…"
|
||||
_isSpeaking.value = true
|
||||
@@ -458,28 +457,46 @@ class TalkModeManager(
|
||||
ensureInterruptListener()
|
||||
|
||||
try {
|
||||
val ttsStarted = SystemClock.elapsedRealtime()
|
||||
val request =
|
||||
ElevenLabsRequest(
|
||||
text = cleaned,
|
||||
modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
|
||||
outputFormat =
|
||||
TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
|
||||
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
|
||||
stability = TalkModeRuntime.validatedUnit(directive?.stability),
|
||||
similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
|
||||
style = TalkModeRuntime.validatedUnit(directive?.style),
|
||||
speakerBoost = directive?.speakerBoost,
|
||||
seed = TalkModeRuntime.validatedSeed(directive?.seed),
|
||||
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
|
||||
language = TalkModeRuntime.validatedLanguage(directive?.language),
|
||||
)
|
||||
val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request)
|
||||
Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
|
||||
playAudio(audio)
|
||||
val canUseElevenLabs = !voiceId.isNullOrBlank() && !apiKey.isNullOrEmpty()
|
||||
if (!canUseElevenLabs) {
|
||||
if (voiceId.isNullOrBlank()) {
|
||||
Log.w(tag, "missing voiceId; falling back to system voice")
|
||||
}
|
||||
if (apiKey.isNullOrEmpty()) {
|
||||
Log.w(tag, "missing ELEVENLABS_API_KEY; falling back to system voice")
|
||||
}
|
||||
_statusText.value = "Speaking (System)…"
|
||||
speakWithSystemTts(cleaned)
|
||||
} else {
|
||||
val ttsStarted = SystemClock.elapsedRealtime()
|
||||
val request =
|
||||
ElevenLabsRequest(
|
||||
text = cleaned,
|
||||
modelId = directive?.modelId ?: currentModelId ?: defaultModelId,
|
||||
outputFormat =
|
||||
TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat),
|
||||
speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm),
|
||||
stability = TalkModeRuntime.validatedUnit(directive?.stability),
|
||||
similarity = TalkModeRuntime.validatedUnit(directive?.similarity),
|
||||
style = TalkModeRuntime.validatedUnit(directive?.style),
|
||||
speakerBoost = directive?.speakerBoost,
|
||||
seed = TalkModeRuntime.validatedSeed(directive?.seed),
|
||||
normalize = TalkModeRuntime.validatedNormalize(directive?.normalize),
|
||||
language = TalkModeRuntime.validatedLanguage(directive?.language),
|
||||
)
|
||||
val audio = synthesize(voiceId = voiceId!!, apiKey = apiKey!!, request = request)
|
||||
Log.d(tag, "elevenlabs ok bytes=${audio.size} durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
|
||||
playAudio(audio)
|
||||
}
|
||||
} catch (err: Throwable) {
|
||||
_statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
|
||||
Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}")
|
||||
Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice")
|
||||
try {
|
||||
_statusText.value = "Speaking (System)…"
|
||||
speakWithSystemTts(cleaned)
|
||||
} catch (fallbackErr: Throwable) {
|
||||
_statusText.value = "Speak failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}"
|
||||
Log.w(tag, "system voice failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}")
|
||||
}
|
||||
}
|
||||
|
||||
_isSpeaking.value = false
|
||||
@@ -524,9 +541,103 @@ class TalkModeManager(
|
||||
Log.d(tag, "play done")
|
||||
}
|
||||
|
||||
private suspend fun speakWithSystemTts(text: String) {
|
||||
val trimmed = text.trim()
|
||||
if (trimmed.isEmpty()) return
|
||||
val ok = ensureSystemTts()
|
||||
if (!ok) {
|
||||
throw IllegalStateException("system TTS unavailable")
|
||||
}
|
||||
|
||||
val tts = systemTts ?: throw IllegalStateException("system TTS unavailable")
|
||||
val utteranceId = "talk-${UUID.randomUUID()}"
|
||||
val deferred = CompletableDeferred<Unit>()
|
||||
systemTtsPending?.cancel()
|
||||
systemTtsPending = deferred
|
||||
systemTtsPendingId = utteranceId
|
||||
|
||||
withContext(Dispatchers.Main) {
|
||||
val params = Bundle()
|
||||
tts.speak(trimmed, TextToSpeech.QUEUE_FLUSH, params, utteranceId)
|
||||
}
|
||||
|
||||
withContext(Dispatchers.IO) {
|
||||
try {
|
||||
kotlinx.coroutines.withTimeout(180_000) { deferred.await() }
|
||||
} catch (err: Throwable) {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun ensureSystemTts(): Boolean {
|
||||
if (systemTts != null) return true
|
||||
return withContext(Dispatchers.Main) {
|
||||
val deferred = CompletableDeferred<Boolean>()
|
||||
val tts =
|
||||
try {
|
||||
TextToSpeech(context) { status ->
|
||||
deferred.complete(status == TextToSpeech.SUCCESS)
|
||||
}
|
||||
} catch (_: Throwable) {
|
||||
deferred.complete(false)
|
||||
null
|
||||
}
|
||||
if (tts == null) return@withContext false
|
||||
|
||||
tts.setOnUtteranceProgressListener(
|
||||
object : UtteranceProgressListener() {
|
||||
override fun onStart(utteranceId: String?) {}
|
||||
|
||||
override fun onDone(utteranceId: String?) {
|
||||
if (utteranceId == null) return
|
||||
if (utteranceId != systemTtsPendingId) return
|
||||
systemTtsPending?.complete(Unit)
|
||||
systemTtsPending = null
|
||||
systemTtsPendingId = null
|
||||
}
|
||||
|
||||
@Deprecated("Deprecated in Java")
|
||||
override fun onError(utteranceId: String?) {
|
||||
if (utteranceId == null) return
|
||||
if (utteranceId != systemTtsPendingId) return
|
||||
systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error"))
|
||||
systemTtsPending = null
|
||||
systemTtsPendingId = null
|
||||
}
|
||||
|
||||
override fun onError(utteranceId: String?, errorCode: Int) {
|
||||
if (utteranceId == null) return
|
||||
if (utteranceId != systemTtsPendingId) return
|
||||
systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error $errorCode"))
|
||||
systemTtsPending = null
|
||||
systemTtsPendingId = null
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
val ok =
|
||||
try {
|
||||
deferred.await()
|
||||
} catch (_: Throwable) {
|
||||
false
|
||||
}
|
||||
if (ok) {
|
||||
systemTts = tts
|
||||
} else {
|
||||
tts.shutdown()
|
||||
}
|
||||
ok
|
||||
}
|
||||
}
|
||||
|
||||
private fun stopSpeaking(resetInterrupt: Boolean = true) {
|
||||
if (!_isSpeaking.value) {
|
||||
cleanupPlayer()
|
||||
systemTts?.stop()
|
||||
systemTtsPending?.cancel()
|
||||
systemTtsPending = null
|
||||
systemTtsPendingId = null
|
||||
return
|
||||
}
|
||||
if (resetInterrupt) {
|
||||
@@ -534,6 +645,10 @@ class TalkModeManager(
|
||||
lastInterruptedAtSeconds = currentMs / 1000.0
|
||||
}
|
||||
cleanupPlayer()
|
||||
systemTts?.stop()
|
||||
systemTtsPending?.cancel()
|
||||
systemTtsPending = null
|
||||
systemTtsPendingId = null
|
||||
_isSpeaking.value = false
|
||||
}
|
||||
|
||||
|
||||
@@ -293,7 +293,7 @@ final class NodeAppModel {
|
||||
Self.color(fromHex: self.seamColorHex) ?? Self.defaultSeamColor
|
||||
}
|
||||
|
||||
private static let defaultSeamColor = Color(red: 127 / 255.0, green: 184 / 255.0, blue: 212 / 255.0)
|
||||
private static let defaultSeamColor = Color(red: 79 / 255.0, green: 122 / 255.0, blue: 154 / 255.0)
|
||||
|
||||
private static func color(fromHex raw: String?) -> Color? {
|
||||
let trimmed = (raw ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
|
||||
@@ -105,6 +105,7 @@ final class TalkModeManager: NSObject {
|
||||
self.stopRecognition()
|
||||
self.stopSpeaking()
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
TalkSystemSpeechSynthesizer.shared.stop()
|
||||
do {
|
||||
try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
|
||||
} catch {
|
||||
@@ -301,20 +302,9 @@ final class TalkModeManager: NSObject {
|
||||
}
|
||||
|
||||
private func buildPrompt(transcript: String) -> String {
|
||||
var lines: [String] = [
|
||||
"Talk Mode active. Reply in a concise, spoken tone.",
|
||||
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
|
||||
]
|
||||
|
||||
if let interrupted = self.lastInterruptedAtSeconds {
|
||||
let formatted = String(format: "%.1f", interrupted)
|
||||
lines.append("Assistant speech interrupted at \(formatted)s.")
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
}
|
||||
|
||||
lines.append("")
|
||||
lines.append(transcript)
|
||||
return lines.joined(separator: "\n")
|
||||
let interrupted = self.lastInterruptedAtSeconds
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted)
|
||||
}
|
||||
|
||||
private enum ChatCompletionState: CustomStringConvertible {
|
||||
@@ -409,7 +399,7 @@ final class TalkModeManager: NSObject {
|
||||
for msg in messages.reversed() {
|
||||
guard (msg["role"] as? String) == "assistant" else { continue }
|
||||
if let since, let timestamp = msg["timestamp"] as? Double,
|
||||
TalkModeRuntime.isMessageTimestampAfter(timestamp, sinceSeconds: since) == false
|
||||
TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) == false
|
||||
{
|
||||
continue
|
||||
}
|
||||
@@ -440,81 +430,91 @@ final class TalkModeManager: NSObject {
|
||||
}
|
||||
}
|
||||
|
||||
let voiceId = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId
|
||||
guard let voiceId, !voiceId.isEmpty else {
|
||||
self.statusText = "Missing voice ID"
|
||||
self.logger.error("missing voiceId")
|
||||
return
|
||||
}
|
||||
|
||||
let resolvedKey =
|
||||
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
|
||||
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
|
||||
guard let apiKey = resolvedKey, !apiKey.isEmpty else {
|
||||
self.statusText = "Missing ELEVENLABS_API_KEY"
|
||||
self.logger.error("missing ELEVENLABS_API_KEY")
|
||||
return
|
||||
}
|
||||
|
||||
self.statusText = "Generating voice…"
|
||||
self.isSpeaking = true
|
||||
self.lastSpokenText = cleaned
|
||||
|
||||
do {
|
||||
let started = Date()
|
||||
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
|
||||
let outputFormat = TalkModeRuntime.validatedOutputFormat(desiredOutputFormat)
|
||||
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
|
||||
self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
|
||||
}
|
||||
let request = ElevenLabsRequest(
|
||||
text: cleaned,
|
||||
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
|
||||
outputFormat: outputFormat,
|
||||
speed: TalkModeRuntime.resolveSpeed(
|
||||
speed: directive?.speed,
|
||||
rateWPM: directive?.rateWPM),
|
||||
stability: TalkModeRuntime.validatedUnit(directive?.stability),
|
||||
similarity: TalkModeRuntime.validatedUnit(directive?.similarity),
|
||||
style: TalkModeRuntime.validatedUnit(directive?.style),
|
||||
speakerBoost: directive?.speakerBoost,
|
||||
seed: TalkModeRuntime.validatedSeed(directive?.seed),
|
||||
normalize: TalkModeRuntime.validatedNormalize(directive?.normalize),
|
||||
language: TalkModeRuntime.validatedLanguage(directive?.language))
|
||||
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
|
||||
|
||||
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
|
||||
let client = ElevenLabsClient(apiKey: apiKey)
|
||||
let audio = try await withThrowingTaskGroup(of: Data.self) { group in
|
||||
group.addTask {
|
||||
try await client.synthesize(voiceId: voiceId, request: request)
|
||||
}
|
||||
group.addTask {
|
||||
try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000))
|
||||
throw NSError(domain: "TalkTTS", code: 408, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s",
|
||||
])
|
||||
}
|
||||
let data = try await group.next()!
|
||||
group.cancelAll()
|
||||
return data
|
||||
}
|
||||
self.logger
|
||||
.info(
|
||||
"elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
|
||||
let voiceId = (directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId)?
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let resolvedKey =
|
||||
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
|
||||
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
|
||||
let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)
|
||||
|
||||
if self.interruptOnSpeech {
|
||||
do {
|
||||
try self.startRecognition()
|
||||
} catch {
|
||||
self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
|
||||
if canUseElevenLabs, let voiceId, let apiKey {
|
||||
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
|
||||
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
|
||||
self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
|
||||
}
|
||||
}
|
||||
|
||||
self.statusText = "Speaking…"
|
||||
try await self.playAudio(data: audio)
|
||||
let request = ElevenLabsTTSRequest(
|
||||
text: cleaned,
|
||||
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
|
||||
outputFormat: outputFormat,
|
||||
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
|
||||
stability: TalkTTSValidation.validatedUnit(directive?.stability),
|
||||
similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
|
||||
style: TalkTTSValidation.validatedUnit(directive?.style),
|
||||
speakerBoost: directive?.speakerBoost,
|
||||
seed: TalkTTSValidation.validatedSeed(directive?.seed),
|
||||
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
|
||||
language: language)
|
||||
|
||||
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
|
||||
let client = ElevenLabsTTSClient(apiKey: apiKey)
|
||||
let audio = try await client.synthesizeWithHardTimeout(
|
||||
voiceId: voiceId,
|
||||
request: request,
|
||||
hardTimeoutSeconds: synthTimeoutSeconds)
|
||||
self.logger
|
||||
.info(
|
||||
"elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
|
||||
|
||||
if self.interruptOnSpeech {
|
||||
do {
|
||||
try self.startRecognition()
|
||||
} catch {
|
||||
self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
|
||||
self.statusText = "Speaking…"
|
||||
try await self.playAudio(data: audio)
|
||||
} else {
|
||||
self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
|
||||
if self.interruptOnSpeech {
|
||||
do {
|
||||
try self.startRecognition()
|
||||
} catch {
|
||||
self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
self.statusText = "Speaking (System)…"
|
||||
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
|
||||
}
|
||||
} catch {
|
||||
self.statusText = "Speak failed: \(error.localizedDescription)"
|
||||
self.logger.error("speak failed: \(error.localizedDescription, privacy: .public)")
|
||||
self.logger.error("tts failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
|
||||
do {
|
||||
if self.interruptOnSpeech {
|
||||
do {
|
||||
try self.startRecognition()
|
||||
} catch {
|
||||
self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
self.statusText = "Speaking (System)…"
|
||||
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
|
||||
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
|
||||
} catch {
|
||||
self.statusText = "Speak failed: \(error.localizedDescription)"
|
||||
self.logger.error("system voice failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
|
||||
self.stopRecognition()
|
||||
@@ -527,7 +527,11 @@ final class TalkModeManager: NSObject {
|
||||
self.player = player
|
||||
player.prepareToPlay()
|
||||
self.logger.info("play start")
|
||||
player.play()
|
||||
guard player.play() else {
|
||||
throw NSError(domain: "TalkMode", code: 2, userInfo: [
|
||||
NSLocalizedDescriptionKey: "audio player refused to play",
|
||||
])
|
||||
}
|
||||
while player.isPlaying {
|
||||
try? await Task.sleep(nanoseconds: 120_000_000)
|
||||
}
|
||||
@@ -541,6 +545,7 @@ final class TalkModeManager: NSObject {
|
||||
}
|
||||
self.player?.stop()
|
||||
self.player = nil
|
||||
TalkSystemSpeechSynthesizer.shared.stop()
|
||||
self.isSpeaking = false
|
||||
}
|
||||
|
||||
@@ -584,7 +589,7 @@ final class TalkModeManager: NSObject {
|
||||
|
||||
private static func configureAudioSession() throws {
|
||||
let session = AVAudioSession.sharedInstance()
|
||||
try session.setCategory(.playAndRecord, mode: .measurement, options: [
|
||||
try session.setCategory(.playAndRecord, mode: .voiceChat, options: [
|
||||
.duckOthers,
|
||||
.mixWithOthers,
|
||||
.allowBluetoothHFP,
|
||||
@@ -609,127 +614,3 @@ final class TalkModeManager: NSObject {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private struct ElevenLabsRequest {
|
||||
let text: String
|
||||
let modelId: String?
|
||||
let outputFormat: String?
|
||||
let speed: Double?
|
||||
let stability: Double?
|
||||
let similarity: Double?
|
||||
let style: Double?
|
||||
let speakerBoost: Bool?
|
||||
let seed: UInt32?
|
||||
let normalize: String?
|
||||
let language: String?
|
||||
}
|
||||
|
||||
private struct ElevenLabsClient {
|
||||
let apiKey: String
|
||||
let baseUrl = URL(string: "https://api.elevenlabs.io")!
|
||||
|
||||
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("text-to-speech")
|
||||
url.appendPathComponent(voiceId)
|
||||
|
||||
var payload: [String: Any] = [
|
||||
"text": request.text,
|
||||
]
|
||||
if let modelId = request.modelId, !modelId.isEmpty {
|
||||
payload["model_id"] = modelId
|
||||
}
|
||||
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
|
||||
payload["output_format"] = outputFormat
|
||||
}
|
||||
if let seed = request.seed {
|
||||
payload["seed"] = seed
|
||||
}
|
||||
if let normalize = request.normalize {
|
||||
payload["apply_text_normalization"] = normalize
|
||||
}
|
||||
if let language = request.language {
|
||||
payload["language_code"] = language
|
||||
}
|
||||
var voiceSettings: [String: Any] = [:]
|
||||
if let speed = request.speed { voiceSettings["speed"] = speed }
|
||||
if let stability = request.stability { voiceSettings["stability"] = stability }
|
||||
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
|
||||
if let style = request.style { voiceSettings["style"] = style }
|
||||
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
|
||||
if !voiceSettings.isEmpty { payload["voice_settings"] = voiceSettings }
|
||||
|
||||
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "POST"
|
||||
req.httpBody = body
|
||||
req.timeoutInterval = 45
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
|
||||
let message = String(data: data, encoding: .utf8) ?? "unknown"
|
||||
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
|
||||
])
|
||||
}
|
||||
return data
|
||||
}
|
||||
}
|
||||
|
||||
private enum TalkModeRuntime {
|
||||
static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
|
||||
if let rateWPM, rateWPM > 0 {
|
||||
let resolved = Double(rateWPM) / 175.0
|
||||
if resolved <= 0.5 || resolved >= 2.0 { return nil }
|
||||
return resolved
|
||||
}
|
||||
if let speed {
|
||||
if speed <= 0.5 || speed >= 2.0 { return nil }
|
||||
return speed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
static func validatedUnit(_ value: Double?) -> Double? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 1 { return nil }
|
||||
return value
|
||||
}
|
||||
|
||||
static func validatedSeed(_ value: Int?) -> UInt32? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 4_294_967_295 { return nil }
|
||||
return UInt32(value)
|
||||
}
|
||||
|
||||
static func validatedNormalize(_ value: String?) -> String? {
|
||||
guard let value else { return nil }
|
||||
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
return ["auto", "on", "off"].contains(normalized) ? normalized : nil
|
||||
}
|
||||
|
||||
static func validatedLanguage(_ value: String?) -> String? {
|
||||
guard let value else { return nil }
|
||||
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
|
||||
return normalized
|
||||
}
|
||||
|
||||
static func validatedOutputFormat(_ value: String?) -> String? {
|
||||
let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||
guard !trimmed.isEmpty else { return nil }
|
||||
return trimmed.hasPrefix("mp3_") ? trimmed : nil
|
||||
}
|
||||
|
||||
static func isMessageTimestampAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
|
||||
let sinceMs = sinceSeconds * 1000
|
||||
if timestamp > 10_000_000_000 {
|
||||
return timestamp >= sinceMs - 500
|
||||
}
|
||||
return timestamp >= sinceSeconds - 0.5
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,14 +12,14 @@ struct TalkOrbOverlay: View {
|
||||
ZStack {
|
||||
Circle()
|
||||
.stroke(seam.opacity(0.26), lineWidth: 2)
|
||||
.frame(width: 280, height: 280)
|
||||
.frame(width: 320, height: 320)
|
||||
.scaleEffect(self.pulse ? 1.15 : 0.96)
|
||||
.opacity(self.pulse ? 0.0 : 1.0)
|
||||
.animation(.easeOut(duration: 1.3).repeatForever(autoreverses: false), value: self.pulse)
|
||||
|
||||
Circle()
|
||||
.stroke(seam.opacity(0.18), lineWidth: 2)
|
||||
.frame(width: 280, height: 280)
|
||||
.frame(width: 320, height: 320)
|
||||
.scaleEffect(self.pulse ? 1.45 : 1.02)
|
||||
.opacity(self.pulse ? 0.0 : 0.9)
|
||||
.animation(.easeOut(duration: 1.9).repeatForever(autoreverses: false).delay(0.2), value: self.pulse)
|
||||
@@ -35,7 +35,7 @@ struct TalkOrbOverlay: View {
|
||||
center: .center,
|
||||
startRadius: 1,
|
||||
endRadius: 112))
|
||||
.frame(width: 168, height: 168)
|
||||
.frame(width: 190, height: 190)
|
||||
.overlay(
|
||||
Circle()
|
||||
.stroke(seam.opacity(0.35), lineWidth: 1))
|
||||
|
||||
@@ -291,7 +291,9 @@ actor TalkModeRuntime {
|
||||
await self.reloadConfig()
|
||||
guard self.isCurrent(gen) else { return }
|
||||
let prompt = self.buildPrompt(transcript: transcript)
|
||||
let sessionKey = await GatewayConnection.shared.mainSessionKey()
|
||||
let sessionKey =
|
||||
await MainActor.run { WebChatManager.shared.activeSessionKey } ??
|
||||
await GatewayConnection.shared.mainSessionKey()
|
||||
let runId = UUID().uuidString
|
||||
let startedAt = Date().timeIntervalSince1970
|
||||
self.logger.info(
|
||||
@@ -335,20 +337,9 @@ actor TalkModeRuntime {
|
||||
}
|
||||
|
||||
private func buildPrompt(transcript: String) -> String {
|
||||
var lines: [String] = [
|
||||
"Talk Mode active. Reply in a concise, spoken tone.",
|
||||
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
|
||||
]
|
||||
|
||||
if let interrupted = self.lastInterruptedAtSeconds {
|
||||
let formatted = String(format: "%.1f", interrupted)
|
||||
lines.append("Assistant speech interrupted at \(formatted)s.")
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
}
|
||||
|
||||
lines.append("")
|
||||
lines.append(transcript)
|
||||
return lines.joined(separator: "\n")
|
||||
let interrupted = self.lastInterruptedAtSeconds
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted)
|
||||
}
|
||||
|
||||
private func waitForAssistantText(
|
||||
@@ -378,7 +369,7 @@ actor TalkModeRuntime {
|
||||
guard message.role == "assistant" else { return false }
|
||||
guard let since else { return true }
|
||||
guard let timestamp = message.timestamp else { return false }
|
||||
return Self.isMessageTimestampAfter(timestamp, sinceSeconds: since)
|
||||
return TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since)
|
||||
}
|
||||
guard let assistant else { return nil }
|
||||
let text = assistant.content.compactMap { $0.text }.joined(separator: "\n")
|
||||
@@ -421,76 +412,108 @@ actor TalkModeRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
guard let apiKey = self.apiKey, !apiKey.isEmpty else {
|
||||
self.logger.error("talk missing ELEVENLABS_API_KEY")
|
||||
return
|
||||
}
|
||||
|
||||
let apiKey = self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let requestedVoice =
|
||||
directive?.voiceId ??
|
||||
self.currentVoiceId ??
|
||||
self.defaultVoiceId
|
||||
guard let voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey) else {
|
||||
self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID")
|
||||
return
|
||||
}
|
||||
guard self.isCurrent(gen) else { return }
|
||||
self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")
|
||||
|
||||
await self.startRecognition()
|
||||
guard self.isCurrent(gen) else { return }
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
|
||||
self.phase = .speaking
|
||||
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
|
||||
|
||||
let voiceId: String?
|
||||
if let apiKey, !apiKey.isEmpty {
|
||||
voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey)
|
||||
} else {
|
||||
voiceId = nil
|
||||
}
|
||||
|
||||
if apiKey?.isEmpty != false {
|
||||
self.ttsLogger.warning("talk missing ELEVENLABS_API_KEY; falling back to system voice")
|
||||
} else if voiceId == nil {
|
||||
self.ttsLogger.warning("talk missing voiceId; falling back to system voice")
|
||||
} else if let voiceId {
|
||||
self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)")
|
||||
}
|
||||
self.lastSpokenText = cleaned
|
||||
|
||||
let resolvedSpeed = Self.resolveSpeed(
|
||||
speed: directive?.speed,
|
||||
rateWPM: directive?.rateWPM,
|
||||
logger: self.logger)
|
||||
|
||||
let request = ElevenLabsRequest(
|
||||
text: cleaned,
|
||||
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
|
||||
outputFormat: Self.validatedOutputFormat(directive?.outputFormat ?? self.defaultOutputFormat, logger: self.logger),
|
||||
speed: resolvedSpeed,
|
||||
stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger),
|
||||
similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger),
|
||||
style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger),
|
||||
speakerBoost: directive?.speakerBoost,
|
||||
seed: Self.validatedSeed(directive?.seed, logger: self.logger),
|
||||
normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger),
|
||||
language: Self.validatedLanguage(directive?.language, logger: self.logger))
|
||||
|
||||
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
|
||||
self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
|
||||
|
||||
do {
|
||||
let client = ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger)
|
||||
let audio = try await withThrowingTaskGroup(of: Data.self) { group in
|
||||
group.addTask {
|
||||
try await client.synthesize(voiceId: voiceId, request: request)
|
||||
if let apiKey, !apiKey.isEmpty, let voiceId {
|
||||
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
|
||||
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
|
||||
self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
|
||||
}
|
||||
group.addTask {
|
||||
try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000))
|
||||
throw NSError(domain: "TalkTTS", code: 408, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s",
|
||||
|
||||
let request = ElevenLabsTTSRequest(
|
||||
text: cleaned,
|
||||
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
|
||||
outputFormat: outputFormat,
|
||||
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
|
||||
stability: TalkTTSValidation.validatedUnit(directive?.stability),
|
||||
similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
|
||||
style: TalkTTSValidation.validatedUnit(directive?.style),
|
||||
speakerBoost: directive?.speakerBoost,
|
||||
seed: TalkTTSValidation.validatedSeed(directive?.seed),
|
||||
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
|
||||
language: language)
|
||||
|
||||
self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s")
|
||||
let client = ElevenLabsTTSClient(apiKey: apiKey)
|
||||
let audio = try await client.synthesizeWithHardTimeout(
|
||||
voiceId: voiceId,
|
||||
request: request,
|
||||
hardTimeoutSeconds: synthTimeoutSeconds)
|
||||
guard self.isCurrent(gen) else { return }
|
||||
self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
|
||||
|
||||
if self.interruptOnSpeech {
|
||||
await self.startRecognition()
|
||||
guard self.isCurrent(gen) else { return }
|
||||
}
|
||||
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
|
||||
self.phase = .speaking
|
||||
|
||||
let result = await TalkAudioPlayer.shared.play(data: audio)
|
||||
self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
|
||||
if !result.finished, result.interruptedAt == nil {
|
||||
throw NSError(domain: "TalkAudioPlayer", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "audio playback failed",
|
||||
])
|
||||
}
|
||||
let data = try await group.next()!
|
||||
group.cancelAll()
|
||||
return data
|
||||
}
|
||||
guard self.isCurrent(gen) else { return }
|
||||
self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)")
|
||||
let result = await TalkAudioPlayer.shared.play(data: audio)
|
||||
self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)")
|
||||
if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
|
||||
if self.interruptOnSpeech {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking {
|
||||
if self.interruptOnSpeech {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
}
|
||||
} else {
|
||||
self.ttsLogger.info("talk system voice start chars=\(cleaned.count, privacy: .public)")
|
||||
if self.interruptOnSpeech {
|
||||
await self.startRecognition()
|
||||
guard self.isCurrent(gen) else { return }
|
||||
}
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
|
||||
self.phase = .speaking
|
||||
await TalkSystemSpeechSynthesizer.shared.stop()
|
||||
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
|
||||
self.ttsLogger.info("talk system voice done")
|
||||
}
|
||||
} catch {
|
||||
self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)")
|
||||
self.ttsLogger.error("talk TTS failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
|
||||
do {
|
||||
if self.interruptOnSpeech {
|
||||
await self.startRecognition()
|
||||
guard self.isCurrent(gen) else { return }
|
||||
}
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
|
||||
self.phase = .speaking
|
||||
await TalkSystemSpeechSynthesizer.shared.stop()
|
||||
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
|
||||
} catch {
|
||||
self.ttsLogger.error("talk system voice failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
|
||||
if self.phase == .speaking {
|
||||
@@ -505,7 +528,7 @@ actor TalkModeRuntime {
|
||||
if let fallbackVoiceId { return fallbackVoiceId }
|
||||
|
||||
do {
|
||||
let voices = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).listVoices()
|
||||
let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
|
||||
guard let first = voices.first else {
|
||||
self.ttsLogger.error("elevenlabs voices list empty")
|
||||
return nil
|
||||
@@ -528,6 +551,7 @@ actor TalkModeRuntime {
|
||||
|
||||
func stopSpeaking(reason: TalkStopReason) async {
|
||||
let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() }
|
||||
await TalkSystemSpeechSynthesizer.shared.stop()
|
||||
guard self.phase == .speaking else { return }
|
||||
if reason == .speech, let interruptedAt {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
@@ -720,154 +744,4 @@ actor TalkModeRuntime {
|
||||
return normalized
|
||||
}
|
||||
|
||||
private static func validatedLanguage(_ value: String?, logger: Logger) -> String? {
|
||||
guard let value else { return nil }
|
||||
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else {
|
||||
logger.warning("talk language invalid: \(normalized, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return normalized
|
||||
}
|
||||
|
||||
private static func validatedOutputFormat(_ value: String?, logger: Logger) -> String? {
|
||||
let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||
guard !trimmed.isEmpty else { return nil }
|
||||
guard trimmed.hasPrefix("mp3_") else {
|
||||
logger.warning("talk output_format unsupported for local playback: \(trimmed, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
return trimmed
|
||||
}
|
||||
|
||||
private static func isMessageTimestampAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
|
||||
let sinceMs = sinceSeconds * 1000
|
||||
if timestamp > 10_000_000_000 {
|
||||
return timestamp >= sinceMs - 500
|
||||
}
|
||||
return timestamp >= sinceSeconds - 0.5
|
||||
}
|
||||
}
|
||||
|
||||
private struct ElevenLabsRequest {
|
||||
let text: String
|
||||
let modelId: String?
|
||||
let outputFormat: String?
|
||||
let speed: Double?
|
||||
let stability: Double?
|
||||
let similarity: Double?
|
||||
let style: Double?
|
||||
let speakerBoost: Bool?
|
||||
let seed: UInt32?
|
||||
let normalize: String?
|
||||
let language: String?
|
||||
}
|
||||
|
||||
private struct ElevenLabsClient {
|
||||
let apiKey: String
|
||||
let logger: Logger
|
||||
let baseUrl: URL = URL(string: "https://api.elevenlabs.io")!
|
||||
let ttsTimeoutSeconds: TimeInterval = 45
|
||||
let listVoicesTimeoutSeconds: TimeInterval = 15
|
||||
|
||||
func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("text-to-speech")
|
||||
url.appendPathComponent(voiceId)
|
||||
|
||||
let charCount = request.text.count
|
||||
self.logger.info(
|
||||
"elevenlabs tts request voice=\(voiceId, privacy: .public) model=\(request.modelId ?? "default", privacy: .public) chars=\(charCount, privacy: .public)")
|
||||
let startedAt = Date()
|
||||
|
||||
var payload: [String: Any] = [
|
||||
"text": request.text,
|
||||
]
|
||||
if let modelId = request.modelId, !modelId.isEmpty {
|
||||
payload["model_id"] = modelId
|
||||
}
|
||||
if let outputFormat = request.outputFormat, !outputFormat.isEmpty {
|
||||
payload["output_format"] = outputFormat
|
||||
}
|
||||
if let seed = request.seed {
|
||||
payload["seed"] = seed
|
||||
}
|
||||
if let normalize = request.normalize {
|
||||
payload["apply_text_normalization"] = normalize
|
||||
}
|
||||
if let language = request.language {
|
||||
payload["language_code"] = language
|
||||
}
|
||||
var voiceSettings: [String: Any] = [:]
|
||||
if let speed = request.speed { voiceSettings["speed"] = speed }
|
||||
if let stability = request.stability { voiceSettings["stability"] = stability }
|
||||
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
|
||||
if let style = request.style { voiceSettings["style"] = style }
|
||||
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
|
||||
if !voiceSettings.isEmpty {
|
||||
payload["voice_settings"] = voiceSettings
|
||||
}
|
||||
|
||||
let body = try JSONSerialization.data(withJSONObject: payload, options: [])
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "POST"
|
||||
req.httpBody = body
|
||||
req.timeoutInterval = self.ttsTimeoutSeconds
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
|
||||
let message = String(data: data, encoding: .utf8) ?? "unknown"
|
||||
self.logger.error(
|
||||
"elevenlabs tts failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)")
|
||||
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)",
|
||||
])
|
||||
}
|
||||
let elapsed = Date().timeIntervalSince(startedAt)
|
||||
self.logger.info("elevenlabs tts ok bytes=\(data.count, privacy: .public) dur=\(elapsed, privacy: .public)s")
|
||||
return data
|
||||
}
|
||||
|
||||
func listVoices() async throws -> [ElevenLabsVoice] {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("voices")
|
||||
|
||||
self.logger.info("elevenlabs voices list request")
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "GET"
|
||||
req.timeoutInterval = self.listVoicesTimeoutSeconds
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
|
||||
let message = String(data: data, encoding: .utf8) ?? "unknown"
|
||||
self.logger.error(
|
||||
"elevenlabs voices list failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)")
|
||||
throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
|
||||
])
|
||||
}
|
||||
|
||||
let decoded = try JSONDecoder().decode(ElevenLabsVoicesResponse.self, from: data)
|
||||
return decoded.voices
|
||||
}
|
||||
}
|
||||
|
||||
private struct ElevenLabsVoice: Decodable {
|
||||
let voiceId: String
|
||||
let name: String?
|
||||
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case voiceId = "voice_id"
|
||||
case name
|
||||
}
|
||||
}
|
||||
|
||||
private struct ElevenLabsVoicesResponse: Decodable {
|
||||
let voices: [ElevenLabsVoice]
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ import SwiftUI
|
||||
@Observable
|
||||
final class TalkOverlayController {
|
||||
static let shared = TalkOverlayController()
|
||||
static let overlaySize: CGFloat = 360
|
||||
static let overlaySize: CGFloat = 440
|
||||
static let windowInset: CGFloat = 88
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay")
|
||||
|
||||
@@ -31,7 +31,7 @@ struct TalkOverlayView: View {
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.contentShape(Circle())
|
||||
.offset(x: -5, y: -5)
|
||||
.offset(x: -2, y: -2)
|
||||
.opacity(self.hoveringWindow ? 1 : 0)
|
||||
.animation(.easeOut(duration: 0.12), value: self.hoveringWindow)
|
||||
.allowsHitTesting(self.hoveringWindow)
|
||||
@@ -42,7 +42,7 @@ struct TalkOverlayView: View {
|
||||
.onHover { self.hoveringWindow = $0 }
|
||||
}
|
||||
|
||||
private static let defaultSeamColor = Color(red: 127 / 255.0, green: 184 / 255.0, blue: 212 / 255.0)
|
||||
private static let defaultSeamColor = Color(red: 79 / 255.0, green: 122 / 255.0, blue: 154 / 255.0)
|
||||
|
||||
private var seamColor: Color {
|
||||
Self.color(fromHex: self.appState.seamColorHex) ?? Self.defaultSeamColor
|
||||
|
||||
@@ -29,6 +29,10 @@ final class WebChatManager {
|
||||
|
||||
var onPanelVisibilityChanged: ((Bool) -> Void)?
|
||||
|
||||
var activeSessionKey: String? {
|
||||
self.panelSessionKey ?? self.windowSessionKey
|
||||
}
|
||||
|
||||
func show(sessionKey: String) {
|
||||
self.closePanel()
|
||||
if let controller = self.windowController {
|
||||
|
||||
233
apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
Normal file
233
apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
Normal file
@@ -0,0 +1,233 @@
|
||||
import Foundation
|
||||
|
||||
public struct ElevenLabsVoice: Decodable, Sendable {
|
||||
public let voiceId: String
|
||||
public let name: String?
|
||||
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case voiceId = "voice_id"
|
||||
case name
|
||||
}
|
||||
}
|
||||
|
||||
public struct ElevenLabsTTSRequest: Sendable {
|
||||
public var text: String
|
||||
public var modelId: String?
|
||||
public var outputFormat: String?
|
||||
public var speed: Double?
|
||||
public var stability: Double?
|
||||
public var similarity: Double?
|
||||
public var style: Double?
|
||||
public var speakerBoost: Bool?
|
||||
public var seed: UInt32?
|
||||
public var normalize: String?
|
||||
public var language: String?
|
||||
|
||||
public init(
|
||||
text: String,
|
||||
modelId: String? = nil,
|
||||
outputFormat: String? = nil,
|
||||
speed: Double? = nil,
|
||||
stability: Double? = nil,
|
||||
similarity: Double? = nil,
|
||||
style: Double? = nil,
|
||||
speakerBoost: Bool? = nil,
|
||||
seed: UInt32? = nil,
|
||||
normalize: String? = nil,
|
||||
language: String? = nil)
|
||||
{
|
||||
self.text = text
|
||||
self.modelId = modelId
|
||||
self.outputFormat = outputFormat
|
||||
self.speed = speed
|
||||
self.stability = stability
|
||||
self.similarity = similarity
|
||||
self.style = style
|
||||
self.speakerBoost = speakerBoost
|
||||
self.seed = seed
|
||||
self.normalize = normalize
|
||||
self.language = language
|
||||
}
|
||||
}
|
||||
|
||||
public struct ElevenLabsTTSClient: Sendable {
|
||||
public var apiKey: String
|
||||
public var requestTimeoutSeconds: TimeInterval
|
||||
public var listVoicesTimeoutSeconds: TimeInterval
|
||||
public var baseUrl: URL
|
||||
|
||||
public init(
|
||||
apiKey: String,
|
||||
requestTimeoutSeconds: TimeInterval = 45,
|
||||
listVoicesTimeoutSeconds: TimeInterval = 15,
|
||||
baseUrl: URL = URL(string: "https://api.elevenlabs.io")!)
|
||||
{
|
||||
self.apiKey = apiKey
|
||||
self.requestTimeoutSeconds = requestTimeoutSeconds
|
||||
self.listVoicesTimeoutSeconds = listVoicesTimeoutSeconds
|
||||
self.baseUrl = baseUrl
|
||||
}
|
||||
|
||||
public func synthesizeWithHardTimeout(
|
||||
voiceId: String,
|
||||
request: ElevenLabsTTSRequest,
|
||||
hardTimeoutSeconds: TimeInterval) async throws -> Data
|
||||
{
|
||||
try await withThrowingTaskGroup(of: Data.self) { group in
|
||||
group.addTask {
|
||||
try await self.synthesize(voiceId: voiceId, request: request)
|
||||
}
|
||||
group.addTask {
|
||||
try await Task.sleep(nanoseconds: UInt64(hardTimeoutSeconds * 1_000_000_000))
|
||||
throw NSError(domain: "ElevenLabsTTS", code: 408, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(hardTimeoutSeconds)s",
|
||||
])
|
||||
}
|
||||
let data = try await group.next()!
|
||||
group.cancelAll()
|
||||
return data
|
||||
}
|
||||
}
|
||||
|
||||
public func synthesize(voiceId: String, request: ElevenLabsTTSRequest) async throws -> Data {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("text-to-speech")
|
||||
url.appendPathComponent(voiceId)
|
||||
|
||||
let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
|
||||
|
||||
var lastError: Error?
|
||||
for attempt in 0..<3 {
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "POST"
|
||||
req.httpBody = body
|
||||
req.timeoutInterval = self.requestTimeoutSeconds
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
do {
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse {
|
||||
let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
|
||||
if http.statusCode == 429 || http.statusCode >= 500 {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
lastError = NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs retryable failure: \(http.statusCode) ct=\(contentType) \(message)",
|
||||
])
|
||||
if attempt < 2 {
|
||||
let retryAfter = Double(http.value(forHTTPHeaderField: "Retry-After") ?? "")
|
||||
let baseDelay = [0.25, 0.75, 1.5][attempt]
|
||||
let delaySeconds = max(baseDelay, retryAfter ?? 0)
|
||||
try? await Task.sleep(nanoseconds: UInt64(delaySeconds * 1_000_000_000))
|
||||
continue
|
||||
}
|
||||
throw lastError!
|
||||
}
|
||||
|
||||
if http.statusCode >= 400 {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
|
||||
])
|
||||
}
|
||||
|
||||
if !contentType.contains("audio") {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
|
||||
])
|
||||
}
|
||||
}
|
||||
return data
|
||||
} catch {
|
||||
lastError = error
|
||||
if attempt < 2 {
|
||||
try? await Task.sleep(nanoseconds: UInt64([0.25, 0.75, 1.5][attempt] * 1_000_000_000))
|
||||
continue
|
||||
}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
throw lastError ?? NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed",
|
||||
])
|
||||
}
|
||||
|
||||
public func listVoices() async throws -> [ElevenLabsVoice] {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("voices")
|
||||
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "GET"
|
||||
req.timeoutInterval = self.listVoicesTimeoutSeconds
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
|
||||
])
|
||||
}
|
||||
|
||||
struct VoicesResponse: Decodable { let voices: [ElevenLabsVoice] }
|
||||
return try JSONDecoder().decode(VoicesResponse.self, from: data).voices
|
||||
}
|
||||
|
||||
public static func validatedOutputFormat(_ value: String?) -> String? {
|
||||
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return nil }
|
||||
guard trimmed.hasPrefix("mp3_") else { return nil }
|
||||
return trimmed
|
||||
}
|
||||
|
||||
public static func validatedLanguage(_ value: String?) -> String? {
|
||||
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
|
||||
return normalized
|
||||
}
|
||||
|
||||
public static func validatedNormalize(_ value: String?) -> String? {
|
||||
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard ["auto", "on", "off"].contains(normalized) else { return nil }
|
||||
return normalized
|
||||
}
|
||||
|
||||
private static func buildPayload(_ request: ElevenLabsTTSRequest) -> [String: Any] {
|
||||
var payload: [String: Any] = ["text": request.text]
|
||||
if let modelId = request.modelId?.trimmingCharacters(in: .whitespacesAndNewlines), !modelId.isEmpty {
|
||||
payload["model_id"] = modelId
|
||||
}
|
||||
if let outputFormat = request.outputFormat?.trimmingCharacters(in: .whitespacesAndNewlines), !outputFormat.isEmpty {
|
||||
payload["output_format"] = outputFormat
|
||||
}
|
||||
if let seed = request.seed {
|
||||
payload["seed"] = seed
|
||||
}
|
||||
if let normalize = request.normalize {
|
||||
payload["apply_text_normalization"] = normalize
|
||||
}
|
||||
if let language = request.language {
|
||||
payload["language_code"] = language
|
||||
}
|
||||
|
||||
var voiceSettings: [String: Any] = [:]
|
||||
if let speed = request.speed { voiceSettings["speed"] = speed }
|
||||
if let stability = request.stability { voiceSettings["stability"] = stability }
|
||||
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
|
||||
if let style = request.style { voiceSettings["style"] = style }
|
||||
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
|
||||
if !voiceSettings.isEmpty {
|
||||
payload["voice_settings"] = voiceSettings
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
private static func truncatedErrorBody(_ data: Data) -> String {
|
||||
let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
|
||||
return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
|
||||
}
|
||||
}
|
||||
@@ -67,12 +67,18 @@ public enum TalkDirectiveParser {
|
||||
var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false)
|
||||
guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) }
|
||||
|
||||
guard let firstNonEmpty =
|
||||
guard let firstNonEmptyIndex =
|
||||
lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty })
|
||||
else {
|
||||
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
|
||||
}
|
||||
|
||||
var firstNonEmpty = firstNonEmptyIndex
|
||||
if firstNonEmpty > 0 {
|
||||
lines.removeSubrange(0..<firstNonEmpty)
|
||||
firstNonEmpty = 0
|
||||
}
|
||||
|
||||
let head = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard head.hasPrefix("{"), head.hasSuffix("}") else {
|
||||
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
public enum TalkHistoryTimestamp: Sendable {
|
||||
/// Gateway history timestamps have historically been emitted as either seconds (Double, epoch seconds)
|
||||
/// or milliseconds (Double, epoch ms). This helper accepts either.
|
||||
public static func isAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
|
||||
let sinceMs = sinceSeconds * 1000
|
||||
// ~2286-11-20 in epoch seconds. Anything bigger is almost certainly epoch milliseconds.
|
||||
if timestamp > 10_000_000_000 {
|
||||
return timestamp >= sinceMs - 500
|
||||
}
|
||||
return timestamp >= sinceSeconds - 0.5
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
public enum TalkPromptBuilder: Sendable {
|
||||
public static func build(transcript: String, interruptedAtSeconds: Double?) -> String {
|
||||
var lines: [String] = [
|
||||
"Talk Mode active. Reply in a concise, spoken tone.",
|
||||
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
|
||||
]
|
||||
|
||||
if let interruptedAtSeconds {
|
||||
let formatted = String(format: "%.1f", interruptedAtSeconds)
|
||||
lines.append("Assistant speech interrupted at \(formatted)s.")
|
||||
}
|
||||
|
||||
lines.append("")
|
||||
lines.append(transcript)
|
||||
return lines.joined(separator: "\n")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
import AVFoundation
|
||||
import Foundation
|
||||
|
||||
@MainActor
|
||||
public final class TalkSystemSpeechSynthesizer: NSObject {
|
||||
public enum SpeakError: Error {
|
||||
case canceled
|
||||
}
|
||||
|
||||
public static let shared = TalkSystemSpeechSynthesizer()
|
||||
|
||||
private let synth = AVSpeechSynthesizer()
|
||||
private var speakContinuation: CheckedContinuation<Void, Error>?
|
||||
private var currentUtterance: AVSpeechUtterance?
|
||||
private var currentToken = UUID()
|
||||
private var watchdog: Task<Void, Never>?
|
||||
|
||||
public var isSpeaking: Bool { self.synth.isSpeaking }
|
||||
|
||||
private override init() {
|
||||
super.init()
|
||||
self.synth.delegate = self
|
||||
}
|
||||
|
||||
public func stop() {
|
||||
self.currentToken = UUID()
|
||||
self.watchdog?.cancel()
|
||||
self.watchdog = nil
|
||||
self.synth.stopSpeaking(at: .immediate)
|
||||
self.finishCurrent(with: SpeakError.canceled)
|
||||
}
|
||||
|
||||
public func speak(text: String, language: String? = nil) async throws {
|
||||
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return }
|
||||
|
||||
self.stop()
|
||||
let token = UUID()
|
||||
self.currentToken = token
|
||||
|
||||
let utterance = AVSpeechUtterance(string: trimmed)
|
||||
if let language, let voice = AVSpeechSynthesisVoice(language: language) {
|
||||
utterance.voice = voice
|
||||
}
|
||||
self.currentUtterance = utterance
|
||||
|
||||
let estimatedSeconds = max(3.0, min(180.0, Double(trimmed.count) * 0.08))
|
||||
self.watchdog?.cancel()
|
||||
self.watchdog = Task { @MainActor [weak self] in
|
||||
guard let self else { return }
|
||||
try? await Task.sleep(nanoseconds: UInt64(estimatedSeconds * 1_000_000_000))
|
||||
if Task.isCancelled { return }
|
||||
guard self.currentToken == token else { return }
|
||||
if self.synth.isSpeaking {
|
||||
self.synth.stopSpeaking(at: .immediate)
|
||||
}
|
||||
self.finishCurrent(
|
||||
with: NSError(domain: "TalkSystemSpeechSynthesizer", code: 408, userInfo: [
|
||||
NSLocalizedDescriptionKey: "system TTS timed out after \(estimatedSeconds)s",
|
||||
]))
|
||||
}
|
||||
|
||||
try await withTaskCancellationHandler(operation: {
|
||||
try await withCheckedThrowingContinuation { cont in
|
||||
self.speakContinuation = cont
|
||||
self.synth.speak(utterance)
|
||||
}
|
||||
}, onCancel: {
|
||||
Task { @MainActor in
|
||||
self.stop()
|
||||
}
|
||||
})
|
||||
|
||||
if self.currentToken != token {
|
||||
throw SpeakError.canceled
|
||||
}
|
||||
}
|
||||
|
||||
private func handleFinish(error: Error?) {
|
||||
guard self.currentUtterance != nil else { return }
|
||||
self.watchdog?.cancel()
|
||||
self.watchdog = nil
|
||||
self.finishCurrent(with: error)
|
||||
}
|
||||
|
||||
private func finishCurrent(with error: Error?) {
|
||||
self.currentUtterance = nil
|
||||
let cont = self.speakContinuation
|
||||
self.speakContinuation = nil
|
||||
if let error {
|
||||
cont?.resume(throwing: error)
|
||||
} else {
|
||||
cont?.resume(returning: ())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extension TalkSystemSpeechSynthesizer: AVSpeechSynthesizerDelegate {
|
||||
public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
|
||||
Task { @MainActor in
|
||||
self.handleFinish(error: nil)
|
||||
}
|
||||
}
|
||||
|
||||
public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
|
||||
Task { @MainActor in
|
||||
self.handleFinish(error: SpeakError.canceled)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
public enum TalkTTSValidation: Sendable {
|
||||
public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
|
||||
if let rateWPM, rateWPM > 0 {
|
||||
let resolved = Double(rateWPM) / 175.0
|
||||
if resolved <= 0.5 || resolved >= 2.0 { return nil }
|
||||
return resolved
|
||||
}
|
||||
if let speed {
|
||||
if speed <= 0.5 || speed >= 2.0 { return nil }
|
||||
return speed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
public static func validatedUnit(_ value: Double?) -> Double? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 1 { return nil }
|
||||
return value
|
||||
}
|
||||
|
||||
public static func validatedSeed(_ value: Int?) -> UInt32? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 4294967295 { return nil }
|
||||
return UInt32(value)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
import XCTest
|
||||
@testable import ClawdisKit
|
||||
|
||||
final class ElevenLabsTTSValidationTests: XCTestCase {
|
||||
func testValidatedOutputFormatAllowsOnlyMp3Presets() {
|
||||
XCTAssertEqual(ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128"), "mp3_44100_128")
|
||||
XCTAssertNil(ElevenLabsTTSClient.validatedOutputFormat("pcm_16000"))
|
||||
}
|
||||
|
||||
func testValidatedLanguageAcceptsTwoLetterCodes() {
|
||||
XCTAssertEqual(ElevenLabsTTSClient.validatedLanguage("EN"), "en")
|
||||
XCTAssertNil(ElevenLabsTTSClient.validatedLanguage("eng"))
|
||||
}
|
||||
|
||||
func testValidatedNormalizeAcceptsKnownValues() {
|
||||
XCTAssertEqual(ElevenLabsTTSClient.validatedNormalize("AUTO"), "auto")
|
||||
XCTAssertNil(ElevenLabsTTSClient.validatedNormalize("maybe"))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,6 +50,18 @@ final class TalkDirectiveTests: XCTestCase {
|
||||
XCTAssertEqual(result.stripped, "Hello.")
|
||||
}
|
||||
|
||||
func testSkipsLeadingEmptyLinesWhenParsingDirective() {
|
||||
let text = """
|
||||
|
||||
|
||||
{"voice":"abc123"}
|
||||
Hello there.
|
||||
"""
|
||||
let result = TalkDirectiveParser.parse(text)
|
||||
XCTAssertEqual(result.directive?.voiceId, "abc123")
|
||||
XCTAssertEqual(result.stripped, "Hello there.")
|
||||
}
|
||||
|
||||
func testTracksUnknownKeys() {
|
||||
let text = """
|
||||
{"voice":"abc","mystery":"value","extra":1}
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
import XCTest
|
||||
@testable import ClawdisKit
|
||||
|
||||
final class TalkHistoryTimestampTests: XCTestCase {
|
||||
func testSecondsTimestampsAreAcceptedWithSmallTolerance() {
|
||||
XCTAssertTrue(TalkHistoryTimestamp.isAfter(999.6, sinceSeconds: 1000))
|
||||
XCTAssertFalse(TalkHistoryTimestamp.isAfter(999.4, sinceSeconds: 1000))
|
||||
}
|
||||
|
||||
func testMillisecondsTimestampsAreAcceptedWithSmallTolerance() {
|
||||
let sinceSeconds = 1_700_000_000.0
|
||||
let sinceMs = sinceSeconds * 1000
|
||||
XCTAssertTrue(TalkHistoryTimestamp.isAfter(sinceMs - 500, sinceSeconds: sinceSeconds))
|
||||
XCTAssertFalse(TalkHistoryTimestamp.isAfter(sinceMs - 501, sinceSeconds: sinceSeconds))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
import XCTest
|
||||
@testable import ClawdisKit
|
||||
|
||||
final class TalkPromptBuilderTests: XCTestCase {
|
||||
func testBuildIncludesTranscript() {
|
||||
let prompt = TalkPromptBuilder.build(transcript: "Hello", interruptedAtSeconds: nil)
|
||||
XCTAssertTrue(prompt.contains("Talk Mode active."))
|
||||
XCTAssertTrue(prompt.hasSuffix("\n\nHello"))
|
||||
}
|
||||
|
||||
func testBuildIncludesInterruptionLineWhenProvided() {
|
||||
let prompt = TalkPromptBuilder.build(transcript: "Hi", interruptedAtSeconds: 1.234)
|
||||
XCTAssertTrue(prompt.contains("Assistant speech interrupted at 1.2s."))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
import XCTest
|
||||
@testable import ClawdisKit
|
||||
|
||||
final class TalkTTSValidationTests: XCTestCase {
|
||||
func testResolveSpeedUsesRateWPMWhenProvided() {
|
||||
let resolved = TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 175)
|
||||
XCTAssertNotNil(resolved)
|
||||
XCTAssertEqual(resolved ?? 0, 1.0, accuracy: 0.0001)
|
||||
XCTAssertNil(TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 400))
|
||||
}
|
||||
|
||||
func testValidatedUnitBounds() {
|
||||
XCTAssertEqual(TalkTTSValidation.validatedUnit(0), 0)
|
||||
XCTAssertEqual(TalkTTSValidation.validatedUnit(1), 1)
|
||||
XCTAssertNil(TalkTTSValidation.validatedUnit(-0.01))
|
||||
XCTAssertNil(TalkTTSValidation.validatedUnit(1.01))
|
||||
}
|
||||
|
||||
func testValidatedSeedBounds() {
|
||||
XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0)
|
||||
XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234)
|
||||
XCTAssertNil(TalkTTSValidation.validatedSeed(-1))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user