fix(talk): harden TTS + add system fallback

2025-12-30 07:40:02 +01:00
parent a7617e4d79
commit f86772f26c
22 changed files with 839 additions and 468 deletions
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
@@ -0,0 +1,233 @@
+import Foundation
+
+public struct ElevenLabsVoice: Decodable, Sendable {
+    public let voiceId: String
+    public let name: String?
+
+    enum CodingKeys: String, CodingKey {
+        case voiceId = "voice_id"
+        case name
+    }
+}
+
+public struct ElevenLabsTTSRequest: Sendable {
+    public var text: String
+    public var modelId: String?
+    public var outputFormat: String?
+    public var speed: Double?
+    public var stability: Double?
+    public var similarity: Double?
+    public var style: Double?
+    public var speakerBoost: Bool?
+    public var seed: UInt32?
+    public var normalize: String?
+    public var language: String?
+
+    public init(
+        text: String,
+        modelId: String? = nil,
+        outputFormat: String? = nil,
+        speed: Double? = nil,
+        stability: Double? = nil,
+        similarity: Double? = nil,
+        style: Double? = nil,
+        speakerBoost: Bool? = nil,
+        seed: UInt32? = nil,
+        normalize: String? = nil,
+        language: String? = nil)
+    {
+        self.text = text
+        self.modelId = modelId
+        self.outputFormat = outputFormat
+        self.speed = speed
+        self.stability = stability
+        self.similarity = similarity
+        self.style = style
+        self.speakerBoost = speakerBoost
+        self.seed = seed
+        self.normalize = normalize
+        self.language = language
+    }
+}
+
+public struct ElevenLabsTTSClient: Sendable {
+    public var apiKey: String
+    public var requestTimeoutSeconds: TimeInterval
+    public var listVoicesTimeoutSeconds: TimeInterval
+    public var baseUrl: URL
+
+    public init(
+        apiKey: String,
+        requestTimeoutSeconds: TimeInterval = 45,
+        listVoicesTimeoutSeconds: TimeInterval = 15,
+        baseUrl: URL = URL(string: "https://api.elevenlabs.io")!)
+    {
+        self.apiKey = apiKey
+        self.requestTimeoutSeconds = requestTimeoutSeconds
+        self.listVoicesTimeoutSeconds = listVoicesTimeoutSeconds
+        self.baseUrl = baseUrl
+    }
+
+    public func synthesizeWithHardTimeout(
+        voiceId: String,
+        request: ElevenLabsTTSRequest,
+        hardTimeoutSeconds: TimeInterval) async throws -> Data
+    {
+        try await withThrowingTaskGroup(of: Data.self) { group in
+            group.addTask {
+                try await self.synthesize(voiceId: voiceId, request: request)
+            }
+            group.addTask {
+                try await Task.sleep(nanoseconds: UInt64(hardTimeoutSeconds * 1_000_000_000))
+                throw NSError(domain: "ElevenLabsTTS", code: 408, userInfo: [
+                    NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(hardTimeoutSeconds)s",
+                ])
+            }
+            let data = try await group.next()!
+            group.cancelAll()
+            return data
+        }
+    }
+
+    public func synthesize(voiceId: String, request: ElevenLabsTTSRequest) async throws -> Data {
+        var url = self.baseUrl
+        url.appendPathComponent("v1")
+        url.appendPathComponent("text-to-speech")
+        url.appendPathComponent(voiceId)
+
+        let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
+
+        var lastError: Error?
+        for attempt in 0..<3 {
+            var req = URLRequest(url: url)
+            req.httpMethod = "POST"
+            req.httpBody = body
+            req.timeoutInterval = self.requestTimeoutSeconds
+            req.setValue("application/json", forHTTPHeaderField: "Content-Type")
+            req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
+            req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
+
+            do {
+                let (data, response) = try await URLSession.shared.data(for: req)
+                if let http = response as? HTTPURLResponse {
+                    let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
+                    if http.statusCode == 429 || http.statusCode >= 500 {
+                        let message = Self.truncatedErrorBody(data)
+                        lastError = NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
+                            NSLocalizedDescriptionKey: "ElevenLabs retryable failure: \(http.statusCode) ct=\(contentType) \(message)",
+                        ])
+                        if attempt < 2 {
+                            let retryAfter = Double(http.value(forHTTPHeaderField: "Retry-After") ?? "")
+                            let baseDelay = [0.25, 0.75, 1.5][attempt]
+                            let delaySeconds = max(baseDelay, retryAfter ?? 0)
+                            try? await Task.sleep(nanoseconds: UInt64(delaySeconds * 1_000_000_000))
+                            continue
+                        }
+                        throw lastError!
+                    }
+
+                    if http.statusCode >= 400 {
+                        let message = Self.truncatedErrorBody(data)
+                        throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
+                            NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
+                        ])
+                    }
+
+                    if !contentType.contains("audio") {
+                        let message = Self.truncatedErrorBody(data)
+                        throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
+                            NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
+                        ])
+                    }
+                }
+                return data
+            } catch {
+                lastError = error
+                if attempt < 2 {
+                    try? await Task.sleep(nanoseconds: UInt64([0.25, 0.75, 1.5][attempt] * 1_000_000_000))
+                    continue
+                }
+                throw error
+            }
+        }
+        throw lastError ?? NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
+            NSLocalizedDescriptionKey: "ElevenLabs failed",
+        ])
+    }
+
+    public func listVoices() async throws -> [ElevenLabsVoice] {
+        var url = self.baseUrl
+        url.appendPathComponent("v1")
+        url.appendPathComponent("voices")
+
+        var req = URLRequest(url: url)
+        req.httpMethod = "GET"
+        req.timeoutInterval = self.listVoicesTimeoutSeconds
+        req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
+
+        let (data, response) = try await URLSession.shared.data(for: req)
+        if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
+            let message = Self.truncatedErrorBody(data)
+            throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
+                NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
+            ])
+        }
+
+        struct VoicesResponse: Decodable { let voices: [ElevenLabsVoice] }
+        return try JSONDecoder().decode(VoicesResponse.self, from: data).voices
+    }
+
+    public static func validatedOutputFormat(_ value: String?) -> String? {
+        let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
+        guard !trimmed.isEmpty else { return nil }
+        guard trimmed.hasPrefix("mp3_") else { return nil }
+        return trimmed
+    }
+
+    public static func validatedLanguage(_ value: String?) -> String? {
+        let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
+        guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
+        return normalized
+    }
+
+    public static func validatedNormalize(_ value: String?) -> String? {
+        let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
+        guard ["auto", "on", "off"].contains(normalized) else { return nil }
+        return normalized
+    }
+
+    private static func buildPayload(_ request: ElevenLabsTTSRequest) -> [String: Any] {
+        var payload: [String: Any] = ["text": request.text]
+        if let modelId = request.modelId?.trimmingCharacters(in: .whitespacesAndNewlines), !modelId.isEmpty {
+            payload["model_id"] = modelId
+        }
+        if let outputFormat = request.outputFormat?.trimmingCharacters(in: .whitespacesAndNewlines), !outputFormat.isEmpty {
+            payload["output_format"] = outputFormat
+        }
+        if let seed = request.seed {
+            payload["seed"] = seed
+        }
+        if let normalize = request.normalize {
+            payload["apply_text_normalization"] = normalize
+        }
+        if let language = request.language {
+            payload["language_code"] = language
+        }
+
+        var voiceSettings: [String: Any] = [:]
+        if let speed = request.speed { voiceSettings["speed"] = speed }
+        if let stability = request.stability { voiceSettings["stability"] = stability }
+        if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
+        if let style = request.style { voiceSettings["style"] = style }
+        if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
+        if !voiceSettings.isEmpty {
+            payload["voice_settings"] = voiceSettings
+        }
+        return payload
+    }
+
+    private static func truncatedErrorBody(_ data: Data) -> String {
+        let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
+        return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
+    }
+}
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift
@@ -67,12 +67,18 @@ public enum TalkDirectiveParser {
        var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false)
        guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) }

-        guard let firstNonEmpty =
+        guard let firstNonEmptyIndex =
            lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty })
        else {
            return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
        }

+        var firstNonEmpty = firstNonEmptyIndex
+        if firstNonEmpty > 0 {
+            lines.removeSubrange(0..<firstNonEmpty)
+            firstNonEmpty = 0
+        }
+
        let head = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines)
        guard head.hasPrefix("{"), head.hasSuffix("}") else {
            return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkHistoryTimestamp.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkHistoryTimestamp.swift
@@ -0,0 +1,13 @@
+public enum TalkHistoryTimestamp: Sendable {
+    /// Gateway history timestamps have historically been emitted as either seconds (Double, epoch seconds)
+    /// or milliseconds (Double, epoch ms). This helper accepts either.
+    public static func isAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
+        let sinceMs = sinceSeconds * 1000
+        // ~2286-11-20 in epoch seconds. Anything bigger is almost certainly epoch milliseconds.
+        if timestamp > 10_000_000_000 {
+            return timestamp >= sinceMs - 500
+        }
+        return timestamp >= sinceSeconds - 0.5
+    }
+}
+
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkPromptBuilder.swift
@@ -0,0 +1,18 @@
+public enum TalkPromptBuilder: Sendable {
+    public static func build(transcript: String, interruptedAtSeconds: Double?) -> String {
+        var lines: [String] = [
+            "Talk Mode active. Reply in a concise, spoken tone.",
+            "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
+        ]
+
+        if let interruptedAtSeconds {
+            let formatted = String(format: "%.1f", interruptedAtSeconds)
+            lines.append("Assistant speech interrupted at \(formatted)s.")
+        }
+
+        lines.append("")
+        lines.append(transcript)
+        return lines.joined(separator: "\n")
+    }
+}
+
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkSystemSpeechSynthesizer.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkSystemSpeechSynthesizer.swift
@@ -0,0 +1,110 @@
+import AVFoundation
+import Foundation
+
+@MainActor
+public final class TalkSystemSpeechSynthesizer: NSObject {
+    public enum SpeakError: Error {
+        case canceled
+    }
+
+    public static let shared = TalkSystemSpeechSynthesizer()
+
+    private let synth = AVSpeechSynthesizer()
+    private var speakContinuation: CheckedContinuation<Void, Error>?
+    private var currentUtterance: AVSpeechUtterance?
+    private var currentToken = UUID()
+    private var watchdog: Task<Void, Never>?
+
+    public var isSpeaking: Bool { self.synth.isSpeaking }
+
+    private override init() {
+        super.init()
+        self.synth.delegate = self
+    }
+
+    public func stop() {
+        self.currentToken = UUID()
+        self.watchdog?.cancel()
+        self.watchdog = nil
+        self.synth.stopSpeaking(at: .immediate)
+        self.finishCurrent(with: SpeakError.canceled)
+    }
+
+    public func speak(text: String, language: String? = nil) async throws {
+        let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
+        guard !trimmed.isEmpty else { return }
+
+        self.stop()
+        let token = UUID()
+        self.currentToken = token
+
+        let utterance = AVSpeechUtterance(string: trimmed)
+        if let language, let voice = AVSpeechSynthesisVoice(language: language) {
+            utterance.voice = voice
+        }
+        self.currentUtterance = utterance
+
+        let estimatedSeconds = max(3.0, min(180.0, Double(trimmed.count) * 0.08))
+        self.watchdog?.cancel()
+        self.watchdog = Task { @MainActor [weak self] in
+            guard let self else { return }
+            try? await Task.sleep(nanoseconds: UInt64(estimatedSeconds * 1_000_000_000))
+            if Task.isCancelled { return }
+            guard self.currentToken == token else { return }
+            if self.synth.isSpeaking {
+                self.synth.stopSpeaking(at: .immediate)
+            }
+            self.finishCurrent(
+                with: NSError(domain: "TalkSystemSpeechSynthesizer", code: 408, userInfo: [
+                    NSLocalizedDescriptionKey: "system TTS timed out after \(estimatedSeconds)s",
+                ]))
+        }
+
+        try await withTaskCancellationHandler(operation: {
+            try await withCheckedThrowingContinuation { cont in
+                self.speakContinuation = cont
+                self.synth.speak(utterance)
+            }
+        }, onCancel: {
+            Task { @MainActor in
+                self.stop()
+            }
+        })
+
+        if self.currentToken != token {
+            throw SpeakError.canceled
+        }
+    }
+
+    private func handleFinish(error: Error?) {
+        guard self.currentUtterance != nil else { return }
+        self.watchdog?.cancel()
+        self.watchdog = nil
+        self.finishCurrent(with: error)
+    }
+
+    private func finishCurrent(with error: Error?) {
+        self.currentUtterance = nil
+        let cont = self.speakContinuation
+        self.speakContinuation = nil
+        if let error {
+            cont?.resume(throwing: error)
+        } else {
+            cont?.resume(returning: ())
+        }
+    }
+}
+
+extension TalkSystemSpeechSynthesizer: AVSpeechSynthesizerDelegate {
+    public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
+        Task { @MainActor in
+            self.handleFinish(error: nil)
+        }
+    }
+
+    public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
+        Task { @MainActor in
+            self.handleFinish(error: SpeakError.canceled)
+        }
+    }
+}
--- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift
+++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkTTSValidation.swift
@@ -0,0 +1,27 @@
+public enum TalkTTSValidation: Sendable {
+    public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
+        if let rateWPM, rateWPM > 0 {
+            let resolved = Double(rateWPM) / 175.0
+            if resolved <= 0.5 || resolved >= 2.0 { return nil }
+            return resolved
+        }
+        if let speed {
+            if speed <= 0.5 || speed >= 2.0 { return nil }
+            return speed
+        }
+        return nil
+    }
+
+    public static func validatedUnit(_ value: Double?) -> Double? {
+        guard let value else { return nil }
+        if value < 0 || value > 1 { return nil }
+        return value
+    }
+
+    public static func validatedSeed(_ value: Int?) -> UInt32? {
+        guard let value else { return nil }
+        if value < 0 || value > 4294967295 { return nil }
+        return UInt32(value)
+    }
+}
+