fix(talk): harden TTS + add system fallback
This commit is contained in:
233
apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
Normal file
233
apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
Normal file
@@ -0,0 +1,233 @@
|
||||
import Foundation
|
||||
|
||||
public struct ElevenLabsVoice: Decodable, Sendable {
|
||||
public let voiceId: String
|
||||
public let name: String?
|
||||
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case voiceId = "voice_id"
|
||||
case name
|
||||
}
|
||||
}
|
||||
|
||||
public struct ElevenLabsTTSRequest: Sendable {
|
||||
public var text: String
|
||||
public var modelId: String?
|
||||
public var outputFormat: String?
|
||||
public var speed: Double?
|
||||
public var stability: Double?
|
||||
public var similarity: Double?
|
||||
public var style: Double?
|
||||
public var speakerBoost: Bool?
|
||||
public var seed: UInt32?
|
||||
public var normalize: String?
|
||||
public var language: String?
|
||||
|
||||
public init(
|
||||
text: String,
|
||||
modelId: String? = nil,
|
||||
outputFormat: String? = nil,
|
||||
speed: Double? = nil,
|
||||
stability: Double? = nil,
|
||||
similarity: Double? = nil,
|
||||
style: Double? = nil,
|
||||
speakerBoost: Bool? = nil,
|
||||
seed: UInt32? = nil,
|
||||
normalize: String? = nil,
|
||||
language: String? = nil)
|
||||
{
|
||||
self.text = text
|
||||
self.modelId = modelId
|
||||
self.outputFormat = outputFormat
|
||||
self.speed = speed
|
||||
self.stability = stability
|
||||
self.similarity = similarity
|
||||
self.style = style
|
||||
self.speakerBoost = speakerBoost
|
||||
self.seed = seed
|
||||
self.normalize = normalize
|
||||
self.language = language
|
||||
}
|
||||
}
|
||||
|
||||
public struct ElevenLabsTTSClient: Sendable {
|
||||
public var apiKey: String
|
||||
public var requestTimeoutSeconds: TimeInterval
|
||||
public var listVoicesTimeoutSeconds: TimeInterval
|
||||
public var baseUrl: URL
|
||||
|
||||
public init(
|
||||
apiKey: String,
|
||||
requestTimeoutSeconds: TimeInterval = 45,
|
||||
listVoicesTimeoutSeconds: TimeInterval = 15,
|
||||
baseUrl: URL = URL(string: "https://api.elevenlabs.io")!)
|
||||
{
|
||||
self.apiKey = apiKey
|
||||
self.requestTimeoutSeconds = requestTimeoutSeconds
|
||||
self.listVoicesTimeoutSeconds = listVoicesTimeoutSeconds
|
||||
self.baseUrl = baseUrl
|
||||
}
|
||||
|
||||
public func synthesizeWithHardTimeout(
|
||||
voiceId: String,
|
||||
request: ElevenLabsTTSRequest,
|
||||
hardTimeoutSeconds: TimeInterval) async throws -> Data
|
||||
{
|
||||
try await withThrowingTaskGroup(of: Data.self) { group in
|
||||
group.addTask {
|
||||
try await self.synthesize(voiceId: voiceId, request: request)
|
||||
}
|
||||
group.addTask {
|
||||
try await Task.sleep(nanoseconds: UInt64(hardTimeoutSeconds * 1_000_000_000))
|
||||
throw NSError(domain: "ElevenLabsTTS", code: 408, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(hardTimeoutSeconds)s",
|
||||
])
|
||||
}
|
||||
let data = try await group.next()!
|
||||
group.cancelAll()
|
||||
return data
|
||||
}
|
||||
}
|
||||
|
||||
public func synthesize(voiceId: String, request: ElevenLabsTTSRequest) async throws -> Data {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("text-to-speech")
|
||||
url.appendPathComponent(voiceId)
|
||||
|
||||
let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
|
||||
|
||||
var lastError: Error?
|
||||
for attempt in 0..<3 {
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "POST"
|
||||
req.httpBody = body
|
||||
req.timeoutInterval = self.requestTimeoutSeconds
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
do {
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse {
|
||||
let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
|
||||
if http.statusCode == 429 || http.statusCode >= 500 {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
lastError = NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs retryable failure: \(http.statusCode) ct=\(contentType) \(message)",
|
||||
])
|
||||
if attempt < 2 {
|
||||
let retryAfter = Double(http.value(forHTTPHeaderField: "Retry-After") ?? "")
|
||||
let baseDelay = [0.25, 0.75, 1.5][attempt]
|
||||
let delaySeconds = max(baseDelay, retryAfter ?? 0)
|
||||
try? await Task.sleep(nanoseconds: UInt64(delaySeconds * 1_000_000_000))
|
||||
continue
|
||||
}
|
||||
throw lastError!
|
||||
}
|
||||
|
||||
if http.statusCode >= 400 {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
|
||||
])
|
||||
}
|
||||
|
||||
if !contentType.contains("audio") {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
|
||||
])
|
||||
}
|
||||
}
|
||||
return data
|
||||
} catch {
|
||||
lastError = error
|
||||
if attempt < 2 {
|
||||
try? await Task.sleep(nanoseconds: UInt64([0.25, 0.75, 1.5][attempt] * 1_000_000_000))
|
||||
continue
|
||||
}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
throw lastError ?? NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed",
|
||||
])
|
||||
}
|
||||
|
||||
public func listVoices() async throws -> [ElevenLabsVoice] {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("voices")
|
||||
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "GET"
|
||||
req.timeoutInterval = self.listVoicesTimeoutSeconds
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
|
||||
])
|
||||
}
|
||||
|
||||
struct VoicesResponse: Decodable { let voices: [ElevenLabsVoice] }
|
||||
return try JSONDecoder().decode(VoicesResponse.self, from: data).voices
|
||||
}
|
||||
|
||||
public static func validatedOutputFormat(_ value: String?) -> String? {
|
||||
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return nil }
|
||||
guard trimmed.hasPrefix("mp3_") else { return nil }
|
||||
return trimmed
|
||||
}
|
||||
|
||||
public static func validatedLanguage(_ value: String?) -> String? {
|
||||
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
|
||||
return normalized
|
||||
}
|
||||
|
||||
public static func validatedNormalize(_ value: String?) -> String? {
|
||||
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard ["auto", "on", "off"].contains(normalized) else { return nil }
|
||||
return normalized
|
||||
}
|
||||
|
||||
private static func buildPayload(_ request: ElevenLabsTTSRequest) -> [String: Any] {
|
||||
var payload: [String: Any] = ["text": request.text]
|
||||
if let modelId = request.modelId?.trimmingCharacters(in: .whitespacesAndNewlines), !modelId.isEmpty {
|
||||
payload["model_id"] = modelId
|
||||
}
|
||||
if let outputFormat = request.outputFormat?.trimmingCharacters(in: .whitespacesAndNewlines), !outputFormat.isEmpty {
|
||||
payload["output_format"] = outputFormat
|
||||
}
|
||||
if let seed = request.seed {
|
||||
payload["seed"] = seed
|
||||
}
|
||||
if let normalize = request.normalize {
|
||||
payload["apply_text_normalization"] = normalize
|
||||
}
|
||||
if let language = request.language {
|
||||
payload["language_code"] = language
|
||||
}
|
||||
|
||||
var voiceSettings: [String: Any] = [:]
|
||||
if let speed = request.speed { voiceSettings["speed"] = speed }
|
||||
if let stability = request.stability { voiceSettings["stability"] = stability }
|
||||
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
|
||||
if let style = request.style { voiceSettings["style"] = style }
|
||||
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
|
||||
if !voiceSettings.isEmpty {
|
||||
payload["voice_settings"] = voiceSettings
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
private static func truncatedErrorBody(_ data: Data) -> String {
|
||||
let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
|
||||
return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
|
||||
}
|
||||
}
|
||||
@@ -67,12 +67,18 @@ public enum TalkDirectiveParser {
|
||||
var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false)
|
||||
guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) }
|
||||
|
||||
guard let firstNonEmpty =
|
||||
guard let firstNonEmptyIndex =
|
||||
lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty })
|
||||
else {
|
||||
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
|
||||
}
|
||||
|
||||
var firstNonEmpty = firstNonEmptyIndex
|
||||
if firstNonEmpty > 0 {
|
||||
lines.removeSubrange(0..<firstNonEmpty)
|
||||
firstNonEmpty = 0
|
||||
}
|
||||
|
||||
let head = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard head.hasPrefix("{"), head.hasSuffix("}") else {
|
||||
return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: [])
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
public enum TalkHistoryTimestamp: Sendable {
|
||||
/// Gateway history timestamps have historically been emitted as either seconds (Double, epoch seconds)
|
||||
/// or milliseconds (Double, epoch ms). This helper accepts either.
|
||||
public static func isAfter(_ timestamp: Double, sinceSeconds: Double) -> Bool {
|
||||
let sinceMs = sinceSeconds * 1000
|
||||
// ~2286-11-20 in epoch seconds. Anything bigger is almost certainly epoch milliseconds.
|
||||
if timestamp > 10_000_000_000 {
|
||||
return timestamp >= sinceMs - 500
|
||||
}
|
||||
return timestamp >= sinceSeconds - 0.5
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
public enum TalkPromptBuilder: Sendable {
|
||||
public static func build(transcript: String, interruptedAtSeconds: Double?) -> String {
|
||||
var lines: [String] = [
|
||||
"Talk Mode active. Reply in a concise, spoken tone.",
|
||||
"You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"<id>\",\"once\":true}.",
|
||||
]
|
||||
|
||||
if let interruptedAtSeconds {
|
||||
let formatted = String(format: "%.1f", interruptedAtSeconds)
|
||||
lines.append("Assistant speech interrupted at \(formatted)s.")
|
||||
}
|
||||
|
||||
lines.append("")
|
||||
lines.append(transcript)
|
||||
return lines.joined(separator: "\n")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
import AVFoundation
|
||||
import Foundation
|
||||
|
||||
@MainActor
|
||||
public final class TalkSystemSpeechSynthesizer: NSObject {
|
||||
public enum SpeakError: Error {
|
||||
case canceled
|
||||
}
|
||||
|
||||
public static let shared = TalkSystemSpeechSynthesizer()
|
||||
|
||||
private let synth = AVSpeechSynthesizer()
|
||||
private var speakContinuation: CheckedContinuation<Void, Error>?
|
||||
private var currentUtterance: AVSpeechUtterance?
|
||||
private var currentToken = UUID()
|
||||
private var watchdog: Task<Void, Never>?
|
||||
|
||||
public var isSpeaking: Bool { self.synth.isSpeaking }
|
||||
|
||||
private override init() {
|
||||
super.init()
|
||||
self.synth.delegate = self
|
||||
}
|
||||
|
||||
public func stop() {
|
||||
self.currentToken = UUID()
|
||||
self.watchdog?.cancel()
|
||||
self.watchdog = nil
|
||||
self.synth.stopSpeaking(at: .immediate)
|
||||
self.finishCurrent(with: SpeakError.canceled)
|
||||
}
|
||||
|
||||
public func speak(text: String, language: String? = nil) async throws {
|
||||
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return }
|
||||
|
||||
self.stop()
|
||||
let token = UUID()
|
||||
self.currentToken = token
|
||||
|
||||
let utterance = AVSpeechUtterance(string: trimmed)
|
||||
if let language, let voice = AVSpeechSynthesisVoice(language: language) {
|
||||
utterance.voice = voice
|
||||
}
|
||||
self.currentUtterance = utterance
|
||||
|
||||
let estimatedSeconds = max(3.0, min(180.0, Double(trimmed.count) * 0.08))
|
||||
self.watchdog?.cancel()
|
||||
self.watchdog = Task { @MainActor [weak self] in
|
||||
guard let self else { return }
|
||||
try? await Task.sleep(nanoseconds: UInt64(estimatedSeconds * 1_000_000_000))
|
||||
if Task.isCancelled { return }
|
||||
guard self.currentToken == token else { return }
|
||||
if self.synth.isSpeaking {
|
||||
self.synth.stopSpeaking(at: .immediate)
|
||||
}
|
||||
self.finishCurrent(
|
||||
with: NSError(domain: "TalkSystemSpeechSynthesizer", code: 408, userInfo: [
|
||||
NSLocalizedDescriptionKey: "system TTS timed out after \(estimatedSeconds)s",
|
||||
]))
|
||||
}
|
||||
|
||||
try await withTaskCancellationHandler(operation: {
|
||||
try await withCheckedThrowingContinuation { cont in
|
||||
self.speakContinuation = cont
|
||||
self.synth.speak(utterance)
|
||||
}
|
||||
}, onCancel: {
|
||||
Task { @MainActor in
|
||||
self.stop()
|
||||
}
|
||||
})
|
||||
|
||||
if self.currentToken != token {
|
||||
throw SpeakError.canceled
|
||||
}
|
||||
}
|
||||
|
||||
private func handleFinish(error: Error?) {
|
||||
guard self.currentUtterance != nil else { return }
|
||||
self.watchdog?.cancel()
|
||||
self.watchdog = nil
|
||||
self.finishCurrent(with: error)
|
||||
}
|
||||
|
||||
private func finishCurrent(with error: Error?) {
|
||||
self.currentUtterance = nil
|
||||
let cont = self.speakContinuation
|
||||
self.speakContinuation = nil
|
||||
if let error {
|
||||
cont?.resume(throwing: error)
|
||||
} else {
|
||||
cont?.resume(returning: ())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extension TalkSystemSpeechSynthesizer: AVSpeechSynthesizerDelegate {
|
||||
public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
|
||||
Task { @MainActor in
|
||||
self.handleFinish(error: nil)
|
||||
}
|
||||
}
|
||||
|
||||
public nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
|
||||
Task { @MainActor in
|
||||
self.handleFinish(error: SpeakError.canceled)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
public enum TalkTTSValidation: Sendable {
|
||||
public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
|
||||
if let rateWPM, rateWPM > 0 {
|
||||
let resolved = Double(rateWPM) / 175.0
|
||||
if resolved <= 0.5 || resolved >= 2.0 { return nil }
|
||||
return resolved
|
||||
}
|
||||
if let speed {
|
||||
if speed <= 0.5 || speed >= 2.0 { return nil }
|
||||
return speed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
public static func validatedUnit(_ value: Double?) -> Double? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 1 { return nil }
|
||||
return value
|
||||
}
|
||||
|
||||
public static func validatedSeed(_ value: Int?) -> UInt32? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 4294967295 { return nil }
|
||||
return UInt32(value)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user