Files
clawdbot/apps/shared/ClawdisKit/Sources/ClawdisKit/ElevenLabsTTS.swift
2025-12-30 07:40:02 +01:00

234 lines
9.4 KiB
Swift

import Foundation
public struct ElevenLabsVoice: Decodable, Sendable {
public let voiceId: String
public let name: String?
enum CodingKeys: String, CodingKey {
case voiceId = "voice_id"
case name
}
}
public struct ElevenLabsTTSRequest: Sendable {
public var text: String
public var modelId: String?
public var outputFormat: String?
public var speed: Double?
public var stability: Double?
public var similarity: Double?
public var style: Double?
public var speakerBoost: Bool?
public var seed: UInt32?
public var normalize: String?
public var language: String?
public init(
text: String,
modelId: String? = nil,
outputFormat: String? = nil,
speed: Double? = nil,
stability: Double? = nil,
similarity: Double? = nil,
style: Double? = nil,
speakerBoost: Bool? = nil,
seed: UInt32? = nil,
normalize: String? = nil,
language: String? = nil)
{
self.text = text
self.modelId = modelId
self.outputFormat = outputFormat
self.speed = speed
self.stability = stability
self.similarity = similarity
self.style = style
self.speakerBoost = speakerBoost
self.seed = seed
self.normalize = normalize
self.language = language
}
}
public struct ElevenLabsTTSClient: Sendable {
public var apiKey: String
public var requestTimeoutSeconds: TimeInterval
public var listVoicesTimeoutSeconds: TimeInterval
public var baseUrl: URL
public init(
apiKey: String,
requestTimeoutSeconds: TimeInterval = 45,
listVoicesTimeoutSeconds: TimeInterval = 15,
baseUrl: URL = URL(string: "https://api.elevenlabs.io")!)
{
self.apiKey = apiKey
self.requestTimeoutSeconds = requestTimeoutSeconds
self.listVoicesTimeoutSeconds = listVoicesTimeoutSeconds
self.baseUrl = baseUrl
}
public func synthesizeWithHardTimeout(
voiceId: String,
request: ElevenLabsTTSRequest,
hardTimeoutSeconds: TimeInterval) async throws -> Data
{
try await withThrowingTaskGroup(of: Data.self) { group in
group.addTask {
try await self.synthesize(voiceId: voiceId, request: request)
}
group.addTask {
try await Task.sleep(nanoseconds: UInt64(hardTimeoutSeconds * 1_000_000_000))
throw NSError(domain: "ElevenLabsTTS", code: 408, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(hardTimeoutSeconds)s",
])
}
let data = try await group.next()!
group.cancelAll()
return data
}
}
public func synthesize(voiceId: String, request: ElevenLabsTTSRequest) async throws -> Data {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("text-to-speech")
url.appendPathComponent(voiceId)
let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
var lastError: Error?
for attempt in 0..<3 {
var req = URLRequest(url: url)
req.httpMethod = "POST"
req.httpBody = body
req.timeoutInterval = self.requestTimeoutSeconds
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
do {
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse {
let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
if http.statusCode == 429 || http.statusCode >= 500 {
let message = Self.truncatedErrorBody(data)
lastError = NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs retryable failure: \(http.statusCode) ct=\(contentType) \(message)",
])
if attempt < 2 {
let retryAfter = Double(http.value(forHTTPHeaderField: "Retry-After") ?? "")
let baseDelay = [0.25, 0.75, 1.5][attempt]
let delaySeconds = max(baseDelay, retryAfter ?? 0)
try? await Task.sleep(nanoseconds: UInt64(delaySeconds * 1_000_000_000))
continue
}
throw lastError!
}
if http.statusCode >= 400 {
let message = Self.truncatedErrorBody(data)
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
])
}
if !contentType.contains("audio") {
let message = Self.truncatedErrorBody(data)
throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
])
}
}
return data
} catch {
lastError = error
if attempt < 2 {
try? await Task.sleep(nanoseconds: UInt64([0.25, 0.75, 1.5][attempt] * 1_000_000_000))
continue
}
throw error
}
}
throw lastError ?? NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed",
])
}
public func listVoices() async throws -> [ElevenLabsVoice] {
var url = self.baseUrl
url.appendPathComponent("v1")
url.appendPathComponent("voices")
var req = URLRequest(url: url)
req.httpMethod = "GET"
req.timeoutInterval = self.listVoicesTimeoutSeconds
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
let (data, response) = try await URLSession.shared.data(for: req)
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
let message = Self.truncatedErrorBody(data)
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
])
}
struct VoicesResponse: Decodable { let voices: [ElevenLabsVoice] }
return try JSONDecoder().decode(VoicesResponse.self, from: data).voices
}
public static func validatedOutputFormat(_ value: String?) -> String? {
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return nil }
guard trimmed.hasPrefix("mp3_") else { return nil }
return trimmed
}
public static func validatedLanguage(_ value: String?) -> String? {
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
return normalized
}
public static func validatedNormalize(_ value: String?) -> String? {
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard ["auto", "on", "off"].contains(normalized) else { return nil }
return normalized
}
private static func buildPayload(_ request: ElevenLabsTTSRequest) -> [String: Any] {
var payload: [String: Any] = ["text": request.text]
if let modelId = request.modelId?.trimmingCharacters(in: .whitespacesAndNewlines), !modelId.isEmpty {
payload["model_id"] = modelId
}
if let outputFormat = request.outputFormat?.trimmingCharacters(in: .whitespacesAndNewlines), !outputFormat.isEmpty {
payload["output_format"] = outputFormat
}
if let seed = request.seed {
payload["seed"] = seed
}
if let normalize = request.normalize {
payload["apply_text_normalization"] = normalize
}
if let language = request.language {
payload["language_code"] = language
}
var voiceSettings: [String: Any] = [:]
if let speed = request.speed { voiceSettings["speed"] = speed }
if let stability = request.stability { voiceSettings["stability"] = stability }
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
if let style = request.style { voiceSettings["style"] = style }
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
if !voiceSettings.isEmpty {
payload["voice_settings"] = voiceSettings
}
return payload
}
private static func truncatedErrorBody(_ data: Data) -> String {
let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
}
}