refactor: extract elevenlabs kit
This commit is contained in:
@@ -12,10 +12,15 @@ let package = Package(
|
||||
.library(name: "ClawdisKit", targets: ["ClawdisKit"]),
|
||||
.library(name: "ClawdisChatUI", targets: ["ClawdisChatUI"]),
|
||||
],
|
||||
dependencies: [
|
||||
.package(path: "../../../ElevenLabsKit"),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "ClawdisKit",
|
||||
dependencies: [],
|
||||
dependencies: [
|
||||
.product(name: "ElevenLabsKit", package: "ElevenLabsKit"),
|
||||
],
|
||||
resources: [
|
||||
.process("Resources"),
|
||||
],
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
@_exported import ElevenLabsKit
|
||||
|
||||
public typealias ElevenLabsVoice = ElevenLabsKit.ElevenLabsVoice
|
||||
public typealias ElevenLabsTTSRequest = ElevenLabsKit.ElevenLabsTTSRequest
|
||||
public typealias ElevenLabsTTSClient = ElevenLabsKit.ElevenLabsTTSClient
|
||||
public typealias TalkTTSValidation = ElevenLabsKit.TalkTTSValidation
|
||||
public typealias StreamingAudioPlayer = ElevenLabsKit.StreamingAudioPlayer
|
||||
public typealias PCMStreamingAudioPlayer = ElevenLabsKit.PCMStreamingAudioPlayer
|
||||
public typealias StreamingPlaybackResult = ElevenLabsKit.StreamingPlaybackResult
|
||||
@@ -1,331 +0,0 @@
|
||||
import Foundation
|
||||
|
||||
public struct ElevenLabsVoice: Decodable, Sendable {
|
||||
public let voiceId: String
|
||||
public let name: String?
|
||||
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case voiceId = "voice_id"
|
||||
case name
|
||||
}
|
||||
}
|
||||
|
||||
public struct ElevenLabsTTSRequest: Sendable {
|
||||
public var text: String
|
||||
public var modelId: String?
|
||||
public var outputFormat: String?
|
||||
public var speed: Double?
|
||||
public var stability: Double?
|
||||
public var similarity: Double?
|
||||
public var style: Double?
|
||||
public var speakerBoost: Bool?
|
||||
public var seed: UInt32?
|
||||
public var normalize: String?
|
||||
public var language: String?
|
||||
public var latencyTier: Int?
|
||||
|
||||
public init(
|
||||
text: String,
|
||||
modelId: String? = nil,
|
||||
outputFormat: String? = nil,
|
||||
speed: Double? = nil,
|
||||
stability: Double? = nil,
|
||||
similarity: Double? = nil,
|
||||
style: Double? = nil,
|
||||
speakerBoost: Bool? = nil,
|
||||
seed: UInt32? = nil,
|
||||
normalize: String? = nil,
|
||||
language: String? = nil,
|
||||
latencyTier: Int? = nil)
|
||||
{
|
||||
self.text = text
|
||||
self.modelId = modelId
|
||||
self.outputFormat = outputFormat
|
||||
self.speed = speed
|
||||
self.stability = stability
|
||||
self.similarity = similarity
|
||||
self.style = style
|
||||
self.speakerBoost = speakerBoost
|
||||
self.seed = seed
|
||||
self.normalize = normalize
|
||||
self.language = language
|
||||
self.latencyTier = latencyTier
|
||||
}
|
||||
}
|
||||
|
||||
public struct ElevenLabsTTSClient: Sendable {
|
||||
public var apiKey: String
|
||||
public var requestTimeoutSeconds: TimeInterval
|
||||
public var listVoicesTimeoutSeconds: TimeInterval
|
||||
public var baseUrl: URL
|
||||
|
||||
public init(
|
||||
apiKey: String,
|
||||
requestTimeoutSeconds: TimeInterval = 45,
|
||||
listVoicesTimeoutSeconds: TimeInterval = 15,
|
||||
baseUrl: URL = URL(string: "https://api.elevenlabs.io")!)
|
||||
{
|
||||
self.apiKey = apiKey
|
||||
self.requestTimeoutSeconds = requestTimeoutSeconds
|
||||
self.listVoicesTimeoutSeconds = listVoicesTimeoutSeconds
|
||||
self.baseUrl = baseUrl
|
||||
}
|
||||
|
||||
public func synthesizeWithHardTimeout(
|
||||
voiceId: String,
|
||||
request: ElevenLabsTTSRequest,
|
||||
hardTimeoutSeconds: TimeInterval) async throws -> Data
|
||||
{
|
||||
try await withThrowingTaskGroup(of: Data.self) { group in
|
||||
group.addTask {
|
||||
try await self.synthesize(voiceId: voiceId, request: request)
|
||||
}
|
||||
group.addTask {
|
||||
try await Task.sleep(nanoseconds: UInt64(hardTimeoutSeconds * 1_000_000_000))
|
||||
throw NSError(domain: "ElevenLabsTTS", code: 408, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(hardTimeoutSeconds)s",
|
||||
])
|
||||
}
|
||||
let data = try await group.next()!
|
||||
group.cancelAll()
|
||||
return data
|
||||
}
|
||||
}
|
||||
|
||||
public func synthesize(voiceId: String, request: ElevenLabsTTSRequest) async throws -> Data {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("text-to-speech")
|
||||
url.appendPathComponent(voiceId)
|
||||
|
||||
let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
|
||||
|
||||
var lastError: Error?
|
||||
for attempt in 0..<3 {
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "POST"
|
||||
req.httpBody = body
|
||||
req.timeoutInterval = self.requestTimeoutSeconds
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
do {
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse {
|
||||
let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
|
||||
if http.statusCode == 429 || http.statusCode >= 500 {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
lastError = NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs retryable failure: \(http.statusCode) ct=\(contentType) \(message)",
|
||||
])
|
||||
if attempt < 2 {
|
||||
let retryAfter = Double(http.value(forHTTPHeaderField: "Retry-After") ?? "")
|
||||
let baseDelay = [0.25, 0.75, 1.5][attempt]
|
||||
let delaySeconds = max(baseDelay, retryAfter ?? 0)
|
||||
try? await Task.sleep(nanoseconds: UInt64(delaySeconds * 1_000_000_000))
|
||||
continue
|
||||
}
|
||||
throw lastError!
|
||||
}
|
||||
|
||||
if http.statusCode >= 400 {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
|
||||
])
|
||||
}
|
||||
|
||||
if !contentType.contains("audio") {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
|
||||
])
|
||||
}
|
||||
}
|
||||
return data
|
||||
} catch {
|
||||
lastError = error
|
||||
if attempt < 2 {
|
||||
try? await Task.sleep(nanoseconds: UInt64([0.25, 0.75, 1.5][attempt] * 1_000_000_000))
|
||||
continue
|
||||
}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
throw lastError ?? NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed",
|
||||
])
|
||||
}
|
||||
|
||||
public func streamSynthesize(
|
||||
voiceId: String,
|
||||
request: ElevenLabsTTSRequest) -> AsyncThrowingStream<Data, Error>
|
||||
{
|
||||
AsyncThrowingStream { continuation in
|
||||
let task = Task {
|
||||
do {
|
||||
let url = Self.streamingURL(
|
||||
baseUrl: self.baseUrl,
|
||||
voiceId: voiceId,
|
||||
latencyTier: request.latencyTier)
|
||||
let body = try JSONSerialization.data(withJSONObject: Self.buildPayload(request), options: [])
|
||||
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "POST"
|
||||
req.httpBody = body
|
||||
req.timeoutInterval = self.requestTimeoutSeconds
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.setValue("audio/mpeg", forHTTPHeaderField: "Accept")
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (bytes, response) = try await URLSession.shared.bytes(for: req)
|
||||
guard let http = response as? HTTPURLResponse else {
|
||||
throw NSError(domain: "ElevenLabsTTS", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs invalid response",
|
||||
])
|
||||
}
|
||||
|
||||
let contentType = (http.value(forHTTPHeaderField: "Content-Type") ?? "unknown").lowercased()
|
||||
if http.statusCode >= 400 {
|
||||
let message = try await Self.readErrorBody(bytes: bytes)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) ct=\(contentType) \(message)",
|
||||
])
|
||||
}
|
||||
if !contentType.contains("audio") {
|
||||
let message = try await Self.readErrorBody(bytes: bytes)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: 415, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs returned non-audio ct=\(contentType) \(message)",
|
||||
])
|
||||
}
|
||||
|
||||
var buffer = Data()
|
||||
buffer.reserveCapacity(16_384)
|
||||
for try await byte in bytes {
|
||||
buffer.append(byte)
|
||||
if buffer.count >= 8_192 {
|
||||
continuation.yield(buffer)
|
||||
buffer.removeAll(keepingCapacity: true)
|
||||
}
|
||||
}
|
||||
if !buffer.isEmpty {
|
||||
continuation.yield(buffer)
|
||||
}
|
||||
continuation.finish()
|
||||
} catch {
|
||||
continuation.finish(throwing: error)
|
||||
}
|
||||
}
|
||||
|
||||
continuation.onTermination = { _ in
|
||||
task.cancel()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public func listVoices() async throws -> [ElevenLabsVoice] {
|
||||
var url = self.baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("voices")
|
||||
|
||||
var req = URLRequest(url: url)
|
||||
req.httpMethod = "GET"
|
||||
req.timeoutInterval = self.listVoicesTimeoutSeconds
|
||||
req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: req)
|
||||
if let http = response as? HTTPURLResponse, http.statusCode >= 400 {
|
||||
let message = Self.truncatedErrorBody(data)
|
||||
throw NSError(domain: "ElevenLabsTTS", code: http.statusCode, userInfo: [
|
||||
NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)",
|
||||
])
|
||||
}
|
||||
|
||||
struct VoicesResponse: Decodable { let voices: [ElevenLabsVoice] }
|
||||
return try JSONDecoder().decode(VoicesResponse.self, from: data).voices
|
||||
}
|
||||
|
||||
public static func validatedOutputFormat(_ value: String?) -> String? {
|
||||
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return nil }
|
||||
guard trimmed.hasPrefix("mp3_") || trimmed.hasPrefix("pcm_") else { return nil }
|
||||
return trimmed
|
||||
}
|
||||
|
||||
public static func validatedLanguage(_ value: String?) -> String? {
|
||||
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil }
|
||||
return normalized
|
||||
}
|
||||
|
||||
public static func validatedNormalize(_ value: String?) -> String? {
|
||||
let normalized = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard ["auto", "on", "off"].contains(normalized) else { return nil }
|
||||
return normalized
|
||||
}
|
||||
|
||||
private static func buildPayload(_ request: ElevenLabsTTSRequest) -> [String: Any] {
|
||||
var payload: [String: Any] = ["text": request.text]
|
||||
if let modelId = request.modelId?.trimmingCharacters(in: .whitespacesAndNewlines), !modelId.isEmpty {
|
||||
payload["model_id"] = modelId
|
||||
}
|
||||
if let outputFormat = request.outputFormat?.trimmingCharacters(in: .whitespacesAndNewlines), !outputFormat.isEmpty {
|
||||
payload["output_format"] = outputFormat
|
||||
}
|
||||
if let seed = request.seed {
|
||||
payload["seed"] = seed
|
||||
}
|
||||
if let normalize = request.normalize {
|
||||
payload["apply_text_normalization"] = normalize
|
||||
}
|
||||
if let language = request.language {
|
||||
payload["language_code"] = language
|
||||
}
|
||||
|
||||
var voiceSettings: [String: Any] = [:]
|
||||
if let speed = request.speed { voiceSettings["speed"] = speed }
|
||||
if let stability = request.stability { voiceSettings["stability"] = stability }
|
||||
if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity }
|
||||
if let style = request.style { voiceSettings["style"] = style }
|
||||
if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost }
|
||||
if !voiceSettings.isEmpty {
|
||||
payload["voice_settings"] = voiceSettings
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
private static func truncatedErrorBody(_ data: Data) -> String {
|
||||
let raw = String(data: data.prefix(4096), encoding: .utf8) ?? "unknown"
|
||||
return raw.replacingOccurrences(of: "\n", with: " ").replacingOccurrences(of: "\r", with: " ")
|
||||
}
|
||||
|
||||
private static func streamingURL(baseUrl: URL, voiceId: String, latencyTier: Int?) -> URL {
|
||||
var url = baseUrl
|
||||
url.appendPathComponent("v1")
|
||||
url.appendPathComponent("text-to-speech")
|
||||
url.appendPathComponent(voiceId)
|
||||
url.appendPathComponent("stream")
|
||||
|
||||
guard let latencyTier else { return url }
|
||||
let latencyItem = URLQueryItem(
|
||||
name: "optimize_streaming_latency",
|
||||
value: "\(latencyTier)")
|
||||
guard var components = URLComponents(url: url, resolvingAgainstBaseURL: false) else {
|
||||
return url
|
||||
}
|
||||
var items = components.queryItems ?? []
|
||||
items.append(latencyItem)
|
||||
components.queryItems = items
|
||||
return components.url ?? url
|
||||
}
|
||||
|
||||
private static func readErrorBody(bytes: URLSession.AsyncBytes) async throws -> String {
|
||||
var data = Data()
|
||||
for try await byte in bytes {
|
||||
data.append(byte)
|
||||
if data.count >= 4096 { break }
|
||||
}
|
||||
return truncatedErrorBody(data)
|
||||
}
|
||||
}
|
||||
@@ -1,145 +0,0 @@
|
||||
import AVFoundation
|
||||
import Foundation
|
||||
import OSLog
|
||||
|
||||
@MainActor
|
||||
public final class PCMStreamingAudioPlayer {
|
||||
public static let shared = PCMStreamingAudioPlayer()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.pcm")
|
||||
private var engine = AVAudioEngine()
|
||||
private var player = AVAudioPlayerNode()
|
||||
private var format: AVAudioFormat?
|
||||
private var pendingBuffers: Int = 0
|
||||
private var inputFinished = false
|
||||
private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
|
||||
|
||||
public init() {
|
||||
self.engine.attach(self.player)
|
||||
}
|
||||
|
||||
public func play(stream: AsyncThrowingStream<Data, Error>, sampleRate: Double) async -> StreamingPlaybackResult {
|
||||
self.stopInternal()
|
||||
|
||||
let format = AVAudioFormat(
|
||||
commonFormat: .pcmFormatInt16,
|
||||
sampleRate: sampleRate,
|
||||
channels: 1,
|
||||
interleaved: true)
|
||||
|
||||
guard let format else {
|
||||
return StreamingPlaybackResult(finished: false, interruptedAt: nil)
|
||||
}
|
||||
self.configure(format: format)
|
||||
|
||||
return await withCheckedContinuation { continuation in
|
||||
self.continuation = continuation
|
||||
self.pendingBuffers = 0
|
||||
self.inputFinished = false
|
||||
|
||||
Task.detached { [weak self] in
|
||||
guard let self else { return }
|
||||
do {
|
||||
for try await chunk in stream {
|
||||
await self.enqueuePCM(chunk, format: format)
|
||||
}
|
||||
await self.finishInput()
|
||||
} catch {
|
||||
await self.fail(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public func stop() -> Double? {
|
||||
let interruptedAt = self.currentTimeSeconds()
|
||||
self.stopInternal()
|
||||
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
|
||||
return interruptedAt
|
||||
}
|
||||
|
||||
private func configure(format: AVAudioFormat) {
|
||||
if self.format?.sampleRate != format.sampleRate || self.format?.commonFormat != format.commonFormat {
|
||||
self.engine.stop()
|
||||
self.engine = AVAudioEngine()
|
||||
self.player = AVAudioPlayerNode()
|
||||
self.engine.attach(self.player)
|
||||
}
|
||||
self.format = format
|
||||
if self.engine.attachedNodes.contains(self.player) {
|
||||
self.engine.connect(self.player, to: self.engine.mainMixerNode, format: format)
|
||||
}
|
||||
}
|
||||
|
||||
private func enqueuePCM(_ data: Data, format: AVAudioFormat) async {
|
||||
guard !data.isEmpty else { return }
|
||||
let frameCount = data.count / MemoryLayout<Int16>.size
|
||||
guard frameCount > 0 else { return }
|
||||
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(frameCount)) else {
|
||||
return
|
||||
}
|
||||
buffer.frameLength = AVAudioFrameCount(frameCount)
|
||||
|
||||
data.withUnsafeBytes { raw in
|
||||
guard let src = raw.baseAddress else { return }
|
||||
let audioBuffer = buffer.audioBufferList.pointee.mBuffers
|
||||
if let dst = audioBuffer.mData {
|
||||
memcpy(dst, src, frameCount * MemoryLayout<Int16>.size)
|
||||
}
|
||||
}
|
||||
|
||||
self.pendingBuffers += 1
|
||||
Task.detached { [weak self] in
|
||||
guard let self else { return }
|
||||
await self.player.scheduleBuffer(buffer)
|
||||
await MainActor.run {
|
||||
self.pendingBuffers = max(0, self.pendingBuffers - 1)
|
||||
if self.inputFinished && self.pendingBuffers == 0 {
|
||||
self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !self.player.isPlaying {
|
||||
do {
|
||||
try self.engine.start()
|
||||
self.player.play()
|
||||
} catch {
|
||||
self.logger.error("pcm engine start failed: \(error.localizedDescription, privacy: .public)")
|
||||
self.fail(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func finishInput() {
|
||||
self.inputFinished = true
|
||||
if self.pendingBuffers == 0 {
|
||||
self.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
|
||||
}
|
||||
}
|
||||
|
||||
private func fail(_ error: Error) {
|
||||
self.logger.error("pcm stream failed: \(error.localizedDescription, privacy: .public)")
|
||||
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
|
||||
}
|
||||
|
||||
private func stopInternal() {
|
||||
self.player.stop()
|
||||
self.engine.stop()
|
||||
self.pendingBuffers = 0
|
||||
self.inputFinished = false
|
||||
}
|
||||
|
||||
private func finish(_ result: StreamingPlaybackResult) {
|
||||
let continuation = self.continuation
|
||||
self.continuation = nil
|
||||
continuation?.resume(returning: result)
|
||||
}
|
||||
|
||||
private func currentTimeSeconds() -> Double? {
|
||||
guard let nodeTime = self.player.lastRenderTime,
|
||||
let playerTime = self.player.playerTime(forNodeTime: nodeTime)
|
||||
else { return nil }
|
||||
return Double(playerTime.sampleTime) / playerTime.sampleRate
|
||||
}
|
||||
}
|
||||
@@ -1,429 +0,0 @@
|
||||
import AudioToolbox
|
||||
import Foundation
|
||||
import OSLog
|
||||
|
||||
public struct StreamingPlaybackResult: Sendable {
|
||||
public let finished: Bool
|
||||
public let interruptedAt: Double?
|
||||
|
||||
public init(finished: Bool, interruptedAt: Double?) {
|
||||
self.finished = finished
|
||||
self.interruptedAt = interruptedAt
|
||||
}
|
||||
}
|
||||
|
||||
@MainActor
|
||||
public final class StreamingAudioPlayer: NSObject {
|
||||
public static let shared = StreamingAudioPlayer()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts.stream")
|
||||
private var playback: Playback?
|
||||
|
||||
public func play(stream: AsyncThrowingStream<Data, Error>) async -> StreamingPlaybackResult {
|
||||
self.stopInternal()
|
||||
|
||||
let playback = Playback(logger: self.logger)
|
||||
self.playback = playback
|
||||
|
||||
return await withCheckedContinuation { continuation in
|
||||
playback.setContinuation(continuation)
|
||||
playback.start()
|
||||
|
||||
Task.detached {
|
||||
do {
|
||||
for try await chunk in stream {
|
||||
playback.append(chunk)
|
||||
}
|
||||
playback.finishInput()
|
||||
} catch {
|
||||
playback.fail(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public func stop() -> Double? {
|
||||
guard let playback else { return nil }
|
||||
let interruptedAt = playback.stop(immediate: true)
|
||||
self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
|
||||
return interruptedAt
|
||||
}
|
||||
|
||||
private func stopInternal() {
|
||||
guard let playback else { return }
|
||||
let interruptedAt = playback.stop(immediate: true)
|
||||
self.finish(playback: playback, result: StreamingPlaybackResult(finished: false, interruptedAt: interruptedAt))
|
||||
}
|
||||
|
||||
private func finish(playback: Playback, result: StreamingPlaybackResult) {
|
||||
playback.finish(result)
|
||||
guard self.playback === playback else { return }
|
||||
self.playback = nil
|
||||
}
|
||||
}
|
||||
|
||||
private final class Playback: @unchecked Sendable {
|
||||
private static let bufferCount: Int = 3
|
||||
private static let bufferSize: Int = 32 * 1024
|
||||
|
||||
private let logger: Logger
|
||||
private let lock = NSLock()
|
||||
private let parseQueue = DispatchQueue(label: "talk.stream.parse")
|
||||
fileprivate let bufferLock = NSLock()
|
||||
fileprivate let bufferSemaphore = DispatchSemaphore(value: bufferCount)
|
||||
|
||||
private var continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
|
||||
private var finished = false
|
||||
|
||||
private var audioFileStream: AudioFileStreamID?
|
||||
private var audioQueue: AudioQueueRef?
|
||||
fileprivate var audioFormat: AudioStreamBasicDescription?
|
||||
fileprivate var maxPacketSize: UInt32 = 0
|
||||
|
||||
fileprivate var availableBuffers: [AudioQueueBufferRef] = []
|
||||
private var currentBuffer: AudioQueueBufferRef?
|
||||
private var currentBufferSize: Int = 0
|
||||
private var currentPacketDescs: [AudioStreamPacketDescription] = []
|
||||
|
||||
private var isRunning = false
|
||||
fileprivate var inputFinished = false
|
||||
private var startRequested = false
|
||||
|
||||
private var sampleRate: Double = 0
|
||||
|
||||
init(logger: Logger) {
|
||||
self.logger = logger
|
||||
}
|
||||
|
||||
func setContinuation(_ continuation: CheckedContinuation<StreamingPlaybackResult, Never>) {
|
||||
self.lock.lock()
|
||||
self.continuation = continuation
|
||||
self.lock.unlock()
|
||||
}
|
||||
|
||||
func start() {
|
||||
let selfPtr = Unmanaged.passUnretained(self).toOpaque()
|
||||
let status = AudioFileStreamOpen(
|
||||
selfPtr,
|
||||
propertyListenerProc,
|
||||
packetsProc,
|
||||
kAudioFileMP3Type,
|
||||
&self.audioFileStream)
|
||||
if status != noErr {
|
||||
self.logger.error("talk stream open failed: \(status)")
|
||||
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
|
||||
}
|
||||
}
|
||||
|
||||
func append(_ data: Data) {
|
||||
guard !data.isEmpty else { return }
|
||||
self.parseQueue.async { [weak self] in
|
||||
guard let self else { return }
|
||||
guard let audioFileStream = self.audioFileStream else { return }
|
||||
let status = data.withUnsafeBytes { bytes in
|
||||
AudioFileStreamParseBytes(
|
||||
audioFileStream,
|
||||
UInt32(bytes.count),
|
||||
bytes.baseAddress,
|
||||
[])
|
||||
}
|
||||
if status != noErr {
|
||||
self.logger.error("talk stream parse failed: \(status)")
|
||||
self.fail(NSError(domain: "StreamingAudio", code: Int(status)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func finishInput() {
|
||||
self.parseQueue.async { [weak self] in
|
||||
guard let self else { return }
|
||||
self.inputFinished = true
|
||||
if self.audioQueue == nil {
|
||||
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
|
||||
return
|
||||
}
|
||||
self.enqueueCurrentBuffer(flushOnly: true)
|
||||
_ = self.stop(immediate: false)
|
||||
}
|
||||
}
|
||||
|
||||
func fail(_ error: Error) {
|
||||
self.logger.error("talk stream failed: \(error.localizedDescription, privacy: .public)")
|
||||
_ = self.stop(immediate: true)
|
||||
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
|
||||
}
|
||||
|
||||
func stop(immediate: Bool) -> Double? {
|
||||
guard let audioQueue else { return nil }
|
||||
let interruptedAt = self.currentTimeSeconds()
|
||||
AudioQueueStop(audioQueue, immediate)
|
||||
return interruptedAt
|
||||
}
|
||||
|
||||
fileprivate func finish(_ result: StreamingPlaybackResult) {
|
||||
let continuation: CheckedContinuation<StreamingPlaybackResult, Never>?
|
||||
self.lock.lock()
|
||||
if self.finished {
|
||||
continuation = nil
|
||||
} else {
|
||||
self.finished = true
|
||||
continuation = self.continuation
|
||||
self.continuation = nil
|
||||
}
|
||||
self.lock.unlock()
|
||||
|
||||
continuation?.resume(returning: result)
|
||||
self.teardown()
|
||||
}
|
||||
|
||||
private func teardown() {
|
||||
if let audioQueue {
|
||||
AudioQueueDispose(audioQueue, true)
|
||||
self.audioQueue = nil
|
||||
}
|
||||
if let audioFileStream {
|
||||
AudioFileStreamClose(audioFileStream)
|
||||
self.audioFileStream = nil
|
||||
}
|
||||
self.bufferLock.lock()
|
||||
self.availableBuffers.removeAll()
|
||||
self.bufferLock.unlock()
|
||||
self.currentBuffer = nil
|
||||
self.currentPacketDescs.removeAll()
|
||||
}
|
||||
|
||||
fileprivate func setupQueueIfNeeded(_ asbd: AudioStreamBasicDescription) {
|
||||
guard self.audioQueue == nil else { return }
|
||||
|
||||
var format = asbd
|
||||
self.audioFormat = format
|
||||
self.sampleRate = format.mSampleRate
|
||||
|
||||
let selfPtr = Unmanaged.passUnretained(self).toOpaque()
|
||||
let status = AudioQueueNewOutput(
|
||||
&format,
|
||||
outputCallbackProc,
|
||||
selfPtr,
|
||||
nil,
|
||||
nil,
|
||||
0,
|
||||
&self.audioQueue)
|
||||
if status != noErr {
|
||||
self.logger.error("talk queue create failed: \(status)")
|
||||
self.finish(StreamingPlaybackResult(finished: false, interruptedAt: nil))
|
||||
return
|
||||
}
|
||||
|
||||
if let audioQueue {
|
||||
AudioQueueAddPropertyListener(audioQueue, kAudioQueueProperty_IsRunning, isRunningCallbackProc, selfPtr)
|
||||
}
|
||||
|
||||
if let audioFileStream {
|
||||
var cookieSize: UInt32 = 0
|
||||
var writable: DarwinBoolean = false
|
||||
let cookieStatus = AudioFileStreamGetPropertyInfo(
|
||||
audioFileStream,
|
||||
kAudioFileStreamProperty_MagicCookieData,
|
||||
&cookieSize,
|
||||
&writable)
|
||||
if cookieStatus == noErr, cookieSize > 0, let audioQueue {
|
||||
var cookie = [UInt8](repeating: 0, count: Int(cookieSize))
|
||||
let readStatus = AudioFileStreamGetProperty(
|
||||
audioFileStream,
|
||||
kAudioFileStreamProperty_MagicCookieData,
|
||||
&cookieSize,
|
||||
&cookie)
|
||||
if readStatus == noErr {
|
||||
AudioQueueSetProperty(audioQueue, kAudioQueueProperty_MagicCookie, cookie, cookieSize)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let audioQueue {
|
||||
for _ in 0..<Self.bufferCount {
|
||||
var buffer: AudioQueueBufferRef?
|
||||
let allocStatus = AudioQueueAllocateBuffer(audioQueue, UInt32(Self.bufferSize), &buffer)
|
||||
if allocStatus == noErr, let buffer {
|
||||
self.bufferLock.lock()
|
||||
self.availableBuffers.append(buffer)
|
||||
self.bufferLock.unlock()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func enqueueCurrentBuffer(flushOnly: Bool = false) {
|
||||
guard let audioQueue, let buffer = self.currentBuffer else { return }
|
||||
guard self.currentBufferSize > 0 else { return }
|
||||
|
||||
buffer.pointee.mAudioDataByteSize = UInt32(self.currentBufferSize)
|
||||
let packetCount = UInt32(self.currentPacketDescs.count)
|
||||
|
||||
let status = self.currentPacketDescs.withUnsafeBufferPointer { descPtr in
|
||||
AudioQueueEnqueueBuffer(audioQueue, buffer, packetCount, descPtr.baseAddress)
|
||||
}
|
||||
if status != noErr {
|
||||
self.logger.error("talk queue enqueue failed: \(status)")
|
||||
} else {
|
||||
if !self.startRequested {
|
||||
self.startRequested = true
|
||||
let startStatus = AudioQueueStart(audioQueue, nil)
|
||||
if startStatus != noErr {
|
||||
self.logger.error("talk queue start failed: \(startStatus)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.currentBuffer = nil
|
||||
self.currentBufferSize = 0
|
||||
self.currentPacketDescs.removeAll(keepingCapacity: true)
|
||||
if !flushOnly {
|
||||
self.bufferSemaphore.wait()
|
||||
self.bufferLock.lock()
|
||||
let next = self.availableBuffers.popLast()
|
||||
self.bufferLock.unlock()
|
||||
if let next { self.currentBuffer = next }
|
||||
}
|
||||
}
|
||||
|
||||
fileprivate func handlePackets(
|
||||
numberBytes: UInt32,
|
||||
numberPackets: UInt32,
|
||||
inputData: UnsafeRawPointer,
|
||||
packetDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
|
||||
{
|
||||
if self.audioQueue == nil, let format = self.audioFormat {
|
||||
self.setupQueueIfNeeded(format)
|
||||
}
|
||||
|
||||
if self.audioQueue == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if self.currentBuffer == nil {
|
||||
self.bufferSemaphore.wait()
|
||||
self.bufferLock.lock()
|
||||
self.currentBuffer = self.availableBuffers.popLast()
|
||||
self.bufferLock.unlock()
|
||||
self.currentBufferSize = 0
|
||||
self.currentPacketDescs.removeAll(keepingCapacity: true)
|
||||
}
|
||||
|
||||
let bytes = inputData.assumingMemoryBound(to: UInt8.self)
|
||||
let packetCount = Int(numberPackets)
|
||||
for index in 0..<packetCount {
|
||||
let packetOffset: Int
|
||||
let packetSize: Int
|
||||
|
||||
if let packetDescriptions {
|
||||
packetOffset = Int(packetDescriptions[index].mStartOffset)
|
||||
packetSize = Int(packetDescriptions[index].mDataByteSize)
|
||||
} else {
|
||||
let size = Int(numberBytes) / packetCount
|
||||
packetOffset = index * size
|
||||
packetSize = size
|
||||
}
|
||||
|
||||
if packetSize > Self.bufferSize {
|
||||
continue
|
||||
}
|
||||
|
||||
if self.currentBufferSize + packetSize > Self.bufferSize {
|
||||
self.enqueueCurrentBuffer()
|
||||
}
|
||||
|
||||
guard let buffer = self.currentBuffer else { continue }
|
||||
let dest = buffer.pointee.mAudioData.advanced(by: self.currentBufferSize)
|
||||
memcpy(dest, bytes.advanced(by: packetOffset), packetSize)
|
||||
|
||||
let desc = AudioStreamPacketDescription(
|
||||
mStartOffset: Int64(self.currentBufferSize),
|
||||
mVariableFramesInPacket: 0,
|
||||
mDataByteSize: UInt32(packetSize))
|
||||
self.currentPacketDescs.append(desc)
|
||||
self.currentBufferSize += packetSize
|
||||
}
|
||||
}
|
||||
|
||||
private func currentTimeSeconds() -> Double? {
|
||||
guard let audioQueue, sampleRate > 0 else { return nil }
|
||||
var timeStamp = AudioTimeStamp()
|
||||
let status = AudioQueueGetCurrentTime(audioQueue, nil, &timeStamp, nil)
|
||||
if status != noErr { return nil }
|
||||
if timeStamp.mSampleTime.isNaN { return nil }
|
||||
return timeStamp.mSampleTime / sampleRate
|
||||
}
|
||||
}
|
||||
|
||||
private func propertyListenerProc(
|
||||
inClientData: UnsafeMutableRawPointer,
|
||||
inAudioFileStream: AudioFileStreamID,
|
||||
inPropertyID: AudioFileStreamPropertyID,
|
||||
ioFlags: UnsafeMutablePointer<AudioFileStreamPropertyFlags>)
|
||||
{
|
||||
let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
|
||||
|
||||
if inPropertyID == kAudioFileStreamProperty_DataFormat {
|
||||
var format = AudioStreamBasicDescription()
|
||||
var size = UInt32(MemoryLayout<AudioStreamBasicDescription>.size)
|
||||
let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &format)
|
||||
if status == noErr {
|
||||
playback.audioFormat = format
|
||||
playback.setupQueueIfNeeded(format)
|
||||
}
|
||||
} else if inPropertyID == kAudioFileStreamProperty_PacketSizeUpperBound {
|
||||
var maxPacketSize: UInt32 = 0
|
||||
var size = UInt32(MemoryLayout<UInt32>.size)
|
||||
let status = AudioFileStreamGetProperty(inAudioFileStream, inPropertyID, &size, &maxPacketSize)
|
||||
if status == noErr {
|
||||
playback.maxPacketSize = maxPacketSize
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func packetsProc(
|
||||
inClientData: UnsafeMutableRawPointer,
|
||||
inNumberBytes: UInt32,
|
||||
inNumberPackets: UInt32,
|
||||
inInputData: UnsafeRawPointer,
|
||||
inPacketDescriptions: UnsafeMutablePointer<AudioStreamPacketDescription>?)
|
||||
{
|
||||
let playback = Unmanaged<Playback>.fromOpaque(inClientData).takeUnretainedValue()
|
||||
playback.handlePackets(
|
||||
numberBytes: inNumberBytes,
|
||||
numberPackets: inNumberPackets,
|
||||
inputData: inInputData,
|
||||
packetDescriptions: inPacketDescriptions)
|
||||
}
|
||||
|
||||
private func outputCallbackProc(
|
||||
inUserData: UnsafeMutableRawPointer?,
|
||||
inAQ: AudioQueueRef,
|
||||
inBuffer: AudioQueueBufferRef)
|
||||
{
|
||||
guard let inUserData else { return }
|
||||
let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
|
||||
playback.bufferLock.lock()
|
||||
playback.availableBuffers.append(inBuffer)
|
||||
playback.bufferLock.unlock()
|
||||
playback.bufferSemaphore.signal()
|
||||
}
|
||||
|
||||
private func isRunningCallbackProc(
|
||||
inUserData: UnsafeMutableRawPointer?,
|
||||
inAQ: AudioQueueRef,
|
||||
inID: AudioQueuePropertyID)
|
||||
{
|
||||
guard let inUserData else { return }
|
||||
guard inID == kAudioQueueProperty_IsRunning else { return }
|
||||
|
||||
let playback = Unmanaged<Playback>.fromOpaque(inUserData).takeUnretainedValue()
|
||||
var running: UInt32 = 0
|
||||
var size = UInt32(MemoryLayout<UInt32>.size)
|
||||
let status = AudioQueueGetProperty(inAQ, kAudioQueueProperty_IsRunning, &running, &size)
|
||||
if status != noErr { return }
|
||||
|
||||
if running == 0, playback.inputFinished {
|
||||
playback.finish(StreamingPlaybackResult(finished: true, interruptedAt: nil))
|
||||
}
|
||||
}
|
||||
@@ -1,51 +0,0 @@
|
||||
public enum TalkTTSValidation: Sendable {
|
||||
private static let v3StabilityValues: Set<Double> = [0.0, 0.5, 1.0]
|
||||
|
||||
public static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? {
|
||||
if let rateWPM, rateWPM > 0 {
|
||||
let resolved = Double(rateWPM) / 175.0
|
||||
if resolved <= 0.5 || resolved >= 2.0 { return nil }
|
||||
return resolved
|
||||
}
|
||||
if let speed {
|
||||
if speed <= 0.5 || speed >= 2.0 { return nil }
|
||||
return speed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
public static func validatedUnit(_ value: Double?) -> Double? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 1 { return nil }
|
||||
return value
|
||||
}
|
||||
|
||||
public static func validatedStability(_ value: Double?, modelId: String?) -> Double? {
|
||||
guard let value else { return nil }
|
||||
let normalizedModel = (modelId ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
if normalizedModel == "eleven_v3" {
|
||||
return v3StabilityValues.contains(value) ? value : nil
|
||||
}
|
||||
return validatedUnit(value)
|
||||
}
|
||||
|
||||
public static func validatedSeed(_ value: Int?) -> UInt32? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 4294967295 { return nil }
|
||||
return UInt32(value)
|
||||
}
|
||||
|
||||
public static func validatedLatencyTier(_ value: Int?) -> Int? {
|
||||
guard let value else { return nil }
|
||||
if value < 0 || value > 4 { return nil }
|
||||
return value
|
||||
}
|
||||
|
||||
public static func pcmSampleRate(from outputFormat: String?) -> Double? {
|
||||
let trimmed = (outputFormat ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
guard trimmed.hasPrefix("pcm_") else { return nil }
|
||||
let parts = trimmed.split(separator: "_", maxSplits: 1)
|
||||
guard parts.count == 2, let rate = Double(parts[1]), rate > 0 else { return nil }
|
||||
return rate
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,7 @@ import XCTest
|
||||
final class ElevenLabsTTSValidationTests: XCTestCase {
|
||||
func testValidatedOutputFormatAllowsOnlyMp3Presets() {
|
||||
XCTAssertEqual(ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128"), "mp3_44100_128")
|
||||
XCTAssertNil(ElevenLabsTTSClient.validatedOutputFormat("pcm_16000"))
|
||||
XCTAssertEqual(ElevenLabsTTSClient.validatedOutputFormat("pcm_16000"), "pcm_16000")
|
||||
}
|
||||
|
||||
func testValidatedLanguageAcceptsTwoLetterCodes() {
|
||||
@@ -17,4 +17,3 @@ final class ElevenLabsTTSValidationTests: XCTestCase {
|
||||
XCTAssertNil(ElevenLabsTTSClient.validatedNormalize("maybe"))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
import XCTest
|
||||
@testable import ClawdisKit
|
||||
|
||||
final class PCMStreamingAudioPlayerTests: XCTestCase {
|
||||
@MainActor
|
||||
func testStopDuringPCMStreamReturnsInterruptedResult() async {
|
||||
var continuation: AsyncThrowingStream<Data, Error>.Continuation?
|
||||
let stream = AsyncThrowingStream<Data, Error> { cont in
|
||||
continuation = cont
|
||||
let samples = Data(repeating: 0, count: 44_100)
|
||||
cont.yield(samples)
|
||||
}
|
||||
|
||||
let task = Task { @MainActor in
|
||||
await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: 44_100)
|
||||
}
|
||||
|
||||
try? await Task.sleep(nanoseconds: 120_000_000)
|
||||
let interruptedAt = PCMStreamingAudioPlayer.shared.stop()
|
||||
continuation?.finish()
|
||||
|
||||
let result = await task.value
|
||||
XCTAssertFalse(result.finished)
|
||||
XCTAssertNotNil(interruptedAt)
|
||||
}
|
||||
}
|
||||
@@ -1,45 +0,0 @@
|
||||
import XCTest
|
||||
@testable import ClawdisKit
|
||||
|
||||
final class TalkTTSValidationTests: XCTestCase {
|
||||
func testResolveSpeedUsesRateWPMWhenProvided() {
|
||||
let resolved = TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 175)
|
||||
XCTAssertNotNil(resolved)
|
||||
XCTAssertEqual(resolved ?? 0, 1.0, accuracy: 0.0001)
|
||||
XCTAssertNil(TalkTTSValidation.resolveSpeed(speed: nil, rateWPM: 400))
|
||||
}
|
||||
|
||||
func testValidatedUnitBounds() {
|
||||
XCTAssertEqual(TalkTTSValidation.validatedUnit(0), 0)
|
||||
XCTAssertEqual(TalkTTSValidation.validatedUnit(1), 1)
|
||||
XCTAssertNil(TalkTTSValidation.validatedUnit(-0.01))
|
||||
XCTAssertNil(TalkTTSValidation.validatedUnit(1.01))
|
||||
}
|
||||
|
||||
func testValidatedStability() {
|
||||
XCTAssertEqual(TalkTTSValidation.validatedStability(0, modelId: "eleven_v3"), 0)
|
||||
XCTAssertEqual(TalkTTSValidation.validatedStability(0.5, modelId: "eleven_v3"), 0.5)
|
||||
XCTAssertEqual(TalkTTSValidation.validatedStability(1, modelId: "eleven_v3"), 1)
|
||||
XCTAssertNil(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_v3"))
|
||||
XCTAssertEqual(TalkTTSValidation.validatedStability(0.7, modelId: "eleven_multilingual_v2"), 0.7)
|
||||
}
|
||||
|
||||
func testValidatedSeedBounds() {
|
||||
XCTAssertEqual(TalkTTSValidation.validatedSeed(0), 0)
|
||||
XCTAssertEqual(TalkTTSValidation.validatedSeed(1234), 1234)
|
||||
XCTAssertNil(TalkTTSValidation.validatedSeed(-1))
|
||||
}
|
||||
|
||||
func testValidatedLatencyTier() {
|
||||
XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(0), 0)
|
||||
XCTAssertEqual(TalkTTSValidation.validatedLatencyTier(4), 4)
|
||||
XCTAssertNil(TalkTTSValidation.validatedLatencyTier(-1))
|
||||
XCTAssertNil(TalkTTSValidation.validatedLatencyTier(5))
|
||||
}
|
||||
|
||||
func testPcmSampleRateParse() {
|
||||
XCTAssertEqual(TalkTTSValidation.pcmSampleRate(from: "pcm_44100"), 44100)
|
||||
XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "mp3_44100_128"))
|
||||
XCTAssertNil(TalkTTSValidation.pcmSampleRate(from: "pcm_bad"))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user