fix: stream elevenlabs tts playback
This commit is contained in:
@@ -9,6 +9,7 @@ import Speech
|
||||
@Observable
|
||||
final class TalkModeManager: NSObject {
|
||||
private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
|
||||
private static let defaultModelIdFallback = "eleven_v3"
|
||||
var isEnabled: Bool = false
|
||||
var isListening: Bool = false
|
||||
var isSpeaking: Bool = false
|
||||
@@ -36,11 +37,12 @@ final class TalkModeManager: NSObject {
|
||||
private var voiceAliases: [String: String] = [:]
|
||||
private var interruptOnSpeech: Bool = true
|
||||
private var mainSessionKey: String = "main"
|
||||
private var fallbackVoiceId: String?
|
||||
private var lastPlaybackWasPCM: Bool = false
|
||||
|
||||
private var bridge: BridgeSession?
|
||||
private let silenceWindow: TimeInterval = 0.7
|
||||
|
||||
private var player: AVAudioPlayer?
|
||||
private var chatSubscribedSessionKeys = Set<String>()
|
||||
|
||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "TalkMode")
|
||||
@@ -446,43 +448,43 @@ final class TalkModeManager: NSObject {
|
||||
let started = Date()
|
||||
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
|
||||
|
||||
let voiceId = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
|
||||
let resolvedKey =
|
||||
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
|
||||
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
|
||||
let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let preferredVoice = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
|
||||
let voiceId: String? = if let apiKey, !apiKey.isEmpty {
|
||||
await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
|
||||
} else {
|
||||
nil
|
||||
}
|
||||
let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)
|
||||
|
||||
if canUseElevenLabs, let voiceId, let apiKey {
|
||||
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat
|
||||
let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
|
||||
if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty {
|
||||
self.logger.warning(
|
||||
"talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)")
|
||||
}
|
||||
|
||||
let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
|
||||
let request = ElevenLabsTTSRequest(
|
||||
text: cleaned,
|
||||
modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId,
|
||||
modelId: modelId,
|
||||
outputFormat: outputFormat,
|
||||
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
|
||||
stability: TalkTTSValidation.validatedUnit(directive?.stability),
|
||||
stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
|
||||
similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
|
||||
style: TalkTTSValidation.validatedUnit(directive?.style),
|
||||
speakerBoost: directive?.speakerBoost,
|
||||
seed: TalkTTSValidation.validatedSeed(directive?.seed),
|
||||
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
|
||||
language: language)
|
||||
language: language,
|
||||
latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))
|
||||
|
||||
let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12))
|
||||
let client = ElevenLabsTTSClient(apiKey: apiKey)
|
||||
let audio = try await client.synthesizeWithHardTimeout(
|
||||
voiceId: voiceId,
|
||||
request: request,
|
||||
hardTimeoutSeconds: synthTimeoutSeconds)
|
||||
self.logger
|
||||
.info(
|
||||
"elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
|
||||
let stream = client.streamSynthesize(voiceId: voiceId, request: request)
|
||||
|
||||
if self.interruptOnSpeech {
|
||||
do {
|
||||
@@ -494,7 +496,21 @@ final class TalkModeManager: NSObject {
|
||||
}
|
||||
|
||||
self.statusText = "Speaking…"
|
||||
try await self.playAudio(data: audio)
|
||||
let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
|
||||
let result: StreamingPlaybackResult
|
||||
if let sampleRate {
|
||||
self.lastPlaybackWasPCM = true
|
||||
result = await PCMStreamingAudioPlayer.shared.play(stream: stream, sampleRate: sampleRate)
|
||||
} else {
|
||||
self.lastPlaybackWasPCM = false
|
||||
result = await StreamingAudioPlayer.shared.play(stream: stream)
|
||||
}
|
||||
self.logger
|
||||
.info(
|
||||
"elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s")
|
||||
if !result.finished, let interruptedAt = result.interruptedAt {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
} else {
|
||||
self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
|
||||
if self.interruptOnSpeech {
|
||||
@@ -533,30 +549,17 @@ final class TalkModeManager: NSObject {
|
||||
self.isSpeaking = false
|
||||
}
|
||||
|
||||
private func playAudio(data: Data) async throws {
|
||||
self.player?.stop()
|
||||
let player = try AVAudioPlayer(data: data)
|
||||
self.player = player
|
||||
player.prepareToPlay()
|
||||
self.logger.info("play start")
|
||||
guard player.play() else {
|
||||
throw NSError(domain: "TalkMode", code: 2, userInfo: [
|
||||
NSLocalizedDescriptionKey: "audio player refused to play",
|
||||
])
|
||||
}
|
||||
while player.isPlaying {
|
||||
try? await Task.sleep(nanoseconds: 120_000_000)
|
||||
}
|
||||
self.logger.info("play done")
|
||||
}
|
||||
|
||||
private func stopSpeaking(storeInterruption: Bool = true) {
|
||||
guard self.isSpeaking else { return }
|
||||
let interruptedAt = self.lastPlaybackWasPCM
|
||||
? PCMStreamingAudioPlayer.shared.stop()
|
||||
: StreamingAudioPlayer.shared.stop()
|
||||
if storeInterruption {
|
||||
self.lastInterruptedAtSeconds = self.player?.currentTime
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
self.player?.stop()
|
||||
self.player = nil
|
||||
_ = self.lastPlaybackWasPCM
|
||||
? StreamingAudioPlayer.shared.stop()
|
||||
: PCMStreamingAudioPlayer.shared.stop()
|
||||
TalkSystemSpeechSynthesizer.shared.stop()
|
||||
self.isSpeaking = false
|
||||
}
|
||||
@@ -581,6 +584,37 @@ final class TalkModeManager: NSObject {
|
||||
return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
|
||||
}
|
||||
|
||||
private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
|
||||
let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||
if !trimmed.isEmpty {
|
||||
if let resolved = self.resolveVoiceAlias(trimmed) { return resolved }
|
||||
self.logger.warning("unknown voice alias \(trimmed, privacy: .public)")
|
||||
}
|
||||
if let fallbackVoiceId { return fallbackVoiceId }
|
||||
|
||||
do {
|
||||
let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
|
||||
guard let first = voices.first else {
|
||||
self.logger.warning("elevenlabs voices list empty")
|
||||
return nil
|
||||
}
|
||||
self.fallbackVoiceId = first.voiceId
|
||||
if self.defaultVoiceId == nil {
|
||||
self.defaultVoiceId = first.voiceId
|
||||
}
|
||||
if !self.voiceOverrideActive {
|
||||
self.currentVoiceId = first.voiceId
|
||||
}
|
||||
let name = first.name ?? "unknown"
|
||||
self.logger
|
||||
.info("default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
|
||||
return first.voiceId
|
||||
} catch {
|
||||
self.logger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
private static func isLikelyVoiceId(_ value: String) -> Bool {
|
||||
guard value.count >= 10 else { return false }
|
||||
return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
|
||||
@@ -598,22 +632,23 @@ final class TalkModeManager: NSObject {
|
||||
self.mainSessionKey = rawMainKey.isEmpty ? "main" : rawMainKey
|
||||
self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if let aliases = talk?["voiceAliases"] as? [String: Any] {
|
||||
self.voiceAliases =
|
||||
aliases.compactMap { key, value in
|
||||
guard let id = value as? String else { return nil }
|
||||
let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { return nil }
|
||||
return (normalizedKey, trimmedId)
|
||||
}
|
||||
.reduce(into: [:]) { $0[$1.0] = $1.1 }
|
||||
var resolved: [String: String] = [:]
|
||||
for (key, value) in aliases {
|
||||
guard let id = value as? String else { continue }
|
||||
let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { continue }
|
||||
resolved[normalizedKey] = trimmedId
|
||||
}
|
||||
self.voiceAliases = resolved
|
||||
} else {
|
||||
self.voiceAliases = [:]
|
||||
}
|
||||
if !self.voiceOverrideActive {
|
||||
self.currentVoiceId = self.defaultVoiceId
|
||||
}
|
||||
self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let model = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
self.defaultModelId = (model?.isEmpty == false) ? model : Self.defaultModelIdFallback
|
||||
if !self.modelOverrideActive {
|
||||
self.currentModelId = self.defaultModelId
|
||||
}
|
||||
@@ -624,7 +659,10 @@ final class TalkModeManager: NSObject {
|
||||
self.interruptOnSpeech = interrupt
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
self.defaultModelId = Self.defaultModelIdFallback
|
||||
if !self.modelOverrideActive {
|
||||
self.currentModelId = self.defaultModelId
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user