From d7b267843e8e44b8ab74bedc37bb77407cf11fe0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 14:32:47 +0100 Subject: [PATCH] fix: fallback mp3 when pcm blocked --- apps/ios/Sources/Voice/TalkModeManager.swift | 42 ++++++++++++------- .../Sources/Clawdis/TalkModeRuntime.swift | 39 +++++++++++------ apps/macos/Sources/Clawdis/TalkOverlay.swift | 5 +++ .../Sources/Clawdis/TalkOverlayView.swift | 1 + 4 files changed, 60 insertions(+), 27 deletions(-) diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index fefefe68f..355c67321 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -473,19 +473,23 @@ final class TalkModeManager: NSObject { } let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId - let request = ElevenLabsTTSRequest( - text: cleaned, - modelId: modelId, - outputFormat: outputFormat, - speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM), - stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId), - similarity: TalkTTSValidation.validatedUnit(directive?.similarity), - style: TalkTTSValidation.validatedUnit(directive?.style), - speakerBoost: directive?.speakerBoost, - seed: TalkTTSValidation.validatedSeed(directive?.seed), - normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize), - language: language, - latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier)) + func makeRequest(outputFormat: String?) -> ElevenLabsTTSRequest { + ElevenLabsTTSRequest( + text: cleaned, + modelId: modelId, + outputFormat: outputFormat, + speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM), + stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId), + similarity: TalkTTSValidation.validatedUnit(directive?.similarity), + style: TalkTTSValidation.validatedUnit(directive?.style), + speakerBoost: directive?.speakerBoost, + seed: TalkTTSValidation.validatedSeed(directive?.seed), + normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize), + language: language, + latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier)) + } + + let request = makeRequest(outputFormat: outputFormat) let client = ElevenLabsTTSClient(apiKey: apiKey) let stream = client.streamSynthesize(voiceId: voiceId, request: request) @@ -504,7 +508,17 @@ final class TalkModeManager: NSObject { let result: StreamingPlaybackResult if let sampleRate { self.lastPlaybackWasPCM = true - result = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate) + var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate) + if !playback.finished, playback.interruptedAt == nil { + let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100") + self.logger.warning("pcm playback failed; retrying mp3") + self.lastPlaybackWasPCM = false + let mp3Stream = client.streamSynthesize( + voiceId: voiceId, + request: makeRequest(outputFormat: mp3Format)) + playback = await self.mp3Player.play(stream: mp3Stream) + } + result = playback } else { self.lastPlaybackWasPCM = false result = await self.mp3Player.play(stream: stream) diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index d177aa586..04a732db0 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -507,19 +507,23 @@ actor TalkModeRuntime { } let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId - let request = ElevenLabsTTSRequest( - text: cleaned, - modelId: modelId, - outputFormat: outputFormat, - speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM), - stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId), - similarity: TalkTTSValidation.validatedUnit(directive?.similarity), - style: TalkTTSValidation.validatedUnit(directive?.style), - speakerBoost: directive?.speakerBoost, - seed: TalkTTSValidation.validatedSeed(directive?.seed), - normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize), - language: language, - latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier)) + func makeRequest(outputFormat: String?) -> ElevenLabsTTSRequest { + ElevenLabsTTSRequest( + text: cleaned, + modelId: modelId, + outputFormat: outputFormat, + speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM), + stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId), + similarity: TalkTTSValidation.validatedUnit(directive?.similarity), + style: TalkTTSValidation.validatedUnit(directive?.style), + speakerBoost: directive?.speakerBoost, + seed: TalkTTSValidation.validatedSeed(directive?.seed), + normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize), + language: language, + latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier)) + } + + let request = makeRequest(outputFormat: outputFormat) self.ttsLogger.info("talk TTS synth timeout=\(synthTimeoutSeconds, privacy: .public)s") let client = ElevenLabsTTSClient(apiKey: apiKey) @@ -539,6 +543,15 @@ actor TalkModeRuntime { if let sampleRate { self.lastPlaybackWasPCM = true result = await self.playPCM(stream: stream, sampleRate: sampleRate) + if !result.finished, result.interruptedAt == nil { + let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100") + self.ttsLogger.warning("talk pcm playback failed; retrying mp3") + self.lastPlaybackWasPCM = false + let mp3Stream = client.streamSynthesize( + voiceId: voiceId, + request: makeRequest(outputFormat: mp3Format)) + result = await self.playMP3(stream: mp3Stream) + } } else { self.lastPlaybackWasPCM = false result = await self.playMP3(stream: stream) diff --git a/apps/macos/Sources/Clawdis/TalkOverlay.swift b/apps/macos/Sources/Clawdis/TalkOverlay.swift index d99f45dbe..258ac2beb 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlay.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift @@ -112,6 +112,7 @@ final class TalkOverlayController { panel.collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .transient] panel.hidesOnDeactivate = false panel.isMovable = false + panel.acceptsMouseMovedEvents = true panel.isFloatingPanel = true panel.becomesKeyOnlyIfNeeded = true panel.titleVisibility = .hidden @@ -136,6 +137,10 @@ final class TalkOverlayController { } private final class TalkOverlayHostingView: NSHostingView { + override func acceptsFirstMouse(for event: NSEvent?) -> Bool { + true + } + override func hitTest(_ point: NSPoint) -> NSView? { let center = CGPoint( x: self.bounds.maxX - TalkOverlayController.orbPadding - (TalkOverlayController.orbSize / 2), diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift index 6d886a608..d6c9a3a73 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -97,6 +97,7 @@ private final class OrbInteractionNSView: NSView { private var suppressSingleClick = false override var acceptsFirstResponder: Bool { true } + override func acceptsFirstMouse(for event: NSEvent?) -> Bool { true } override func mouseDown(with event: NSEvent) { self.mouseDownEvent = event