From afbd18e8df2e45b5a976261ef6f2df16b72faf04 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 06:05:43 +0100 Subject: [PATCH] fix(talk): harden playback, interrupts, and timeouts --- CHANGELOG.md | 5 + apps/ios/Sources/Voice/TalkModeManager.swift | 165 +++++++++++++++--- apps/ios/Sources/Voice/TalkOrbOverlay.swift | 8 +- apps/ios/SwiftSources.input.xcfilelist | 1 + .../Sources/Clawdis/TalkAudioPlayer.swift | 9 +- .../Sources/Clawdis/TalkModeRuntime.swift | 165 ++++++++++-------- .../TalkAudioPlayerTests.swift | 18 ++ 7 files changed, 262 insertions(+), 109 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 079a12308..4168f520c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,8 +13,13 @@ - macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured. - macOS Talk Mode: add hard timeout around ElevenLabs TTS synthesis to avoid getting stuck “speaking” forever on hung requests. - macOS Talk Mode: avoid stuck playback when the audio player never starts (fail-fast + watchdog). +- macOS Talk Mode: fix audio stop ordering so disabling Talk Mode always stops in-flight playback. +- macOS Talk Mode: throttle audio-level updates (avoid per-buffer task creation) to reduce CPU/task churn. - macOS Talk Mode: increase overlay window size so wave rings don’t clip; close button is hover-only and closer to the orb. - Talk Mode: wait for chat history to surface the assistant reply before starting TTS (macOS/iOS/Android). +- iOS Talk Mode: fix chat completion wait to time out even if no events arrive (prevents “Thinking…” hangs). +- iOS Talk Mode: keep recognition running during playback to support interrupt-on-speech. +- iOS Talk Mode: preserve directive voice/model overrides across config reloads and add ElevenLabs request timeouts. - iOS/Android Talk Mode: explicitly `chat.subscribe` when Talk Mode is active, so completion events arrive even if the Chat UI isn’t open. - Chat UI: refresh history when another client finishes a run in the same session, so Talk Mode + Voice Wake transcripts appear consistently. - Gateway: `voice.transcript` now also maps agent bus output to `chat` events, ensuring chat UIs refresh for voice-triggered runs. diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 6e2e5440d..9eb441ce6 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -2,8 +2,8 @@ import AVFAudio import ClawdisKit import Foundation import Observation -import Speech import OSLog +import Speech @MainActor @Observable @@ -29,6 +29,8 @@ final class TalkModeManager: NSObject { private var currentVoiceId: String? private var defaultModelId: String? private var currentModelId: String? + private var voiceOverrideActive = false + private var modelOverrideActive = false private var defaultOutputFormat: String? private var apiKey: String? private var interruptOnSpeech: Bool = true @@ -101,6 +103,12 @@ final class TalkModeManager: NSObject { self.silenceTask = nil self.stopRecognition() self.stopSpeaking() + self.lastInterruptedAtSeconds = nil + do { + try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation]) + } catch { + self.logger.warning("audio session deactivate failed: \(error.localizedDescription, privacy: .public)") + } Task { await self.unsubscribeAllChats() } } @@ -109,6 +117,7 @@ final class TalkModeManager: NSObject { } private func startRecognition() throws { + self.stopRecognition() self.speechRecognizer = SFSpeechRecognizer() guard let recognizer = self.speechRecognizer else { throw NSError(domain: "TalkMode", code: 1, userInfo: [ @@ -132,7 +141,10 @@ final class TalkModeManager: NSObject { self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in guard let self else { return } if let error { - self.statusText = "Speech error: \(error.localizedDescription)" + if !self.isSpeaking { + self.statusText = "Speech error: \(error.localizedDescription)" + } + self.logger.debug("speech recognition error: \(error.localizedDescription, privacy: .public)") } guard let result else { return } let transcript = result.bestTranscription.formattedString @@ -189,7 +201,7 @@ final class TalkModeManager: NSObject { } private func checkSilence() async { - guard self.isListening else { return } + guard self.isListening, !self.isSpeaking else { return } let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) guard !transcript.isEmpty else { return } guard let lastHeard else { return } @@ -219,10 +231,21 @@ final class TalkModeManager: NSObject { self.logger.info("chat.send start chars=\(prompt.count, privacy: .public)") let runId = try await self.sendChat(prompt, bridge: bridge) self.logger.info("chat.send ok runId=\(runId, privacy: .public)") - let ok = await self.waitForChatFinal(runId: runId, bridge: bridge) - if !ok { - self.statusText = "No reply" - self.logger.warning("chat final timeout runId=\(runId, privacy: .public)") + let completion = await self.waitForChatCompletion(runId: runId, bridge: bridge, timeoutSeconds: 120) + guard completion == .final else { + switch completion { + case .timeout: + self.statusText = "No reply" + self.logger.warning("chat completion timeout runId=\(runId, privacy: .public)") + case .aborted: + self.statusText = "Aborted" + self.logger.warning("chat completion aborted runId=\(runId, privacy: .public)") + case .error: + self.statusText = "Chat error" + self.logger.warning("chat completion error runId=\(runId, privacy: .public)") + case .final: + break + } await self.start() return } @@ -259,7 +282,9 @@ final class TalkModeManager: NSObject { self.chatSubscribedSessionKeys.insert(key) self.logger.info("chat.subscribe ok sessionKey=\(key, privacy: .public)") } catch { - self.logger.warning("chat.subscribe failed sessionKey=\(key, privacy: .public) err=\(error.localizedDescription, privacy: .public)") + self.logger + .warning( + "chat.subscribe failed sessionKey=\(key, privacy: .public) err=\(error.localizedDescription, privacy: .public)") } } @@ -294,6 +319,22 @@ final class TalkModeManager: NSObject { return lines.joined(separator: "\n") } + private enum ChatCompletionState: CustomStringConvertible { + case final + case aborted + case error + case timeout + + var description: String { + switch self { + case .final: "final" + case .aborted: "aborted" + case .error: "error" + case .timeout: "timeout" + } + } + } + private func sendChat(_ message: String, bridge: BridgeSession) async throws -> String { struct SendResponse: Decodable { let runId: String } let payload: [String: Any] = [ @@ -310,20 +351,39 @@ final class TalkModeManager: NSObject { return decoded.runId } - private func waitForChatFinal(runId: String, bridge: BridgeSession) async -> Bool { + private func waitForChatCompletion( + runId: String, + bridge: BridgeSession, + timeoutSeconds: Int = 120) async -> ChatCompletionState + { let stream = await bridge.subscribeServerEvents(bufferingNewest: 200) - let timeout = Date().addingTimeInterval(120) - for await evt in stream { - if Date() > timeout { return false } - guard evt.event == "chat", let payload = evt.payloadJSON else { continue } - guard let data = payload.data(using: .utf8) else { continue } - guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { continue } - if (json["runId"] as? String) != runId { continue } - if let state = json["state"] as? String, state == "final" { - return true + return await withTaskGroup(of: ChatCompletionState.self) { group in + group.addTask { [runId] in + for await evt in stream { + if Task.isCancelled { return .timeout } + guard evt.event == "chat", let payload = evt.payloadJSON else { continue } + guard let data = payload.data(using: .utf8) else { continue } + guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { continue } + if (json["runId"] as? String) != runId { continue } + if let state = json["state"] as? String { + switch state { + case "final": return .final + case "aborted": return .aborted + case "error": return .error + default: break + } + } + } + return .timeout } + group.addTask { + try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000) + return .timeout + } + let result = await group.next() ?? .timeout + group.cancelAll() + return result } - return false } private func waitForAssistantText( @@ -370,11 +430,13 @@ final class TalkModeManager: NSObject { if let voice = directive?.voiceId { if directive?.once != true { self.currentVoiceId = voice + self.voiceOverrideActive = true } } if let model = directive?.modelId { if directive?.once != true { self.currentModelId = model + self.modelOverrideActive = true } } @@ -394,16 +456,21 @@ final class TalkModeManager: NSObject { return } - self.statusText = "Speaking…" + self.statusText = "Generating voice…" self.isSpeaking = true self.lastSpokenText = cleaned do { let started = Date() + let desiredOutputFormat = directive?.outputFormat ?? self.defaultOutputFormat + let outputFormat = TalkModeRuntime.validatedOutputFormat(desiredOutputFormat) + if outputFormat == nil, let desiredOutputFormat, !desiredOutputFormat.isEmpty { + self.logger.warning("talk output_format unsupported for local playback: \(desiredOutputFormat, privacy: .public)") + } let request = ElevenLabsRequest( text: cleaned, modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, - outputFormat: directive?.outputFormat ?? self.defaultOutputFormat, + outputFormat: outputFormat, speed: TalkModeRuntime.resolveSpeed( speed: directive?.speed, rateWPM: directive?.rateWPM), @@ -414,16 +481,43 @@ final class TalkModeManager: NSObject { seed: TalkModeRuntime.validatedSeed(directive?.seed), normalize: TalkModeRuntime.validatedNormalize(directive?.normalize), language: TalkModeRuntime.validatedLanguage(directive?.language)) - let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( - voiceId: voiceId, - request: request) - self.logger.info("elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s") + + let synthTimeoutSeconds = max(20.0, min(90.0, Double(cleaned.count) * 0.12)) + let client = ElevenLabsClient(apiKey: apiKey) + let audio = try await withThrowingTaskGroup(of: Data.self) { group in + group.addTask { + try await client.synthesize(voiceId: voiceId, request: request) + } + group.addTask { + try await Task.sleep(nanoseconds: UInt64(synthTimeoutSeconds * 1_000_000_000)) + throw NSError(domain: "TalkTTS", code: 408, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs TTS timed out after \(synthTimeoutSeconds)s", + ]) + } + let data = try await group.next()! + group.cancelAll() + return data + } + self.logger + .info( + "elevenlabs ok bytes=\(audio.count, privacy: .public) dur=\(Date().timeIntervalSince(started), privacy: .public)s") + + if self.interruptOnSpeech { + do { + try self.startRecognition() + } catch { + self.logger.warning("startRecognition during speak failed: \(error.localizedDescription, privacy: .public)") + } + } + + self.statusText = "Speaking…" try await self.playAudio(data: audio) } catch { self.statusText = "Speak failed: \(error.localizedDescription)" self.logger.error("speak failed: \(error.localizedDescription, privacy: .public)") } + self.stopRecognition() self.isSpeaking = false } @@ -440,9 +534,11 @@ final class TalkModeManager: NSObject { self.logger.info("play done") } - private func stopSpeaking() { + private func stopSpeaking(storeInterruption: Bool = true) { guard self.isSpeaking else { return } - self.lastInterruptedAtSeconds = self.player?.currentTime + if storeInterruption { + self.lastInterruptedAtSeconds = self.player?.currentTime + } self.player?.stop() self.player = nil self.isSpeaking = false @@ -465,9 +561,13 @@ final class TalkModeManager: NSObject { guard let config = json["config"] as? [String: Any] else { return } let talk = config["talk"] as? [String: Any] self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) - self.currentVoiceId = self.defaultVoiceId + if !self.voiceOverrideActive { + self.currentVoiceId = self.defaultVoiceId + } self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) - self.currentModelId = self.defaultModelId + if !self.modelOverrideActive { + self.currentModelId = self.defaultModelId + } self.defaultOutputFormat = (talk?["outputFormat"] as? String)? .trimmingCharacters(in: .whitespacesAndNewlines) self.apiKey = (talk?["apiKey"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) @@ -561,6 +661,7 @@ private struct ElevenLabsClient { var req = URLRequest(url: url) req.httpMethod = "POST" req.httpBody = body + req.timeoutInterval = 45 req.setValue("application/json", forHTTPHeaderField: "Content-Type") req.setValue("audio/mpeg", forHTTPHeaderField: "Accept") req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") @@ -614,4 +715,10 @@ private enum TalkModeRuntime { guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil } return normalized } + + static func validatedOutputFormat(_ value: String?) -> String? { + let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + guard !trimmed.isEmpty else { return nil } + return trimmed.hasPrefix("mp3_") ? trimmed : nil + } } diff --git a/apps/ios/Sources/Voice/TalkOrbOverlay.swift b/apps/ios/Sources/Voice/TalkOrbOverlay.swift index 84a0b56c5..3d7907c6d 100644 --- a/apps/ios/Sources/Voice/TalkOrbOverlay.swift +++ b/apps/ios/Sources/Voice/TalkOrbOverlay.swift @@ -38,8 +38,7 @@ struct TalkOrbOverlay: View { .frame(width: 136, height: 136) .overlay( Circle() - .stroke(seam.opacity(0.35), lineWidth: 1) - ) + .stroke(seam.opacity(0.35), lineWidth: 1)) .shadow(color: seam.opacity(0.32), radius: 26, x: 0, y: 0) .shadow(color: Color.black.opacity(0.50), radius: 22, x: 0, y: 10) } @@ -58,9 +57,7 @@ struct TalkOrbOverlay: View { Capsule() .fill(Color.black.opacity(0.40)) .overlay( - Capsule().stroke(seam.opacity(0.22), lineWidth: 1) - ) - ) + Capsule().stroke(seam.opacity(0.22), lineWidth: 1))) } } .padding(28) @@ -71,4 +68,3 @@ struct TalkOrbOverlay: View { .accessibilityLabel("Talk Mode \(status)") } } - diff --git a/apps/ios/SwiftSources.input.xcfilelist b/apps/ios/SwiftSources.input.xcfilelist index 81d42dce1..5b71e678f 100644 --- a/apps/ios/SwiftSources.input.xcfilelist +++ b/apps/ios/SwiftSources.input.xcfilelist @@ -57,3 +57,4 @@ Sources/Voice/VoiceWakePreferences.swift ../shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift ../../Swabble/Sources/SwabbleKit/WakeWordGate.swift Sources/Voice/TalkModeManager.swift +Sources/Voice/TalkOrbOverlay.swift diff --git a/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift index 713ede40d..6a61b5881 100644 --- a/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift +++ b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift @@ -106,8 +106,13 @@ final class TalkAudioPlayer: NSObject, @preconcurrency AVAudioPlayerDelegate { } private func stopInternal() { - self.playback?.cancelWatchdog() - self.playback = nil + if let playback = self.playback { + let interruptedAt = self.player?.currentTime + self.finish( + playback: playback, + result: TalkPlaybackResult(finished: false, interruptedAt: interruptedAt)) + return + } self.player?.stop() self.player = nil } diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 7f5280805..e090cfebe 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -11,16 +11,37 @@ actor TalkModeRuntime { private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime") private let ttsLogger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts") + private final class RMSMeter: @unchecked Sendable { + private let lock = NSLock() + private var latestRMS: Double = 0 + + func set(_ rms: Double) { + self.lock.lock() + self.latestRMS = rms + self.lock.unlock() + } + + func get() -> Double { + self.lock.lock() + let value = self.latestRMS + self.lock.unlock() + return value + } + } + private var recognizer: SFSpeechRecognizer? private var audioEngine: AVAudioEngine? private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? private var recognitionTask: SFSpeechRecognitionTask? private var recognitionGeneration: Int = 0 + private var rmsTask: Task? + private let rmsMeter = RMSMeter() private var captureTask: Task? private var silenceTask: Task? private var phase: TalkModePhase = .idle private var isEnabled = false + private var lifecycleGeneration: Int = 0 private var lastHeard: Date? private var noiseFloorRMS: Double = 1e-4 @@ -49,6 +70,7 @@ actor TalkModeRuntime { func setEnabled(_ enabled: Bool) async { guard enabled != self.isEnabled else { return } self.isEnabled = enabled + self.lifecycleGeneration &+= 1 if enabled { await self.start() } else { @@ -56,14 +78,21 @@ actor TalkModeRuntime { } } + private func isCurrent(_ generation: Int) -> Bool { + generation == self.lifecycleGeneration && self.isEnabled + } + private func start() async { + let gen = self.lifecycleGeneration guard voiceWakeSupported else { return } guard PermissionManager.voiceWakePermissionsGranted() else { self.logger.debug("talk runtime not starting: permissions missing") return } await self.reloadConfig() + guard self.isCurrent(gen) else { return } await self.startRecognition() + guard self.isCurrent(gen) else { return } self.phase = .listening await MainActor.run { TalkModeController.shared.updatePhase(.listening) } self.startSilenceMonitor() @@ -74,12 +103,15 @@ actor TalkModeRuntime { self.captureTask = nil self.silenceTask?.cancel() self.silenceTask = nil + + // Stop audio before changing phase (stopSpeaking is gated on .speaking). + await self.stopSpeaking(reason: .manual) + self.lastTranscript = "" self.lastHeard = nil self.lastSpeechEnergyAt = nil self.phase = .idle await self.stopRecognition() - await self.stopSpeaking(reason: .manual) await MainActor.run { TalkModeController.shared.updateLevel(0) TalkModeController.shared.updatePhase(.idle) @@ -120,12 +152,11 @@ actor TalkModeRuntime { let input = audioEngine.inputNode let format = input.outputFormat(forBus: 0) input.removeTap(onBus: 0) - input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in + let meter = self.rmsMeter + input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request, meter] buffer, _ in request?.append(buffer) if let rms = Self.rmsLevel(buffer: buffer) { - Task.detached { [weak self] in - await self?.noteAudioLevel(rms: rms) - } + meter.set(rms) } } @@ -137,6 +168,8 @@ actor TalkModeRuntime { return } + self.startRMSTicker(meter: meter) + self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in guard let self else { return } let segments = result?.bestTranscription.segments ?? [] @@ -161,6 +194,19 @@ actor TalkModeRuntime { self.audioEngine?.stop() self.audioEngine = nil self.recognizer = nil + self.rmsTask?.cancel() + self.rmsTask = nil + } + + private func startRMSTicker(meter: RMSMeter) { + self.rmsTask?.cancel() + self.rmsTask = Task { [weak self, meter] in + while let self { + try? await Task.sleep(nanoseconds: 50_000_000) + if Task.isCancelled { return } + await self.noteAudioLevel(rms: meter.get()) + } + } } private func handleRecognition(_ update: RecognitionUpdate) async { @@ -241,43 +287,42 @@ actor TalkModeRuntime { // MARK: - Gateway + TTS private func sendAndSpeak(_ transcript: String) async { + let gen = self.lifecycleGeneration await self.reloadConfig() + guard self.isCurrent(gen) else { return } let prompt = self.buildPrompt(transcript: transcript) + let sessionKey = await GatewayConnection.shared.mainSessionKey() let runId = UUID().uuidString let startedAt = Date().timeIntervalSince1970 - self.logger.info("talk send start runId=\(runId, privacy: .public) chars=\(prompt.count, privacy: .public)") + self.logger.info( + "talk send start runId=\(runId, privacy: .public) session=\(sessionKey, privacy: .public) chars=\(prompt.count, privacy: .public)") do { let response = try await GatewayConnection.shared.chatSend( - sessionKey: "main", + sessionKey: sessionKey, message: prompt, thinking: "low", idempotencyKey: runId, attachments: []) - self.logger.info("talk chat.send ok runId=\(response.runId, privacy: .public)") - let completion = await self.waitForChatCompletion( - runId: response.runId, - timeoutSeconds: 120) - self.logger.info("talk chat completion runId=\(response.runId, privacy: .public) state=\(String(describing: completion), privacy: .public)") - guard completion == .final else { - await self.startListening() - await self.startRecognition() - return - } + guard self.isCurrent(gen) else { return } + self.logger.info( + "talk chat.send ok runId=\(response.runId, privacy: .public) session=\(sessionKey, privacy: .public)") guard let assistantText = await self.waitForAssistantText( - sessionKey: "main", + sessionKey: sessionKey, since: startedAt, - timeoutSeconds: 12) + timeoutSeconds: 45) else { - self.logger.warning("talk assistant text missing after completion") + self.logger.warning("talk assistant text missing after timeout") await self.startListening() await self.startRecognition() return } + guard self.isCurrent(gen) else { return } self.logger.info("talk assistant text len=\(assistantText.count, privacy: .public)") await self.playAssistant(text: assistantText) + guard self.isCurrent(gen) else { return } await self.startListening() await self.startRecognition() return @@ -306,54 +351,6 @@ actor TalkModeRuntime { return lines.joined(separator: "\n") } - private enum ChatCompletionState: CustomStringConvertible { - case final - case aborted - case error - case timeout - - var description: String { - switch self { - case .final: return "final" - case .aborted: return "aborted" - case .error: return "error" - case .timeout: return "timeout" - } - } - } - - private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState { - let stream = await GatewayConnection.shared.subscribe() - return await withTaskGroup(of: ChatCompletionState.self) { group in - group.addTask { [runId] in - for await push in stream { - if case let .event(evt) = push, evt.event == "chat", let payload = evt.payload { - if let chat = try? JSONDecoder().decode( - ClawdisChatEventPayload.self, - from: JSONEncoder().encode(payload)) - { - guard chat.runId == runId else { continue } - switch chat.state { - case .some("final"): return .final - case .some("aborted"): return .aborted - case .some("error"): return .error - default: break - } - } - } - } - return .timeout - } - group.addTask { - try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000) - return .timeout - } - let result = await group.next() ?? .timeout - group.cancelAll() - return result - } - } - private func waitForAssistantText( sessionKey: String, since: Double, @@ -394,10 +391,12 @@ actor TalkModeRuntime { } private func playAssistant(text: String) async { + let gen = self.lifecycleGeneration let parse = TalkDirectiveParser.parse(text) let directive = parse.directive let cleaned = parse.stripped.trimmingCharacters(in: .whitespacesAndNewlines) guard !cleaned.isEmpty else { return } + guard self.isCurrent(gen) else { return } if !parse.unknownKeys.isEmpty { self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)") @@ -435,9 +434,11 @@ actor TalkModeRuntime { self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID") return } + guard self.isCurrent(gen) else { return } self.ttsLogger.info("talk TTS request voiceId=\(voiceId, privacy: .public) chars=\(cleaned.count, privacy: .public)") await self.startRecognition() + guard self.isCurrent(gen) else { return } await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } self.phase = .speaking self.lastSpokenText = cleaned @@ -450,7 +451,7 @@ actor TalkModeRuntime { let request = ElevenLabsRequest( text: cleaned, modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, - outputFormat: directive?.outputFormat ?? self.defaultOutputFormat, + outputFormat: Self.validatedOutputFormat(directive?.outputFormat ?? self.defaultOutputFormat, logger: self.logger), speed: resolvedSpeed, stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger), similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger), @@ -479,6 +480,7 @@ actor TalkModeRuntime { group.cancelAll() return data } + guard self.isCurrent(gen) else { return } self.ttsLogger.info("talk TTS response bytes=\(audio.count, privacy: .public)") let result = await TalkAudioPlayer.shared.play(data: audio) self.ttsLogger.info("talk audio result finished=\(result.finished, privacy: .public) interruptedAt=\(String(describing: result.interruptedAt), privacy: .public)") @@ -491,8 +493,10 @@ actor TalkModeRuntime { self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)") } - self.phase = .thinking - await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } + if self.phase == .speaking { + self.phase = .thinking + await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } + } } private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? { @@ -523,11 +527,18 @@ actor TalkModeRuntime { } func stopSpeaking(reason: TalkStopReason) async { - guard self.phase == .speaking else { return } let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() } + guard self.phase == .speaking else { return } if reason == .speech, let interruptedAt { self.lastInterruptedAtSeconds = interruptedAt } + if reason == .manual { + return + } + if reason == .speech || reason == .userTap { + await self.startListening() + return + } self.phase = .thinking await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } } @@ -718,6 +729,16 @@ actor TalkModeRuntime { } return normalized } + + private static func validatedOutputFormat(_ value: String?, logger: Logger) -> String? { + let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + guard !trimmed.isEmpty else { return nil } + guard trimmed.hasPrefix("mp3_") else { + logger.warning("talk output_format unsupported for local playback: \(trimmed, privacy: .public)") + return nil + } + return trimmed + } } private struct ElevenLabsRequest { diff --git a/apps/macos/Tests/ClawdisIPCTests/TalkAudioPlayerTests.swift b/apps/macos/Tests/ClawdisIPCTests/TalkAudioPlayerTests.swift index e0278e4bb..8654f03e3 100644 --- a/apps/macos/Tests/ClawdisIPCTests/TalkAudioPlayerTests.swift +++ b/apps/macos/Tests/ClawdisIPCTests/TalkAudioPlayerTests.swift @@ -14,6 +14,24 @@ import Testing #expect(true) } + + @MainActor + @Test func playDoesNotHangWhenPlayIsCalledTwice() async throws { + let wav = makeWav16Mono(sampleRate: 8000, samples: 800) + defer { _ = TalkAudioPlayer.shared.stop() } + + let first = Task { @MainActor in + await TalkAudioPlayer.shared.play(data: wav) + } + + await Task.yield() + _ = await TalkAudioPlayer.shared.play(data: wav) + + _ = try await withTimeout(seconds: 2.0) { + await first.value + } + #expect(true) + } } private struct TimeoutError: Error {}