From 53eccc1c1ecd4fe8f71719bf10c8a9239cab370a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 00:17:10 +0100 Subject: [PATCH] fix: wire talk menu + mac build --- AGENTS.md | 1 + .../Sources/Clawdis/ConfigSettings.swift | 23 ++++++++--- .../Sources/Clawdis/MenuContentView.swift | 20 ++++----- .../Sources/Clawdis/TalkAudioPlayer.swift | 2 +- .../Sources/Clawdis/TalkModeRuntime.swift | 41 ++++++++++--------- 5 files changed, 48 insertions(+), 39 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 0abed7948..8226882d7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -41,6 +41,7 @@ - Also read the shared guardrails at `~/Projects/oracle/AGENTS.md` and `~/Projects/agent-scripts/AGENTS.MD` before making changes; align with any cross-repo rules noted there. - SwiftUI state management (iOS/macOS): prefer the `Observation` framework (`@Observable`, `@Bindable`) over `ObservableObject`/`@StateObject`; don’t introduce new `ObservableObject` unless required for compatibility, and migrate existing usages when touching related code. - **Restart apps:** “restart iOS/Android apps” means rebuild (recompile/install) and relaunch, not just kill/launch. +- **Device checks:** before testing, verify connected real devices (iOS/Android) before reaching for simulators/emulators. - iOS Team ID lookup: `security find-identity -p codesigning -v` → use Apple Development (…) TEAMID. Fallback: `defaults read com.apple.dt.Xcode IDEProvisioningTeamIdentifiers`. - A2UI bundle hash: `src/canvas-host/a2ui/.bundle.hash` is auto-generated; regenerate via `pnpm canvas:a2ui:bundle` (or `scripts/bundle-a2ui.sh`) instead of manual conflict resolution. - Notary key file lives at `~/Library/CloudStorage/Dropbox/Backup/AppStore/AuthKey_NJF3NFGTS3.p8` (Sparkle keys live under `~/Library/CloudStorage/Dropbox/Backup/Sparkle`). diff --git a/apps/macos/Sources/Clawdis/ConfigSettings.swift b/apps/macos/Sources/Clawdis/ConfigSettings.swift index cbbf04d5a..7c0867d79 100644 --- a/apps/macos/Sources/Clawdis/ConfigSettings.swift +++ b/apps/macos/Sources/Clawdis/ConfigSettings.swift @@ -277,14 +277,25 @@ struct ConfigSettings: View { GridRow { self.gridLabel("Voice ID") VStack(alignment: .leading, spacing: 6) { - ComboBox("ElevenLabs voice ID", text: self.$talkVoiceId) { - ForEach(self.talkVoiceSuggestions, id: \.self) { value in - Text(value).tag(value) + HStack(spacing: 8) { + TextField("ElevenLabs voice ID", text: self.$talkVoiceId) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: .infinity) + .onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() } + if !self.talkVoiceSuggestions.isEmpty { + Menu { + ForEach(self.talkVoiceSuggestions, id: \.self) { value in + Button(value) { + self.talkVoiceId = value + self.autosaveConfig() + } + } + } label: { + Label("Suggestions", systemImage: "chevron.up.chevron.down") + } + .fixedSize() } } - .textFieldStyle(.roundedBorder) - .frame(maxWidth: .infinity) - .onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() } Text("Defaults to ELEVENLABS_VOICE_ID / SAG_VOICE_ID if unset.") .font(.footnote) .foregroundStyle(.secondary) diff --git a/apps/macos/Sources/Clawdis/MenuContentView.swift b/apps/macos/Sources/Clawdis/MenuContentView.swift index dee70ed5d..e1453e5a2 100644 --- a/apps/macos/Sources/Clawdis/MenuContentView.swift +++ b/apps/macos/Sources/Clawdis/MenuContentView.swift @@ -80,11 +80,6 @@ struct MenuContent: View { if self.showVoiceWakeMicPicker { self.voiceWakeMicMenu } - Toggle(isOn: self.talkBinding) { - Label("Talk", systemImage: "bubble.left.and.waveform") - } - .disabled(!voiceWakeSupported) - .opacity(voiceWakeSupported ? 1 : 0.5) Divider() Button { Task { @MainActor in @@ -115,6 +110,13 @@ struct MenuContent: View { systemImage: "rectangle.inset.filled.on.rectangle") } } + Button { + Task { await self.state.setTalkEnabled(!self.state.talkEnabled) } + } label: { + Label(self.state.talkEnabled ? "Stop Talk Mode" : "Talk Mode", systemImage: "bubble.left.and.waveform") + } + .disabled(!voiceWakeSupported) + .opacity(voiceWakeSupported ? 1 : 0.5) Divider() Button("Settings…") { self.open(tab: .general) } .keyboardShortcut(",", modifiers: [.command]) @@ -344,14 +346,6 @@ struct MenuContent: View { }) } - private var talkBinding: Binding { - Binding( - get: { self.state.talkEnabled }, - set: { newValue in - Task { await self.state.setTalkEnabled(newValue) } - }) - } - private var showVoiceWakeMicPicker: Bool { voiceWakeSupported && self.state.swabbleEnabled } diff --git a/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift index f72de1d02..b1df3886b 100644 --- a/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift +++ b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift @@ -3,7 +3,7 @@ import Foundation import OSLog @MainActor -final class TalkAudioPlayer: NSObject, AVAudioPlayerDelegate { +final class TalkAudioPlayer: NSObject, @preconcurrency AVAudioPlayerDelegate { static let shared = TalkAudioPlayer() private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts") diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 955d9ceda..3be350d9d 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -87,9 +87,9 @@ actor TalkModeRuntime { private struct RecognitionUpdate { let transcript: String? - let segments: [SFTranscriptionSegment] + let hasConfidence: Bool let isFinal: Bool - let error: Error? + let errorDescription: String? let generation: Int } @@ -136,12 +136,13 @@ actor TalkModeRuntime { self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in guard let self else { return } + let segments = result?.bestTranscription.segments ?? [] let transcript = result?.bestTranscription.formattedString let update = RecognitionUpdate( transcript: transcript, - segments: result?.bestTranscription.segments ?? [], + hasConfidence: segments.contains { $0.confidence > 0.6 }, isFinal: result?.isFinal ?? false, - error: error, + errorDescription: error?.localizedDescription, generation: generation) Task { await self.handleRecognition(update) } } @@ -161,14 +162,14 @@ actor TalkModeRuntime { private func handleRecognition(_ update: RecognitionUpdate) async { guard update.generation == self.recognitionGeneration else { return } - if let error = update.error { - self.logger.debug("talk recognition error: \(error.localizedDescription, privacy: .public)") + if let errorDescription = update.errorDescription { + self.logger.debug("talk recognition error: \(errorDescription, privacy: .public)") } guard let transcript = update.transcript else { return } let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) if self.phase == .speaking, self.interruptOnSpeech { - if await self.shouldInterrupt(transcript: trimmed, segments: update.segments) { + if await self.shouldInterrupt(transcript: trimmed, hasConfidence: update.hasConfidence) { await self.stopSpeaking(reason: .speech) self.lastTranscript = "" self.lastHeard = nil @@ -194,11 +195,14 @@ actor TalkModeRuntime { private func startSilenceMonitor() { self.silenceTask?.cancel() self.silenceTask = Task { [weak self] in - guard let self else { return } - while self.isEnabled { - try? await Task.sleep(nanoseconds: 200_000_000) - await self.checkSilence() - } + await self?.silenceLoop() + } + } + + private func silenceLoop() async { + while self.isEnabled { + try? await Task.sleep(nanoseconds: 200_000_000) + await self.checkSilence() } } @@ -297,9 +301,9 @@ actor TalkModeRuntime { } private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState { - await withTaskGroup(of: ChatCompletionState.self) { group in + let stream = await GatewayConnection.shared.subscribe() + return await withTaskGroup(of: ChatCompletionState.self) { group in group.addTask { [runId] in - let stream = GatewayConnection.shared.subscribe() for await push in stream { if case let .event(evt) = push, evt.event == "chat", let payload = evt.payload { if let chat = try? JSONDecoder().decode( @@ -332,13 +336,13 @@ actor TalkModeRuntime { do { let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey) let messages = history.messages ?? [] - let decoded = messages.compactMap { item in + let decoded: [ClawdisChatMessage] = messages.compactMap { item in guard let data = try? JSONEncoder().encode(item) else { return nil } return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data) } guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil } let text = assistant.content.compactMap { $0.text }.joined(separator: "\n") - let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) return trimmed.isEmpty ? nil : trimmed } catch { self.logger.error("talk history fetch failed: \(error.localizedDescription, privacy: .public)") @@ -418,7 +422,7 @@ actor TalkModeRuntime { let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( voiceId: voiceId, request: request) - let result = await MainActor.run { await TalkAudioPlayer.shared.play(data: audio) } + let result = await TalkAudioPlayer.shared.play(data: audio) if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking { if self.interruptOnSpeech { self.lastInterruptedAtSeconds = interruptedAt @@ -533,7 +537,7 @@ actor TalkModeRuntime { return sqrt(sum / Double(frameCount)) } - private func shouldInterrupt(transcript: String, segments: [SFTranscriptionSegment]) async -> Bool { + private func shouldInterrupt(transcript: String, hasConfidence: Bool) async -> Bool { let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) guard trimmed.count >= 3 else { return false } if self.isLikelyEcho(of: trimmed) { return false } @@ -541,7 +545,6 @@ actor TalkModeRuntime { if let lastSpeechEnergyAt, now.timeIntervalSince(lastSpeechEnergyAt) > 0.35 { return false } - let hasConfidence = segments.contains { $0.confidence > 0.6 } return hasConfidence }