From 862a490038664ab6559e3a4147c29c2e53b92bba Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:48:09 +0000 Subject: [PATCH 01/10] feat(ios): pulse settings indicator --- apps/ios/Sources/RootTabs.swift | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/apps/ios/Sources/RootTabs.swift b/apps/ios/Sources/RootTabs.swift index c87392974..3b450647b 100644 --- a/apps/ios/Sources/RootTabs.swift +++ b/apps/ios/Sources/RootTabs.swift @@ -2,6 +2,7 @@ import SwiftUI struct RootTabs: View { @EnvironmentObject private var appModel: NodeAppModel + @State private var isConnectingPulse: Bool = false var body: some View { TabView { @@ -27,12 +28,18 @@ struct RootTabs: View { radius: self.settingsIndicatorGlowRadius, x: 0, y: 0) + .scaleEffect(self.settingsIndicatorScale) + .opacity(self.settingsIndicatorOpacity) .offset(x: 7, y: -2) } Text("Settings") } } } + .onAppear { self.updateConnectingPulse(for: self.bridgeIndicatorState) } + .onChange(of: self.bridgeIndicatorState) { _, newValue in + self.updateConnectingPulse(for: newValue) + } } private enum BridgeIndicatorState { @@ -74,9 +81,31 @@ struct RootTabs: View { case .connected: 6 case .connecting: - 4 + self.isConnectingPulse ? 6 : 3 case .disconnected: 0 } } + + private var settingsIndicatorScale: CGFloat { + guard self.bridgeIndicatorState == .connecting else { return 1 } + return self.isConnectingPulse ? 1.12 : 0.96 + } + + private var settingsIndicatorOpacity: Double { + guard self.bridgeIndicatorState == .connecting else { return 1 } + return self.isConnectingPulse ? 1.0 : 0.75 + } + + private func updateConnectingPulse(for state: BridgeIndicatorState) { + guard state == .connecting else { + withAnimation(.easeOut(duration: 0.2)) { self.isConnectingPulse = false } + return + } + + guard !self.isConnectingPulse else { return } + withAnimation(.easeInOut(duration: 0.9).repeatForever(autoreverses: true)) { + self.isConnectingPulse = true + } + } } From 2454e67e09d2283f434af7da5551b08dea69673a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:48:16 +0000 Subject: [PATCH 02/10] feat(ios): reconnect to last discovered gateway --- .../Bridge/BridgeConnectionController.swift | 16 +++++++-- .../Sources/Bridge/BridgeSettingsStore.swift | 33 +++++++++++++++++++ apps/ios/Sources/Settings/SettingsTab.swift | 3 ++ docs/ios/connect.md | 4 +-- 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/apps/ios/Sources/Bridge/BridgeConnectionController.swift b/apps/ios/Sources/Bridge/BridgeConnectionController.swift index 6361093de..390ae41fa 100644 --- a/apps/ios/Sources/Bridge/BridgeConnectionController.swift +++ b/apps/ios/Sources/Bridge/BridgeConnectionController.swift @@ -13,6 +13,7 @@ final class BridgeConnectionController: ObservableObject { private weak var appModel: NodeAppModel? private var cancellables = Set() private var didAutoConnect = false + private var seenStableIDs = Set() init(appModel: NodeAppModel) { self.appModel = appModel @@ -23,6 +24,7 @@ final class BridgeConnectionController: ObservableObject { .sink { [weak self] newValue in guard let self else { return } self.bridges = newValue + self.updateLastDiscoveredBridge(from: newValue) self.maybeAutoConnect() } .store(in: &self.cancellables) @@ -50,9 +52,9 @@ final class BridgeConnectionController: ObservableObject { guard appModel.bridgeServerName == nil else { return } let defaults = UserDefaults.standard - let preferredStableID = defaults.string(forKey: "bridge.preferredStableID")? + let targetStableID = defaults.string(forKey: "bridge.lastDiscoveredStableID")? .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" - guard !preferredStableID.isEmpty else { return } + guard !targetStableID.isEmpty else { return } let instanceId = defaults.string(forKey: "node.instanceId")? .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" @@ -64,12 +66,20 @@ final class BridgeConnectionController: ObservableObject { .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" guard !token.isEmpty else { return } - guard let target = self.bridges.first(where: { $0.stableID == preferredStableID }) else { return } + guard let target = self.bridges.first(where: { $0.stableID == targetStableID }) else { return } self.didAutoConnect = true appModel.connectToBridge(endpoint: target.endpoint, hello: self.makeHello(token: token)) } + private func updateLastDiscoveredBridge(from bridges: [BridgeDiscoveryModel.DiscoveredBridge]) { + let newlyDiscovered = bridges.filter { self.seenStableIDs.insert($0.stableID).inserted } + guard let last = newlyDiscovered.last else { return } + + UserDefaults.standard.set(last.stableID, forKey: "bridge.lastDiscoveredStableID") + BridgeSettingsStore.saveLastDiscoveredBridgeStableID(last.stableID) + } + private func makeHello(token: String) -> BridgeHello { let defaults = UserDefaults.standard let nodeId = defaults.string(forKey: "node.instanceId") ?? "ios-node" diff --git a/apps/ios/Sources/Bridge/BridgeSettingsStore.swift b/apps/ios/Sources/Bridge/BridgeSettingsStore.swift index 653d56280..f73a02637 100644 --- a/apps/ios/Sources/Bridge/BridgeSettingsStore.swift +++ b/apps/ios/Sources/Bridge/BridgeSettingsStore.swift @@ -6,13 +6,16 @@ enum BridgeSettingsStore { private static let instanceIdDefaultsKey = "node.instanceId" private static let preferredBridgeStableIDDefaultsKey = "bridge.preferredStableID" + private static let lastDiscoveredBridgeStableIDDefaultsKey = "bridge.lastDiscoveredStableID" private static let instanceIdAccount = "instanceId" private static let preferredBridgeStableIDAccount = "preferredStableID" + private static let lastDiscoveredBridgeStableIDAccount = "lastDiscoveredStableID" static func bootstrapPersistence() { self.ensureStableInstanceID() self.ensurePreferredBridgeStableID() + self.ensureLastDiscoveredBridgeStableID() } static func loadStableInstanceID() -> String? { @@ -36,6 +39,18 @@ enum BridgeSettingsStore { account: self.preferredBridgeStableIDAccount) } + static func loadLastDiscoveredBridgeStableID() -> String? { + KeychainStore.loadString(service: self.bridgeService, account: self.lastDiscoveredBridgeStableIDAccount)? + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + static func saveLastDiscoveredBridgeStableID(_ stableID: String) { + _ = KeychainStore.saveString( + stableID, + service: self.bridgeService, + account: self.lastDiscoveredBridgeStableIDAccount) + } + private static func ensureStableInstanceID() { let defaults = UserDefaults.standard @@ -76,4 +91,22 @@ enum BridgeSettingsStore { defaults.set(stored, forKey: self.preferredBridgeStableIDDefaultsKey) } } + + private static func ensureLastDiscoveredBridgeStableID() { + let defaults = UserDefaults.standard + + if let existing = defaults.string(forKey: self.lastDiscoveredBridgeStableIDDefaultsKey)? + .trimmingCharacters(in: .whitespacesAndNewlines), + !existing.isEmpty + { + if self.loadLastDiscoveredBridgeStableID() == nil { + self.saveLastDiscoveredBridgeStableID(existing) + } + return + } + + if let stored = self.loadLastDiscoveredBridgeStableID(), !stored.isEmpty { + defaults.set(stored, forKey: self.lastDiscoveredBridgeStableIDDefaultsKey) + } + } } diff --git a/apps/ios/Sources/Settings/SettingsTab.swift b/apps/ios/Sources/Settings/SettingsTab.swift index d5304d698..de01a2548 100644 --- a/apps/ios/Sources/Settings/SettingsTab.swift +++ b/apps/ios/Sources/Settings/SettingsTab.swift @@ -19,6 +19,7 @@ struct SettingsTab: View { @AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false @AppStorage("camera.enabled") private var cameraEnabled: Bool = true @AppStorage("bridge.preferredStableID") private var preferredBridgeStableID: String = "" + @AppStorage("bridge.lastDiscoveredStableID") private var lastDiscoveredBridgeStableID: String = "" @StateObject private var connectStatus = ConnectStatusStore() @State private var connectingBridgeID: String? @State private var localIPAddress: String? @@ -207,6 +208,8 @@ struct SettingsTab: View { self.connectingBridgeID = bridge.id self.preferredBridgeStableID = bridge.stableID BridgeSettingsStore.savePreferredBridgeStableID(bridge.stableID) + self.lastDiscoveredBridgeStableID = bridge.stableID + BridgeSettingsStore.saveLastDiscoveredBridgeStableID(bridge.stableID) defer { self.connectingBridgeID = nil } do { diff --git a/docs/ios/connect.md b/docs/ios/connect.md index 8917a3b25..c4aa856a7 100644 --- a/docs/ios/connect.md +++ b/docs/ios/connect.md @@ -54,13 +54,13 @@ More debugging notes: `docs/bonjour.md`. In Iris: - Pick the discovered bridge (or hit refresh). - If not paired yet, Iris will initiate pairing automatically. -- After the first successful pairing, Iris will auto-reconnect to the **last bridge** on launch (including after reinstall), as long as the iOS Keychain entry is still present. +- After the first successful pairing, Iris will auto-reconnect **strictly to the last discovered gateway** on launch (including after reinstall), as long as the iOS Keychain entry is still present. ### Connection indicator (always visible) The Settings tab icon shows a small status dot: - **Green**: connected to the bridge -- **Yellow**: connecting +- **Yellow**: connecting (subtle pulse) - **Red**: not connected / error ## 4) Approve pairing (CLI) From a92eb1f33d0952f15a16f8b5ad2edaf32d359738 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:48:58 +0000 Subject: [PATCH 03/10] feat(camera): add snap/clip capture --- .../ios/Sources/Camera/CameraController.swift | 319 ++++++++++++++++ apps/ios/Sources/Info.plist | 2 + apps/ios/Sources/Model/NodeAppModel.swift | 60 ++- apps/ios/Sources/Voice/VoiceWakeManager.swift | 31 ++ apps/ios/project.yml | 1 + .../Clawdis/CameraCaptureService.swift | 341 ++++++++++++++++++ apps/macos/Sources/Clawdis/Constants.swift | 1 + .../Clawdis/ControlRequestHandler.swift | 58 +++ .../macos/Sources/Clawdis/DebugSettings.swift | 16 + .../macos/Sources/ClawdisCLI/ClawdisCLI.swift | 71 ++++ apps/macos/Sources/ClawdisIPC/IPC.swift | 42 +++ .../ClawdisIPCTests/CameraIPCTests.swift | 62 ++++ .../Sources/ClawdisKit/CameraCommands.swift | 58 +++ docs/camera.md | 98 +++++ scripts/package-mac-app.sh | 2 + src/cli/nodes-camera.test.ts | 64 ++++ src/cli/nodes-camera.ts | 92 +++++ src/cli/nodes-cli.ts | 211 +++++++++++ src/cli/program.test.ts | 142 ++++++++ 19 files changed, 1669 insertions(+), 2 deletions(-) create mode 100644 apps/ios/Sources/Camera/CameraController.swift create mode 100644 apps/macos/Sources/Clawdis/CameraCaptureService.swift create mode 100644 apps/macos/Tests/ClawdisIPCTests/CameraIPCTests.swift create mode 100644 apps/shared/ClawdisKit/Sources/ClawdisKit/CameraCommands.swift create mode 100644 docs/camera.md create mode 100644 src/cli/nodes-camera.test.ts create mode 100644 src/cli/nodes-camera.ts diff --git a/apps/ios/Sources/Camera/CameraController.swift b/apps/ios/Sources/Camera/CameraController.swift new file mode 100644 index 000000000..5579307b9 --- /dev/null +++ b/apps/ios/Sources/Camera/CameraController.swift @@ -0,0 +1,319 @@ +import AVFoundation +import ClawdisKit +import Foundation +import UIKit + +actor CameraController { + enum CameraError: LocalizedError, Sendable { + case cameraUnavailable + case microphoneUnavailable + case permissionDenied(kind: String) + case invalidParams(String) + case captureFailed(String) + case exportFailed(String) + + var errorDescription: String? { + switch self { + case .cameraUnavailable: + "Camera unavailable" + case .microphoneUnavailable: + "Microphone unavailable" + case let .permissionDenied(kind): + "\(kind) permission denied" + case let .invalidParams(msg): + msg + case let .captureFailed(msg): + msg + case let .exportFailed(msg): + msg + } + } + } + + func snap(params: ClawdisCameraSnapParams) async throws -> ( + format: String, + base64: String, + width: Int, + height: Int) + { + let facing = params.facing ?? .front + let maxWidth = params.maxWidth.flatMap { $0 > 0 ? $0 : nil } + let quality = Self.clampQuality(params.quality) + + try await self.ensureAccess(for: .video) + + let session = AVCaptureSession() + session.sessionPreset = .photo + + guard let device = Self.pickCamera(facing: facing) else { + throw CameraError.cameraUnavailable + } + + let input = try AVCaptureDeviceInput(device: device) + guard session.canAddInput(input) else { + throw CameraError.captureFailed("Failed to add camera input") + } + session.addInput(input) + + let output = AVCapturePhotoOutput() + guard session.canAddOutput(output) else { + throw CameraError.captureFailed("Failed to add photo output") + } + session.addOutput(output) + output.maxPhotoQualityPrioritization = .quality + + session.startRunning() + defer { session.stopRunning() } + + let settings: AVCapturePhotoSettings = { + if output.availablePhotoCodecTypes.contains(.jpeg) { + return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg]) + } + return AVCapturePhotoSettings() + }() + settings.photoQualityPrioritization = .quality + + let rawData: Data = try await withCheckedThrowingContinuation { cont in + output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont)) + } + + let (finalData, size) = try Self.reencodeJPEG( + imageData: rawData, + maxWidth: maxWidth, + quality: quality) + + return ( + format: "jpg", + base64: finalData.base64EncodedString(), + width: Int(size.width.rounded()), + height: Int(size.height.rounded())) + } + + func clip(params: ClawdisCameraClipParams) async throws -> ( + format: String, + base64: String, + durationMs: Int, + hasAudio: Bool) + { + let facing = params.facing ?? .front + let durationMs = Self.clampDurationMs(params.durationMs) + let includeAudio = params.includeAudio ?? true + + try await self.ensureAccess(for: .video) + if includeAudio { + try await self.ensureAccess(for: .audio) + } + + let session = AVCaptureSession() + session.sessionPreset = .high + + guard let camera = Self.pickCamera(facing: facing) else { + throw CameraError.cameraUnavailable + } + let cameraInput = try AVCaptureDeviceInput(device: camera) + guard session.canAddInput(cameraInput) else { + throw CameraError.captureFailed("Failed to add camera input") + } + session.addInput(cameraInput) + + if includeAudio { + guard let mic = AVCaptureDevice.default(for: .audio) else { + throw CameraError.microphoneUnavailable + } + let micInput = try AVCaptureDeviceInput(device: mic) + if session.canAddInput(micInput) { + session.addInput(micInput) + } else { + throw CameraError.captureFailed("Failed to add microphone input") + } + } + + let output = AVCaptureMovieFileOutput() + guard session.canAddOutput(output) else { + throw CameraError.captureFailed("Failed to add movie output") + } + session.addOutput(output) + output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000) + + session.startRunning() + defer { session.stopRunning() } + + let movURL = FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov") + let mp4URL = FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4") + + defer { + try? FileManager.default.removeItem(at: movURL) + try? FileManager.default.removeItem(at: mp4URL) + } + + let recordedURL: URL = try await withCheckedThrowingContinuation { cont in + let delegate = MovieFileDelegate(cont) + output.startRecording(to: movURL, recordingDelegate: delegate) + } + + // Transcode .mov -> .mp4 for easier downstream handling. + try await Self.exportToMP4(inputURL: recordedURL, outputURL: mp4URL) + + let data = try Data(contentsOf: mp4URL) + return (format: "mp4", base64: data.base64EncodedString(), durationMs: durationMs, hasAudio: includeAudio) + } + + private func ensureAccess(for mediaType: AVMediaType) async throws { + let status = AVCaptureDevice.authorizationStatus(for: mediaType) + switch status { + case .authorized: + return + case .notDetermined: + let ok = await withCheckedContinuation(isolation: nil) { cont in + AVCaptureDevice.requestAccess(for: mediaType) { granted in + cont.resume(returning: granted) + } + } + if !ok { + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + } + case .denied, .restricted: + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + @unknown default: + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + } + } + + private nonisolated static func pickCamera(facing: ClawdisCameraFacing) -> AVCaptureDevice? { + let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back + return AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) + } + + private nonisolated static func clampQuality(_ quality: Double?) -> Double { + let q = quality ?? 0.9 + return min(1.0, max(0.05, q)) + } + + private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { + let v = ms ?? 3000 + // Keep clips short by default; avoid huge base64 payloads on the bridge. + return min(15000, max(250, v)) + } + + private nonisolated static func reencodeJPEG( + imageData: Data, + maxWidth: Int?, + quality: Double) throws -> (data: Data, size: CGSize) + { + guard let image = UIImage(data: imageData) else { + throw CameraError.captureFailed("Failed to decode captured image") + } + + let finalImage: UIImage = if let maxWidth, maxWidth > 0 { + Self.downscale(image: image, maxWidth: CGFloat(maxWidth)) + } else { + image + } + + guard let out = finalImage.jpegData(compressionQuality: quality) else { + throw CameraError.captureFailed("Failed to encode JPEG") + } + + return (out, finalImage.size) + } + + private nonisolated static func downscale(image: UIImage, maxWidth: CGFloat) -> UIImage { + let w = image.size.width + let h = image.size.height + guard w > 0, h > 0 else { return image } + guard w > maxWidth else { return image } + + let scale = maxWidth / w + let target = CGSize(width: maxWidth, height: max(1, h * scale)) + + let format = UIGraphicsImageRendererFormat.default() + format.opaque = false + let renderer = UIGraphicsImageRenderer(size: target, format: format) + return renderer.image { _ in + image.draw(in: CGRect(origin: .zero, size: target)) + } + } + + private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws { + let asset = AVAsset(url: inputURL) + guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetHighestQuality) else { + throw CameraError.exportFailed("Failed to create export session") + } + exporter.outputURL = outputURL + exporter.outputFileType = .mp4 + exporter.shouldOptimizeForNetworkUse = true + + try await withCheckedThrowingContinuation(isolation: nil) { cont in + exporter.exportAsynchronously { + switch exporter.status { + case .completed: + cont.resume(returning: ()) + case .failed: + cont.resume(throwing: exporter.error ?? CameraError.exportFailed("Export failed")) + case .cancelled: + cont.resume(throwing: CameraError.exportFailed("Export cancelled")) + default: + cont.resume(throwing: CameraError.exportFailed("Export did not complete")) + } + } + } + } +} + +private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate { + private let continuation: CheckedContinuation + private var didResume = false + + init(_ continuation: CheckedContinuation) { + self.continuation = continuation + } + + func photoOutput( + _ output: AVCapturePhotoOutput, + didFinishProcessingPhoto photo: AVCapturePhoto, + error: Error?) + { + guard !self.didResume else { return } + self.didResume = true + + if let error { + self.continuation.resume(throwing: error) + return + } + guard let data = photo.fileDataRepresentation() else { + self.continuation.resume( + throwing: NSError(domain: "Camera", code: 1, userInfo: [ + NSLocalizedDescriptionKey: "photo data missing", + ])) + return + } + self.continuation.resume(returning: data) + } +} + +private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate { + private let continuation: CheckedContinuation + private var didResume = false + + init(_ continuation: CheckedContinuation) { + self.continuation = continuation + } + + func fileOutput( + _ output: AVCaptureFileOutput, + didFinishRecordingTo outputFileURL: URL, + from connections: [AVCaptureConnection], + error: Error?) + { + guard !self.didResume else { return } + self.didResume = true + + if let error { + self.continuation.resume(throwing: error) + return + } + self.continuation.resume(returning: outputFileURL) + } +} diff --git a/apps/ios/Sources/Info.plist b/apps/ios/Sources/Info.plist index 78f4b34aa..6ed6968b2 100644 --- a/apps/ios/Sources/Info.plist +++ b/apps/ios/Sources/Info.plist @@ -26,6 +26,8 @@ NSLocalNetworkUsageDescription Clawdis discovers and connects to your Clawdis bridge on the local network. + NSCameraUsageDescription + Clawdis can capture photos or short video clips when requested via the bridge. NSMicrophoneUsageDescription Clawdis needs microphone access for voice wake. NSSpeechRecognitionUsageDescription diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index de20a117b..c581c6b30 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -6,6 +6,7 @@ import SwiftUI final class NodeAppModel: ObservableObject { @Published var isBackgrounded: Bool = false let screen = ScreenController() + let camera = CameraController() @Published var bridgeStatusText: String = "Not connected" @Published var bridgeServerName: String? @Published var bridgeRemoteAddress: String? @@ -182,13 +183,22 @@ final class NodeAppModel: ObservableObject { } private func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse { - if req.command.hasPrefix("screen."), self.isBackgrounded { + if req.command.hasPrefix("screen.") || req.command.hasPrefix("camera."), self.isBackgrounded { return BridgeInvokeResponse( id: req.id, ok: false, error: ClawdisNodeError( code: .backgroundUnavailable, - message: "NODE_BACKGROUND_UNAVAILABLE: screen commands require foreground")) + message: "NODE_BACKGROUND_UNAVAILABLE: screen/camera commands require foreground")) + } + + if req.command.hasPrefix("camera."), !self.isCameraEnabled() { + return BridgeInvokeResponse( + id: req.id, + ok: false, + error: ClawdisNodeError( + code: .unavailable, + message: "CAMERA_DISABLED: enable Camera in iOS Settings → Camera → Allow Camera")) } do { @@ -222,6 +232,46 @@ final class NodeAppModel: ObservableObject { let payload = try Self.encodePayload(["format": "png", "base64": base64]) return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) + case ClawdisCameraCommand.snap.rawValue: + let params = (try? Self.decodeParams(ClawdisCameraSnapParams.self, from: req.paramsJSON)) ?? + ClawdisCameraSnapParams() + let res = try await self.camera.snap(params: params) + + struct Payload: Codable { + var format: String + var base64: String + var width: Int + var height: Int + } + let payload = try Self.encodePayload(Payload( + format: res.format, + base64: res.base64, + width: res.width, + height: res.height)) + return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) + + case ClawdisCameraCommand.clip.rawValue: + let params = (try? Self.decodeParams(ClawdisCameraClipParams.self, from: req.paramsJSON)) ?? + ClawdisCameraClipParams() + + let suspended = (params.includeAudio ?? true) ? self.voiceWake.suspendForExternalAudioCapture() : false + defer { self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: suspended) } + + let res = try await self.camera.clip(params: params) + + struct Payload: Codable { + var format: String + var base64: String + var durationMs: Int + var hasAudio: Bool + } + let payload = try Self.encodePayload(Payload( + format: res.format, + base64: res.base64, + durationMs: res.durationMs, + hasAudio: res.hasAudio)) + return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) + default: return BridgeInvokeResponse( id: req.id, @@ -254,4 +304,10 @@ final class NodeAppModel: ObservableObject { } return json } + + private func isCameraEnabled() -> Bool { + // Default-on: if the key doesn't exist yet, treat it as enabled. + if UserDefaults.standard.object(forKey: "camera.enabled") == nil { return true } + return UserDefaults.standard.bool(forKey: "camera.enabled") + } } diff --git a/apps/ios/Sources/Voice/VoiceWakeManager.swift b/apps/ios/Sources/Voice/VoiceWakeManager.swift index 348d0bd78..2b46c5490 100644 --- a/apps/ios/Sources/Voice/VoiceWakeManager.swift +++ b/apps/ios/Sources/Voice/VoiceWakeManager.swift @@ -205,6 +205,37 @@ final class VoiceWakeManager: NSObject, ObservableObject { try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation) } + /// Temporarily releases the microphone so other subsystems (e.g. camera video capture) can record audio. + /// Returns `true` when listening was active and was suspended. + func suspendForExternalAudioCapture() -> Bool { + guard self.isEnabled, self.isListening else { return false } + + self.isListening = false + self.statusText = "Paused" + + self.tapDrainTask?.cancel() + self.tapDrainTask = nil + self.tapQueue?.clear() + self.tapQueue = nil + + self.recognitionTask?.cancel() + self.recognitionTask = nil + self.recognitionRequest = nil + + if self.audioEngine.isRunning { + self.audioEngine.stop() + self.audioEngine.inputNode.removeTap(onBus: 0) + } + + try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation) + return true + } + + func resumeAfterExternalAudioCapture(wasSuspended: Bool) { + guard wasSuspended else { return } + Task { await self.start() } + } + private func startRecognition() throws { self.recognitionTask?.cancel() self.recognitionTask = nil diff --git a/apps/ios/project.yml b/apps/ios/project.yml index bd8d1479e..8281aac17 100644 --- a/apps/ios/project.yml +++ b/apps/ios/project.yml @@ -54,5 +54,6 @@ targets: NSLocalNetworkUsageDescription: Clawdis discovers and connects to your Clawdis bridge on the local network. NSBonjourServices: - _clawdis-bridge._tcp + NSCameraUsageDescription: Clawdis can capture photos or short video clips when requested via the bridge. NSMicrophoneUsageDescription: Clawdis needs microphone access for voice wake. NSSpeechRecognitionUsageDescription: Clawdis uses on-device speech recognition for voice wake. diff --git a/apps/macos/Sources/Clawdis/CameraCaptureService.swift b/apps/macos/Sources/Clawdis/CameraCaptureService.swift new file mode 100644 index 000000000..52ce4d53d --- /dev/null +++ b/apps/macos/Sources/Clawdis/CameraCaptureService.swift @@ -0,0 +1,341 @@ +import AVFoundation +import ClawdisIPC +import CoreGraphics +import Foundation +import ImageIO +import OSLog +import UniformTypeIdentifiers + +actor CameraCaptureService { + enum CameraError: LocalizedError, Sendable { + case cameraUnavailable + case microphoneUnavailable + case permissionDenied(kind: String) + case captureFailed(String) + case exportFailed(String) + + var errorDescription: String? { + switch self { + case .cameraUnavailable: + "Camera unavailable" + case .microphoneUnavailable: + "Microphone unavailable" + case let .permissionDenied(kind): + "\(kind) permission denied" + case let .captureFailed(msg): + msg + case let .exportFailed(msg): + msg + } + } + } + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "camera") + + func snap(facing: CameraFacing?, maxWidth: Int?, quality: Double?) async throws -> (data: Data, size: CGSize) { + let facing = facing ?? .front + let maxWidth = maxWidth.flatMap { $0 > 0 ? $0 : nil } + let quality = Self.clampQuality(quality) + + try await self.ensureAccess(for: .video) + + let session = AVCaptureSession() + session.sessionPreset = .photo + + guard let device = Self.pickCamera(facing: facing) else { + throw CameraError.cameraUnavailable + } + + let input = try AVCaptureDeviceInput(device: device) + guard session.canAddInput(input) else { + throw CameraError.captureFailed("Failed to add camera input") + } + session.addInput(input) + + let output = AVCapturePhotoOutput() + guard session.canAddOutput(output) else { + throw CameraError.captureFailed("Failed to add photo output") + } + session.addOutput(output) + output.maxPhotoQualityPrioritization = .quality + + session.startRunning() + defer { session.stopRunning() } + + let settings: AVCapturePhotoSettings = { + if output.availablePhotoCodecTypes.contains(.jpeg) { + return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg]) + } + return AVCapturePhotoSettings() + }() + settings.photoQualityPrioritization = .quality + + let rawData: Data = try await withCheckedThrowingContinuation(isolation: nil) { cont in + output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont)) + } + + return try Self.reencodeJPEG(imageData: rawData, maxWidth: maxWidth, quality: quality) + } + + func clip( + facing: CameraFacing?, + durationMs: Int?, + includeAudio: Bool, + outPath: String?) async throws -> (path: String, durationMs: Int, hasAudio: Bool) + { + let facing = facing ?? .front + let durationMs = Self.clampDurationMs(durationMs) + + try await self.ensureAccess(for: .video) + if includeAudio { + try await self.ensureAccess(for: .audio) + } + + let session = AVCaptureSession() + session.sessionPreset = .high + + guard let camera = Self.pickCamera(facing: facing) else { + throw CameraError.cameraUnavailable + } + let cameraInput = try AVCaptureDeviceInput(device: camera) + guard session.canAddInput(cameraInput) else { + throw CameraError.captureFailed("Failed to add camera input") + } + session.addInput(cameraInput) + + if includeAudio { + guard let mic = AVCaptureDevice.default(for: .audio) else { + throw CameraError.microphoneUnavailable + } + let micInput = try AVCaptureDeviceInput(device: mic) + guard session.canAddInput(micInput) else { + throw CameraError.captureFailed("Failed to add microphone input") + } + session.addInput(micInput) + } + + let output = AVCaptureMovieFileOutput() + guard session.canAddOutput(output) else { + throw CameraError.captureFailed("Failed to add movie output") + } + session.addOutput(output) + output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000) + + session.startRunning() + defer { session.stopRunning() } + + let tmpMovURL = FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov") + defer { try? FileManager.default.removeItem(at: tmpMovURL) } + + let outputURL: URL = { + if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + return URL(fileURLWithPath: outPath) + } + return FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4") + }() + + // Ensure we don't fail exporting due to an existing file. + try? FileManager.default.removeItem(at: outputURL) + + let logger = self.logger + let recordedURL: URL = try await withCheckedThrowingContinuation(isolation: nil) { cont in + output.startRecording(to: tmpMovURL, recordingDelegate: MovieFileDelegate(cont, logger: logger)) + } + + try await Self.exportToMP4(inputURL: recordedURL, outputURL: outputURL) + return (path: outputURL.path, durationMs: durationMs, hasAudio: includeAudio) + } + + private func ensureAccess(for mediaType: AVMediaType) async throws { + let status = AVCaptureDevice.authorizationStatus(for: mediaType) + switch status { + case .authorized: + return + case .notDetermined: + let ok = await withCheckedContinuation(isolation: nil) { cont in + AVCaptureDevice.requestAccess(for: mediaType) { granted in + cont.resume(returning: granted) + } + } + if !ok { + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + } + case .denied, .restricted: + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + @unknown default: + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + } + } + + private nonisolated static func pickCamera(facing: CameraFacing) -> AVCaptureDevice? { + let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back + + if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) { + return device + } + + // Many macOS cameras report `unspecified` position; fall back to any default. + return AVCaptureDevice.default(for: .video) + } + + private nonisolated static func clampQuality(_ quality: Double?) -> Double { + let q = quality ?? 0.9 + return min(1.0, max(0.05, q)) + } + + private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { + let v = ms ?? 3000 + return min(15_000, max(250, v)) + } + + private nonisolated static func reencodeJPEG( + imageData: Data, + maxWidth: Int?, + quality: Double) throws -> (data: Data, size: CGSize) + { + guard let src = CGImageSourceCreateWithData(imageData as CFData, nil), + let img = CGImageSourceCreateImageAtIndex(src, 0, nil) + else { + throw CameraError.captureFailed("Failed to decode captured image") + } + + let finalImage: CGImage + if let maxWidth, img.width > maxWidth { + guard let scaled = self.downscale(image: img, maxWidth: maxWidth) else { + throw CameraError.captureFailed("Failed to downscale image") + } + finalImage = scaled + } else { + finalImage = img + } + + let out = NSMutableData() + guard let dest = CGImageDestinationCreateWithData(out, UTType.jpeg.identifier as CFString, 1, nil) else { + throw CameraError.captureFailed("Failed to create JPEG destination") + } + + let props = [kCGImageDestinationLossyCompressionQuality: quality] as CFDictionary + CGImageDestinationAddImage(dest, finalImage, props) + guard CGImageDestinationFinalize(dest) else { + throw CameraError.captureFailed("Failed to encode JPEG") + } + + return (out as Data, CGSize(width: finalImage.width, height: finalImage.height)) + } + + private nonisolated static func downscale(image: CGImage, maxWidth: Int) -> CGImage? { + guard image.width > 0, image.height > 0 else { return image } + guard image.width > maxWidth else { return image } + + let scale = Double(maxWidth) / Double(image.width) + let targetW = maxWidth + let targetH = max(1, Int((Double(image.height) * scale).rounded())) + + let cs = CGColorSpaceCreateDeviceRGB() + let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue + guard let ctx = CGContext( + data: nil, + width: targetW, + height: targetH, + bitsPerComponent: 8, + bytesPerRow: 0, + space: cs, + bitmapInfo: bitmapInfo) + else { return nil } + + ctx.interpolationQuality = .high + ctx.draw(image, in: CGRect(x: 0, y: 0, width: targetW, height: targetH)) + return ctx.makeImage() + } + + private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws { + let asset = AVAsset(url: inputURL) + guard let export = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetMediumQuality) else { + throw CameraError.exportFailed("Failed to create export session") + } + export.outputURL = outputURL + export.outputFileType = .mp4 + export.shouldOptimizeForNetworkUse = true + + await withCheckedContinuation { cont in + export.exportAsynchronously { + cont.resume() + } + } + + switch export.status { + case .completed: + return + case .failed: + throw CameraError.exportFailed(export.error?.localizedDescription ?? "export failed") + case .cancelled: + throw CameraError.exportFailed("export cancelled") + default: + throw CameraError.exportFailed("export did not complete (\(export.status.rawValue))") + } + } +} + +private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate { + private var cont: CheckedContinuation? + + init(_ cont: CheckedContinuation) { + self.cont = cont + } + + func photoOutput( + _ output: AVCapturePhotoOutput, + didFinishProcessingPhoto photo: AVCapturePhoto, + error: Error?) + { + guard let cont else { return } + self.cont = nil + if let error { + cont.resume(throwing: error) + return + } + guard let data = photo.fileDataRepresentation() else { + cont.resume(throwing: CameraCaptureService.CameraError.captureFailed("No photo data")) + return + } + cont.resume(returning: data) + } +} + +private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate { + private var cont: CheckedContinuation? + private let logger: Logger + + init(_ cont: CheckedContinuation, logger: Logger) { + self.cont = cont + self.logger = logger + } + + func fileOutput( + _ output: AVCaptureFileOutput, + didFinishRecordingTo outputFileURL: URL, + from connections: [AVCaptureConnection], + error: Error?) + { + guard let cont else { return } + self.cont = nil + + if let error { + let ns = error as NSError + if ns.domain == AVFoundationErrorDomain, + ns.code == AVError.maximumDurationReached.rawValue + { + cont.resume(returning: outputFileURL) + return + } + + self.logger.error("camera record failed: \(error.localizedDescription, privacy: .public)") + cont.resume(throwing: error) + return + } + + cont.resume(returning: outputFileURL) + } +} diff --git a/apps/macos/Sources/Clawdis/Constants.swift b/apps/macos/Sources/Clawdis/Constants.swift index c4538365c..dc0965425 100644 --- a/apps/macos/Sources/Clawdis/Constants.swift +++ b/apps/macos/Sources/Clawdis/Constants.swift @@ -24,6 +24,7 @@ let webChatEnabledKey = "clawdis.webChatEnabled" let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled" let webChatPortKey = "clawdis.webChatPort" let canvasEnabledKey = "clawdis.canvasEnabled" +let cameraEnabledKey = "clawdis.cameraEnabled" let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled" let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled" let deepLinkKeyKey = "clawdis.deepLinkKey" diff --git a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift index a847ce62c..e17dddafa 100644 --- a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift +++ b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift @@ -3,6 +3,8 @@ import Foundation import OSLog enum ControlRequestHandler { + private static let cameraCapture = CameraCaptureService() + static func process( request: Request, notifier: NotificationManager = NotificationManager(), @@ -77,6 +79,16 @@ enum ControlRequestHandler { command: command, paramsJSON: paramsJSON, logger: logger) + + case let .cameraSnap(facing, maxWidth, quality, outPath): + return await self.handleCameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath) + + case let .cameraClip(facing, durationMs, includeAudio, outPath): + return await self.handleCameraClip( + facing: facing, + durationMs: durationMs, + includeAudio: includeAudio, + outPath: outPath) } } @@ -173,6 +185,10 @@ enum ControlRequestHandler { UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true } + private static func cameraEnabled() -> Bool { + UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false + } + private static func handleCanvasShow( session: String, path: String?, @@ -254,4 +270,46 @@ enum ControlRequestHandler { return Response(ok: false, message: error.localizedDescription) } } + + private static func handleCameraSnap( + facing: CameraFacing?, + maxWidth: Int?, + quality: Double?, + outPath: String?) async -> Response + { + guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") } + do { + let res = try await self.cameraCapture.snap(facing: facing, maxWidth: maxWidth, quality: quality) + let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + URL(fileURLWithPath: outPath) + } else { + FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).jpg") + } + + try res.data.write(to: url, options: [.atomic]) + return Response(ok: true, message: url.path) + } catch { + return Response(ok: false, message: error.localizedDescription) + } + } + + private static func handleCameraClip( + facing: CameraFacing?, + durationMs: Int?, + includeAudio: Bool, + outPath: String?) async -> Response + { + guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") } + do { + let res = try await self.cameraCapture.clip( + facing: facing, + durationMs: durationMs, + includeAudio: includeAudio, + outPath: outPath) + return Response(ok: true, message: res.path) + } catch { + return Response(ok: false, message: error.localizedDescription) + } + } } diff --git a/apps/macos/Sources/Clawdis/DebugSettings.swift b/apps/macos/Sources/Clawdis/DebugSettings.swift index 6b79e24b7..abd5d0e44 100644 --- a/apps/macos/Sources/Clawdis/DebugSettings.swift +++ b/apps/macos/Sources/Clawdis/DebugSettings.swift @@ -9,6 +9,7 @@ struct DebugSettings: View { @AppStorage(modelCatalogReloadKey) private var modelCatalogReloadBump: Int = 0 @AppStorage(iconOverrideKey) private var iconOverrideRaw: String = IconOverrideSelection.system.rawValue @AppStorage(canvasEnabledKey) private var canvasEnabled: Bool = true + @AppStorage(cameraEnabledKey) private var cameraEnabled: Bool = false @AppStorage(deepLinkAgentEnabledKey) private var deepLinkAgentEnabled: Bool = false @State private var modelsCount: Int? @State private var modelsLoading = false @@ -48,6 +49,7 @@ struct DebugSettings: View { self.pathsSection self.quickActionsSection self.canvasSection + self.cameraSection self.experimentsSection Spacer(minLength: 0) @@ -571,6 +573,20 @@ struct DebugSettings: View { } } + private var cameraSection: some View { + GroupBox("Camera") { + VStack(alignment: .leading, spacing: 10) { + Toggle("Allow Camera (agent)", isOn: self.$cameraEnabled) + .toggleStyle(.checkbox) + .help("When off, camera requests return “Camera disabled by user”.") + + Text("Allows Clawdis to capture a photo or short video via the built-in camera.") + .font(.caption) + .foregroundStyle(.secondary) + } + } + } + private var experimentsSection: some View { GroupBox("Experiments") { Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) { diff --git a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift index fa4100422..b1c203ef1 100644 --- a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift +++ b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift @@ -52,6 +52,7 @@ struct ClawdisCLI { enum Kind { case generic + case mediaPath } } @@ -91,6 +92,9 @@ struct ClawdisCLI { case "canvas": return try self.parseCanvas(args: &args) + case "camera": + return try self.parseCamera(args: &args) + default: throw CLIError.help } @@ -292,6 +296,62 @@ struct ClawdisCLI { } } + private static func parseCamera(args: inout [String]) throws -> ParsedCLIRequest { + guard let sub = args.popFirst() else { throw CLIError.help } + switch sub { + case "snap": + var facing: CameraFacing? + var maxWidth: Int? + var quality: Double? + var outPath: String? + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--facing": + if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f } + case "--max-width": + maxWidth = args.popFirst().flatMap(Int.init) + case "--quality": + quality = args.popFirst().flatMap(Double.init) + case "--out": + outPath = args.popFirst() + default: + break + } + } + return ParsedCLIRequest( + request: .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath), + kind: .mediaPath) + + case "clip": + var facing: CameraFacing? + var durationMs: Int? + var includeAudio = true + var outPath: String? + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--facing": + if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f } + case "--duration-ms": + durationMs = args.popFirst().flatMap(Int.init) + case "--no-audio": + includeAudio = false + case "--out": + outPath = args.popFirst() + default: + break + } + } + return ParsedCLIRequest( + request: .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath), + kind: .mediaPath) + + default: + throw CLIError.help + } + } + private static func parseCanvasPlacement( args: inout [String], session: inout String, @@ -334,6 +394,10 @@ struct ClawdisCLI { if let message = response.message, !message.isEmpty { FileHandle.standardOutput.write(Data((message + "\n").utf8)) } + case .mediaPath: + if let message = response.message, !message.isEmpty { + print("MEDIA:\(message)") + } } } @@ -352,6 +416,8 @@ struct ClawdisCLI { output["payload"] = text } } + case .mediaPath: + break } let json = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted]) @@ -406,6 +472,10 @@ struct ClawdisCLI { clawdis-mac canvas eval --js [--session ] clawdis-mac canvas snapshot [--out ] [--session ] + Camera: + clawdis-mac camera snap [--facing ] [--max-width ] [--quality <0-1>] [--out ] + clawdis-mac camera clip [--facing ] [--duration-ms ] [--no-audio] [--out ] + Browser (clawd): clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot @@ -433,6 +503,7 @@ struct ClawdisCLI { Output: Default output is text. Use --json for machine-readable output. In text mode, `browser screenshot` prints MEDIA:. + In text mode, `camera snap` and `camera clip` print MEDIA:. """ print(usage) } diff --git a/apps/macos/Sources/ClawdisIPC/IPC.swift b/apps/macos/Sources/ClawdisIPC/IPC.swift index feb062bdc..6aadc0185 100644 --- a/apps/macos/Sources/ClawdisIPC/IPC.swift +++ b/apps/macos/Sources/ClawdisIPC/IPC.swift @@ -13,6 +13,11 @@ public enum Capability: String, Codable, CaseIterable, Sendable { case speechRecognition } +public enum CameraFacing: String, Codable, Sendable { + case front + case back +} + // MARK: - Requests /// Notification interruption level (maps to UNNotificationInterruptionLevel) @@ -74,6 +79,8 @@ public enum Request: Sendable { case canvasSnapshot(session: String, outPath: String?) case nodeList case nodeInvoke(nodeId: String, command: String, paramsJSON: String?) + case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?) + case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?) } // MARK: - Responses @@ -104,6 +111,11 @@ extension Request: Codable { case path case javaScript case outPath + case facing + case maxWidth + case quality + case durationMs + case includeAudio case placement case nodeId case nodeCommand @@ -124,6 +136,8 @@ extension Request: Codable { case canvasSnapshot case nodeList case nodeInvoke + case cameraSnap + case cameraClip } public func encode(to encoder: Encoder) throws { @@ -198,6 +212,20 @@ extension Request: Codable { try container.encode(nodeId, forKey: .nodeId) try container.encode(command, forKey: .nodeCommand) try container.encodeIfPresent(paramsJSON, forKey: .paramsJSON) + + case let .cameraSnap(facing, maxWidth, quality, outPath): + try container.encode(Kind.cameraSnap, forKey: .type) + try container.encodeIfPresent(facing, forKey: .facing) + try container.encodeIfPresent(maxWidth, forKey: .maxWidth) + try container.encodeIfPresent(quality, forKey: .quality) + try container.encodeIfPresent(outPath, forKey: .outPath) + + case let .cameraClip(facing, durationMs, includeAudio, outPath): + try container.encode(Kind.cameraClip, forKey: .type) + try container.encodeIfPresent(facing, forKey: .facing) + try container.encodeIfPresent(durationMs, forKey: .durationMs) + try container.encode(includeAudio, forKey: .includeAudio) + try container.encodeIfPresent(outPath, forKey: .outPath) } } @@ -274,6 +302,20 @@ extension Request: Codable { let command = try container.decode(String.self, forKey: .nodeCommand) let paramsJSON = try container.decodeIfPresent(String.self, forKey: .paramsJSON) self = .nodeInvoke(nodeId: nodeId, command: command, paramsJSON: paramsJSON) + + case .cameraSnap: + let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing) + let maxWidth = try container.decodeIfPresent(Int.self, forKey: .maxWidth) + let quality = try container.decodeIfPresent(Double.self, forKey: .quality) + let outPath = try container.decodeIfPresent(String.self, forKey: .outPath) + self = .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath) + + case .cameraClip: + let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing) + let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs) + let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true + let outPath = try container.decodeIfPresent(String.self, forKey: .outPath) + self = .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath) } } } diff --git a/apps/macos/Tests/ClawdisIPCTests/CameraIPCTests.swift b/apps/macos/Tests/ClawdisIPCTests/CameraIPCTests.swift new file mode 100644 index 000000000..a5121ab56 --- /dev/null +++ b/apps/macos/Tests/ClawdisIPCTests/CameraIPCTests.swift @@ -0,0 +1,62 @@ +import ClawdisIPC +import Foundation +import Testing + +@Suite struct CameraIPCTests { + @Test func cameraSnapCodableRoundtrip() throws { + let req: Request = .cameraSnap( + facing: .front, + maxWidth: 640, + quality: 0.85, + outPath: "/tmp/test.jpg") + + let data = try JSONEncoder().encode(req) + let decoded = try JSONDecoder().decode(Request.self, from: data) + + switch decoded { + case let .cameraSnap(facing, maxWidth, quality, outPath): + #expect(facing == .front) + #expect(maxWidth == 640) + #expect(quality == 0.85) + #expect(outPath == "/tmp/test.jpg") + default: + Issue.record("expected cameraSnap, got \(decoded)") + } + } + + @Test func cameraClipCodableRoundtrip() throws { + let req: Request = .cameraClip( + facing: .back, + durationMs: 3000, + includeAudio: false, + outPath: "/tmp/test.mp4") + + let data = try JSONEncoder().encode(req) + let decoded = try JSONDecoder().decode(Request.self, from: data) + + switch decoded { + case let .cameraClip(facing, durationMs, includeAudio, outPath): + #expect(facing == .back) + #expect(durationMs == 3000) + #expect(includeAudio == false) + #expect(outPath == "/tmp/test.mp4") + default: + Issue.record("expected cameraClip, got \(decoded)") + } + } + + @Test func cameraClipDefaultsIncludeAudioToTrueWhenMissing() throws { + let json = """ + {"type":"cameraClip","durationMs":1234} + """ + let decoded = try JSONDecoder().decode(Request.self, from: Data(json.utf8)) + switch decoded { + case let .cameraClip(_, durationMs, includeAudio, _): + #expect(durationMs == 1234) + #expect(includeAudio == true) + default: + Issue.record("expected cameraClip, got \(decoded)") + } + } +} + diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/CameraCommands.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/CameraCommands.swift new file mode 100644 index 000000000..dd2c2015d --- /dev/null +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/CameraCommands.swift @@ -0,0 +1,58 @@ +import Foundation + +public enum ClawdisCameraCommand: String, Codable, Sendable { + case snap = "camera.snap" + case clip = "camera.clip" +} + +public enum ClawdisCameraFacing: String, Codable, Sendable { + case back + case front +} + +public enum ClawdisCameraImageFormat: String, Codable, Sendable { + case jpg + case jpeg +} + +public enum ClawdisCameraVideoFormat: String, Codable, Sendable { + case mp4 +} + +public struct ClawdisCameraSnapParams: Codable, Sendable, Equatable { + public var facing: ClawdisCameraFacing? + public var maxWidth: Int? + public var quality: Double? + public var format: ClawdisCameraImageFormat? + + public init( + facing: ClawdisCameraFacing? = nil, + maxWidth: Int? = nil, + quality: Double? = nil, + format: ClawdisCameraImageFormat? = nil) + { + self.facing = facing + self.maxWidth = maxWidth + self.quality = quality + self.format = format + } +} + +public struct ClawdisCameraClipParams: Codable, Sendable, Equatable { + public var facing: ClawdisCameraFacing? + public var durationMs: Int? + public var includeAudio: Bool? + public var format: ClawdisCameraVideoFormat? + + public init( + facing: ClawdisCameraFacing? = nil, + durationMs: Int? = nil, + includeAudio: Bool? = nil, + format: ClawdisCameraVideoFormat? = nil) + { + self.facing = facing + self.durationMs = durationMs + self.includeAudio = includeAudio + self.format = format + } +} diff --git a/docs/camera.md b/docs/camera.md new file mode 100644 index 000000000..dc5ab93db --- /dev/null +++ b/docs/camera.md @@ -0,0 +1,98 @@ +--- +summary: "Camera capture (iOS node + macOS app) for agent use: photos (jpg) and short video clips (mp4)" +read_when: + - Adding or modifying camera capture on iOS nodes or macOS + - Extending agent-accessible MEDIA temp-file workflows +--- + +# Camera capture (agent) + +Clawdis supports **camera capture** for agent workflows: + +- **iOS node** (paired via Gateway): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `node.invoke`. +- **macOS app** (local control socket): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `clawdis-mac`. + +All camera access is gated behind **user-controlled settings**. + +## iOS node + +### User setting (default on) + +- iOS Settings tab → **Camera** → **Allow Camera** (`camera.enabled`) + - Default: **on** (missing key is treated as enabled). + - When off: `camera.*` commands return `CAMERA_DISABLED`. + +### Commands (via Gateway `node.invoke`) + +- `camera.snap` + - Params: + - `facing`: `front|back` (default: `front`) + - `maxWidth`: number (optional) + - `quality`: `0..1` (optional; default `0.9`) + - `format`: currently `jpg` + - Response payload: + - `format: "jpg"` + - `base64: "<...>"` + - `width`, `height` + +- `camera.clip` + - Params: + - `facing`: `front|back` (default: `front`) + - `durationMs`: number (default `3000`, clamped to a max) + - `includeAudio`: boolean (default `true`) + - `format`: currently `mp4` + - Response payload: + - `format: "mp4"` + - `base64: "<...>"` + - `durationMs` + - `hasAudio` + +### Foreground requirement + +Like `screen.*`, the iOS node only allows `camera.*` commands in the **foreground**. Background invocations return `NODE_BACKGROUND_UNAVAILABLE`. + +### CLI helper (temp files + MEDIA) + +The easiest way to get attachments is via the CLI helper, which writes decoded media to a temp file and prints `MEDIA:`. + +Examples: + +```bash +clawdis nodes camera snap --node # default: both front + back (2 MEDIA lines) +clawdis nodes camera snap --node --facing front +clawdis nodes camera clip --node --duration 3000 +clawdis nodes camera clip --node --no-audio +``` + +Notes: +- `nodes camera snap` defaults to **both** facings to give the agent both views. +- Output files are temporary (in the OS temp directory) unless you build your own wrapper. + +## macOS app + +### User setting (default off) + +The macOS companion app exposes a checkbox: + +- **Settings → Debug → Camera → Allow Camera (agent)** (`clawdis.cameraEnabled`) + - Default: **off** + - When off: camera requests return “Camera disabled by user”. + +### CLI helper (local control socket) + +The `clawdis-mac` helper talks to the running menu bar app over the local control socket. + +Examples: + +```bash +clawdis-mac camera snap # prints MEDIA: +clawdis-mac camera snap --max-width 1280 +clawdis-mac camera clip --duration-ms 3000 # prints MEDIA: +clawdis-mac camera clip --no-audio +``` + +## Safety + practical limits + +- Camera and microphone access trigger the usual OS permission prompts (and require usage strings in Info.plist). +- Video clips are intentionally short to avoid oversized bridge payloads (base64 overhead + WebSocket message limits). + diff --git a/scripts/package-mac-app.sh b/scripts/package-mac-app.sh index a3bd8b56d..fc084f1e4 100755 --- a/scripts/package-mac-app.sh +++ b/scripts/package-mac-app.sh @@ -98,6 +98,8 @@ cat > "$APP_ROOT/Contents/Info.plist" <Clawdis needs notification permission to show alerts for agent actions. NSScreenCaptureDescription Clawdis captures the screen when the agent needs screenshots for context. + NSCameraUsageDescription + Clawdis can capture photos or short video clips when requested by the agent. NSMicrophoneUsageDescription Clawdis needs the mic for Voice Wake tests and agent audio capture. NSSpeechRecognitionUsageDescription diff --git a/src/cli/nodes-camera.test.ts b/src/cli/nodes-camera.test.ts new file mode 100644 index 000000000..cab0a91c1 --- /dev/null +++ b/src/cli/nodes-camera.test.ts @@ -0,0 +1,64 @@ +import * as fs from "node:fs/promises"; +import * as os from "node:os"; +import * as path from "node:path"; +import { describe, expect, it } from "vitest"; +import { + cameraTempPath, + parseCameraClipPayload, + parseCameraSnapPayload, + writeBase64ToFile, +} from "./nodes-camera.js"; + +describe("nodes camera helpers", () => { + it("parses camera.snap payload", () => { + expect( + parseCameraSnapPayload({ + format: "jpg", + base64: "aGk=", + width: 10, + height: 20, + }), + ).toEqual({ format: "jpg", base64: "aGk=", width: 10, height: 20 }); + }); + + it("rejects invalid camera.snap payload", () => { + expect(() => parseCameraSnapPayload({ format: "jpg" })).toThrow( + /invalid camera\.snap payload/i, + ); + }); + + it("parses camera.clip payload", () => { + expect( + parseCameraClipPayload({ + format: "mp4", + base64: "AAEC", + durationMs: 1234, + hasAudio: true, + }), + ).toEqual({ + format: "mp4", + base64: "AAEC", + durationMs: 1234, + hasAudio: true, + }); + }); + + it("builds stable temp paths when id provided", () => { + const p = cameraTempPath({ + kind: "snap", + facing: "front", + ext: "jpg", + tmpDir: "/tmp", + id: "id1", + }); + expect(p).toBe(path.join("/tmp", "clawdis-camera-snap-front-id1.jpg")); + }); + + it("writes base64 to file", async () => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-test-")); + const out = path.join(dir, "x.bin"); + await writeBase64ToFile(out, "aGk="); + await expect(fs.readFile(out, "utf8")).resolves.toBe("hi"); + await fs.rm(dir, { recursive: true, force: true }); + }); +}); diff --git a/src/cli/nodes-camera.ts b/src/cli/nodes-camera.ts new file mode 100644 index 000000000..a7a8150ab --- /dev/null +++ b/src/cli/nodes-camera.ts @@ -0,0 +1,92 @@ +import { randomUUID } from "node:crypto"; +import * as fs from "node:fs/promises"; +import * as os from "node:os"; +import * as path from "node:path"; + +export type CameraFacing = "front" | "back"; + +export type CameraSnapPayload = { + format: string; + base64: string; + width: number; + height: number; +}; + +export type CameraClipPayload = { + format: string; + base64: string; + durationMs: number; + hasAudio: boolean; +}; + +function asRecord(value: unknown): Record { + return typeof value === "object" && value !== null + ? (value as Record) + : {}; +} + +function asString(value: unknown): string | undefined { + return typeof value === "string" ? value : undefined; +} + +function asNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) + ? value + : undefined; +} + +function asBoolean(value: unknown): boolean | undefined { + return typeof value === "boolean" ? value : undefined; +} + +export function parseCameraSnapPayload(value: unknown): CameraSnapPayload { + const obj = asRecord(value); + const format = asString(obj.format); + const base64 = asString(obj.base64); + const width = asNumber(obj.width); + const height = asNumber(obj.height); + if (!format || !base64 || width === undefined || height === undefined) { + throw new Error("invalid camera.snap payload"); + } + return { format, base64, width, height }; +} + +export function parseCameraClipPayload(value: unknown): CameraClipPayload { + const obj = asRecord(value); + const format = asString(obj.format); + const base64 = asString(obj.base64); + const durationMs = asNumber(obj.durationMs); + const hasAudio = asBoolean(obj.hasAudio); + if ( + !format || + !base64 || + durationMs === undefined || + hasAudio === undefined + ) { + throw new Error("invalid camera.clip payload"); + } + return { format, base64, durationMs, hasAudio }; +} + +export function cameraTempPath(opts: { + kind: "snap" | "clip"; + facing?: CameraFacing; + ext: string; + tmpDir?: string; + id?: string; +}) { + const tmpDir = opts.tmpDir ?? os.tmpdir(); + const id = opts.id ?? randomUUID(); + const facingPart = opts.facing ? `-${opts.facing}` : ""; + const ext = opts.ext.startsWith(".") ? opts.ext : `.${opts.ext}`; + return path.join( + tmpDir, + `clawdis-camera-${opts.kind}${facingPart}-${id}${ext}`, + ); +} + +export async function writeBase64ToFile(filePath: string, base64: string) { + const buf = Buffer.from(base64, "base64"); + await fs.writeFile(filePath, buf); + return { path: filePath, bytes: buf.length }; +} diff --git a/src/cli/nodes-cli.ts b/src/cli/nodes-cli.ts index 669ab5c51..1bfa931b9 100644 --- a/src/cli/nodes-cli.ts +++ b/src/cli/nodes-cli.ts @@ -1,6 +1,13 @@ import type { Command } from "commander"; import { callGateway, randomIdempotencyKey } from "../gateway/call.js"; import { defaultRuntime } from "../runtime.js"; +import { + type CameraFacing, + cameraTempPath, + parseCameraClipPayload, + parseCameraSnapPayload, + writeBase64ToFile, +} from "./nodes-camera.js"; type NodesRpcOpts = { url?: string; @@ -12,6 +19,11 @@ type NodesRpcOpts = { params?: string; invokeTimeout?: string; idempotencyKey?: string; + facing?: string; + maxWidth?: string; + quality?: string; + duration?: string; + audio?: boolean; }; type NodeListNode = { @@ -340,4 +352,203 @@ export function registerNodesCli(program: Command) { }), { timeoutMs: 30_000 }, ); + + const parseFacing = (value: string): CameraFacing => { + const v = String(value ?? "") + .trim() + .toLowerCase(); + if (v === "front" || v === "back") return v; + throw new Error(`invalid facing: ${value} (expected front|back)`); + }; + + const camera = nodes + .command("camera") + .description("Capture camera media from a paired node"); + + nodesCallOpts( + camera + .command("snap") + .description("Capture a photo from a node camera (prints MEDIA:)") + .requiredOption("--node ", "Node id, name, or IP") + .option("--facing ", "Camera facing", "both") + .option("--max-width ", "Max width in px (optional)") + .option("--quality <0-1>", "JPEG quality (default 0.9)") + .option( + "--invoke-timeout ", + "Node invoke timeout in ms (default 20000)", + "20000", + ) + .action(async (opts: NodesRpcOpts) => { + try { + const nodeId = await resolveNodeId(opts, String(opts.node ?? "")); + const facingOpt = String(opts.facing ?? "both") + .trim() + .toLowerCase(); + const facings: CameraFacing[] = + facingOpt === "both" + ? ["front", "back"] + : facingOpt === "front" || facingOpt === "back" + ? [facingOpt] + : (() => { + throw new Error( + `invalid facing: ${String(opts.facing)} (expected front|back|both)`, + ); + })(); + + const maxWidth = opts.maxWidth + ? Number.parseInt(String(opts.maxWidth), 10) + : undefined; + const quality = opts.quality + ? Number.parseFloat(String(opts.quality)) + : undefined; + const timeoutMs = opts.invokeTimeout + ? Number.parseInt(String(opts.invokeTimeout), 10) + : undefined; + + const results: Array<{ + facing: CameraFacing; + path: string; + width: number; + height: number; + }> = []; + + for (const facing of facings) { + const invokeParams: Record = { + nodeId, + command: "camera.snap", + params: { + facing, + maxWidth: Number.isFinite(maxWidth) ? maxWidth : undefined, + quality: Number.isFinite(quality) ? quality : undefined, + format: "jpg", + }, + idempotencyKey: randomIdempotencyKey(), + }; + if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) { + invokeParams.timeoutMs = timeoutMs; + } + + const raw = (await callGatewayCli( + "node.invoke", + opts, + invokeParams, + )) as unknown; + + const res = + typeof raw === "object" && raw !== null + ? (raw as { payload?: unknown }) + : {}; + const payload = parseCameraSnapPayload(res.payload); + const filePath = cameraTempPath({ + kind: "snap", + facing, + ext: payload.format === "jpeg" ? "jpg" : payload.format, + }); + await writeBase64ToFile(filePath, payload.base64); + results.push({ + facing, + path: filePath, + width: payload.width, + height: payload.height, + }); + } + + if (opts.json) { + defaultRuntime.log(JSON.stringify({ files: results }, null, 2)); + return; + } + defaultRuntime.log(results.map((r) => `MEDIA:${r.path}`).join("\n")); + } catch (err) { + defaultRuntime.error(`nodes camera snap failed: ${String(err)}`); + defaultRuntime.exit(1); + } + }), + { timeoutMs: 60_000 }, + ); + + nodesCallOpts( + camera + .command("clip") + .description( + "Capture a short video clip from a node camera (prints MEDIA:)", + ) + .requiredOption("--node ", "Node id, name, or IP") + .option("--facing ", "Camera facing", "front") + .option("--duration ", "Duration in ms (default 3000)", "3000") + .option("--no-audio", "Disable audio capture") + .option( + "--invoke-timeout ", + "Node invoke timeout in ms (default 45000)", + "45000", + ) + .action(async (opts: NodesRpcOpts & { audio?: boolean }) => { + try { + const nodeId = await resolveNodeId(opts, String(opts.node ?? "")); + const facing = parseFacing(String(opts.facing ?? "front")); + const durationMs = Number.parseInt( + String(opts.duration ?? "3000"), + 10, + ); + const includeAudio = opts.audio !== false; + const timeoutMs = opts.invokeTimeout + ? Number.parseInt(String(opts.invokeTimeout), 10) + : undefined; + + const invokeParams: Record = { + nodeId, + command: "camera.clip", + params: { + facing, + durationMs: Number.isFinite(durationMs) ? durationMs : undefined, + includeAudio, + format: "mp4", + }, + idempotencyKey: randomIdempotencyKey(), + }; + if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) { + invokeParams.timeoutMs = timeoutMs; + } + + const raw = (await callGatewayCli( + "node.invoke", + opts, + invokeParams, + )) as unknown; + const res = + typeof raw === "object" && raw !== null + ? (raw as { payload?: unknown }) + : {}; + const payload = parseCameraClipPayload(res.payload); + const filePath = cameraTempPath({ + kind: "clip", + facing, + ext: payload.format, + }); + await writeBase64ToFile(filePath, payload.base64); + + if (opts.json) { + defaultRuntime.log( + JSON.stringify( + { + file: { + facing, + path: filePath, + durationMs: payload.durationMs, + hasAudio: payload.hasAudio, + }, + }, + null, + 2, + ), + ); + return; + } + defaultRuntime.log(`MEDIA:${filePath}`); + } catch (err) { + defaultRuntime.error(`nodes camera clip failed: ${String(err)}`); + defaultRuntime.exit(1); + } + }), + { timeoutMs: 90_000 }, + ); } diff --git a/src/cli/program.test.ts b/src/cli/program.test.ts index 84631a313..e3b13c741 100644 --- a/src/cli/program.test.ts +++ b/src/cli/program.test.ts @@ -1,3 +1,4 @@ +import * as fs from "node:fs/promises"; import { beforeEach, describe, expect, it, vi } from "vitest"; const sendCommand = vi.fn(); @@ -148,4 +149,145 @@ describe("cli program", () => { ); expect(runtime.log).toHaveBeenCalled(); }); + + it("runs nodes camera snap and prints two MEDIA paths", async () => { + callGateway + .mockResolvedValueOnce({ + ts: Date.now(), + nodes: [ + { + nodeId: "ios-node", + displayName: "iOS Node", + remoteIp: "192.168.0.88", + connected: true, + }, + ], + }) + .mockResolvedValueOnce({ + ok: true, + nodeId: "ios-node", + command: "camera.snap", + payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 }, + }) + .mockResolvedValueOnce({ + ok: true, + nodeId: "ios-node", + command: "camera.snap", + payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 }, + }); + + const program = buildProgram(); + runtime.log.mockClear(); + await program.parseAsync( + ["nodes", "camera", "snap", "--node", "ios-node"], + { + from: "user", + }, + ); + + expect(callGateway).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ + method: "node.invoke", + params: expect.objectContaining({ + nodeId: "ios-node", + command: "camera.snap", + timeoutMs: 20000, + idempotencyKey: "idem-test", + params: expect.objectContaining({ facing: "front", format: "jpg" }), + }), + }), + ); + expect(callGateway).toHaveBeenNthCalledWith( + 3, + expect.objectContaining({ + method: "node.invoke", + params: expect.objectContaining({ + nodeId: "ios-node", + command: "camera.snap", + timeoutMs: 20000, + idempotencyKey: "idem-test", + params: expect.objectContaining({ facing: "back", format: "jpg" }), + }), + }), + ); + + const out = String(runtime.log.mock.calls[0]?.[0] ?? ""); + const mediaPaths = out + .split("\n") + .filter((l) => l.startsWith("MEDIA:")) + .map((l) => l.replace(/^MEDIA:/, "")) + .filter(Boolean); + expect(mediaPaths).toHaveLength(2); + + try { + for (const p of mediaPaths) { + await expect(fs.readFile(p, "utf8")).resolves.toBe("hi"); + } + } finally { + await Promise.all(mediaPaths.map((p) => fs.unlink(p).catch(() => {}))); + } + }); + + it("runs nodes camera clip and prints one MEDIA path", async () => { + callGateway + .mockResolvedValueOnce({ + ts: Date.now(), + nodes: [ + { + nodeId: "ios-node", + displayName: "iOS Node", + remoteIp: "192.168.0.88", + connected: true, + }, + ], + }) + .mockResolvedValueOnce({ + ok: true, + nodeId: "ios-node", + command: "camera.clip", + payload: { + format: "mp4", + base64: "aGk=", + durationMs: 3000, + hasAudio: true, + }, + }); + + const program = buildProgram(); + runtime.log.mockClear(); + await program.parseAsync( + ["nodes", "camera", "clip", "--node", "ios-node", "--duration", "3000"], + { from: "user" }, + ); + + expect(callGateway).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ + method: "node.invoke", + params: expect.objectContaining({ + nodeId: "ios-node", + command: "camera.clip", + timeoutMs: 45000, + idempotencyKey: "idem-test", + params: expect.objectContaining({ + facing: "front", + durationMs: 3000, + includeAudio: true, + format: "mp4", + }), + }), + }), + ); + + const out = String(runtime.log.mock.calls[0]?.[0] ?? ""); + const mediaPath = out.replace(/^MEDIA:/, "").trim(); + expect(mediaPath).toMatch(/clawdis-camera-clip-front-.*\.mp4$/); + + try { + await expect(fs.readFile(mediaPath, "utf8")).resolves.toBe("hi"); + } finally { + await fs.unlink(mediaPath).catch(() => {}); + } + }); }); From 8fb064ed703f0dd30e294973c2ea2857c8aece36 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:49:18 +0000 Subject: [PATCH 04/10] docs(telegram): clarify polling + webhook config --- docs/telegram.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/telegram.md b/docs/telegram.md index cd5250278..94ad4bc68 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -7,7 +7,7 @@ read_when: Updated: 2025-12-07 -Status: ready for bot-mode use with grammY (long-poll + webhook). Text + media send, proxy, and webhook helpers all ship in-tree. +Status: ready for bot-mode use with grammY (long-polling by default; webhook supported when configured). Text + media send, mention-gated group replies, and optional proxy support are implemented. ## Goals - Let you talk to Clawdis via a Telegram bot in DMs and groups. @@ -17,7 +17,11 @@ Status: ready for bot-mode use with grammY (long-poll + webhook). Text + media s ## How it will work (Bot API) 1) Create a bot with @BotFather and grab the token. 2) Configure Clawdis with `TELEGRAM_BOT_TOKEN` (or `telegram.botToken` in `~/.clawdis/clawdis.json`). -3) Run the gateway; it auto-starts Telegram when the bot token is set. To force Telegram-only: `clawdis gateway --provider telegram`. Webhook mode: `clawdis gateway --provider telegram --webhook --port 8787 --webhook-secret ` (optionally `--webhook-url` when the public URL differs). +3) Run the gateway; it auto-starts Telegram when the bot token is set. + - **Long-polling** is the default. + - **Webhook mode** is enabled by setting `telegram.webhookUrl` (optionally `telegram.webhookSecret` / `telegram.webhookPath`). + - The webhook listener currently binds to `0.0.0.0:8787` and serves `POST /telegram-webhook` by default. + - If you need a different public port/host, set `telegram.webhookUrl` to the externally reachable URL and use a reverse proxy to forward to `:8787`. 4) Direct chats: user sends the first message; all subsequent turns land in the shared `main` session (default, no extra config). 5) Groups: add the bot, disable privacy mode (or make it admin) so it can read messages; group threads stay on `group:` and require mention/command to trigger replies. 6) Optional allowlist: reuse `inbound.allowFrom` for direct chats by chat id (`123456789` or `telegram:123456789`). @@ -32,7 +36,7 @@ Status: ready for bot-mode use with grammY (long-poll + webhook). Text + media s - Library: grammY is the only client for send + gateway (fetch fallback removed); grammY throttler is enabled by default to stay under Bot API limits. - Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, and `Timestamp`; groups require @bot mention by default. - Outbound: text and media (photo/video/audio/document) with optional caption; chunked to limits. Typing cue sent best-effort. -- Config: `TELEGRAM_BOT_TOKEN` env or `telegram.botToken` required; `telegram.requireMention`, `telegram.allowFrom`, `telegram.mediaMaxMb`, `telegram.proxy`, `telegram.webhookSecret`, `telegram.webhookUrl` supported. +- Config: `TELEGRAM_BOT_TOKEN` env or `telegram.botToken` required; `telegram.requireMention`, `telegram.allowFrom`, `telegram.mediaMaxMb`, `telegram.proxy`, `telegram.webhookSecret`, `telegram.webhookUrl`, `telegram.webhookPath` supported. Example config: ```json5 @@ -44,6 +48,7 @@ Example config: mediaMaxMb: 5, proxy: "socks5://localhost:9050", webhookSecret: "mysecret", + webhookPath: "/telegram-webhook", webhookUrl: "https://yourdomain.com/telegram-webhook" } } @@ -62,6 +67,6 @@ Example config: - ⏳ Add more grammY coverage (webhook payloads, media edge cases) ## Safety & ops -- Treat the bot token as a secret (equivalent to account control); store under `~/.clawdis/credentials/` with 0600 perms. -- Respect Telegram rate limits (429s); we’ll add throttling in the provider to stay below flood thresholds. +- Treat the bot token as a secret (equivalent to account control); prefer `TELEGRAM_BOT_TOKEN` or a locked-down config file (`chmod 600 ~/.clawdis/clawdis.json`). +- Respect Telegram rate limits (429s); grammY throttling is enabled by default. - Use a test bot for development to avoid hitting production chats. From 700212608a49d9e14b88fb3eedef584f61961a2a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:49:34 +0000 Subject: [PATCH 05/10] docs(remote): clarify ssh tunneling --- docs/remote.md | 72 ++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/docs/remote.md b/docs/remote.md index c61541613..4e1779a82 100644 --- a/docs/remote.md +++ b/docs/remote.md @@ -3,48 +3,50 @@ summary: "Remote mode topology using SSH control channels between gateway and ma read_when: - Running or troubleshooting remote gateway setups --- -# Remote mode with control channel +# Remote access (SSH, tunnels, and tailnets) -This repo supports “remote over SSH” by keeping a single gateway (the master) running on a host (e.g., your Mac Studio) and connecting one or more macOS menu bar clients to it. The menu app no longer shells out to `pnpm clawdis …`; it talks to the gateway over a persistent control channel that is tunneled through SSH. +This repo supports “remote over SSH” by keeping a single Gateway (the master) running on a host (e.g., your Mac Studio) and connecting clients to it. -Remote mode is the SSH fallback transport. As Clawdis adds a direct “bridge” transport for LAN/tailnet setups, SSH remains supported for universal reach. -See `docs/discovery.md` for how clients choose between direct vs SSH. +- For **operators (you / the macOS app)**: SSH tunneling is the universal fallback. +- For **nodes (Iris/iOS and future devices)**: prefer the Gateway **Bridge** when on the same LAN/tailnet (see `docs/discovery.md`). -## Topology -- Master: runs the gateway + control server on `127.0.0.1:18789` (in-process TCP server). -- Clients: when “Remote over SSH” is selected, the app opens one SSH tunnel: - - `ssh -N -L :127.0.0.1:18789 @` - - The app then connects to `localhost:` and keeps that socket open. -- Messages are newline-delimited JSON (documented in `docs/control-api.md`). +## The core idea -## Connection flow (clients) -1) Establish SSH tunnel. -2) Open TCP socket to the local forwarded port. -3) Send `ping` to verify connectivity. -4) Issue `health`, `status`, and `last-heartbeat` requests to seed UI. -5) Listen for `event` frames (heartbeat updates, gateway status). +- The Gateway WebSocket binds to **loopback**: `ws://127.0.0.1:18789`. +- For remote use, you forward that loopback port over SSH (or use a tailnet/VPN and tunnel less). -## Heartbeats -- Heartbeats always run on the master gateway. -- The control server emits `event: "heartbeat"` after each heartbeat attempt and keeps the latest in memory for `last-heartbeat` requests. -- No file-based heartbeat logs/state are required when the control stream is available. +## SSH tunnel (CLI + tools) -## Local mode -- The menu app skips SSH and connects directly to `127.0.0.1:18789` with the same protocol. +Create a local tunnel to the remote Gateway WS: -## Failure handling -- If the tunnel drops, the client reconnects and re-issues `ping`, `health`, and `last-heartbeat` to refresh state (the mac app shows “Control channel disconnected”). -- If the control port is unavailable (older gateway), the app can optionally fall back to the legacy CLI path, but the goal is to rely solely on the control channel. +```bash +ssh -N -L 18789:127.0.0.1:18789 user@host +``` -## Test Remote (in the mac app) -1) SSH reachability check (`ssh -o BatchMode=yes … echo ok`). -2) If SSH succeeds, the app opens the control tunnel and issues a `health` request; success marks the remote as ready. +With the tunnel up: +- `clawdis health` and `clawdis status --deep` now reach the remote gateway via `ws://127.0.0.1:18789`. +- `clawdis gateway {status,health,send,agent,call}` can also target the forwarded URL via `--url` when needed. -## Security -- Control server listens only on localhost. -- SSH tunneling reuses existing keys/agent; no additional auth is added by the control server. +## WebChat over SSH -## Files to keep in sync -- Protocol definition: `docs/control-api.md`. -- App connection logic: macOS `Remote over SSH` plumbing. -- Gateway control server: lives inside the Node gateway process. +Forward both the WebChat HTTP port and the Gateway WS port: + +```bash +ssh -N \ + -L 18788:127.0.0.1:18788 \ + -L 18789:127.0.0.1:18789 \ + user@host +``` + +Then open `http://127.0.0.1:18788/webchat/` locally. (Details: `docs/webchat.md`.) + +## macOS app “Remote over SSH” + +The macOS menu bar app can drive the same setup end-to-end (remote status checks, WebChat, and Voice Wake forwarding). + +Runbook: `docs/mac/remote.md`. + +## Legacy control channel + +Older builds experimented with a newline-delimited TCP control channel on the same port. +That API is deprecated and should not be relied on. (Historical reference: `docs/control-api.md`.) From a80cd26341804924bba3fc73321a966e0ab67ff6 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:49:54 +0000 Subject: [PATCH 06/10] docs: clarify legacy control + sessions path --- docs/control-api.md | 7 ++++--- docs/health.md | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/control-api.md b/docs/control-api.md index 6458852f0..adca0dceb 100644 --- a/docs/control-api.md +++ b/docs/control-api.md @@ -5,9 +5,10 @@ read_when: --- # Control channel API (newline-delimited JSON) -**Deprecated:** superseded by the WebSocket Gateway protocol (`clawdis gateway`, see `docs/architecture.md` and `docs/gateway.md`). Use only for legacy builds predating the Gateway rollout. +**Deprecated (historical):** superseded by the WebSocket Gateway protocol (`clawdis gateway`, see `docs/architecture.md` and `docs/gateway.md`). +Current builds use a WebSocket server on `ws://127.0.0.1:18789` and do **not** expose this TCP control channel. -Endpoint: `127.0.0.1:18789` (TCP, localhost only). Clients reach it via SSH port forward in remote mode. +Legacy endpoint (if present in an older build): `127.0.0.1:18789` (TCP, localhost only), typically reached via SSH port forward in remote mode. ## Frame format Each line is a JSON object. Two shapes exist: @@ -45,4 +46,4 @@ Each line is a JSON object. Two shapes exist: 4) For user toggles, send `set-heartbeats` and await response. ## Backward compatibility -- If the control port is unavailable (older gateway), the client may fall back to the legacy CLI path, but the intended path is to rely solely on this API. +- If the control channel is unavailable: that’s expected on modern builds. Use the Gateway WS protocol instead. diff --git a/docs/health.md b/docs/health.md index b155decc3..d57a3bf6e 100644 --- a/docs/health.md +++ b/docs/health.md @@ -16,7 +16,7 @@ Short guide to verify the WhatsApp Web / Baileys stack without guessing. ## Deep diagnostics - Creds on disk: `ls -l ~/.clawdis/credentials/creds.json` (mtime should be recent). -- Session store: `ls -l ~/.clawdis/sessions.json` (path can be overridden in config). Count and recent recipients are surfaced via `status`. +- Session store: `ls -l ~/.clawdis/sessions/sessions.json` (legacy: `~/.clawdis/sessions.json`; path can be overridden in config). Count and recent recipients are surfaced via `status`. - Relink flow: `clawdis logout && clawdis login --verbose` when status codes 409–515 or `loggedOut` appear in logs. ## When something fails From 128df570055d17b6914be8a2f1688836341d8e4d Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:50:12 +0000 Subject: [PATCH 07/10] docs: refer to session store --- docs/group-messages.md | 2 +- docs/refactor/webagent-session.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/group-messages.md b/docs/group-messages.md index 96e7f2237..0317d7f18 100644 --- a/docs/group-messages.md +++ b/docs/group-messages.md @@ -56,4 +56,4 @@ Notes: ## Known considerations - Heartbeats are intentionally skipped for groups to avoid noisy broadcasts. - Echo suppression uses the combined batch string; if you send identical text twice without mentions, only the first will get a response. -- Session store entries will appear as `group:` in `sessions.json`; a missing entry just means the group hasn’t triggered a run yet. +- Session store entries will appear as `group:` in the session store (`~/.clawdis/sessions/sessions.json` by default); a missing entry just means the group hasn’t triggered a run yet. diff --git a/docs/refactor/webagent-session.md b/docs/refactor/webagent-session.md index 0061a4401..13f0145d9 100644 --- a/docs/refactor/webagent-session.md +++ b/docs/refactor/webagent-session.md @@ -10,7 +10,7 @@ Context: web chat currently lives in a WKWebView that loads the pi-web bundle. S ## Target state - Gateway WS adds methods: - - `chat.history { sessionKey }` → `{ sessionKey, messages[], thinkingLevel }` (reads the existing JSONL + sessions.json). + - `chat.history { sessionKey }` → `{ sessionKey, messages[], thinkingLevel }` (reads the existing JSONL + session store). - `chat.send { sessionKey, message, attachments?, thinking?, deliver?, timeoutMs<=30000, idempotencyKey }` → `res { runId, status:"accepted" }` or `res ok:false` on validation/timeout. - Gateway WS emits `chat` events `{ runId, sessionKey, seq, state:"delta"|"final"|"error", message?, errorMessage?, usage?, stopReason? }`. Streaming is optional; minimum is a single `state:"final"` per send. - Client consumes only WS: bootstrap via `chat.history`, send via `chat.send`, live updates via `chat` events. No file watchers. From 441bd25f906e4a7275e8e5fc7bd97c230f660762 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:50:26 +0000 Subject: [PATCH 08/10] docs(clawd): update install + session store path --- docs/clawd.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/clawd.md b/docs/clawd.md index 318dd0885..6babbad42 100644 --- a/docs/clawd.md +++ b/docs/clawd.md @@ -24,9 +24,17 @@ Start conservative: ## Prerequisites - Node **22+** -- CLAWDIS installed: `npm install -g clawdis` +- CLAWDIS available on PATH (recommended during development: from source + global link) - A second phone number (SIM/eSIM/prepaid) for the assistant +From source (recommended while the npm package is still settling): + +```bash +pnpm install +pnpm build +pnpm link --global +``` + ## The two-phone setup (recommended) You want this: @@ -121,7 +129,7 @@ Example: ## Sessions and memory - Session files: `~/.clawdis/sessions/{{SessionId}}.jsonl` -- Session metadata (token usage, last route, etc): `~/.clawdis/sessions.json` +- Session metadata (token usage, last route, etc): `~/.clawdis/sessions/sessions.json` (legacy: `~/.clawdis/sessions.json`) - `/new` starts a fresh session for that chat (configurable via `resetTriggers`) ## Heartbeats (proactive mode) From 00f83ca7af906ac84a700d599b957f140790adfb Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:50:41 +0000 Subject: [PATCH 09/10] docs(index): update architecture + quickstart --- docs/index.md | 67 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/docs/index.md b/docs/index.md index 086a6b91b..33aa0d777 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,7 +19,7 @@ read_when:

GitHub · - npm · + Releases · Clawd setup

@@ -29,25 +29,41 @@ It’s built for [Clawd](https://clawd.me), a space lobster who needed a TARDIS. ## How it works ``` -┌─────────────┐ ┌──────────┐ ┌─────────────┐ -│ WhatsApp │ ───▶ │ CLAWDIS │ ───▶ │ AI Agent │ -│ Telegram │ ───▶ │ 🦞⏱️💙 │ ◀─── │ (Pi) │ -│ (You) │ ◀─── │ │ │ │ -└─────────────┘ └──────────┘ └─────────────┘ +WhatsApp / Telegram + │ + ▼ + ┌──────────────────────────┐ + │ Gateway │ ws://127.0.0.1:18789 (loopback-only) + │ (single source) │ tcp://0.0.0.0:18790 (optional Bridge) + └───────────┬───────────────┘ + │ + ├─ Pi agent (RPC) + ├─ CLI (clawdis …) + ├─ WebChat (loopback UI) + ├─ macOS app (Clawdis.app) + └─ iOS node (Iris) via Bridge + pairing ``` Most operations flow through the **Gateway** (`clawdis gateway`), a single long-running process that owns provider connections and the WebSocket control plane. +## Network model + +- **One Gateway per host**: it is the only process allowed to own the WhatsApp Web session. +- **Loopback-first**: Gateway WS is `ws://127.0.0.1:18789` (not exposed on the LAN). +- **Bridge for nodes**: optional LAN/tailnet-facing bridge on `tcp://0.0.0.0:18790` for paired nodes (Bonjour-discoverable). +- **Remote use**: SSH tunnel or tailnet/VPN; see `docs/remote.md` and `docs/discovery.md`. + ## Features (high level) - 📱 **WhatsApp Integration** — Uses Baileys for WhatsApp Web protocol - ✈️ **Telegram Bot** — DMs + groups via grammY - 🤖 **Agent bridge** — Pi (RPC mode) with tool streaming -- 💬 **Sessions** — Per-sender (or shared `main`) conversation context +- 💬 **Sessions** — Direct chats collapse into shared `main` (default); groups are isolated - 👥 **Group Chat Support** — Mention-based triggering in group chats - 📎 **Media Support** — Send and receive images, audio, documents - 🎤 **Voice notes** — Optional transcription hook -- 🖥️ **WebChat + macOS app** — A local UI + menu bar companion for ops and voice wake +- 🖥️ **WebChat + macOS app** — Local UI + menu bar companion for ops and voice wake +- 📱 **iOS node (Iris)** — Pairs as a node and exposes a Canvas surface Note: legacy Claude/Codex/Gemini/Opencode paths have been removed; Pi is the only coding-agent path. @@ -56,8 +72,10 @@ Note: legacy Claude/Codex/Gemini/Opencode paths have been removed; Pi is the onl Runtime requirement: **Node ≥ 22**. ```bash -# Install -npm install -g clawdis +# From source (recommended while the npm package is still settling) +pnpm install +pnpm build +pnpm link --global # Pair WhatsApp Web (shows QR) clawdis login @@ -95,18 +113,23 @@ Example: ## Docs -- [Configuration](./configuration.md) -- [Gateway runbook](./gateway.md) -- [WebChat](./webchat.md) -- [Agent integration](./agents.md) -- [Telegram](./telegram.md) -- [Group messages](./group-messages.md) -- [Media: images](./images.md) -- [Media: audio](./audio.md) -- [Sessions](./session.md) -- [Cron + wakeups](./cron.md) -- [Security](./security.md) -- [Troubleshooting](./troubleshooting.md) +- Start here: + - [Configuration](./configuration.md) + - [Clawd personal assistant setup](./clawd.md) + - [Gateway runbook](./gateway.md) + - [Discovery + transports](./discovery.md) + - [Remote access](./remote.md) +- Providers and UX: + - [WebChat](./webchat.md) + - [Telegram](./telegram.md) + - [Group messages](./group-messages.md) + - [Media: images](./images.md) + - [Media: audio](./audio.md) +- Ops and safety: + - [Sessions](./session.md) + - [Cron + wakeups](./cron.md) + - [Security](./security.md) + - [Troubleshooting](./troubleshooting.md) ## The name From affbd48a3fbd10d20e21de521632ba747901cd58 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 14 Dec 2025 00:50:57 +0000 Subject: [PATCH 10/10] docs(site): refresh footer + agent blurb --- docs/AGENTS.default.md | 2 +- docs/_layouts/default.html | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/AGENTS.default.md b/docs/AGENTS.default.md index 9a1d711a4..9b5ce1a5c 100644 --- a/docs/AGENTS.default.md +++ b/docs/AGENTS.default.md @@ -9,7 +9,7 @@ read_when: ## What Clawdis Does - Runs WhatsApp gateway + Pi coding agent so the assistant can read/write chats, fetch context, and run tools via the host Mac. - macOS app manages permissions (screen recording, notifications, microphone) and exposes a CLI helper `clawdis-mac` for scripts. -- Sessions are per-sender; heartbeats keep background tasks alive. +- Direct chats collapse into the shared `main` session by default; groups stay isolated as `group:`; heartbeats keep background tasks alive. ## Core Tools (enable in Settings → Tools) - **mcporter** — MCP runtime/CLI to list, call, and sync Model Context Protocol servers. diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html index 63eedc126..a56aafc07 100644 --- a/docs/_layouts/default.html +++ b/docs/_layouts/default.html @@ -122,8 +122,8 @@ · source · - npm - + releases +