diff --git a/apps/ios/Sources/Camera/CameraController.swift b/apps/ios/Sources/Camera/CameraController.swift new file mode 100644 index 000000000..5579307b9 --- /dev/null +++ b/apps/ios/Sources/Camera/CameraController.swift @@ -0,0 +1,319 @@ +import AVFoundation +import ClawdisKit +import Foundation +import UIKit + +actor CameraController { + enum CameraError: LocalizedError, Sendable { + case cameraUnavailable + case microphoneUnavailable + case permissionDenied(kind: String) + case invalidParams(String) + case captureFailed(String) + case exportFailed(String) + + var errorDescription: String? { + switch self { + case .cameraUnavailable: + "Camera unavailable" + case .microphoneUnavailable: + "Microphone unavailable" + case let .permissionDenied(kind): + "\(kind) permission denied" + case let .invalidParams(msg): + msg + case let .captureFailed(msg): + msg + case let .exportFailed(msg): + msg + } + } + } + + func snap(params: ClawdisCameraSnapParams) async throws -> ( + format: String, + base64: String, + width: Int, + height: Int) + { + let facing = params.facing ?? .front + let maxWidth = params.maxWidth.flatMap { $0 > 0 ? $0 : nil } + let quality = Self.clampQuality(params.quality) + + try await self.ensureAccess(for: .video) + + let session = AVCaptureSession() + session.sessionPreset = .photo + + guard let device = Self.pickCamera(facing: facing) else { + throw CameraError.cameraUnavailable + } + + let input = try AVCaptureDeviceInput(device: device) + guard session.canAddInput(input) else { + throw CameraError.captureFailed("Failed to add camera input") + } + session.addInput(input) + + let output = AVCapturePhotoOutput() + guard session.canAddOutput(output) else { + throw CameraError.captureFailed("Failed to add photo output") + } + session.addOutput(output) + output.maxPhotoQualityPrioritization = .quality + + session.startRunning() + defer { session.stopRunning() } + + let settings: AVCapturePhotoSettings = { + if output.availablePhotoCodecTypes.contains(.jpeg) { + return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg]) + } + return AVCapturePhotoSettings() + }() + settings.photoQualityPrioritization = .quality + + let rawData: Data = try await withCheckedThrowingContinuation { cont in + output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont)) + } + + let (finalData, size) = try Self.reencodeJPEG( + imageData: rawData, + maxWidth: maxWidth, + quality: quality) + + return ( + format: "jpg", + base64: finalData.base64EncodedString(), + width: Int(size.width.rounded()), + height: Int(size.height.rounded())) + } + + func clip(params: ClawdisCameraClipParams) async throws -> ( + format: String, + base64: String, + durationMs: Int, + hasAudio: Bool) + { + let facing = params.facing ?? .front + let durationMs = Self.clampDurationMs(params.durationMs) + let includeAudio = params.includeAudio ?? true + + try await self.ensureAccess(for: .video) + if includeAudio { + try await self.ensureAccess(for: .audio) + } + + let session = AVCaptureSession() + session.sessionPreset = .high + + guard let camera = Self.pickCamera(facing: facing) else { + throw CameraError.cameraUnavailable + } + let cameraInput = try AVCaptureDeviceInput(device: camera) + guard session.canAddInput(cameraInput) else { + throw CameraError.captureFailed("Failed to add camera input") + } + session.addInput(cameraInput) + + if includeAudio { + guard let mic = AVCaptureDevice.default(for: .audio) else { + throw CameraError.microphoneUnavailable + } + let micInput = try AVCaptureDeviceInput(device: mic) + if session.canAddInput(micInput) { + session.addInput(micInput) + } else { + throw CameraError.captureFailed("Failed to add microphone input") + } + } + + let output = AVCaptureMovieFileOutput() + guard session.canAddOutput(output) else { + throw CameraError.captureFailed("Failed to add movie output") + } + session.addOutput(output) + output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000) + + session.startRunning() + defer { session.stopRunning() } + + let movURL = FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov") + let mp4URL = FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4") + + defer { + try? FileManager.default.removeItem(at: movURL) + try? FileManager.default.removeItem(at: mp4URL) + } + + let recordedURL: URL = try await withCheckedThrowingContinuation { cont in + let delegate = MovieFileDelegate(cont) + output.startRecording(to: movURL, recordingDelegate: delegate) + } + + // Transcode .mov -> .mp4 for easier downstream handling. + try await Self.exportToMP4(inputURL: recordedURL, outputURL: mp4URL) + + let data = try Data(contentsOf: mp4URL) + return (format: "mp4", base64: data.base64EncodedString(), durationMs: durationMs, hasAudio: includeAudio) + } + + private func ensureAccess(for mediaType: AVMediaType) async throws { + let status = AVCaptureDevice.authorizationStatus(for: mediaType) + switch status { + case .authorized: + return + case .notDetermined: + let ok = await withCheckedContinuation(isolation: nil) { cont in + AVCaptureDevice.requestAccess(for: mediaType) { granted in + cont.resume(returning: granted) + } + } + if !ok { + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + } + case .denied, .restricted: + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + @unknown default: + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + } + } + + private nonisolated static func pickCamera(facing: ClawdisCameraFacing) -> AVCaptureDevice? { + let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back + return AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) + } + + private nonisolated static func clampQuality(_ quality: Double?) -> Double { + let q = quality ?? 0.9 + return min(1.0, max(0.05, q)) + } + + private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { + let v = ms ?? 3000 + // Keep clips short by default; avoid huge base64 payloads on the bridge. + return min(15000, max(250, v)) + } + + private nonisolated static func reencodeJPEG( + imageData: Data, + maxWidth: Int?, + quality: Double) throws -> (data: Data, size: CGSize) + { + guard let image = UIImage(data: imageData) else { + throw CameraError.captureFailed("Failed to decode captured image") + } + + let finalImage: UIImage = if let maxWidth, maxWidth > 0 { + Self.downscale(image: image, maxWidth: CGFloat(maxWidth)) + } else { + image + } + + guard let out = finalImage.jpegData(compressionQuality: quality) else { + throw CameraError.captureFailed("Failed to encode JPEG") + } + + return (out, finalImage.size) + } + + private nonisolated static func downscale(image: UIImage, maxWidth: CGFloat) -> UIImage { + let w = image.size.width + let h = image.size.height + guard w > 0, h > 0 else { return image } + guard w > maxWidth else { return image } + + let scale = maxWidth / w + let target = CGSize(width: maxWidth, height: max(1, h * scale)) + + let format = UIGraphicsImageRendererFormat.default() + format.opaque = false + let renderer = UIGraphicsImageRenderer(size: target, format: format) + return renderer.image { _ in + image.draw(in: CGRect(origin: .zero, size: target)) + } + } + + private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws { + let asset = AVAsset(url: inputURL) + guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetHighestQuality) else { + throw CameraError.exportFailed("Failed to create export session") + } + exporter.outputURL = outputURL + exporter.outputFileType = .mp4 + exporter.shouldOptimizeForNetworkUse = true + + try await withCheckedThrowingContinuation(isolation: nil) { cont in + exporter.exportAsynchronously { + switch exporter.status { + case .completed: + cont.resume(returning: ()) + case .failed: + cont.resume(throwing: exporter.error ?? CameraError.exportFailed("Export failed")) + case .cancelled: + cont.resume(throwing: CameraError.exportFailed("Export cancelled")) + default: + cont.resume(throwing: CameraError.exportFailed("Export did not complete")) + } + } + } + } +} + +private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate { + private let continuation: CheckedContinuation + private var didResume = false + + init(_ continuation: CheckedContinuation) { + self.continuation = continuation + } + + func photoOutput( + _ output: AVCapturePhotoOutput, + didFinishProcessingPhoto photo: AVCapturePhoto, + error: Error?) + { + guard !self.didResume else { return } + self.didResume = true + + if let error { + self.continuation.resume(throwing: error) + return + } + guard let data = photo.fileDataRepresentation() else { + self.continuation.resume( + throwing: NSError(domain: "Camera", code: 1, userInfo: [ + NSLocalizedDescriptionKey: "photo data missing", + ])) + return + } + self.continuation.resume(returning: data) + } +} + +private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate { + private let continuation: CheckedContinuation + private var didResume = false + + init(_ continuation: CheckedContinuation) { + self.continuation = continuation + } + + func fileOutput( + _ output: AVCaptureFileOutput, + didFinishRecordingTo outputFileURL: URL, + from connections: [AVCaptureConnection], + error: Error?) + { + guard !self.didResume else { return } + self.didResume = true + + if let error { + self.continuation.resume(throwing: error) + return + } + self.continuation.resume(returning: outputFileURL) + } +} diff --git a/apps/ios/Sources/Info.plist b/apps/ios/Sources/Info.plist index 78f4b34aa..6ed6968b2 100644 --- a/apps/ios/Sources/Info.plist +++ b/apps/ios/Sources/Info.plist @@ -26,6 +26,8 @@ NSLocalNetworkUsageDescription Clawdis discovers and connects to your Clawdis bridge on the local network. + NSCameraUsageDescription + Clawdis can capture photos or short video clips when requested via the bridge. NSMicrophoneUsageDescription Clawdis needs microphone access for voice wake. NSSpeechRecognitionUsageDescription diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index de20a117b..c581c6b30 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -6,6 +6,7 @@ import SwiftUI final class NodeAppModel: ObservableObject { @Published var isBackgrounded: Bool = false let screen = ScreenController() + let camera = CameraController() @Published var bridgeStatusText: String = "Not connected" @Published var bridgeServerName: String? @Published var bridgeRemoteAddress: String? @@ -182,13 +183,22 @@ final class NodeAppModel: ObservableObject { } private func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse { - if req.command.hasPrefix("screen."), self.isBackgrounded { + if req.command.hasPrefix("screen.") || req.command.hasPrefix("camera."), self.isBackgrounded { return BridgeInvokeResponse( id: req.id, ok: false, error: ClawdisNodeError( code: .backgroundUnavailable, - message: "NODE_BACKGROUND_UNAVAILABLE: screen commands require foreground")) + message: "NODE_BACKGROUND_UNAVAILABLE: screen/camera commands require foreground")) + } + + if req.command.hasPrefix("camera."), !self.isCameraEnabled() { + return BridgeInvokeResponse( + id: req.id, + ok: false, + error: ClawdisNodeError( + code: .unavailable, + message: "CAMERA_DISABLED: enable Camera in iOS Settings → Camera → Allow Camera")) } do { @@ -222,6 +232,46 @@ final class NodeAppModel: ObservableObject { let payload = try Self.encodePayload(["format": "png", "base64": base64]) return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) + case ClawdisCameraCommand.snap.rawValue: + let params = (try? Self.decodeParams(ClawdisCameraSnapParams.self, from: req.paramsJSON)) ?? + ClawdisCameraSnapParams() + let res = try await self.camera.snap(params: params) + + struct Payload: Codable { + var format: String + var base64: String + var width: Int + var height: Int + } + let payload = try Self.encodePayload(Payload( + format: res.format, + base64: res.base64, + width: res.width, + height: res.height)) + return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) + + case ClawdisCameraCommand.clip.rawValue: + let params = (try? Self.decodeParams(ClawdisCameraClipParams.self, from: req.paramsJSON)) ?? + ClawdisCameraClipParams() + + let suspended = (params.includeAudio ?? true) ? self.voiceWake.suspendForExternalAudioCapture() : false + defer { self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: suspended) } + + let res = try await self.camera.clip(params: params) + + struct Payload: Codable { + var format: String + var base64: String + var durationMs: Int + var hasAudio: Bool + } + let payload = try Self.encodePayload(Payload( + format: res.format, + base64: res.base64, + durationMs: res.durationMs, + hasAudio: res.hasAudio)) + return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) + default: return BridgeInvokeResponse( id: req.id, @@ -254,4 +304,10 @@ final class NodeAppModel: ObservableObject { } return json } + + private func isCameraEnabled() -> Bool { + // Default-on: if the key doesn't exist yet, treat it as enabled. + if UserDefaults.standard.object(forKey: "camera.enabled") == nil { return true } + return UserDefaults.standard.bool(forKey: "camera.enabled") + } } diff --git a/apps/ios/Sources/Voice/VoiceWakeManager.swift b/apps/ios/Sources/Voice/VoiceWakeManager.swift index 348d0bd78..2b46c5490 100644 --- a/apps/ios/Sources/Voice/VoiceWakeManager.swift +++ b/apps/ios/Sources/Voice/VoiceWakeManager.swift @@ -205,6 +205,37 @@ final class VoiceWakeManager: NSObject, ObservableObject { try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation) } + /// Temporarily releases the microphone so other subsystems (e.g. camera video capture) can record audio. + /// Returns `true` when listening was active and was suspended. + func suspendForExternalAudioCapture() -> Bool { + guard self.isEnabled, self.isListening else { return false } + + self.isListening = false + self.statusText = "Paused" + + self.tapDrainTask?.cancel() + self.tapDrainTask = nil + self.tapQueue?.clear() + self.tapQueue = nil + + self.recognitionTask?.cancel() + self.recognitionTask = nil + self.recognitionRequest = nil + + if self.audioEngine.isRunning { + self.audioEngine.stop() + self.audioEngine.inputNode.removeTap(onBus: 0) + } + + try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation) + return true + } + + func resumeAfterExternalAudioCapture(wasSuspended: Bool) { + guard wasSuspended else { return } + Task { await self.start() } + } + private func startRecognition() throws { self.recognitionTask?.cancel() self.recognitionTask = nil diff --git a/apps/ios/project.yml b/apps/ios/project.yml index bd8d1479e..8281aac17 100644 --- a/apps/ios/project.yml +++ b/apps/ios/project.yml @@ -54,5 +54,6 @@ targets: NSLocalNetworkUsageDescription: Clawdis discovers and connects to your Clawdis bridge on the local network. NSBonjourServices: - _clawdis-bridge._tcp + NSCameraUsageDescription: Clawdis can capture photos or short video clips when requested via the bridge. NSMicrophoneUsageDescription: Clawdis needs microphone access for voice wake. NSSpeechRecognitionUsageDescription: Clawdis uses on-device speech recognition for voice wake. diff --git a/apps/macos/Sources/Clawdis/CameraCaptureService.swift b/apps/macos/Sources/Clawdis/CameraCaptureService.swift new file mode 100644 index 000000000..52ce4d53d --- /dev/null +++ b/apps/macos/Sources/Clawdis/CameraCaptureService.swift @@ -0,0 +1,341 @@ +import AVFoundation +import ClawdisIPC +import CoreGraphics +import Foundation +import ImageIO +import OSLog +import UniformTypeIdentifiers + +actor CameraCaptureService { + enum CameraError: LocalizedError, Sendable { + case cameraUnavailable + case microphoneUnavailable + case permissionDenied(kind: String) + case captureFailed(String) + case exportFailed(String) + + var errorDescription: String? { + switch self { + case .cameraUnavailable: + "Camera unavailable" + case .microphoneUnavailable: + "Microphone unavailable" + case let .permissionDenied(kind): + "\(kind) permission denied" + case let .captureFailed(msg): + msg + case let .exportFailed(msg): + msg + } + } + } + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "camera") + + func snap(facing: CameraFacing?, maxWidth: Int?, quality: Double?) async throws -> (data: Data, size: CGSize) { + let facing = facing ?? .front + let maxWidth = maxWidth.flatMap { $0 > 0 ? $0 : nil } + let quality = Self.clampQuality(quality) + + try await self.ensureAccess(for: .video) + + let session = AVCaptureSession() + session.sessionPreset = .photo + + guard let device = Self.pickCamera(facing: facing) else { + throw CameraError.cameraUnavailable + } + + let input = try AVCaptureDeviceInput(device: device) + guard session.canAddInput(input) else { + throw CameraError.captureFailed("Failed to add camera input") + } + session.addInput(input) + + let output = AVCapturePhotoOutput() + guard session.canAddOutput(output) else { + throw CameraError.captureFailed("Failed to add photo output") + } + session.addOutput(output) + output.maxPhotoQualityPrioritization = .quality + + session.startRunning() + defer { session.stopRunning() } + + let settings: AVCapturePhotoSettings = { + if output.availablePhotoCodecTypes.contains(.jpeg) { + return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg]) + } + return AVCapturePhotoSettings() + }() + settings.photoQualityPrioritization = .quality + + let rawData: Data = try await withCheckedThrowingContinuation(isolation: nil) { cont in + output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont)) + } + + return try Self.reencodeJPEG(imageData: rawData, maxWidth: maxWidth, quality: quality) + } + + func clip( + facing: CameraFacing?, + durationMs: Int?, + includeAudio: Bool, + outPath: String?) async throws -> (path: String, durationMs: Int, hasAudio: Bool) + { + let facing = facing ?? .front + let durationMs = Self.clampDurationMs(durationMs) + + try await self.ensureAccess(for: .video) + if includeAudio { + try await self.ensureAccess(for: .audio) + } + + let session = AVCaptureSession() + session.sessionPreset = .high + + guard let camera = Self.pickCamera(facing: facing) else { + throw CameraError.cameraUnavailable + } + let cameraInput = try AVCaptureDeviceInput(device: camera) + guard session.canAddInput(cameraInput) else { + throw CameraError.captureFailed("Failed to add camera input") + } + session.addInput(cameraInput) + + if includeAudio { + guard let mic = AVCaptureDevice.default(for: .audio) else { + throw CameraError.microphoneUnavailable + } + let micInput = try AVCaptureDeviceInput(device: mic) + guard session.canAddInput(micInput) else { + throw CameraError.captureFailed("Failed to add microphone input") + } + session.addInput(micInput) + } + + let output = AVCaptureMovieFileOutput() + guard session.canAddOutput(output) else { + throw CameraError.captureFailed("Failed to add movie output") + } + session.addOutput(output) + output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000) + + session.startRunning() + defer { session.stopRunning() } + + let tmpMovURL = FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov") + defer { try? FileManager.default.removeItem(at: tmpMovURL) } + + let outputURL: URL = { + if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + return URL(fileURLWithPath: outPath) + } + return FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4") + }() + + // Ensure we don't fail exporting due to an existing file. + try? FileManager.default.removeItem(at: outputURL) + + let logger = self.logger + let recordedURL: URL = try await withCheckedThrowingContinuation(isolation: nil) { cont in + output.startRecording(to: tmpMovURL, recordingDelegate: MovieFileDelegate(cont, logger: logger)) + } + + try await Self.exportToMP4(inputURL: recordedURL, outputURL: outputURL) + return (path: outputURL.path, durationMs: durationMs, hasAudio: includeAudio) + } + + private func ensureAccess(for mediaType: AVMediaType) async throws { + let status = AVCaptureDevice.authorizationStatus(for: mediaType) + switch status { + case .authorized: + return + case .notDetermined: + let ok = await withCheckedContinuation(isolation: nil) { cont in + AVCaptureDevice.requestAccess(for: mediaType) { granted in + cont.resume(returning: granted) + } + } + if !ok { + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + } + case .denied, .restricted: + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + @unknown default: + throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") + } + } + + private nonisolated static func pickCamera(facing: CameraFacing) -> AVCaptureDevice? { + let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back + + if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) { + return device + } + + // Many macOS cameras report `unspecified` position; fall back to any default. + return AVCaptureDevice.default(for: .video) + } + + private nonisolated static func clampQuality(_ quality: Double?) -> Double { + let q = quality ?? 0.9 + return min(1.0, max(0.05, q)) + } + + private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { + let v = ms ?? 3000 + return min(15_000, max(250, v)) + } + + private nonisolated static func reencodeJPEG( + imageData: Data, + maxWidth: Int?, + quality: Double) throws -> (data: Data, size: CGSize) + { + guard let src = CGImageSourceCreateWithData(imageData as CFData, nil), + let img = CGImageSourceCreateImageAtIndex(src, 0, nil) + else { + throw CameraError.captureFailed("Failed to decode captured image") + } + + let finalImage: CGImage + if let maxWidth, img.width > maxWidth { + guard let scaled = self.downscale(image: img, maxWidth: maxWidth) else { + throw CameraError.captureFailed("Failed to downscale image") + } + finalImage = scaled + } else { + finalImage = img + } + + let out = NSMutableData() + guard let dest = CGImageDestinationCreateWithData(out, UTType.jpeg.identifier as CFString, 1, nil) else { + throw CameraError.captureFailed("Failed to create JPEG destination") + } + + let props = [kCGImageDestinationLossyCompressionQuality: quality] as CFDictionary + CGImageDestinationAddImage(dest, finalImage, props) + guard CGImageDestinationFinalize(dest) else { + throw CameraError.captureFailed("Failed to encode JPEG") + } + + return (out as Data, CGSize(width: finalImage.width, height: finalImage.height)) + } + + private nonisolated static func downscale(image: CGImage, maxWidth: Int) -> CGImage? { + guard image.width > 0, image.height > 0 else { return image } + guard image.width > maxWidth else { return image } + + let scale = Double(maxWidth) / Double(image.width) + let targetW = maxWidth + let targetH = max(1, Int((Double(image.height) * scale).rounded())) + + let cs = CGColorSpaceCreateDeviceRGB() + let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue + guard let ctx = CGContext( + data: nil, + width: targetW, + height: targetH, + bitsPerComponent: 8, + bytesPerRow: 0, + space: cs, + bitmapInfo: bitmapInfo) + else { return nil } + + ctx.interpolationQuality = .high + ctx.draw(image, in: CGRect(x: 0, y: 0, width: targetW, height: targetH)) + return ctx.makeImage() + } + + private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws { + let asset = AVAsset(url: inputURL) + guard let export = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetMediumQuality) else { + throw CameraError.exportFailed("Failed to create export session") + } + export.outputURL = outputURL + export.outputFileType = .mp4 + export.shouldOptimizeForNetworkUse = true + + await withCheckedContinuation { cont in + export.exportAsynchronously { + cont.resume() + } + } + + switch export.status { + case .completed: + return + case .failed: + throw CameraError.exportFailed(export.error?.localizedDescription ?? "export failed") + case .cancelled: + throw CameraError.exportFailed("export cancelled") + default: + throw CameraError.exportFailed("export did not complete (\(export.status.rawValue))") + } + } +} + +private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate { + private var cont: CheckedContinuation? + + init(_ cont: CheckedContinuation) { + self.cont = cont + } + + func photoOutput( + _ output: AVCapturePhotoOutput, + didFinishProcessingPhoto photo: AVCapturePhoto, + error: Error?) + { + guard let cont else { return } + self.cont = nil + if let error { + cont.resume(throwing: error) + return + } + guard let data = photo.fileDataRepresentation() else { + cont.resume(throwing: CameraCaptureService.CameraError.captureFailed("No photo data")) + return + } + cont.resume(returning: data) + } +} + +private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate { + private var cont: CheckedContinuation? + private let logger: Logger + + init(_ cont: CheckedContinuation, logger: Logger) { + self.cont = cont + self.logger = logger + } + + func fileOutput( + _ output: AVCaptureFileOutput, + didFinishRecordingTo outputFileURL: URL, + from connections: [AVCaptureConnection], + error: Error?) + { + guard let cont else { return } + self.cont = nil + + if let error { + let ns = error as NSError + if ns.domain == AVFoundationErrorDomain, + ns.code == AVError.maximumDurationReached.rawValue + { + cont.resume(returning: outputFileURL) + return + } + + self.logger.error("camera record failed: \(error.localizedDescription, privacy: .public)") + cont.resume(throwing: error) + return + } + + cont.resume(returning: outputFileURL) + } +} diff --git a/apps/macos/Sources/Clawdis/Constants.swift b/apps/macos/Sources/Clawdis/Constants.swift index c4538365c..dc0965425 100644 --- a/apps/macos/Sources/Clawdis/Constants.swift +++ b/apps/macos/Sources/Clawdis/Constants.swift @@ -24,6 +24,7 @@ let webChatEnabledKey = "clawdis.webChatEnabled" let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled" let webChatPortKey = "clawdis.webChatPort" let canvasEnabledKey = "clawdis.canvasEnabled" +let cameraEnabledKey = "clawdis.cameraEnabled" let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled" let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled" let deepLinkKeyKey = "clawdis.deepLinkKey" diff --git a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift index a847ce62c..e17dddafa 100644 --- a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift +++ b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift @@ -3,6 +3,8 @@ import Foundation import OSLog enum ControlRequestHandler { + private static let cameraCapture = CameraCaptureService() + static func process( request: Request, notifier: NotificationManager = NotificationManager(), @@ -77,6 +79,16 @@ enum ControlRequestHandler { command: command, paramsJSON: paramsJSON, logger: logger) + + case let .cameraSnap(facing, maxWidth, quality, outPath): + return await self.handleCameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath) + + case let .cameraClip(facing, durationMs, includeAudio, outPath): + return await self.handleCameraClip( + facing: facing, + durationMs: durationMs, + includeAudio: includeAudio, + outPath: outPath) } } @@ -173,6 +185,10 @@ enum ControlRequestHandler { UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true } + private static func cameraEnabled() -> Bool { + UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false + } + private static func handleCanvasShow( session: String, path: String?, @@ -254,4 +270,46 @@ enum ControlRequestHandler { return Response(ok: false, message: error.localizedDescription) } } + + private static func handleCameraSnap( + facing: CameraFacing?, + maxWidth: Int?, + quality: Double?, + outPath: String?) async -> Response + { + guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") } + do { + let res = try await self.cameraCapture.snap(facing: facing, maxWidth: maxWidth, quality: quality) + let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + URL(fileURLWithPath: outPath) + } else { + FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).jpg") + } + + try res.data.write(to: url, options: [.atomic]) + return Response(ok: true, message: url.path) + } catch { + return Response(ok: false, message: error.localizedDescription) + } + } + + private static func handleCameraClip( + facing: CameraFacing?, + durationMs: Int?, + includeAudio: Bool, + outPath: String?) async -> Response + { + guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") } + do { + let res = try await self.cameraCapture.clip( + facing: facing, + durationMs: durationMs, + includeAudio: includeAudio, + outPath: outPath) + return Response(ok: true, message: res.path) + } catch { + return Response(ok: false, message: error.localizedDescription) + } + } } diff --git a/apps/macos/Sources/Clawdis/DebugSettings.swift b/apps/macos/Sources/Clawdis/DebugSettings.swift index 6b79e24b7..abd5d0e44 100644 --- a/apps/macos/Sources/Clawdis/DebugSettings.swift +++ b/apps/macos/Sources/Clawdis/DebugSettings.swift @@ -9,6 +9,7 @@ struct DebugSettings: View { @AppStorage(modelCatalogReloadKey) private var modelCatalogReloadBump: Int = 0 @AppStorage(iconOverrideKey) private var iconOverrideRaw: String = IconOverrideSelection.system.rawValue @AppStorage(canvasEnabledKey) private var canvasEnabled: Bool = true + @AppStorage(cameraEnabledKey) private var cameraEnabled: Bool = false @AppStorage(deepLinkAgentEnabledKey) private var deepLinkAgentEnabled: Bool = false @State private var modelsCount: Int? @State private var modelsLoading = false @@ -48,6 +49,7 @@ struct DebugSettings: View { self.pathsSection self.quickActionsSection self.canvasSection + self.cameraSection self.experimentsSection Spacer(minLength: 0) @@ -571,6 +573,20 @@ struct DebugSettings: View { } } + private var cameraSection: some View { + GroupBox("Camera") { + VStack(alignment: .leading, spacing: 10) { + Toggle("Allow Camera (agent)", isOn: self.$cameraEnabled) + .toggleStyle(.checkbox) + .help("When off, camera requests return “Camera disabled by user”.") + + Text("Allows Clawdis to capture a photo or short video via the built-in camera.") + .font(.caption) + .foregroundStyle(.secondary) + } + } + } + private var experimentsSection: some View { GroupBox("Experiments") { Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) { diff --git a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift index fa4100422..b1c203ef1 100644 --- a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift +++ b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift @@ -52,6 +52,7 @@ struct ClawdisCLI { enum Kind { case generic + case mediaPath } } @@ -91,6 +92,9 @@ struct ClawdisCLI { case "canvas": return try self.parseCanvas(args: &args) + case "camera": + return try self.parseCamera(args: &args) + default: throw CLIError.help } @@ -292,6 +296,62 @@ struct ClawdisCLI { } } + private static func parseCamera(args: inout [String]) throws -> ParsedCLIRequest { + guard let sub = args.popFirst() else { throw CLIError.help } + switch sub { + case "snap": + var facing: CameraFacing? + var maxWidth: Int? + var quality: Double? + var outPath: String? + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--facing": + if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f } + case "--max-width": + maxWidth = args.popFirst().flatMap(Int.init) + case "--quality": + quality = args.popFirst().flatMap(Double.init) + case "--out": + outPath = args.popFirst() + default: + break + } + } + return ParsedCLIRequest( + request: .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath), + kind: .mediaPath) + + case "clip": + var facing: CameraFacing? + var durationMs: Int? + var includeAudio = true + var outPath: String? + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--facing": + if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f } + case "--duration-ms": + durationMs = args.popFirst().flatMap(Int.init) + case "--no-audio": + includeAudio = false + case "--out": + outPath = args.popFirst() + default: + break + } + } + return ParsedCLIRequest( + request: .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath), + kind: .mediaPath) + + default: + throw CLIError.help + } + } + private static func parseCanvasPlacement( args: inout [String], session: inout String, @@ -334,6 +394,10 @@ struct ClawdisCLI { if let message = response.message, !message.isEmpty { FileHandle.standardOutput.write(Data((message + "\n").utf8)) } + case .mediaPath: + if let message = response.message, !message.isEmpty { + print("MEDIA:\(message)") + } } } @@ -352,6 +416,8 @@ struct ClawdisCLI { output["payload"] = text } } + case .mediaPath: + break } let json = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted]) @@ -406,6 +472,10 @@ struct ClawdisCLI { clawdis-mac canvas eval --js [--session ] clawdis-mac canvas snapshot [--out ] [--session ] + Camera: + clawdis-mac camera snap [--facing ] [--max-width ] [--quality <0-1>] [--out ] + clawdis-mac camera clip [--facing ] [--duration-ms ] [--no-audio] [--out ] + Browser (clawd): clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot @@ -433,6 +503,7 @@ struct ClawdisCLI { Output: Default output is text. Use --json for machine-readable output. In text mode, `browser screenshot` prints MEDIA:. + In text mode, `camera snap` and `camera clip` print MEDIA:. """ print(usage) } diff --git a/apps/macos/Sources/ClawdisIPC/IPC.swift b/apps/macos/Sources/ClawdisIPC/IPC.swift index feb062bdc..6aadc0185 100644 --- a/apps/macos/Sources/ClawdisIPC/IPC.swift +++ b/apps/macos/Sources/ClawdisIPC/IPC.swift @@ -13,6 +13,11 @@ public enum Capability: String, Codable, CaseIterable, Sendable { case speechRecognition } +public enum CameraFacing: String, Codable, Sendable { + case front + case back +} + // MARK: - Requests /// Notification interruption level (maps to UNNotificationInterruptionLevel) @@ -74,6 +79,8 @@ public enum Request: Sendable { case canvasSnapshot(session: String, outPath: String?) case nodeList case nodeInvoke(nodeId: String, command: String, paramsJSON: String?) + case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?) + case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?) } // MARK: - Responses @@ -104,6 +111,11 @@ extension Request: Codable { case path case javaScript case outPath + case facing + case maxWidth + case quality + case durationMs + case includeAudio case placement case nodeId case nodeCommand @@ -124,6 +136,8 @@ extension Request: Codable { case canvasSnapshot case nodeList case nodeInvoke + case cameraSnap + case cameraClip } public func encode(to encoder: Encoder) throws { @@ -198,6 +212,20 @@ extension Request: Codable { try container.encode(nodeId, forKey: .nodeId) try container.encode(command, forKey: .nodeCommand) try container.encodeIfPresent(paramsJSON, forKey: .paramsJSON) + + case let .cameraSnap(facing, maxWidth, quality, outPath): + try container.encode(Kind.cameraSnap, forKey: .type) + try container.encodeIfPresent(facing, forKey: .facing) + try container.encodeIfPresent(maxWidth, forKey: .maxWidth) + try container.encodeIfPresent(quality, forKey: .quality) + try container.encodeIfPresent(outPath, forKey: .outPath) + + case let .cameraClip(facing, durationMs, includeAudio, outPath): + try container.encode(Kind.cameraClip, forKey: .type) + try container.encodeIfPresent(facing, forKey: .facing) + try container.encodeIfPresent(durationMs, forKey: .durationMs) + try container.encode(includeAudio, forKey: .includeAudio) + try container.encodeIfPresent(outPath, forKey: .outPath) } } @@ -274,6 +302,20 @@ extension Request: Codable { let command = try container.decode(String.self, forKey: .nodeCommand) let paramsJSON = try container.decodeIfPresent(String.self, forKey: .paramsJSON) self = .nodeInvoke(nodeId: nodeId, command: command, paramsJSON: paramsJSON) + + case .cameraSnap: + let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing) + let maxWidth = try container.decodeIfPresent(Int.self, forKey: .maxWidth) + let quality = try container.decodeIfPresent(Double.self, forKey: .quality) + let outPath = try container.decodeIfPresent(String.self, forKey: .outPath) + self = .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath) + + case .cameraClip: + let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing) + let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs) + let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true + let outPath = try container.decodeIfPresent(String.self, forKey: .outPath) + self = .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath) } } } diff --git a/apps/macos/Tests/ClawdisIPCTests/CameraIPCTests.swift b/apps/macos/Tests/ClawdisIPCTests/CameraIPCTests.swift new file mode 100644 index 000000000..a5121ab56 --- /dev/null +++ b/apps/macos/Tests/ClawdisIPCTests/CameraIPCTests.swift @@ -0,0 +1,62 @@ +import ClawdisIPC +import Foundation +import Testing + +@Suite struct CameraIPCTests { + @Test func cameraSnapCodableRoundtrip() throws { + let req: Request = .cameraSnap( + facing: .front, + maxWidth: 640, + quality: 0.85, + outPath: "/tmp/test.jpg") + + let data = try JSONEncoder().encode(req) + let decoded = try JSONDecoder().decode(Request.self, from: data) + + switch decoded { + case let .cameraSnap(facing, maxWidth, quality, outPath): + #expect(facing == .front) + #expect(maxWidth == 640) + #expect(quality == 0.85) + #expect(outPath == "/tmp/test.jpg") + default: + Issue.record("expected cameraSnap, got \(decoded)") + } + } + + @Test func cameraClipCodableRoundtrip() throws { + let req: Request = .cameraClip( + facing: .back, + durationMs: 3000, + includeAudio: false, + outPath: "/tmp/test.mp4") + + let data = try JSONEncoder().encode(req) + let decoded = try JSONDecoder().decode(Request.self, from: data) + + switch decoded { + case let .cameraClip(facing, durationMs, includeAudio, outPath): + #expect(facing == .back) + #expect(durationMs == 3000) + #expect(includeAudio == false) + #expect(outPath == "/tmp/test.mp4") + default: + Issue.record("expected cameraClip, got \(decoded)") + } + } + + @Test func cameraClipDefaultsIncludeAudioToTrueWhenMissing() throws { + let json = """ + {"type":"cameraClip","durationMs":1234} + """ + let decoded = try JSONDecoder().decode(Request.self, from: Data(json.utf8)) + switch decoded { + case let .cameraClip(_, durationMs, includeAudio, _): + #expect(durationMs == 1234) + #expect(includeAudio == true) + default: + Issue.record("expected cameraClip, got \(decoded)") + } + } +} + diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/CameraCommands.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/CameraCommands.swift new file mode 100644 index 000000000..dd2c2015d --- /dev/null +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/CameraCommands.swift @@ -0,0 +1,58 @@ +import Foundation + +public enum ClawdisCameraCommand: String, Codable, Sendable { + case snap = "camera.snap" + case clip = "camera.clip" +} + +public enum ClawdisCameraFacing: String, Codable, Sendable { + case back + case front +} + +public enum ClawdisCameraImageFormat: String, Codable, Sendable { + case jpg + case jpeg +} + +public enum ClawdisCameraVideoFormat: String, Codable, Sendable { + case mp4 +} + +public struct ClawdisCameraSnapParams: Codable, Sendable, Equatable { + public var facing: ClawdisCameraFacing? + public var maxWidth: Int? + public var quality: Double? + public var format: ClawdisCameraImageFormat? + + public init( + facing: ClawdisCameraFacing? = nil, + maxWidth: Int? = nil, + quality: Double? = nil, + format: ClawdisCameraImageFormat? = nil) + { + self.facing = facing + self.maxWidth = maxWidth + self.quality = quality + self.format = format + } +} + +public struct ClawdisCameraClipParams: Codable, Sendable, Equatable { + public var facing: ClawdisCameraFacing? + public var durationMs: Int? + public var includeAudio: Bool? + public var format: ClawdisCameraVideoFormat? + + public init( + facing: ClawdisCameraFacing? = nil, + durationMs: Int? = nil, + includeAudio: Bool? = nil, + format: ClawdisCameraVideoFormat? = nil) + { + self.facing = facing + self.durationMs = durationMs + self.includeAudio = includeAudio + self.format = format + } +} diff --git a/docs/camera.md b/docs/camera.md new file mode 100644 index 000000000..dc5ab93db --- /dev/null +++ b/docs/camera.md @@ -0,0 +1,98 @@ +--- +summary: "Camera capture (iOS node + macOS app) for agent use: photos (jpg) and short video clips (mp4)" +read_when: + - Adding or modifying camera capture on iOS nodes or macOS + - Extending agent-accessible MEDIA temp-file workflows +--- + +# Camera capture (agent) + +Clawdis supports **camera capture** for agent workflows: + +- **iOS node** (paired via Gateway): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `node.invoke`. +- **macOS app** (local control socket): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `clawdis-mac`. + +All camera access is gated behind **user-controlled settings**. + +## iOS node + +### User setting (default on) + +- iOS Settings tab → **Camera** → **Allow Camera** (`camera.enabled`) + - Default: **on** (missing key is treated as enabled). + - When off: `camera.*` commands return `CAMERA_DISABLED`. + +### Commands (via Gateway `node.invoke`) + +- `camera.snap` + - Params: + - `facing`: `front|back` (default: `front`) + - `maxWidth`: number (optional) + - `quality`: `0..1` (optional; default `0.9`) + - `format`: currently `jpg` + - Response payload: + - `format: "jpg"` + - `base64: "<...>"` + - `width`, `height` + +- `camera.clip` + - Params: + - `facing`: `front|back` (default: `front`) + - `durationMs`: number (default `3000`, clamped to a max) + - `includeAudio`: boolean (default `true`) + - `format`: currently `mp4` + - Response payload: + - `format: "mp4"` + - `base64: "<...>"` + - `durationMs` + - `hasAudio` + +### Foreground requirement + +Like `screen.*`, the iOS node only allows `camera.*` commands in the **foreground**. Background invocations return `NODE_BACKGROUND_UNAVAILABLE`. + +### CLI helper (temp files + MEDIA) + +The easiest way to get attachments is via the CLI helper, which writes decoded media to a temp file and prints `MEDIA:`. + +Examples: + +```bash +clawdis nodes camera snap --node # default: both front + back (2 MEDIA lines) +clawdis nodes camera snap --node --facing front +clawdis nodes camera clip --node --duration 3000 +clawdis nodes camera clip --node --no-audio +``` + +Notes: +- `nodes camera snap` defaults to **both** facings to give the agent both views. +- Output files are temporary (in the OS temp directory) unless you build your own wrapper. + +## macOS app + +### User setting (default off) + +The macOS companion app exposes a checkbox: + +- **Settings → Debug → Camera → Allow Camera (agent)** (`clawdis.cameraEnabled`) + - Default: **off** + - When off: camera requests return “Camera disabled by user”. + +### CLI helper (local control socket) + +The `clawdis-mac` helper talks to the running menu bar app over the local control socket. + +Examples: + +```bash +clawdis-mac camera snap # prints MEDIA: +clawdis-mac camera snap --max-width 1280 +clawdis-mac camera clip --duration-ms 3000 # prints MEDIA: +clawdis-mac camera clip --no-audio +``` + +## Safety + practical limits + +- Camera and microphone access trigger the usual OS permission prompts (and require usage strings in Info.plist). +- Video clips are intentionally short to avoid oversized bridge payloads (base64 overhead + WebSocket message limits). + diff --git a/scripts/package-mac-app.sh b/scripts/package-mac-app.sh index a3bd8b56d..fc084f1e4 100755 --- a/scripts/package-mac-app.sh +++ b/scripts/package-mac-app.sh @@ -98,6 +98,8 @@ cat > "$APP_ROOT/Contents/Info.plist" <Clawdis needs notification permission to show alerts for agent actions. NSScreenCaptureDescription Clawdis captures the screen when the agent needs screenshots for context. + NSCameraUsageDescription + Clawdis can capture photos or short video clips when requested by the agent. NSMicrophoneUsageDescription Clawdis needs the mic for Voice Wake tests and agent audio capture. NSSpeechRecognitionUsageDescription diff --git a/src/cli/nodes-camera.test.ts b/src/cli/nodes-camera.test.ts new file mode 100644 index 000000000..cab0a91c1 --- /dev/null +++ b/src/cli/nodes-camera.test.ts @@ -0,0 +1,64 @@ +import * as fs from "node:fs/promises"; +import * as os from "node:os"; +import * as path from "node:path"; +import { describe, expect, it } from "vitest"; +import { + cameraTempPath, + parseCameraClipPayload, + parseCameraSnapPayload, + writeBase64ToFile, +} from "./nodes-camera.js"; + +describe("nodes camera helpers", () => { + it("parses camera.snap payload", () => { + expect( + parseCameraSnapPayload({ + format: "jpg", + base64: "aGk=", + width: 10, + height: 20, + }), + ).toEqual({ format: "jpg", base64: "aGk=", width: 10, height: 20 }); + }); + + it("rejects invalid camera.snap payload", () => { + expect(() => parseCameraSnapPayload({ format: "jpg" })).toThrow( + /invalid camera\.snap payload/i, + ); + }); + + it("parses camera.clip payload", () => { + expect( + parseCameraClipPayload({ + format: "mp4", + base64: "AAEC", + durationMs: 1234, + hasAudio: true, + }), + ).toEqual({ + format: "mp4", + base64: "AAEC", + durationMs: 1234, + hasAudio: true, + }); + }); + + it("builds stable temp paths when id provided", () => { + const p = cameraTempPath({ + kind: "snap", + facing: "front", + ext: "jpg", + tmpDir: "/tmp", + id: "id1", + }); + expect(p).toBe(path.join("/tmp", "clawdis-camera-snap-front-id1.jpg")); + }); + + it("writes base64 to file", async () => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-test-")); + const out = path.join(dir, "x.bin"); + await writeBase64ToFile(out, "aGk="); + await expect(fs.readFile(out, "utf8")).resolves.toBe("hi"); + await fs.rm(dir, { recursive: true, force: true }); + }); +}); diff --git a/src/cli/nodes-camera.ts b/src/cli/nodes-camera.ts new file mode 100644 index 000000000..a7a8150ab --- /dev/null +++ b/src/cli/nodes-camera.ts @@ -0,0 +1,92 @@ +import { randomUUID } from "node:crypto"; +import * as fs from "node:fs/promises"; +import * as os from "node:os"; +import * as path from "node:path"; + +export type CameraFacing = "front" | "back"; + +export type CameraSnapPayload = { + format: string; + base64: string; + width: number; + height: number; +}; + +export type CameraClipPayload = { + format: string; + base64: string; + durationMs: number; + hasAudio: boolean; +}; + +function asRecord(value: unknown): Record { + return typeof value === "object" && value !== null + ? (value as Record) + : {}; +} + +function asString(value: unknown): string | undefined { + return typeof value === "string" ? value : undefined; +} + +function asNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) + ? value + : undefined; +} + +function asBoolean(value: unknown): boolean | undefined { + return typeof value === "boolean" ? value : undefined; +} + +export function parseCameraSnapPayload(value: unknown): CameraSnapPayload { + const obj = asRecord(value); + const format = asString(obj.format); + const base64 = asString(obj.base64); + const width = asNumber(obj.width); + const height = asNumber(obj.height); + if (!format || !base64 || width === undefined || height === undefined) { + throw new Error("invalid camera.snap payload"); + } + return { format, base64, width, height }; +} + +export function parseCameraClipPayload(value: unknown): CameraClipPayload { + const obj = asRecord(value); + const format = asString(obj.format); + const base64 = asString(obj.base64); + const durationMs = asNumber(obj.durationMs); + const hasAudio = asBoolean(obj.hasAudio); + if ( + !format || + !base64 || + durationMs === undefined || + hasAudio === undefined + ) { + throw new Error("invalid camera.clip payload"); + } + return { format, base64, durationMs, hasAudio }; +} + +export function cameraTempPath(opts: { + kind: "snap" | "clip"; + facing?: CameraFacing; + ext: string; + tmpDir?: string; + id?: string; +}) { + const tmpDir = opts.tmpDir ?? os.tmpdir(); + const id = opts.id ?? randomUUID(); + const facingPart = opts.facing ? `-${opts.facing}` : ""; + const ext = opts.ext.startsWith(".") ? opts.ext : `.${opts.ext}`; + return path.join( + tmpDir, + `clawdis-camera-${opts.kind}${facingPart}-${id}${ext}`, + ); +} + +export async function writeBase64ToFile(filePath: string, base64: string) { + const buf = Buffer.from(base64, "base64"); + await fs.writeFile(filePath, buf); + return { path: filePath, bytes: buf.length }; +} diff --git a/src/cli/nodes-cli.ts b/src/cli/nodes-cli.ts index 669ab5c51..1bfa931b9 100644 --- a/src/cli/nodes-cli.ts +++ b/src/cli/nodes-cli.ts @@ -1,6 +1,13 @@ import type { Command } from "commander"; import { callGateway, randomIdempotencyKey } from "../gateway/call.js"; import { defaultRuntime } from "../runtime.js"; +import { + type CameraFacing, + cameraTempPath, + parseCameraClipPayload, + parseCameraSnapPayload, + writeBase64ToFile, +} from "./nodes-camera.js"; type NodesRpcOpts = { url?: string; @@ -12,6 +19,11 @@ type NodesRpcOpts = { params?: string; invokeTimeout?: string; idempotencyKey?: string; + facing?: string; + maxWidth?: string; + quality?: string; + duration?: string; + audio?: boolean; }; type NodeListNode = { @@ -340,4 +352,203 @@ export function registerNodesCli(program: Command) { }), { timeoutMs: 30_000 }, ); + + const parseFacing = (value: string): CameraFacing => { + const v = String(value ?? "") + .trim() + .toLowerCase(); + if (v === "front" || v === "back") return v; + throw new Error(`invalid facing: ${value} (expected front|back)`); + }; + + const camera = nodes + .command("camera") + .description("Capture camera media from a paired node"); + + nodesCallOpts( + camera + .command("snap") + .description("Capture a photo from a node camera (prints MEDIA:)") + .requiredOption("--node ", "Node id, name, or IP") + .option("--facing ", "Camera facing", "both") + .option("--max-width ", "Max width in px (optional)") + .option("--quality <0-1>", "JPEG quality (default 0.9)") + .option( + "--invoke-timeout ", + "Node invoke timeout in ms (default 20000)", + "20000", + ) + .action(async (opts: NodesRpcOpts) => { + try { + const nodeId = await resolveNodeId(opts, String(opts.node ?? "")); + const facingOpt = String(opts.facing ?? "both") + .trim() + .toLowerCase(); + const facings: CameraFacing[] = + facingOpt === "both" + ? ["front", "back"] + : facingOpt === "front" || facingOpt === "back" + ? [facingOpt] + : (() => { + throw new Error( + `invalid facing: ${String(opts.facing)} (expected front|back|both)`, + ); + })(); + + const maxWidth = opts.maxWidth + ? Number.parseInt(String(opts.maxWidth), 10) + : undefined; + const quality = opts.quality + ? Number.parseFloat(String(opts.quality)) + : undefined; + const timeoutMs = opts.invokeTimeout + ? Number.parseInt(String(opts.invokeTimeout), 10) + : undefined; + + const results: Array<{ + facing: CameraFacing; + path: string; + width: number; + height: number; + }> = []; + + for (const facing of facings) { + const invokeParams: Record = { + nodeId, + command: "camera.snap", + params: { + facing, + maxWidth: Number.isFinite(maxWidth) ? maxWidth : undefined, + quality: Number.isFinite(quality) ? quality : undefined, + format: "jpg", + }, + idempotencyKey: randomIdempotencyKey(), + }; + if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) { + invokeParams.timeoutMs = timeoutMs; + } + + const raw = (await callGatewayCli( + "node.invoke", + opts, + invokeParams, + )) as unknown; + + const res = + typeof raw === "object" && raw !== null + ? (raw as { payload?: unknown }) + : {}; + const payload = parseCameraSnapPayload(res.payload); + const filePath = cameraTempPath({ + kind: "snap", + facing, + ext: payload.format === "jpeg" ? "jpg" : payload.format, + }); + await writeBase64ToFile(filePath, payload.base64); + results.push({ + facing, + path: filePath, + width: payload.width, + height: payload.height, + }); + } + + if (opts.json) { + defaultRuntime.log(JSON.stringify({ files: results }, null, 2)); + return; + } + defaultRuntime.log(results.map((r) => `MEDIA:${r.path}`).join("\n")); + } catch (err) { + defaultRuntime.error(`nodes camera snap failed: ${String(err)}`); + defaultRuntime.exit(1); + } + }), + { timeoutMs: 60_000 }, + ); + + nodesCallOpts( + camera + .command("clip") + .description( + "Capture a short video clip from a node camera (prints MEDIA:)", + ) + .requiredOption("--node ", "Node id, name, or IP") + .option("--facing ", "Camera facing", "front") + .option("--duration ", "Duration in ms (default 3000)", "3000") + .option("--no-audio", "Disable audio capture") + .option( + "--invoke-timeout ", + "Node invoke timeout in ms (default 45000)", + "45000", + ) + .action(async (opts: NodesRpcOpts & { audio?: boolean }) => { + try { + const nodeId = await resolveNodeId(opts, String(opts.node ?? "")); + const facing = parseFacing(String(opts.facing ?? "front")); + const durationMs = Number.parseInt( + String(opts.duration ?? "3000"), + 10, + ); + const includeAudio = opts.audio !== false; + const timeoutMs = opts.invokeTimeout + ? Number.parseInt(String(opts.invokeTimeout), 10) + : undefined; + + const invokeParams: Record = { + nodeId, + command: "camera.clip", + params: { + facing, + durationMs: Number.isFinite(durationMs) ? durationMs : undefined, + includeAudio, + format: "mp4", + }, + idempotencyKey: randomIdempotencyKey(), + }; + if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) { + invokeParams.timeoutMs = timeoutMs; + } + + const raw = (await callGatewayCli( + "node.invoke", + opts, + invokeParams, + )) as unknown; + const res = + typeof raw === "object" && raw !== null + ? (raw as { payload?: unknown }) + : {}; + const payload = parseCameraClipPayload(res.payload); + const filePath = cameraTempPath({ + kind: "clip", + facing, + ext: payload.format, + }); + await writeBase64ToFile(filePath, payload.base64); + + if (opts.json) { + defaultRuntime.log( + JSON.stringify( + { + file: { + facing, + path: filePath, + durationMs: payload.durationMs, + hasAudio: payload.hasAudio, + }, + }, + null, + 2, + ), + ); + return; + } + defaultRuntime.log(`MEDIA:${filePath}`); + } catch (err) { + defaultRuntime.error(`nodes camera clip failed: ${String(err)}`); + defaultRuntime.exit(1); + } + }), + { timeoutMs: 90_000 }, + ); } diff --git a/src/cli/program.test.ts b/src/cli/program.test.ts index 84631a313..e3b13c741 100644 --- a/src/cli/program.test.ts +++ b/src/cli/program.test.ts @@ -1,3 +1,4 @@ +import * as fs from "node:fs/promises"; import { beforeEach, describe, expect, it, vi } from "vitest"; const sendCommand = vi.fn(); @@ -148,4 +149,145 @@ describe("cli program", () => { ); expect(runtime.log).toHaveBeenCalled(); }); + + it("runs nodes camera snap and prints two MEDIA paths", async () => { + callGateway + .mockResolvedValueOnce({ + ts: Date.now(), + nodes: [ + { + nodeId: "ios-node", + displayName: "iOS Node", + remoteIp: "192.168.0.88", + connected: true, + }, + ], + }) + .mockResolvedValueOnce({ + ok: true, + nodeId: "ios-node", + command: "camera.snap", + payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 }, + }) + .mockResolvedValueOnce({ + ok: true, + nodeId: "ios-node", + command: "camera.snap", + payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 }, + }); + + const program = buildProgram(); + runtime.log.mockClear(); + await program.parseAsync( + ["nodes", "camera", "snap", "--node", "ios-node"], + { + from: "user", + }, + ); + + expect(callGateway).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ + method: "node.invoke", + params: expect.objectContaining({ + nodeId: "ios-node", + command: "camera.snap", + timeoutMs: 20000, + idempotencyKey: "idem-test", + params: expect.objectContaining({ facing: "front", format: "jpg" }), + }), + }), + ); + expect(callGateway).toHaveBeenNthCalledWith( + 3, + expect.objectContaining({ + method: "node.invoke", + params: expect.objectContaining({ + nodeId: "ios-node", + command: "camera.snap", + timeoutMs: 20000, + idempotencyKey: "idem-test", + params: expect.objectContaining({ facing: "back", format: "jpg" }), + }), + }), + ); + + const out = String(runtime.log.mock.calls[0]?.[0] ?? ""); + const mediaPaths = out + .split("\n") + .filter((l) => l.startsWith("MEDIA:")) + .map((l) => l.replace(/^MEDIA:/, "")) + .filter(Boolean); + expect(mediaPaths).toHaveLength(2); + + try { + for (const p of mediaPaths) { + await expect(fs.readFile(p, "utf8")).resolves.toBe("hi"); + } + } finally { + await Promise.all(mediaPaths.map((p) => fs.unlink(p).catch(() => {}))); + } + }); + + it("runs nodes camera clip and prints one MEDIA path", async () => { + callGateway + .mockResolvedValueOnce({ + ts: Date.now(), + nodes: [ + { + nodeId: "ios-node", + displayName: "iOS Node", + remoteIp: "192.168.0.88", + connected: true, + }, + ], + }) + .mockResolvedValueOnce({ + ok: true, + nodeId: "ios-node", + command: "camera.clip", + payload: { + format: "mp4", + base64: "aGk=", + durationMs: 3000, + hasAudio: true, + }, + }); + + const program = buildProgram(); + runtime.log.mockClear(); + await program.parseAsync( + ["nodes", "camera", "clip", "--node", "ios-node", "--duration", "3000"], + { from: "user" }, + ); + + expect(callGateway).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ + method: "node.invoke", + params: expect.objectContaining({ + nodeId: "ios-node", + command: "camera.clip", + timeoutMs: 45000, + idempotencyKey: "idem-test", + params: expect.objectContaining({ + facing: "front", + durationMs: 3000, + includeAudio: true, + format: "mp4", + }), + }), + }), + ); + + const out = String(runtime.log.mock.calls[0]?.[0] ?? ""); + const mediaPath = out.replace(/^MEDIA:/, "").trim(); + expect(mediaPath).toMatch(/clawdis-camera-clip-front-.*\.mp4$/); + + try { + await expect(fs.readFile(mediaPath, "utf8")).resolves.toBe("hi"); + } finally { + await fs.unlink(mediaPath).catch(() => {}); + } + }); });