feat(camera): add snap/clip capture

This commit is contained in:
Peter Steinberger
2025-12-14 00:48:58 +00:00
parent 2454e67e09
commit a92eb1f33d
19 changed files with 1669 additions and 2 deletions

View File

@@ -0,0 +1,319 @@
import AVFoundation
import ClawdisKit
import Foundation
import UIKit
actor CameraController {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case invalidParams(String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .invalidParams(msg):
msg
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
func snap(params: ClawdisCameraSnapParams) async throws -> (
format: String,
base64: String,
width: Int,
height: Int)
{
let facing = params.facing ?? .front
let maxWidth = params.maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(params.quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
let (finalData, size) = try Self.reencodeJPEG(
imageData: rawData,
maxWidth: maxWidth,
quality: quality)
return (
format: "jpg",
base64: finalData.base64EncodedString(),
width: Int(size.width.rounded()),
height: Int(size.height.rounded()))
}
func clip(params: ClawdisCameraClipParams) async throws -> (
format: String,
base64: String,
durationMs: Int,
hasAudio: Bool)
{
let facing = params.facing ?? .front
let durationMs = Self.clampDurationMs(params.durationMs)
let includeAudio = params.includeAudio ?? true
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
if session.canAddInput(micInput) {
session.addInput(micInput)
} else {
throw CameraError.captureFailed("Failed to add microphone input")
}
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let movURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
let mp4URL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
defer {
try? FileManager.default.removeItem(at: movURL)
try? FileManager.default.removeItem(at: mp4URL)
}
let recordedURL: URL = try await withCheckedThrowingContinuation { cont in
let delegate = MovieFileDelegate(cont)
output.startRecording(to: movURL, recordingDelegate: delegate)
}
// Transcode .mov -> .mp4 for easier downstream handling.
try await Self.exportToMP4(inputURL: recordedURL, outputURL: mp4URL)
let data = try Data(contentsOf: mp4URL)
return (format: "mp4", base64: data.base64EncodedString(), durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: ClawdisCameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
return AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
// Keep clips short by default; avoid huge base64 payloads on the bridge.
return min(15000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let image = UIImage(data: imageData) else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: UIImage = if let maxWidth, maxWidth > 0 {
Self.downscale(image: image, maxWidth: CGFloat(maxWidth))
} else {
image
}
guard let out = finalImage.jpegData(compressionQuality: quality) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out, finalImage.size)
}
private nonisolated static func downscale(image: UIImage, maxWidth: CGFloat) -> UIImage {
let w = image.size.width
let h = image.size.height
guard w > 0, h > 0 else { return image }
guard w > maxWidth else { return image }
let scale = maxWidth / w
let target = CGSize(width: maxWidth, height: max(1, h * scale))
let format = UIGraphicsImageRendererFormat.default()
format.opaque = false
let renderer = UIGraphicsImageRenderer(size: target, format: format)
return renderer.image { _ in
image.draw(in: CGRect(origin: .zero, size: target))
}
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetHighestQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
exporter.outputURL = outputURL
exporter.outputFileType = .mp4
exporter.shouldOptimizeForNetworkUse = true
try await withCheckedThrowingContinuation(isolation: nil) { cont in
exporter.exportAsynchronously {
switch exporter.status {
case .completed:
cont.resume(returning: ())
case .failed:
cont.resume(throwing: exporter.error ?? CameraError.exportFailed("Export failed"))
case .cancelled:
cont.resume(throwing: CameraError.exportFailed("Export cancelled"))
default:
cont.resume(throwing: CameraError.exportFailed("Export did not complete"))
}
}
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private let continuation: CheckedContinuation<Data, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<Data, Error>) {
self.continuation = continuation
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
self.continuation.resume(
throwing: NSError(domain: "Camera", code: 1, userInfo: [
NSLocalizedDescriptionKey: "photo data missing",
]))
return
}
self.continuation.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private let continuation: CheckedContinuation<URL, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<URL, Error>) {
self.continuation = continuation
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
self.continuation.resume(returning: outputFileURL)
}
}

View File

@@ -26,6 +26,8 @@
</array> </array>
<key>NSLocalNetworkUsageDescription</key> <key>NSLocalNetworkUsageDescription</key>
<string>Clawdis discovers and connects to your Clawdis bridge on the local network.</string> <string>Clawdis discovers and connects to your Clawdis bridge on the local network.</string>
<key>NSCameraUsageDescription</key>
<string>Clawdis can capture photos or short video clips when requested via the bridge.</string>
<key>NSMicrophoneUsageDescription</key> <key>NSMicrophoneUsageDescription</key>
<string>Clawdis needs microphone access for voice wake.</string> <string>Clawdis needs microphone access for voice wake.</string>
<key>NSSpeechRecognitionUsageDescription</key> <key>NSSpeechRecognitionUsageDescription</key>

View File

@@ -6,6 +6,7 @@ import SwiftUI
final class NodeAppModel: ObservableObject { final class NodeAppModel: ObservableObject {
@Published var isBackgrounded: Bool = false @Published var isBackgrounded: Bool = false
let screen = ScreenController() let screen = ScreenController()
let camera = CameraController()
@Published var bridgeStatusText: String = "Not connected" @Published var bridgeStatusText: String = "Not connected"
@Published var bridgeServerName: String? @Published var bridgeServerName: String?
@Published var bridgeRemoteAddress: String? @Published var bridgeRemoteAddress: String?
@@ -182,13 +183,22 @@ final class NodeAppModel: ObservableObject {
} }
private func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse { private func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse {
if req.command.hasPrefix("screen."), self.isBackgrounded { if req.command.hasPrefix("screen.") || req.command.hasPrefix("camera."), self.isBackgrounded {
return BridgeInvokeResponse( return BridgeInvokeResponse(
id: req.id, id: req.id,
ok: false, ok: false,
error: ClawdisNodeError( error: ClawdisNodeError(
code: .backgroundUnavailable, code: .backgroundUnavailable,
message: "NODE_BACKGROUND_UNAVAILABLE: screen commands require foreground")) message: "NODE_BACKGROUND_UNAVAILABLE: screen/camera commands require foreground"))
}
if req.command.hasPrefix("camera."), !self.isCameraEnabled() {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .unavailable,
message: "CAMERA_DISABLED: enable Camera in iOS Settings → Camera → Allow Camera"))
} }
do { do {
@@ -222,6 +232,46 @@ final class NodeAppModel: ObservableObject {
let payload = try Self.encodePayload(["format": "png", "base64": base64]) let payload = try Self.encodePayload(["format": "png", "base64": base64])
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.snap.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraSnapParams.self, from: req.paramsJSON)) ??
ClawdisCameraSnapParams()
let res = try await self.camera.snap(params: params)
struct Payload: Codable {
var format: String
var base64: String
var width: Int
var height: Int
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
width: res.width,
height: res.height))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.clip.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraClipParams.self, from: req.paramsJSON)) ??
ClawdisCameraClipParams()
let suspended = (params.includeAudio ?? true) ? self.voiceWake.suspendForExternalAudioCapture() : false
defer { self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: suspended) }
let res = try await self.camera.clip(params: params)
struct Payload: Codable {
var format: String
var base64: String
var durationMs: Int
var hasAudio: Bool
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
durationMs: res.durationMs,
hasAudio: res.hasAudio))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
default: default:
return BridgeInvokeResponse( return BridgeInvokeResponse(
id: req.id, id: req.id,
@@ -254,4 +304,10 @@ final class NodeAppModel: ObservableObject {
} }
return json return json
} }
private func isCameraEnabled() -> Bool {
// Default-on: if the key doesn't exist yet, treat it as enabled.
if UserDefaults.standard.object(forKey: "camera.enabled") == nil { return true }
return UserDefaults.standard.bool(forKey: "camera.enabled")
}
} }

View File

@@ -205,6 +205,37 @@ final class VoiceWakeManager: NSObject, ObservableObject {
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation) try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
} }
/// Temporarily releases the microphone so other subsystems (e.g. camera video capture) can record audio.
/// Returns `true` when listening was active and was suspended.
func suspendForExternalAudioCapture() -> Bool {
guard self.isEnabled, self.isListening else { return false }
self.isListening = false
self.statusText = "Paused"
self.tapDrainTask?.cancel()
self.tapDrainTask = nil
self.tapQueue?.clear()
self.tapQueue = nil
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest = nil
if self.audioEngine.isRunning {
self.audioEngine.stop()
self.audioEngine.inputNode.removeTap(onBus: 0)
}
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
return true
}
func resumeAfterExternalAudioCapture(wasSuspended: Bool) {
guard wasSuspended else { return }
Task { await self.start() }
}
private func startRecognition() throws { private func startRecognition() throws {
self.recognitionTask?.cancel() self.recognitionTask?.cancel()
self.recognitionTask = nil self.recognitionTask = nil

View File

@@ -54,5 +54,6 @@ targets:
NSLocalNetworkUsageDescription: Clawdis discovers and connects to your Clawdis bridge on the local network. NSLocalNetworkUsageDescription: Clawdis discovers and connects to your Clawdis bridge on the local network.
NSBonjourServices: NSBonjourServices:
- _clawdis-bridge._tcp - _clawdis-bridge._tcp
NSCameraUsageDescription: Clawdis can capture photos or short video clips when requested via the bridge.
NSMicrophoneUsageDescription: Clawdis needs microphone access for voice wake. NSMicrophoneUsageDescription: Clawdis needs microphone access for voice wake.
NSSpeechRecognitionUsageDescription: Clawdis uses on-device speech recognition for voice wake. NSSpeechRecognitionUsageDescription: Clawdis uses on-device speech recognition for voice wake.

View File

@@ -0,0 +1,341 @@
import AVFoundation
import ClawdisIPC
import CoreGraphics
import Foundation
import ImageIO
import OSLog
import UniformTypeIdentifiers
actor CameraCaptureService {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "camera")
func snap(facing: CameraFacing?, maxWidth: Int?, quality: Double?) async throws -> (data: Data, size: CGSize) {
let facing = facing ?? .front
let maxWidth = maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
return try Self.reencodeJPEG(imageData: rawData, maxWidth: maxWidth, quality: quality)
}
func clip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async throws -> (path: String, durationMs: Int, hasAudio: Bool)
{
let facing = facing ?? .front
let durationMs = Self.clampDurationMs(durationMs)
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
guard session.canAddInput(micInput) else {
throw CameraError.captureFailed("Failed to add microphone input")
}
session.addInput(micInput)
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let tmpMovURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
defer { try? FileManager.default.removeItem(at: tmpMovURL) }
let outputURL: URL = {
if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
return URL(fileURLWithPath: outPath)
}
return FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
}()
// Ensure we don't fail exporting due to an existing file.
try? FileManager.default.removeItem(at: outputURL)
let logger = self.logger
let recordedURL: URL = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.startRecording(to: tmpMovURL, recordingDelegate: MovieFileDelegate(cont, logger: logger))
}
try await Self.exportToMP4(inputURL: recordedURL, outputURL: outputURL)
return (path: outputURL.path, durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: CameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) {
return device
}
// Many macOS cameras report `unspecified` position; fall back to any default.
return AVCaptureDevice.default(for: .video)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
return min(15_000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let src = CGImageSourceCreateWithData(imageData as CFData, nil),
let img = CGImageSourceCreateImageAtIndex(src, 0, nil)
else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: CGImage
if let maxWidth, img.width > maxWidth {
guard let scaled = self.downscale(image: img, maxWidth: maxWidth) else {
throw CameraError.captureFailed("Failed to downscale image")
}
finalImage = scaled
} else {
finalImage = img
}
let out = NSMutableData()
guard let dest = CGImageDestinationCreateWithData(out, UTType.jpeg.identifier as CFString, 1, nil) else {
throw CameraError.captureFailed("Failed to create JPEG destination")
}
let props = [kCGImageDestinationLossyCompressionQuality: quality] as CFDictionary
CGImageDestinationAddImage(dest, finalImage, props)
guard CGImageDestinationFinalize(dest) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out as Data, CGSize(width: finalImage.width, height: finalImage.height))
}
private nonisolated static func downscale(image: CGImage, maxWidth: Int) -> CGImage? {
guard image.width > 0, image.height > 0 else { return image }
guard image.width > maxWidth else { return image }
let scale = Double(maxWidth) / Double(image.width)
let targetW = maxWidth
let targetH = max(1, Int((Double(image.height) * scale).rounded()))
let cs = CGColorSpaceCreateDeviceRGB()
let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue
guard let ctx = CGContext(
data: nil,
width: targetW,
height: targetH,
bitsPerComponent: 8,
bytesPerRow: 0,
space: cs,
bitmapInfo: bitmapInfo)
else { return nil }
ctx.interpolationQuality = .high
ctx.draw(image, in: CGRect(x: 0, y: 0, width: targetW, height: targetH))
return ctx.makeImage()
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let export = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetMediumQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
export.outputURL = outputURL
export.outputFileType = .mp4
export.shouldOptimizeForNetworkUse = true
await withCheckedContinuation { cont in
export.exportAsynchronously {
cont.resume()
}
}
switch export.status {
case .completed:
return
case .failed:
throw CameraError.exportFailed(export.error?.localizedDescription ?? "export failed")
case .cancelled:
throw CameraError.exportFailed("export cancelled")
default:
throw CameraError.exportFailed("export did not complete (\(export.status.rawValue))")
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private var cont: CheckedContinuation<Data, Error>?
init(_ cont: CheckedContinuation<Data, Error>) {
self.cont = cont
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
cont.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
cont.resume(throwing: CameraCaptureService.CameraError.captureFailed("No photo data"))
return
}
cont.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private var cont: CheckedContinuation<URL, Error>?
private let logger: Logger
init(_ cont: CheckedContinuation<URL, Error>, logger: Logger) {
self.cont = cont
self.logger = logger
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
let ns = error as NSError
if ns.domain == AVFoundationErrorDomain,
ns.code == AVError.maximumDurationReached.rawValue
{
cont.resume(returning: outputFileURL)
return
}
self.logger.error("camera record failed: \(error.localizedDescription, privacy: .public)")
cont.resume(throwing: error)
return
}
cont.resume(returning: outputFileURL)
}
}

View File

@@ -24,6 +24,7 @@ let webChatEnabledKey = "clawdis.webChatEnabled"
let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled" let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled"
let webChatPortKey = "clawdis.webChatPort" let webChatPortKey = "clawdis.webChatPort"
let canvasEnabledKey = "clawdis.canvasEnabled" let canvasEnabledKey = "clawdis.canvasEnabled"
let cameraEnabledKey = "clawdis.cameraEnabled"
let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled" let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled"
let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled" let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled"
let deepLinkKeyKey = "clawdis.deepLinkKey" let deepLinkKeyKey = "clawdis.deepLinkKey"

View File

@@ -3,6 +3,8 @@ import Foundation
import OSLog import OSLog
enum ControlRequestHandler { enum ControlRequestHandler {
private static let cameraCapture = CameraCaptureService()
static func process( static func process(
request: Request, request: Request,
notifier: NotificationManager = NotificationManager(), notifier: NotificationManager = NotificationManager(),
@@ -77,6 +79,16 @@ enum ControlRequestHandler {
command: command, command: command,
paramsJSON: paramsJSON, paramsJSON: paramsJSON,
logger: logger) logger: logger)
case let .cameraSnap(facing, maxWidth, quality, outPath):
return await self.handleCameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
return await self.handleCameraClip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
} }
} }
@@ -173,6 +185,10 @@ enum ControlRequestHandler {
UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true
} }
private static func cameraEnabled() -> Bool {
UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false
}
private static func handleCanvasShow( private static func handleCanvasShow(
session: String, session: String,
path: String?, path: String?,
@@ -254,4 +270,46 @@ enum ControlRequestHandler {
return Response(ok: false, message: error.localizedDescription) return Response(ok: false, message: error.localizedDescription)
} }
} }
private static func handleCameraSnap(
facing: CameraFacing?,
maxWidth: Int?,
quality: Double?,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.snap(facing: facing, maxWidth: maxWidth, quality: quality)
let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
URL(fileURLWithPath: outPath)
} else {
FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).jpg")
}
try res.data.write(to: url, options: [.atomic])
return Response(ok: true, message: url.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCameraClip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.clip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
return Response(ok: true, message: res.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
} }

View File

@@ -9,6 +9,7 @@ struct DebugSettings: View {
@AppStorage(modelCatalogReloadKey) private var modelCatalogReloadBump: Int = 0 @AppStorage(modelCatalogReloadKey) private var modelCatalogReloadBump: Int = 0
@AppStorage(iconOverrideKey) private var iconOverrideRaw: String = IconOverrideSelection.system.rawValue @AppStorage(iconOverrideKey) private var iconOverrideRaw: String = IconOverrideSelection.system.rawValue
@AppStorage(canvasEnabledKey) private var canvasEnabled: Bool = true @AppStorage(canvasEnabledKey) private var canvasEnabled: Bool = true
@AppStorage(cameraEnabledKey) private var cameraEnabled: Bool = false
@AppStorage(deepLinkAgentEnabledKey) private var deepLinkAgentEnabled: Bool = false @AppStorage(deepLinkAgentEnabledKey) private var deepLinkAgentEnabled: Bool = false
@State private var modelsCount: Int? @State private var modelsCount: Int?
@State private var modelsLoading = false @State private var modelsLoading = false
@@ -48,6 +49,7 @@ struct DebugSettings: View {
self.pathsSection self.pathsSection
self.quickActionsSection self.quickActionsSection
self.canvasSection self.canvasSection
self.cameraSection
self.experimentsSection self.experimentsSection
Spacer(minLength: 0) Spacer(minLength: 0)
@@ -571,6 +573,20 @@ struct DebugSettings: View {
} }
} }
private var cameraSection: some View {
GroupBox("Camera") {
VStack(alignment: .leading, spacing: 10) {
Toggle("Allow Camera (agent)", isOn: self.$cameraEnabled)
.toggleStyle(.checkbox)
.help("When off, camera requests return “Camera disabled by user”.")
Text("Allows Clawdis to capture a photo or short video via the built-in camera.")
.font(.caption)
.foregroundStyle(.secondary)
}
}
}
private var experimentsSection: some View { private var experimentsSection: some View {
GroupBox("Experiments") { GroupBox("Experiments") {
Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) { Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) {

View File

@@ -52,6 +52,7 @@ struct ClawdisCLI {
enum Kind { enum Kind {
case generic case generic
case mediaPath
} }
} }
@@ -91,6 +92,9 @@ struct ClawdisCLI {
case "canvas": case "canvas":
return try self.parseCanvas(args: &args) return try self.parseCanvas(args: &args)
case "camera":
return try self.parseCamera(args: &args)
default: default:
throw CLIError.help throw CLIError.help
} }
@@ -292,6 +296,62 @@ struct ClawdisCLI {
} }
} }
private static func parseCamera(args: inout [String]) throws -> ParsedCLIRequest {
guard let sub = args.popFirst() else { throw CLIError.help }
switch sub {
case "snap":
var facing: CameraFacing?
var maxWidth: Int?
var quality: Double?
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--max-width":
maxWidth = args.popFirst().flatMap(Int.init)
case "--quality":
quality = args.popFirst().flatMap(Double.init)
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath),
kind: .mediaPath)
case "clip":
var facing: CameraFacing?
var durationMs: Int?
var includeAudio = true
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--duration-ms":
durationMs = args.popFirst().flatMap(Int.init)
case "--no-audio":
includeAudio = false
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath),
kind: .mediaPath)
default:
throw CLIError.help
}
}
private static func parseCanvasPlacement( private static func parseCanvasPlacement(
args: inout [String], args: inout [String],
session: inout String, session: inout String,
@@ -334,6 +394,10 @@ struct ClawdisCLI {
if let message = response.message, !message.isEmpty { if let message = response.message, !message.isEmpty {
FileHandle.standardOutput.write(Data((message + "\n").utf8)) FileHandle.standardOutput.write(Data((message + "\n").utf8))
} }
case .mediaPath:
if let message = response.message, !message.isEmpty {
print("MEDIA:\(message)")
}
} }
} }
@@ -352,6 +416,8 @@ struct ClawdisCLI {
output["payload"] = text output["payload"] = text
} }
} }
case .mediaPath:
break
} }
let json = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted]) let json = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted])
@@ -406,6 +472,10 @@ struct ClawdisCLI {
clawdis-mac canvas eval --js <code> [--session <key>] clawdis-mac canvas eval --js <code> [--session <key>]
clawdis-mac canvas snapshot [--out <path>] [--session <key>] clawdis-mac canvas snapshot [--out <path>] [--session <key>]
Camera:
clawdis-mac camera snap [--facing <front|back>] [--max-width <px>] [--quality <0-1>] [--out <path>]
clawdis-mac camera clip [--facing <front|back>] [--duration-ms <ms>] [--no-audio] [--out <path>]
Browser (clawd): Browser (clawd):
clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot
@@ -433,6 +503,7 @@ struct ClawdisCLI {
Output: Output:
Default output is text. Use --json for machine-readable output. Default output is text. Use --json for machine-readable output.
In text mode, `browser screenshot` prints MEDIA:<path>. In text mode, `browser screenshot` prints MEDIA:<path>.
In text mode, `camera snap` and `camera clip` print MEDIA:<path>.
""" """
print(usage) print(usage)
} }

View File

@@ -13,6 +13,11 @@ public enum Capability: String, Codable, CaseIterable, Sendable {
case speechRecognition case speechRecognition
} }
public enum CameraFacing: String, Codable, Sendable {
case front
case back
}
// MARK: - Requests // MARK: - Requests
/// Notification interruption level (maps to UNNotificationInterruptionLevel) /// Notification interruption level (maps to UNNotificationInterruptionLevel)
@@ -74,6 +79,8 @@ public enum Request: Sendable {
case canvasSnapshot(session: String, outPath: String?) case canvasSnapshot(session: String, outPath: String?)
case nodeList case nodeList
case nodeInvoke(nodeId: String, command: String, paramsJSON: String?) case nodeInvoke(nodeId: String, command: String, paramsJSON: String?)
case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?)
case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?)
} }
// MARK: - Responses // MARK: - Responses
@@ -104,6 +111,11 @@ extension Request: Codable {
case path case path
case javaScript case javaScript
case outPath case outPath
case facing
case maxWidth
case quality
case durationMs
case includeAudio
case placement case placement
case nodeId case nodeId
case nodeCommand case nodeCommand
@@ -124,6 +136,8 @@ extension Request: Codable {
case canvasSnapshot case canvasSnapshot
case nodeList case nodeList
case nodeInvoke case nodeInvoke
case cameraSnap
case cameraClip
} }
public func encode(to encoder: Encoder) throws { public func encode(to encoder: Encoder) throws {
@@ -198,6 +212,20 @@ extension Request: Codable {
try container.encode(nodeId, forKey: .nodeId) try container.encode(nodeId, forKey: .nodeId)
try container.encode(command, forKey: .nodeCommand) try container.encode(command, forKey: .nodeCommand)
try container.encodeIfPresent(paramsJSON, forKey: .paramsJSON) try container.encodeIfPresent(paramsJSON, forKey: .paramsJSON)
case let .cameraSnap(facing, maxWidth, quality, outPath):
try container.encode(Kind.cameraSnap, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(maxWidth, forKey: .maxWidth)
try container.encodeIfPresent(quality, forKey: .quality)
try container.encodeIfPresent(outPath, forKey: .outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
try container.encode(Kind.cameraClip, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(durationMs, forKey: .durationMs)
try container.encode(includeAudio, forKey: .includeAudio)
try container.encodeIfPresent(outPath, forKey: .outPath)
} }
} }
@@ -274,6 +302,20 @@ extension Request: Codable {
let command = try container.decode(String.self, forKey: .nodeCommand) let command = try container.decode(String.self, forKey: .nodeCommand)
let paramsJSON = try container.decodeIfPresent(String.self, forKey: .paramsJSON) let paramsJSON = try container.decodeIfPresent(String.self, forKey: .paramsJSON)
self = .nodeInvoke(nodeId: nodeId, command: command, paramsJSON: paramsJSON) self = .nodeInvoke(nodeId: nodeId, command: command, paramsJSON: paramsJSON)
case .cameraSnap:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let maxWidth = try container.decodeIfPresent(Int.self, forKey: .maxWidth)
let quality = try container.decodeIfPresent(Double.self, forKey: .quality)
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case .cameraClip:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs)
let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath)
} }
} }
} }

View File

@@ -0,0 +1,62 @@
import ClawdisIPC
import Foundation
import Testing
@Suite struct CameraIPCTests {
@Test func cameraSnapCodableRoundtrip() throws {
let req: Request = .cameraSnap(
facing: .front,
maxWidth: 640,
quality: 0.85,
outPath: "/tmp/test.jpg")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraSnap(facing, maxWidth, quality, outPath):
#expect(facing == .front)
#expect(maxWidth == 640)
#expect(quality == 0.85)
#expect(outPath == "/tmp/test.jpg")
default:
Issue.record("expected cameraSnap, got \(decoded)")
}
}
@Test func cameraClipCodableRoundtrip() throws {
let req: Request = .cameraClip(
facing: .back,
durationMs: 3000,
includeAudio: false,
outPath: "/tmp/test.mp4")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraClip(facing, durationMs, includeAudio, outPath):
#expect(facing == .back)
#expect(durationMs == 3000)
#expect(includeAudio == false)
#expect(outPath == "/tmp/test.mp4")
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
@Test func cameraClipDefaultsIncludeAudioToTrueWhenMissing() throws {
let json = """
{"type":"cameraClip","durationMs":1234}
"""
let decoded = try JSONDecoder().decode(Request.self, from: Data(json.utf8))
switch decoded {
case let .cameraClip(_, durationMs, includeAudio, _):
#expect(durationMs == 1234)
#expect(includeAudio == true)
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
}

View File

@@ -0,0 +1,58 @@
import Foundation
public enum ClawdisCameraCommand: String, Codable, Sendable {
case snap = "camera.snap"
case clip = "camera.clip"
}
public enum ClawdisCameraFacing: String, Codable, Sendable {
case back
case front
}
public enum ClawdisCameraImageFormat: String, Codable, Sendable {
case jpg
case jpeg
}
public enum ClawdisCameraVideoFormat: String, Codable, Sendable {
case mp4
}
public struct ClawdisCameraSnapParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var maxWidth: Int?
public var quality: Double?
public var format: ClawdisCameraImageFormat?
public init(
facing: ClawdisCameraFacing? = nil,
maxWidth: Int? = nil,
quality: Double? = nil,
format: ClawdisCameraImageFormat? = nil)
{
self.facing = facing
self.maxWidth = maxWidth
self.quality = quality
self.format = format
}
}
public struct ClawdisCameraClipParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var durationMs: Int?
public var includeAudio: Bool?
public var format: ClawdisCameraVideoFormat?
public init(
facing: ClawdisCameraFacing? = nil,
durationMs: Int? = nil,
includeAudio: Bool? = nil,
format: ClawdisCameraVideoFormat? = nil)
{
self.facing = facing
self.durationMs = durationMs
self.includeAudio = includeAudio
self.format = format
}
}

98
docs/camera.md Normal file
View File

@@ -0,0 +1,98 @@
---
summary: "Camera capture (iOS node + macOS app) for agent use: photos (jpg) and short video clips (mp4)"
read_when:
- Adding or modifying camera capture on iOS nodes or macOS
- Extending agent-accessible MEDIA temp-file workflows
---
# Camera capture (agent)
Clawdis supports **camera capture** for agent workflows:
- **iOS node** (paired via Gateway): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `node.invoke`.
- **macOS app** (local control socket): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `clawdis-mac`.
All camera access is gated behind **user-controlled settings**.
## iOS node
### User setting (default on)
- iOS Settings tab → **Camera****Allow Camera** (`camera.enabled`)
- Default: **on** (missing key is treated as enabled).
- When off: `camera.*` commands return `CAMERA_DISABLED`.
### Commands (via Gateway `node.invoke`)
- `camera.snap`
- Params:
- `facing`: `front|back` (default: `front`)
- `maxWidth`: number (optional)
- `quality`: `0..1` (optional; default `0.9`)
- `format`: currently `jpg`
- Response payload:
- `format: "jpg"`
- `base64: "<...>"`
- `width`, `height`
- `camera.clip`
- Params:
- `facing`: `front|back` (default: `front`)
- `durationMs`: number (default `3000`, clamped to a max)
- `includeAudio`: boolean (default `true`)
- `format`: currently `mp4`
- Response payload:
- `format: "mp4"`
- `base64: "<...>"`
- `durationMs`
- `hasAudio`
### Foreground requirement
Like `screen.*`, the iOS node only allows `camera.*` commands in the **foreground**. Background invocations return `NODE_BACKGROUND_UNAVAILABLE`.
### CLI helper (temp files + MEDIA)
The easiest way to get attachments is via the CLI helper, which writes decoded media to a temp file and prints `MEDIA:<path>`.
Examples:
```bash
clawdis nodes camera snap --node <id> # default: both front + back (2 MEDIA lines)
clawdis nodes camera snap --node <id> --facing front
clawdis nodes camera clip --node <id> --duration 3000
clawdis nodes camera clip --node <id> --no-audio
```
Notes:
- `nodes camera snap` defaults to **both** facings to give the agent both views.
- Output files are temporary (in the OS temp directory) unless you build your own wrapper.
## macOS app
### User setting (default off)
The macOS companion app exposes a checkbox:
- **Settings → Debug → Camera → Allow Camera (agent)** (`clawdis.cameraEnabled`)
- Default: **off**
- When off: camera requests return “Camera disabled by user”.
### CLI helper (local control socket)
The `clawdis-mac` helper talks to the running menu bar app over the local control socket.
Examples:
```bash
clawdis-mac camera snap # prints MEDIA:<path>
clawdis-mac camera snap --max-width 1280
clawdis-mac camera clip --duration-ms 3000 # prints MEDIA:<path>
clawdis-mac camera clip --no-audio
```
## Safety + practical limits
- Camera and microphone access trigger the usual OS permission prompts (and require usage strings in Info.plist).
- Video clips are intentionally short to avoid oversized bridge payloads (base64 overhead + WebSocket message limits).

View File

@@ -98,6 +98,8 @@ cat > "$APP_ROOT/Contents/Info.plist" <<PLIST
<string>Clawdis needs notification permission to show alerts for agent actions.</string> <string>Clawdis needs notification permission to show alerts for agent actions.</string>
<key>NSScreenCaptureDescription</key> <key>NSScreenCaptureDescription</key>
<string>Clawdis captures the screen when the agent needs screenshots for context.</string> <string>Clawdis captures the screen when the agent needs screenshots for context.</string>
<key>NSCameraUsageDescription</key>
<string>Clawdis can capture photos or short video clips when requested by the agent.</string>
<key>NSMicrophoneUsageDescription</key> <key>NSMicrophoneUsageDescription</key>
<string>Clawdis needs the mic for Voice Wake tests and agent audio capture.</string> <string>Clawdis needs the mic for Voice Wake tests and agent audio capture.</string>
<key>NSSpeechRecognitionUsageDescription</key> <key>NSSpeechRecognitionUsageDescription</key>

View File

@@ -0,0 +1,64 @@
import * as fs from "node:fs/promises";
import * as os from "node:os";
import * as path from "node:path";
import { describe, expect, it } from "vitest";
import {
cameraTempPath,
parseCameraClipPayload,
parseCameraSnapPayload,
writeBase64ToFile,
} from "./nodes-camera.js";
describe("nodes camera helpers", () => {
it("parses camera.snap payload", () => {
expect(
parseCameraSnapPayload({
format: "jpg",
base64: "aGk=",
width: 10,
height: 20,
}),
).toEqual({ format: "jpg", base64: "aGk=", width: 10, height: 20 });
});
it("rejects invalid camera.snap payload", () => {
expect(() => parseCameraSnapPayload({ format: "jpg" })).toThrow(
/invalid camera\.snap payload/i,
);
});
it("parses camera.clip payload", () => {
expect(
parseCameraClipPayload({
format: "mp4",
base64: "AAEC",
durationMs: 1234,
hasAudio: true,
}),
).toEqual({
format: "mp4",
base64: "AAEC",
durationMs: 1234,
hasAudio: true,
});
});
it("builds stable temp paths when id provided", () => {
const p = cameraTempPath({
kind: "snap",
facing: "front",
ext: "jpg",
tmpDir: "/tmp",
id: "id1",
});
expect(p).toBe(path.join("/tmp", "clawdis-camera-snap-front-id1.jpg"));
});
it("writes base64 to file", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-test-"));
const out = path.join(dir, "x.bin");
await writeBase64ToFile(out, "aGk=");
await expect(fs.readFile(out, "utf8")).resolves.toBe("hi");
await fs.rm(dir, { recursive: true, force: true });
});
});

92
src/cli/nodes-camera.ts Normal file
View File

@@ -0,0 +1,92 @@
import { randomUUID } from "node:crypto";
import * as fs from "node:fs/promises";
import * as os from "node:os";
import * as path from "node:path";
export type CameraFacing = "front" | "back";
export type CameraSnapPayload = {
format: string;
base64: string;
width: number;
height: number;
};
export type CameraClipPayload = {
format: string;
base64: string;
durationMs: number;
hasAudio: boolean;
};
function asRecord(value: unknown): Record<string, unknown> {
return typeof value === "object" && value !== null
? (value as Record<string, unknown>)
: {};
}
function asString(value: unknown): string | undefined {
return typeof value === "string" ? value : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value)
? value
: undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
export function parseCameraSnapPayload(value: unknown): CameraSnapPayload {
const obj = asRecord(value);
const format = asString(obj.format);
const base64 = asString(obj.base64);
const width = asNumber(obj.width);
const height = asNumber(obj.height);
if (!format || !base64 || width === undefined || height === undefined) {
throw new Error("invalid camera.snap payload");
}
return { format, base64, width, height };
}
export function parseCameraClipPayload(value: unknown): CameraClipPayload {
const obj = asRecord(value);
const format = asString(obj.format);
const base64 = asString(obj.base64);
const durationMs = asNumber(obj.durationMs);
const hasAudio = asBoolean(obj.hasAudio);
if (
!format ||
!base64 ||
durationMs === undefined ||
hasAudio === undefined
) {
throw new Error("invalid camera.clip payload");
}
return { format, base64, durationMs, hasAudio };
}
export function cameraTempPath(opts: {
kind: "snap" | "clip";
facing?: CameraFacing;
ext: string;
tmpDir?: string;
id?: string;
}) {
const tmpDir = opts.tmpDir ?? os.tmpdir();
const id = opts.id ?? randomUUID();
const facingPart = opts.facing ? `-${opts.facing}` : "";
const ext = opts.ext.startsWith(".") ? opts.ext : `.${opts.ext}`;
return path.join(
tmpDir,
`clawdis-camera-${opts.kind}${facingPart}-${id}${ext}`,
);
}
export async function writeBase64ToFile(filePath: string, base64: string) {
const buf = Buffer.from(base64, "base64");
await fs.writeFile(filePath, buf);
return { path: filePath, bytes: buf.length };
}

View File

@@ -1,6 +1,13 @@
import type { Command } from "commander"; import type { Command } from "commander";
import { callGateway, randomIdempotencyKey } from "../gateway/call.js"; import { callGateway, randomIdempotencyKey } from "../gateway/call.js";
import { defaultRuntime } from "../runtime.js"; import { defaultRuntime } from "../runtime.js";
import {
type CameraFacing,
cameraTempPath,
parseCameraClipPayload,
parseCameraSnapPayload,
writeBase64ToFile,
} from "./nodes-camera.js";
type NodesRpcOpts = { type NodesRpcOpts = {
url?: string; url?: string;
@@ -12,6 +19,11 @@ type NodesRpcOpts = {
params?: string; params?: string;
invokeTimeout?: string; invokeTimeout?: string;
idempotencyKey?: string; idempotencyKey?: string;
facing?: string;
maxWidth?: string;
quality?: string;
duration?: string;
audio?: boolean;
}; };
type NodeListNode = { type NodeListNode = {
@@ -340,4 +352,203 @@ export function registerNodesCli(program: Command) {
}), }),
{ timeoutMs: 30_000 }, { timeoutMs: 30_000 },
); );
const parseFacing = (value: string): CameraFacing => {
const v = String(value ?? "")
.trim()
.toLowerCase();
if (v === "front" || v === "back") return v;
throw new Error(`invalid facing: ${value} (expected front|back)`);
};
const camera = nodes
.command("camera")
.description("Capture camera media from a paired node");
nodesCallOpts(
camera
.command("snap")
.description("Capture a photo from a node camera (prints MEDIA:<path>)")
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back|both>", "Camera facing", "both")
.option("--max-width <px>", "Max width in px (optional)")
.option("--quality <0-1>", "JPEG quality (default 0.9)")
.option(
"--invoke-timeout <ms>",
"Node invoke timeout in ms (default 20000)",
"20000",
)
.action(async (opts: NodesRpcOpts) => {
try {
const nodeId = await resolveNodeId(opts, String(opts.node ?? ""));
const facingOpt = String(opts.facing ?? "both")
.trim()
.toLowerCase();
const facings: CameraFacing[] =
facingOpt === "both"
? ["front", "back"]
: facingOpt === "front" || facingOpt === "back"
? [facingOpt]
: (() => {
throw new Error(
`invalid facing: ${String(opts.facing)} (expected front|back|both)`,
);
})();
const maxWidth = opts.maxWidth
? Number.parseInt(String(opts.maxWidth), 10)
: undefined;
const quality = opts.quality
? Number.parseFloat(String(opts.quality))
: undefined;
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
const results: Array<{
facing: CameraFacing;
path: string;
width: number;
height: number;
}> = [];
for (const facing of facings) {
const invokeParams: Record<string, unknown> = {
nodeId,
command: "camera.snap",
params: {
facing,
maxWidth: Number.isFinite(maxWidth) ? maxWidth : undefined,
quality: Number.isFinite(quality) ? quality : undefined,
format: "jpg",
},
idempotencyKey: randomIdempotencyKey(),
};
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
invokeParams.timeoutMs = timeoutMs;
}
const raw = (await callGatewayCli(
"node.invoke",
opts,
invokeParams,
)) as unknown;
const res =
typeof raw === "object" && raw !== null
? (raw as { payload?: unknown })
: {};
const payload = parseCameraSnapPayload(res.payload);
const filePath = cameraTempPath({
kind: "snap",
facing,
ext: payload.format === "jpeg" ? "jpg" : payload.format,
});
await writeBase64ToFile(filePath, payload.base64);
results.push({
facing,
path: filePath,
width: payload.width,
height: payload.height,
});
}
if (opts.json) {
defaultRuntime.log(JSON.stringify({ files: results }, null, 2));
return;
}
defaultRuntime.log(results.map((r) => `MEDIA:${r.path}`).join("\n"));
} catch (err) {
defaultRuntime.error(`nodes camera snap failed: ${String(err)}`);
defaultRuntime.exit(1);
}
}),
{ timeoutMs: 60_000 },
);
nodesCallOpts(
camera
.command("clip")
.description(
"Capture a short video clip from a node camera (prints MEDIA:<path>)",
)
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back>", "Camera facing", "front")
.option("--duration <ms>", "Duration in ms (default 3000)", "3000")
.option("--no-audio", "Disable audio capture")
.option(
"--invoke-timeout <ms>",
"Node invoke timeout in ms (default 45000)",
"45000",
)
.action(async (opts: NodesRpcOpts & { audio?: boolean }) => {
try {
const nodeId = await resolveNodeId(opts, String(opts.node ?? ""));
const facing = parseFacing(String(opts.facing ?? "front"));
const durationMs = Number.parseInt(
String(opts.duration ?? "3000"),
10,
);
const includeAudio = opts.audio !== false;
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
const invokeParams: Record<string, unknown> = {
nodeId,
command: "camera.clip",
params: {
facing,
durationMs: Number.isFinite(durationMs) ? durationMs : undefined,
includeAudio,
format: "mp4",
},
idempotencyKey: randomIdempotencyKey(),
};
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
invokeParams.timeoutMs = timeoutMs;
}
const raw = (await callGatewayCli(
"node.invoke",
opts,
invokeParams,
)) as unknown;
const res =
typeof raw === "object" && raw !== null
? (raw as { payload?: unknown })
: {};
const payload = parseCameraClipPayload(res.payload);
const filePath = cameraTempPath({
kind: "clip",
facing,
ext: payload.format,
});
await writeBase64ToFile(filePath, payload.base64);
if (opts.json) {
defaultRuntime.log(
JSON.stringify(
{
file: {
facing,
path: filePath,
durationMs: payload.durationMs,
hasAudio: payload.hasAudio,
},
},
null,
2,
),
);
return;
}
defaultRuntime.log(`MEDIA:${filePath}`);
} catch (err) {
defaultRuntime.error(`nodes camera clip failed: ${String(err)}`);
defaultRuntime.exit(1);
}
}),
{ timeoutMs: 90_000 },
);
} }

View File

@@ -1,3 +1,4 @@
import * as fs from "node:fs/promises";
import { beforeEach, describe, expect, it, vi } from "vitest"; import { beforeEach, describe, expect, it, vi } from "vitest";
const sendCommand = vi.fn(); const sendCommand = vi.fn();
@@ -148,4 +149,145 @@ describe("cli program", () => {
); );
expect(runtime.log).toHaveBeenCalled(); expect(runtime.log).toHaveBeenCalled();
}); });
it("runs nodes camera snap and prints two MEDIA paths", async () => {
callGateway
.mockResolvedValueOnce({
ts: Date.now(),
nodes: [
{
nodeId: "ios-node",
displayName: "iOS Node",
remoteIp: "192.168.0.88",
connected: true,
},
],
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.snap",
payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 },
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.snap",
payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 },
});
const program = buildProgram();
runtime.log.mockClear();
await program.parseAsync(
["nodes", "camera", "snap", "--node", "ios-node"],
{
from: "user",
},
);
expect(callGateway).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.snap",
timeoutMs: 20000,
idempotencyKey: "idem-test",
params: expect.objectContaining({ facing: "front", format: "jpg" }),
}),
}),
);
expect(callGateway).toHaveBeenNthCalledWith(
3,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.snap",
timeoutMs: 20000,
idempotencyKey: "idem-test",
params: expect.objectContaining({ facing: "back", format: "jpg" }),
}),
}),
);
const out = String(runtime.log.mock.calls[0]?.[0] ?? "");
const mediaPaths = out
.split("\n")
.filter((l) => l.startsWith("MEDIA:"))
.map((l) => l.replace(/^MEDIA:/, ""))
.filter(Boolean);
expect(mediaPaths).toHaveLength(2);
try {
for (const p of mediaPaths) {
await expect(fs.readFile(p, "utf8")).resolves.toBe("hi");
}
} finally {
await Promise.all(mediaPaths.map((p) => fs.unlink(p).catch(() => {})));
}
});
it("runs nodes camera clip and prints one MEDIA path", async () => {
callGateway
.mockResolvedValueOnce({
ts: Date.now(),
nodes: [
{
nodeId: "ios-node",
displayName: "iOS Node",
remoteIp: "192.168.0.88",
connected: true,
},
],
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.clip",
payload: {
format: "mp4",
base64: "aGk=",
durationMs: 3000,
hasAudio: true,
},
});
const program = buildProgram();
runtime.log.mockClear();
await program.parseAsync(
["nodes", "camera", "clip", "--node", "ios-node", "--duration", "3000"],
{ from: "user" },
);
expect(callGateway).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.clip",
timeoutMs: 45000,
idempotencyKey: "idem-test",
params: expect.objectContaining({
facing: "front",
durationMs: 3000,
includeAudio: true,
format: "mp4",
}),
}),
}),
);
const out = String(runtime.log.mock.calls[0]?.[0] ?? "");
const mediaPath = out.replace(/^MEDIA:/, "").trim();
expect(mediaPath).toMatch(/clawdis-camera-clip-front-.*\.mp4$/);
try {
await expect(fs.readFile(mediaPath, "utf8")).resolves.toBe("hi");
} finally {
await fs.unlink(mediaPath).catch(() => {});
}
});
}); });