feat(camera): add snap/clip capture

This commit is contained in:
Peter Steinberger
2025-12-14 00:48:58 +00:00
parent 2454e67e09
commit a92eb1f33d
19 changed files with 1669 additions and 2 deletions

View File

@@ -0,0 +1,319 @@
import AVFoundation
import ClawdisKit
import Foundation
import UIKit
actor CameraController {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case invalidParams(String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .invalidParams(msg):
msg
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
func snap(params: ClawdisCameraSnapParams) async throws -> (
format: String,
base64: String,
width: Int,
height: Int)
{
let facing = params.facing ?? .front
let maxWidth = params.maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(params.quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
let (finalData, size) = try Self.reencodeJPEG(
imageData: rawData,
maxWidth: maxWidth,
quality: quality)
return (
format: "jpg",
base64: finalData.base64EncodedString(),
width: Int(size.width.rounded()),
height: Int(size.height.rounded()))
}
func clip(params: ClawdisCameraClipParams) async throws -> (
format: String,
base64: String,
durationMs: Int,
hasAudio: Bool)
{
let facing = params.facing ?? .front
let durationMs = Self.clampDurationMs(params.durationMs)
let includeAudio = params.includeAudio ?? true
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
if session.canAddInput(micInput) {
session.addInput(micInput)
} else {
throw CameraError.captureFailed("Failed to add microphone input")
}
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let movURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
let mp4URL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
defer {
try? FileManager.default.removeItem(at: movURL)
try? FileManager.default.removeItem(at: mp4URL)
}
let recordedURL: URL = try await withCheckedThrowingContinuation { cont in
let delegate = MovieFileDelegate(cont)
output.startRecording(to: movURL, recordingDelegate: delegate)
}
// Transcode .mov -> .mp4 for easier downstream handling.
try await Self.exportToMP4(inputURL: recordedURL, outputURL: mp4URL)
let data = try Data(contentsOf: mp4URL)
return (format: "mp4", base64: data.base64EncodedString(), durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: ClawdisCameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
return AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
// Keep clips short by default; avoid huge base64 payloads on the bridge.
return min(15000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let image = UIImage(data: imageData) else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: UIImage = if let maxWidth, maxWidth > 0 {
Self.downscale(image: image, maxWidth: CGFloat(maxWidth))
} else {
image
}
guard let out = finalImage.jpegData(compressionQuality: quality) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out, finalImage.size)
}
private nonisolated static func downscale(image: UIImage, maxWidth: CGFloat) -> UIImage {
let w = image.size.width
let h = image.size.height
guard w > 0, h > 0 else { return image }
guard w > maxWidth else { return image }
let scale = maxWidth / w
let target = CGSize(width: maxWidth, height: max(1, h * scale))
let format = UIGraphicsImageRendererFormat.default()
format.opaque = false
let renderer = UIGraphicsImageRenderer(size: target, format: format)
return renderer.image { _ in
image.draw(in: CGRect(origin: .zero, size: target))
}
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetHighestQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
exporter.outputURL = outputURL
exporter.outputFileType = .mp4
exporter.shouldOptimizeForNetworkUse = true
try await withCheckedThrowingContinuation(isolation: nil) { cont in
exporter.exportAsynchronously {
switch exporter.status {
case .completed:
cont.resume(returning: ())
case .failed:
cont.resume(throwing: exporter.error ?? CameraError.exportFailed("Export failed"))
case .cancelled:
cont.resume(throwing: CameraError.exportFailed("Export cancelled"))
default:
cont.resume(throwing: CameraError.exportFailed("Export did not complete"))
}
}
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private let continuation: CheckedContinuation<Data, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<Data, Error>) {
self.continuation = continuation
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
self.continuation.resume(
throwing: NSError(domain: "Camera", code: 1, userInfo: [
NSLocalizedDescriptionKey: "photo data missing",
]))
return
}
self.continuation.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private let continuation: CheckedContinuation<URL, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<URL, Error>) {
self.continuation = continuation
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
self.continuation.resume(returning: outputFileURL)
}
}

View File

@@ -26,6 +26,8 @@
</array>
<key>NSLocalNetworkUsageDescription</key>
<string>Clawdis discovers and connects to your Clawdis bridge on the local network.</string>
<key>NSCameraUsageDescription</key>
<string>Clawdis can capture photos or short video clips when requested via the bridge.</string>
<key>NSMicrophoneUsageDescription</key>
<string>Clawdis needs microphone access for voice wake.</string>
<key>NSSpeechRecognitionUsageDescription</key>

View File

@@ -6,6 +6,7 @@ import SwiftUI
final class NodeAppModel: ObservableObject {
@Published var isBackgrounded: Bool = false
let screen = ScreenController()
let camera = CameraController()
@Published var bridgeStatusText: String = "Not connected"
@Published var bridgeServerName: String?
@Published var bridgeRemoteAddress: String?
@@ -182,13 +183,22 @@ final class NodeAppModel: ObservableObject {
}
private func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse {
if req.command.hasPrefix("screen."), self.isBackgrounded {
if req.command.hasPrefix("screen.") || req.command.hasPrefix("camera."), self.isBackgrounded {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .backgroundUnavailable,
message: "NODE_BACKGROUND_UNAVAILABLE: screen commands require foreground"))
message: "NODE_BACKGROUND_UNAVAILABLE: screen/camera commands require foreground"))
}
if req.command.hasPrefix("camera."), !self.isCameraEnabled() {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .unavailable,
message: "CAMERA_DISABLED: enable Camera in iOS Settings → Camera → Allow Camera"))
}
do {
@@ -222,6 +232,46 @@ final class NodeAppModel: ObservableObject {
let payload = try Self.encodePayload(["format": "png", "base64": base64])
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.snap.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraSnapParams.self, from: req.paramsJSON)) ??
ClawdisCameraSnapParams()
let res = try await self.camera.snap(params: params)
struct Payload: Codable {
var format: String
var base64: String
var width: Int
var height: Int
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
width: res.width,
height: res.height))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.clip.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraClipParams.self, from: req.paramsJSON)) ??
ClawdisCameraClipParams()
let suspended = (params.includeAudio ?? true) ? self.voiceWake.suspendForExternalAudioCapture() : false
defer { self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: suspended) }
let res = try await self.camera.clip(params: params)
struct Payload: Codable {
var format: String
var base64: String
var durationMs: Int
var hasAudio: Bool
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
durationMs: res.durationMs,
hasAudio: res.hasAudio))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
default:
return BridgeInvokeResponse(
id: req.id,
@@ -254,4 +304,10 @@ final class NodeAppModel: ObservableObject {
}
return json
}
private func isCameraEnabled() -> Bool {
// Default-on: if the key doesn't exist yet, treat it as enabled.
if UserDefaults.standard.object(forKey: "camera.enabled") == nil { return true }
return UserDefaults.standard.bool(forKey: "camera.enabled")
}
}

View File

@@ -205,6 +205,37 @@ final class VoiceWakeManager: NSObject, ObservableObject {
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
}
/// Temporarily releases the microphone so other subsystems (e.g. camera video capture) can record audio.
/// Returns `true` when listening was active and was suspended.
func suspendForExternalAudioCapture() -> Bool {
guard self.isEnabled, self.isListening else { return false }
self.isListening = false
self.statusText = "Paused"
self.tapDrainTask?.cancel()
self.tapDrainTask = nil
self.tapQueue?.clear()
self.tapQueue = nil
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest = nil
if self.audioEngine.isRunning {
self.audioEngine.stop()
self.audioEngine.inputNode.removeTap(onBus: 0)
}
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
return true
}
func resumeAfterExternalAudioCapture(wasSuspended: Bool) {
guard wasSuspended else { return }
Task { await self.start() }
}
private func startRecognition() throws {
self.recognitionTask?.cancel()
self.recognitionTask = nil

View File

@@ -54,5 +54,6 @@ targets:
NSLocalNetworkUsageDescription: Clawdis discovers and connects to your Clawdis bridge on the local network.
NSBonjourServices:
- _clawdis-bridge._tcp
NSCameraUsageDescription: Clawdis can capture photos or short video clips when requested via the bridge.
NSMicrophoneUsageDescription: Clawdis needs microphone access for voice wake.
NSSpeechRecognitionUsageDescription: Clawdis uses on-device speech recognition for voice wake.

View File

@@ -0,0 +1,341 @@
import AVFoundation
import ClawdisIPC
import CoreGraphics
import Foundation
import ImageIO
import OSLog
import UniformTypeIdentifiers
actor CameraCaptureService {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "camera")
func snap(facing: CameraFacing?, maxWidth: Int?, quality: Double?) async throws -> (data: Data, size: CGSize) {
let facing = facing ?? .front
let maxWidth = maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
return try Self.reencodeJPEG(imageData: rawData, maxWidth: maxWidth, quality: quality)
}
func clip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async throws -> (path: String, durationMs: Int, hasAudio: Bool)
{
let facing = facing ?? .front
let durationMs = Self.clampDurationMs(durationMs)
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
guard session.canAddInput(micInput) else {
throw CameraError.captureFailed("Failed to add microphone input")
}
session.addInput(micInput)
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let tmpMovURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
defer { try? FileManager.default.removeItem(at: tmpMovURL) }
let outputURL: URL = {
if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
return URL(fileURLWithPath: outPath)
}
return FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
}()
// Ensure we don't fail exporting due to an existing file.
try? FileManager.default.removeItem(at: outputURL)
let logger = self.logger
let recordedURL: URL = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.startRecording(to: tmpMovURL, recordingDelegate: MovieFileDelegate(cont, logger: logger))
}
try await Self.exportToMP4(inputURL: recordedURL, outputURL: outputURL)
return (path: outputURL.path, durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: CameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) {
return device
}
// Many macOS cameras report `unspecified` position; fall back to any default.
return AVCaptureDevice.default(for: .video)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
return min(15_000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let src = CGImageSourceCreateWithData(imageData as CFData, nil),
let img = CGImageSourceCreateImageAtIndex(src, 0, nil)
else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: CGImage
if let maxWidth, img.width > maxWidth {
guard let scaled = self.downscale(image: img, maxWidth: maxWidth) else {
throw CameraError.captureFailed("Failed to downscale image")
}
finalImage = scaled
} else {
finalImage = img
}
let out = NSMutableData()
guard let dest = CGImageDestinationCreateWithData(out, UTType.jpeg.identifier as CFString, 1, nil) else {
throw CameraError.captureFailed("Failed to create JPEG destination")
}
let props = [kCGImageDestinationLossyCompressionQuality: quality] as CFDictionary
CGImageDestinationAddImage(dest, finalImage, props)
guard CGImageDestinationFinalize(dest) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out as Data, CGSize(width: finalImage.width, height: finalImage.height))
}
private nonisolated static func downscale(image: CGImage, maxWidth: Int) -> CGImage? {
guard image.width > 0, image.height > 0 else { return image }
guard image.width > maxWidth else { return image }
let scale = Double(maxWidth) / Double(image.width)
let targetW = maxWidth
let targetH = max(1, Int((Double(image.height) * scale).rounded()))
let cs = CGColorSpaceCreateDeviceRGB()
let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue
guard let ctx = CGContext(
data: nil,
width: targetW,
height: targetH,
bitsPerComponent: 8,
bytesPerRow: 0,
space: cs,
bitmapInfo: bitmapInfo)
else { return nil }
ctx.interpolationQuality = .high
ctx.draw(image, in: CGRect(x: 0, y: 0, width: targetW, height: targetH))
return ctx.makeImage()
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let export = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetMediumQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
export.outputURL = outputURL
export.outputFileType = .mp4
export.shouldOptimizeForNetworkUse = true
await withCheckedContinuation { cont in
export.exportAsynchronously {
cont.resume()
}
}
switch export.status {
case .completed:
return
case .failed:
throw CameraError.exportFailed(export.error?.localizedDescription ?? "export failed")
case .cancelled:
throw CameraError.exportFailed("export cancelled")
default:
throw CameraError.exportFailed("export did not complete (\(export.status.rawValue))")
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private var cont: CheckedContinuation<Data, Error>?
init(_ cont: CheckedContinuation<Data, Error>) {
self.cont = cont
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
cont.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
cont.resume(throwing: CameraCaptureService.CameraError.captureFailed("No photo data"))
return
}
cont.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private var cont: CheckedContinuation<URL, Error>?
private let logger: Logger
init(_ cont: CheckedContinuation<URL, Error>, logger: Logger) {
self.cont = cont
self.logger = logger
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
let ns = error as NSError
if ns.domain == AVFoundationErrorDomain,
ns.code == AVError.maximumDurationReached.rawValue
{
cont.resume(returning: outputFileURL)
return
}
self.logger.error("camera record failed: \(error.localizedDescription, privacy: .public)")
cont.resume(throwing: error)
return
}
cont.resume(returning: outputFileURL)
}
}

View File

@@ -24,6 +24,7 @@ let webChatEnabledKey = "clawdis.webChatEnabled"
let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled"
let webChatPortKey = "clawdis.webChatPort"
let canvasEnabledKey = "clawdis.canvasEnabled"
let cameraEnabledKey = "clawdis.cameraEnabled"
let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled"
let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled"
let deepLinkKeyKey = "clawdis.deepLinkKey"

View File

@@ -3,6 +3,8 @@ import Foundation
import OSLog
enum ControlRequestHandler {
private static let cameraCapture = CameraCaptureService()
static func process(
request: Request,
notifier: NotificationManager = NotificationManager(),
@@ -77,6 +79,16 @@ enum ControlRequestHandler {
command: command,
paramsJSON: paramsJSON,
logger: logger)
case let .cameraSnap(facing, maxWidth, quality, outPath):
return await self.handleCameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
return await self.handleCameraClip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
}
}
@@ -173,6 +185,10 @@ enum ControlRequestHandler {
UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true
}
private static func cameraEnabled() -> Bool {
UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false
}
private static func handleCanvasShow(
session: String,
path: String?,
@@ -254,4 +270,46 @@ enum ControlRequestHandler {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCameraSnap(
facing: CameraFacing?,
maxWidth: Int?,
quality: Double?,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.snap(facing: facing, maxWidth: maxWidth, quality: quality)
let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
URL(fileURLWithPath: outPath)
} else {
FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).jpg")
}
try res.data.write(to: url, options: [.atomic])
return Response(ok: true, message: url.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCameraClip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.clip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
return Response(ok: true, message: res.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
}

View File

@@ -9,6 +9,7 @@ struct DebugSettings: View {
@AppStorage(modelCatalogReloadKey) private var modelCatalogReloadBump: Int = 0
@AppStorage(iconOverrideKey) private var iconOverrideRaw: String = IconOverrideSelection.system.rawValue
@AppStorage(canvasEnabledKey) private var canvasEnabled: Bool = true
@AppStorage(cameraEnabledKey) private var cameraEnabled: Bool = false
@AppStorage(deepLinkAgentEnabledKey) private var deepLinkAgentEnabled: Bool = false
@State private var modelsCount: Int?
@State private var modelsLoading = false
@@ -48,6 +49,7 @@ struct DebugSettings: View {
self.pathsSection
self.quickActionsSection
self.canvasSection
self.cameraSection
self.experimentsSection
Spacer(minLength: 0)
@@ -571,6 +573,20 @@ struct DebugSettings: View {
}
}
private var cameraSection: some View {
GroupBox("Camera") {
VStack(alignment: .leading, spacing: 10) {
Toggle("Allow Camera (agent)", isOn: self.$cameraEnabled)
.toggleStyle(.checkbox)
.help("When off, camera requests return “Camera disabled by user”.")
Text("Allows Clawdis to capture a photo or short video via the built-in camera.")
.font(.caption)
.foregroundStyle(.secondary)
}
}
}
private var experimentsSection: some View {
GroupBox("Experiments") {
Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) {

View File

@@ -52,6 +52,7 @@ struct ClawdisCLI {
enum Kind {
case generic
case mediaPath
}
}
@@ -91,6 +92,9 @@ struct ClawdisCLI {
case "canvas":
return try self.parseCanvas(args: &args)
case "camera":
return try self.parseCamera(args: &args)
default:
throw CLIError.help
}
@@ -292,6 +296,62 @@ struct ClawdisCLI {
}
}
private static func parseCamera(args: inout [String]) throws -> ParsedCLIRequest {
guard let sub = args.popFirst() else { throw CLIError.help }
switch sub {
case "snap":
var facing: CameraFacing?
var maxWidth: Int?
var quality: Double?
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--max-width":
maxWidth = args.popFirst().flatMap(Int.init)
case "--quality":
quality = args.popFirst().flatMap(Double.init)
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath),
kind: .mediaPath)
case "clip":
var facing: CameraFacing?
var durationMs: Int?
var includeAudio = true
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--duration-ms":
durationMs = args.popFirst().flatMap(Int.init)
case "--no-audio":
includeAudio = false
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath),
kind: .mediaPath)
default:
throw CLIError.help
}
}
private static func parseCanvasPlacement(
args: inout [String],
session: inout String,
@@ -334,6 +394,10 @@ struct ClawdisCLI {
if let message = response.message, !message.isEmpty {
FileHandle.standardOutput.write(Data((message + "\n").utf8))
}
case .mediaPath:
if let message = response.message, !message.isEmpty {
print("MEDIA:\(message)")
}
}
}
@@ -352,6 +416,8 @@ struct ClawdisCLI {
output["payload"] = text
}
}
case .mediaPath:
break
}
let json = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted])
@@ -406,6 +472,10 @@ struct ClawdisCLI {
clawdis-mac canvas eval --js <code> [--session <key>]
clawdis-mac canvas snapshot [--out <path>] [--session <key>]
Camera:
clawdis-mac camera snap [--facing <front|back>] [--max-width <px>] [--quality <0-1>] [--out <path>]
clawdis-mac camera clip [--facing <front|back>] [--duration-ms <ms>] [--no-audio] [--out <path>]
Browser (clawd):
clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot
@@ -433,6 +503,7 @@ struct ClawdisCLI {
Output:
Default output is text. Use --json for machine-readable output.
In text mode, `browser screenshot` prints MEDIA:<path>.
In text mode, `camera snap` and `camera clip` print MEDIA:<path>.
"""
print(usage)
}

View File

@@ -13,6 +13,11 @@ public enum Capability: String, Codable, CaseIterable, Sendable {
case speechRecognition
}
public enum CameraFacing: String, Codable, Sendable {
case front
case back
}
// MARK: - Requests
/// Notification interruption level (maps to UNNotificationInterruptionLevel)
@@ -74,6 +79,8 @@ public enum Request: Sendable {
case canvasSnapshot(session: String, outPath: String?)
case nodeList
case nodeInvoke(nodeId: String, command: String, paramsJSON: String?)
case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?)
case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?)
}
// MARK: - Responses
@@ -104,6 +111,11 @@ extension Request: Codable {
case path
case javaScript
case outPath
case facing
case maxWidth
case quality
case durationMs
case includeAudio
case placement
case nodeId
case nodeCommand
@@ -124,6 +136,8 @@ extension Request: Codable {
case canvasSnapshot
case nodeList
case nodeInvoke
case cameraSnap
case cameraClip
}
public func encode(to encoder: Encoder) throws {
@@ -198,6 +212,20 @@ extension Request: Codable {
try container.encode(nodeId, forKey: .nodeId)
try container.encode(command, forKey: .nodeCommand)
try container.encodeIfPresent(paramsJSON, forKey: .paramsJSON)
case let .cameraSnap(facing, maxWidth, quality, outPath):
try container.encode(Kind.cameraSnap, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(maxWidth, forKey: .maxWidth)
try container.encodeIfPresent(quality, forKey: .quality)
try container.encodeIfPresent(outPath, forKey: .outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
try container.encode(Kind.cameraClip, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(durationMs, forKey: .durationMs)
try container.encode(includeAudio, forKey: .includeAudio)
try container.encodeIfPresent(outPath, forKey: .outPath)
}
}
@@ -274,6 +302,20 @@ extension Request: Codable {
let command = try container.decode(String.self, forKey: .nodeCommand)
let paramsJSON = try container.decodeIfPresent(String.self, forKey: .paramsJSON)
self = .nodeInvoke(nodeId: nodeId, command: command, paramsJSON: paramsJSON)
case .cameraSnap:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let maxWidth = try container.decodeIfPresent(Int.self, forKey: .maxWidth)
let quality = try container.decodeIfPresent(Double.self, forKey: .quality)
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case .cameraClip:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs)
let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath)
}
}
}

View File

@@ -0,0 +1,62 @@
import ClawdisIPC
import Foundation
import Testing
@Suite struct CameraIPCTests {
@Test func cameraSnapCodableRoundtrip() throws {
let req: Request = .cameraSnap(
facing: .front,
maxWidth: 640,
quality: 0.85,
outPath: "/tmp/test.jpg")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraSnap(facing, maxWidth, quality, outPath):
#expect(facing == .front)
#expect(maxWidth == 640)
#expect(quality == 0.85)
#expect(outPath == "/tmp/test.jpg")
default:
Issue.record("expected cameraSnap, got \(decoded)")
}
}
@Test func cameraClipCodableRoundtrip() throws {
let req: Request = .cameraClip(
facing: .back,
durationMs: 3000,
includeAudio: false,
outPath: "/tmp/test.mp4")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraClip(facing, durationMs, includeAudio, outPath):
#expect(facing == .back)
#expect(durationMs == 3000)
#expect(includeAudio == false)
#expect(outPath == "/tmp/test.mp4")
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
@Test func cameraClipDefaultsIncludeAudioToTrueWhenMissing() throws {
let json = """
{"type":"cameraClip","durationMs":1234}
"""
let decoded = try JSONDecoder().decode(Request.self, from: Data(json.utf8))
switch decoded {
case let .cameraClip(_, durationMs, includeAudio, _):
#expect(durationMs == 1234)
#expect(includeAudio == true)
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
}

View File

@@ -0,0 +1,58 @@
import Foundation
public enum ClawdisCameraCommand: String, Codable, Sendable {
case snap = "camera.snap"
case clip = "camera.clip"
}
public enum ClawdisCameraFacing: String, Codable, Sendable {
case back
case front
}
public enum ClawdisCameraImageFormat: String, Codable, Sendable {
case jpg
case jpeg
}
public enum ClawdisCameraVideoFormat: String, Codable, Sendable {
case mp4
}
public struct ClawdisCameraSnapParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var maxWidth: Int?
public var quality: Double?
public var format: ClawdisCameraImageFormat?
public init(
facing: ClawdisCameraFacing? = nil,
maxWidth: Int? = nil,
quality: Double? = nil,
format: ClawdisCameraImageFormat? = nil)
{
self.facing = facing
self.maxWidth = maxWidth
self.quality = quality
self.format = format
}
}
public struct ClawdisCameraClipParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var durationMs: Int?
public var includeAudio: Bool?
public var format: ClawdisCameraVideoFormat?
public init(
facing: ClawdisCameraFacing? = nil,
durationMs: Int? = nil,
includeAudio: Bool? = nil,
format: ClawdisCameraVideoFormat? = nil)
{
self.facing = facing
self.durationMs = durationMs
self.includeAudio = includeAudio
self.format = format
}
}