import AVFoundation import ClawdbotKit import Foundation actor CameraController { struct CameraDeviceInfo: Codable, Sendable { var id: String var name: String var position: String var deviceType: String } enum CameraError: LocalizedError, Sendable { case cameraUnavailable case microphoneUnavailable case permissionDenied(kind: String) case invalidParams(String) case captureFailed(String) case exportFailed(String) var errorDescription: String? { switch self { case .cameraUnavailable: "Camera unavailable" case .microphoneUnavailable: "Microphone unavailable" case let .permissionDenied(kind): "\(kind) permission denied" case let .invalidParams(msg): msg case let .captureFailed(msg): msg case let .exportFailed(msg): msg } } } func snap(params: ClawdbotCameraSnapParams) async throws -> ( format: String, base64: String, width: Int, height: Int) { let facing = params.facing ?? .front let format = params.format ?? .jpg // Default to a reasonable max width to keep bridge payload sizes manageable. // If you need the full-res photo, explicitly request a larger maxWidth. let maxWidth = params.maxWidth.flatMap { $0 > 0 ? $0 : nil } ?? 1600 let quality = Self.clampQuality(params.quality) let delayMs = max(0, params.delayMs ?? 0) try await self.ensureAccess(for: .video) let session = AVCaptureSession() session.sessionPreset = .photo guard let device = Self.pickCamera(facing: facing, deviceId: params.deviceId) else { throw CameraError.cameraUnavailable } let input = try AVCaptureDeviceInput(device: device) guard session.canAddInput(input) else { throw CameraError.captureFailed("Failed to add camera input") } session.addInput(input) let output = AVCapturePhotoOutput() guard session.canAddOutput(output) else { throw CameraError.captureFailed("Failed to add photo output") } session.addOutput(output) output.maxPhotoQualityPrioritization = .quality session.startRunning() defer { session.stopRunning() } await Self.warmUpCaptureSession() await Self.sleepDelayMs(delayMs) let settings: AVCapturePhotoSettings = { if output.availablePhotoCodecTypes.contains(.jpeg) { return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg]) } return AVCapturePhotoSettings() }() settings.photoQualityPrioritization = .quality var delegate: PhotoCaptureDelegate? let rawData: Data = try await withCheckedThrowingContinuation { cont in let d = PhotoCaptureDelegate(cont) delegate = d output.capturePhoto(with: settings, delegate: d) } withExtendedLifetime(delegate) {} let maxPayloadBytes = 5 * 1024 * 1024 // Base64 inflates payloads by ~4/3; cap encoded bytes so the payload stays under 5MB (API limit). let maxEncodedBytes = (maxPayloadBytes / 4) * 3 let res = try JPEGTranscoder.transcodeToJPEG( imageData: rawData, maxWidthPx: maxWidth, quality: quality, maxBytes: maxEncodedBytes) return ( format: format.rawValue, base64: res.data.base64EncodedString(), width: res.widthPx, height: res.heightPx) } func clip(params: ClawdbotCameraClipParams) async throws -> ( format: String, base64: String, durationMs: Int, hasAudio: Bool) { let facing = params.facing ?? .front let durationMs = Self.clampDurationMs(params.durationMs) let includeAudio = params.includeAudio ?? true let format = params.format ?? .mp4 try await self.ensureAccess(for: .video) if includeAudio { try await self.ensureAccess(for: .audio) } let session = AVCaptureSession() session.sessionPreset = .high guard let camera = Self.pickCamera(facing: facing, deviceId: params.deviceId) else { throw CameraError.cameraUnavailable } let cameraInput = try AVCaptureDeviceInput(device: camera) guard session.canAddInput(cameraInput) else { throw CameraError.captureFailed("Failed to add camera input") } session.addInput(cameraInput) if includeAudio { guard let mic = AVCaptureDevice.default(for: .audio) else { throw CameraError.microphoneUnavailable } let micInput = try AVCaptureDeviceInput(device: mic) if session.canAddInput(micInput) { session.addInput(micInput) } else { throw CameraError.captureFailed("Failed to add microphone input") } } let output = AVCaptureMovieFileOutput() guard session.canAddOutput(output) else { throw CameraError.captureFailed("Failed to add movie output") } session.addOutput(output) output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000) session.startRunning() defer { session.stopRunning() } await Self.warmUpCaptureSession() let movURL = FileManager.default.temporaryDirectory .appendingPathComponent("clawdbot-camera-\(UUID().uuidString).mov") let mp4URL = FileManager.default.temporaryDirectory .appendingPathComponent("clawdbot-camera-\(UUID().uuidString).mp4") defer { try? FileManager.default.removeItem(at: movURL) try? FileManager.default.removeItem(at: mp4URL) } var delegate: MovieFileDelegate? let recordedURL: URL = try await withCheckedThrowingContinuation { cont in let d = MovieFileDelegate(cont) delegate = d output.startRecording(to: movURL, recordingDelegate: d) } withExtendedLifetime(delegate) {} // Transcode .mov -> .mp4 for easier downstream handling. try await Self.exportToMP4(inputURL: recordedURL, outputURL: mp4URL) let data = try Data(contentsOf: mp4URL) return ( format: format.rawValue, base64: data.base64EncodedString(), durationMs: durationMs, hasAudio: includeAudio) } func listDevices() -> [CameraDeviceInfo] { let types: [AVCaptureDevice.DeviceType] = [ .builtInWideAngleCamera, ] let session = AVCaptureDevice.DiscoverySession( deviceTypes: types, mediaType: .video, position: .unspecified) return session.devices.map { device in CameraDeviceInfo( id: device.uniqueID, name: device.localizedName, position: Self.positionLabel(device.position), deviceType: device.deviceType.rawValue) } } private func ensureAccess(for mediaType: AVMediaType) async throws { let status = AVCaptureDevice.authorizationStatus(for: mediaType) switch status { case .authorized: return case .notDetermined: let ok = await withCheckedContinuation(isolation: nil) { cont in AVCaptureDevice.requestAccess(for: mediaType) { granted in cont.resume(returning: granted) } } if !ok { throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") } case .denied, .restricted: throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") @unknown default: throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone") } } private nonisolated static func pickCamera( facing: ClawdbotCameraFacing, deviceId: String?) -> AVCaptureDevice? { if let deviceId, !deviceId.isEmpty { if let match = AVCaptureDevice.devices(for: .video).first(where: { $0.uniqueID == deviceId }) { return match } } let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) { return device } // Fall back to any default camera (e.g. simulator / unusual device configurations). return AVCaptureDevice.default(for: .video) } private nonisolated static func positionLabel(_ position: AVCaptureDevice.Position) -> String { switch position { case .front: "front" case .back: "back" default: "unspecified" } } nonisolated static func clampQuality(_ quality: Double?) -> Double { let q = quality ?? 0.9 return min(1.0, max(0.05, q)) } nonisolated static func clampDurationMs(_ ms: Int?) -> Int { let v = ms ?? 3000 // Keep clips short by default; avoid huge base64 payloads on the bridge. return min(60000, max(250, v)) } private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws { let asset = AVURLAsset(url: inputURL) guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetMediumQuality) else { throw CameraError.exportFailed("Failed to create export session") } exporter.shouldOptimizeForNetworkUse = true if #available(iOS 18.0, tvOS 18.0, visionOS 2.0, *) { do { try await exporter.export(to: outputURL, as: .mp4) return } catch { throw CameraError.exportFailed(error.localizedDescription) } } else { exporter.outputURL = outputURL exporter.outputFileType = .mp4 try await withCheckedThrowingContinuation(isolation: nil) { (cont: CheckedContinuation) in exporter.exportAsynchronously { cont.resume(returning: ()) } } switch exporter.status { case .completed: return case .failed: throw CameraError.exportFailed(exporter.error?.localizedDescription ?? "export failed") case .cancelled: throw CameraError.exportFailed("export cancelled") default: throw CameraError.exportFailed("export did not complete") } } } private nonisolated static func warmUpCaptureSession() async { // A short delay after `startRunning()` significantly reduces "blank first frame" captures on some devices. try? await Task.sleep(nanoseconds: 150_000_000) // 150ms } private nonisolated static func sleepDelayMs(_ delayMs: Int) async { guard delayMs > 0 else { return } let maxDelayMs = 10 * 1000 let ns = UInt64(min(delayMs, maxDelayMs)) * UInt64(NSEC_PER_MSEC) try? await Task.sleep(nanoseconds: ns) } } private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate { private let continuation: CheckedContinuation private var didResume = false init(_ continuation: CheckedContinuation) { self.continuation = continuation } func photoOutput( _ output: AVCapturePhotoOutput, didFinishProcessingPhoto photo: AVCapturePhoto, error: Error?) { guard !self.didResume else { return } self.didResume = true if let error { self.continuation.resume(throwing: error) return } guard let data = photo.fileDataRepresentation() else { self.continuation.resume( throwing: NSError(domain: "Camera", code: 1, userInfo: [ NSLocalizedDescriptionKey: "photo data missing", ])) return } if data.isEmpty { self.continuation.resume( throwing: NSError(domain: "Camera", code: 2, userInfo: [ NSLocalizedDescriptionKey: "photo data empty", ])) return } self.continuation.resume(returning: data) } func photoOutput( _ output: AVCapturePhotoOutput, didFinishCaptureFor resolvedSettings: AVCaptureResolvedPhotoSettings, error: Error?) { guard let error else { return } guard !self.didResume else { return } self.didResume = true self.continuation.resume(throwing: error) } } private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate { private let continuation: CheckedContinuation private var didResume = false init(_ continuation: CheckedContinuation) { self.continuation = continuation } func fileOutput( _ output: AVCaptureFileOutput, didFinishRecordingTo outputFileURL: URL, from connections: [AVCaptureConnection], error: Error?) { guard !self.didResume else { return } self.didResume = true if let error { let ns = error as NSError if ns.domain == AVFoundationErrorDomain, ns.code == AVError.maximumDurationReached.rawValue { self.continuation.resume(returning: outputFileURL) return } self.continuation.resume(throwing: error) return } self.continuation.resume(returning: outputFileURL) } }