Merge origin/main

This commit is contained in:
Peter Steinberger
2025-12-14 00:52:40 +00:00
34 changed files with 1862 additions and 81 deletions

View File

@@ -13,6 +13,7 @@ final class BridgeConnectionController: ObservableObject {
private weak var appModel: NodeAppModel?
private var cancellables = Set<AnyCancellable>()
private var didAutoConnect = false
private var seenStableIDs = Set<String>()
init(appModel: NodeAppModel) {
self.appModel = appModel
@@ -23,6 +24,7 @@ final class BridgeConnectionController: ObservableObject {
.sink { [weak self] newValue in
guard let self else { return }
self.bridges = newValue
self.updateLastDiscoveredBridge(from: newValue)
self.maybeAutoConnect()
}
.store(in: &self.cancellables)
@@ -50,9 +52,9 @@ final class BridgeConnectionController: ObservableObject {
guard appModel.bridgeServerName == nil else { return }
let defaults = UserDefaults.standard
let preferredStableID = defaults.string(forKey: "bridge.preferredStableID")?
let targetStableID = defaults.string(forKey: "bridge.lastDiscoveredStableID")?
.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
guard !preferredStableID.isEmpty else { return }
guard !targetStableID.isEmpty else { return }
let instanceId = defaults.string(forKey: "node.instanceId")?
.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
@@ -64,12 +66,20 @@ final class BridgeConnectionController: ObservableObject {
.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
guard !token.isEmpty else { return }
guard let target = self.bridges.first(where: { $0.stableID == preferredStableID }) else { return }
guard let target = self.bridges.first(where: { $0.stableID == targetStableID }) else { return }
self.didAutoConnect = true
appModel.connectToBridge(endpoint: target.endpoint, hello: self.makeHello(token: token))
}
private func updateLastDiscoveredBridge(from bridges: [BridgeDiscoveryModel.DiscoveredBridge]) {
let newlyDiscovered = bridges.filter { self.seenStableIDs.insert($0.stableID).inserted }
guard let last = newlyDiscovered.last else { return }
UserDefaults.standard.set(last.stableID, forKey: "bridge.lastDiscoveredStableID")
BridgeSettingsStore.saveLastDiscoveredBridgeStableID(last.stableID)
}
private func makeHello(token: String) -> BridgeHello {
let defaults = UserDefaults.standard
let nodeId = defaults.string(forKey: "node.instanceId") ?? "ios-node"

View File

@@ -6,13 +6,16 @@ enum BridgeSettingsStore {
private static let instanceIdDefaultsKey = "node.instanceId"
private static let preferredBridgeStableIDDefaultsKey = "bridge.preferredStableID"
private static let lastDiscoveredBridgeStableIDDefaultsKey = "bridge.lastDiscoveredStableID"
private static let instanceIdAccount = "instanceId"
private static let preferredBridgeStableIDAccount = "preferredStableID"
private static let lastDiscoveredBridgeStableIDAccount = "lastDiscoveredStableID"
static func bootstrapPersistence() {
self.ensureStableInstanceID()
self.ensurePreferredBridgeStableID()
self.ensureLastDiscoveredBridgeStableID()
}
static func loadStableInstanceID() -> String? {
@@ -36,6 +39,18 @@ enum BridgeSettingsStore {
account: self.preferredBridgeStableIDAccount)
}
static func loadLastDiscoveredBridgeStableID() -> String? {
KeychainStore.loadString(service: self.bridgeService, account: self.lastDiscoveredBridgeStableIDAccount)?
.trimmingCharacters(in: .whitespacesAndNewlines)
}
static func saveLastDiscoveredBridgeStableID(_ stableID: String) {
_ = KeychainStore.saveString(
stableID,
service: self.bridgeService,
account: self.lastDiscoveredBridgeStableIDAccount)
}
private static func ensureStableInstanceID() {
let defaults = UserDefaults.standard
@@ -76,4 +91,22 @@ enum BridgeSettingsStore {
defaults.set(stored, forKey: self.preferredBridgeStableIDDefaultsKey)
}
}
private static func ensureLastDiscoveredBridgeStableID() {
let defaults = UserDefaults.standard
if let existing = defaults.string(forKey: self.lastDiscoveredBridgeStableIDDefaultsKey)?
.trimmingCharacters(in: .whitespacesAndNewlines),
!existing.isEmpty
{
if self.loadLastDiscoveredBridgeStableID() == nil {
self.saveLastDiscoveredBridgeStableID(existing)
}
return
}
if let stored = self.loadLastDiscoveredBridgeStableID(), !stored.isEmpty {
defaults.set(stored, forKey: self.lastDiscoveredBridgeStableIDDefaultsKey)
}
}
}

View File

@@ -0,0 +1,319 @@
import AVFoundation
import ClawdisKit
import Foundation
import UIKit
actor CameraController {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case invalidParams(String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .invalidParams(msg):
msg
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
func snap(params: ClawdisCameraSnapParams) async throws -> (
format: String,
base64: String,
width: Int,
height: Int)
{
let facing = params.facing ?? .front
let maxWidth = params.maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(params.quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
let (finalData, size) = try Self.reencodeJPEG(
imageData: rawData,
maxWidth: maxWidth,
quality: quality)
return (
format: "jpg",
base64: finalData.base64EncodedString(),
width: Int(size.width.rounded()),
height: Int(size.height.rounded()))
}
func clip(params: ClawdisCameraClipParams) async throws -> (
format: String,
base64: String,
durationMs: Int,
hasAudio: Bool)
{
let facing = params.facing ?? .front
let durationMs = Self.clampDurationMs(params.durationMs)
let includeAudio = params.includeAudio ?? true
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
if session.canAddInput(micInput) {
session.addInput(micInput)
} else {
throw CameraError.captureFailed("Failed to add microphone input")
}
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let movURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
let mp4URL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
defer {
try? FileManager.default.removeItem(at: movURL)
try? FileManager.default.removeItem(at: mp4URL)
}
let recordedURL: URL = try await withCheckedThrowingContinuation { cont in
let delegate = MovieFileDelegate(cont)
output.startRecording(to: movURL, recordingDelegate: delegate)
}
// Transcode .mov -> .mp4 for easier downstream handling.
try await Self.exportToMP4(inputURL: recordedURL, outputURL: mp4URL)
let data = try Data(contentsOf: mp4URL)
return (format: "mp4", base64: data.base64EncodedString(), durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: ClawdisCameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
return AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
// Keep clips short by default; avoid huge base64 payloads on the bridge.
return min(15000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let image = UIImage(data: imageData) else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: UIImage = if let maxWidth, maxWidth > 0 {
Self.downscale(image: image, maxWidth: CGFloat(maxWidth))
} else {
image
}
guard let out = finalImage.jpegData(compressionQuality: quality) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out, finalImage.size)
}
private nonisolated static func downscale(image: UIImage, maxWidth: CGFloat) -> UIImage {
let w = image.size.width
let h = image.size.height
guard w > 0, h > 0 else { return image }
guard w > maxWidth else { return image }
let scale = maxWidth / w
let target = CGSize(width: maxWidth, height: max(1, h * scale))
let format = UIGraphicsImageRendererFormat.default()
format.opaque = false
let renderer = UIGraphicsImageRenderer(size: target, format: format)
return renderer.image { _ in
image.draw(in: CGRect(origin: .zero, size: target))
}
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetHighestQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
exporter.outputURL = outputURL
exporter.outputFileType = .mp4
exporter.shouldOptimizeForNetworkUse = true
try await withCheckedThrowingContinuation(isolation: nil) { cont in
exporter.exportAsynchronously {
switch exporter.status {
case .completed:
cont.resume(returning: ())
case .failed:
cont.resume(throwing: exporter.error ?? CameraError.exportFailed("Export failed"))
case .cancelled:
cont.resume(throwing: CameraError.exportFailed("Export cancelled"))
default:
cont.resume(throwing: CameraError.exportFailed("Export did not complete"))
}
}
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private let continuation: CheckedContinuation<Data, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<Data, Error>) {
self.continuation = continuation
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
self.continuation.resume(
throwing: NSError(domain: "Camera", code: 1, userInfo: [
NSLocalizedDescriptionKey: "photo data missing",
]))
return
}
self.continuation.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private let continuation: CheckedContinuation<URL, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<URL, Error>) {
self.continuation = continuation
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
self.continuation.resume(returning: outputFileURL)
}
}

View File

@@ -26,6 +26,8 @@
</array>
<key>NSLocalNetworkUsageDescription</key>
<string>Clawdis discovers and connects to your Clawdis bridge on the local network.</string>
<key>NSCameraUsageDescription</key>
<string>Clawdis can capture photos or short video clips when requested via the bridge.</string>
<key>NSMicrophoneUsageDescription</key>
<string>Clawdis needs microphone access for voice wake.</string>
<key>NSSpeechRecognitionUsageDescription</key>

View File

@@ -6,6 +6,7 @@ import SwiftUI
final class NodeAppModel: ObservableObject {
@Published var isBackgrounded: Bool = false
let screen = ScreenController()
let camera = CameraController()
@Published var bridgeStatusText: String = "Not connected"
@Published var bridgeServerName: String?
@Published var bridgeRemoteAddress: String?
@@ -182,13 +183,22 @@ final class NodeAppModel: ObservableObject {
}
private func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse {
if req.command.hasPrefix("screen."), self.isBackgrounded {
if req.command.hasPrefix("screen.") || req.command.hasPrefix("camera."), self.isBackgrounded {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .backgroundUnavailable,
message: "NODE_BACKGROUND_UNAVAILABLE: screen commands require foreground"))
message: "NODE_BACKGROUND_UNAVAILABLE: screen/camera commands require foreground"))
}
if req.command.hasPrefix("camera."), !self.isCameraEnabled() {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .unavailable,
message: "CAMERA_DISABLED: enable Camera in iOS Settings → Camera → Allow Camera"))
}
do {
@@ -222,6 +232,46 @@ final class NodeAppModel: ObservableObject {
let payload = try Self.encodePayload(["format": "png", "base64": base64])
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.snap.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraSnapParams.self, from: req.paramsJSON)) ??
ClawdisCameraSnapParams()
let res = try await self.camera.snap(params: params)
struct Payload: Codable {
var format: String
var base64: String
var width: Int
var height: Int
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
width: res.width,
height: res.height))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.clip.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraClipParams.self, from: req.paramsJSON)) ??
ClawdisCameraClipParams()
let suspended = (params.includeAudio ?? true) ? self.voiceWake.suspendForExternalAudioCapture() : false
defer { self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: suspended) }
let res = try await self.camera.clip(params: params)
struct Payload: Codable {
var format: String
var base64: String
var durationMs: Int
var hasAudio: Bool
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
durationMs: res.durationMs,
hasAudio: res.hasAudio))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
default:
return BridgeInvokeResponse(
id: req.id,
@@ -254,4 +304,10 @@ final class NodeAppModel: ObservableObject {
}
return json
}
private func isCameraEnabled() -> Bool {
// Default-on: if the key doesn't exist yet, treat it as enabled.
if UserDefaults.standard.object(forKey: "camera.enabled") == nil { return true }
return UserDefaults.standard.bool(forKey: "camera.enabled")
}
}

View File

@@ -2,6 +2,7 @@ import SwiftUI
struct RootTabs: View {
@EnvironmentObject private var appModel: NodeAppModel
@State private var isConnectingPulse: Bool = false
var body: some View {
TabView {
@@ -27,12 +28,18 @@ struct RootTabs: View {
radius: self.settingsIndicatorGlowRadius,
x: 0,
y: 0)
.scaleEffect(self.settingsIndicatorScale)
.opacity(self.settingsIndicatorOpacity)
.offset(x: 7, y: -2)
}
Text("Settings")
}
}
}
.onAppear { self.updateConnectingPulse(for: self.bridgeIndicatorState) }
.onChange(of: self.bridgeIndicatorState) { _, newValue in
self.updateConnectingPulse(for: newValue)
}
}
private enum BridgeIndicatorState {
@@ -74,9 +81,31 @@ struct RootTabs: View {
case .connected:
6
case .connecting:
4
self.isConnectingPulse ? 6 : 3
case .disconnected:
0
}
}
private var settingsIndicatorScale: CGFloat {
guard self.bridgeIndicatorState == .connecting else { return 1 }
return self.isConnectingPulse ? 1.12 : 0.96
}
private var settingsIndicatorOpacity: Double {
guard self.bridgeIndicatorState == .connecting else { return 1 }
return self.isConnectingPulse ? 1.0 : 0.75
}
private func updateConnectingPulse(for state: BridgeIndicatorState) {
guard state == .connecting else {
withAnimation(.easeOut(duration: 0.2)) { self.isConnectingPulse = false }
return
}
guard !self.isConnectingPulse else { return }
withAnimation(.easeInOut(duration: 0.9).repeatForever(autoreverses: true)) {
self.isConnectingPulse = true
}
}
}

View File

@@ -19,6 +19,7 @@ struct SettingsTab: View {
@AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false
@AppStorage("camera.enabled") private var cameraEnabled: Bool = true
@AppStorage("bridge.preferredStableID") private var preferredBridgeStableID: String = ""
@AppStorage("bridge.lastDiscoveredStableID") private var lastDiscoveredBridgeStableID: String = ""
@StateObject private var connectStatus = ConnectStatusStore()
@State private var connectingBridgeID: String?
@State private var localIPAddress: String?
@@ -207,6 +208,8 @@ struct SettingsTab: View {
self.connectingBridgeID = bridge.id
self.preferredBridgeStableID = bridge.stableID
BridgeSettingsStore.savePreferredBridgeStableID(bridge.stableID)
self.lastDiscoveredBridgeStableID = bridge.stableID
BridgeSettingsStore.saveLastDiscoveredBridgeStableID(bridge.stableID)
defer { self.connectingBridgeID = nil }
do {

View File

@@ -205,6 +205,37 @@ final class VoiceWakeManager: NSObject, ObservableObject {
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
}
/// Temporarily releases the microphone so other subsystems (e.g. camera video capture) can record audio.
/// Returns `true` when listening was active and was suspended.
func suspendForExternalAudioCapture() -> Bool {
guard self.isEnabled, self.isListening else { return false }
self.isListening = false
self.statusText = "Paused"
self.tapDrainTask?.cancel()
self.tapDrainTask = nil
self.tapQueue?.clear()
self.tapQueue = nil
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest = nil
if self.audioEngine.isRunning {
self.audioEngine.stop()
self.audioEngine.inputNode.removeTap(onBus: 0)
}
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
return true
}
func resumeAfterExternalAudioCapture(wasSuspended: Bool) {
guard wasSuspended else { return }
Task { await self.start() }
}
private func startRecognition() throws {
self.recognitionTask?.cancel()
self.recognitionTask = nil

View File

@@ -54,5 +54,6 @@ targets:
NSLocalNetworkUsageDescription: Clawdis discovers and connects to your Clawdis bridge on the local network.
NSBonjourServices:
- _clawdis-bridge._tcp
NSCameraUsageDescription: Clawdis can capture photos or short video clips when requested via the bridge.
NSMicrophoneUsageDescription: Clawdis needs microphone access for voice wake.
NSSpeechRecognitionUsageDescription: Clawdis uses on-device speech recognition for voice wake.

View File

@@ -0,0 +1,341 @@
import AVFoundation
import ClawdisIPC
import CoreGraphics
import Foundation
import ImageIO
import OSLog
import UniformTypeIdentifiers
actor CameraCaptureService {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "camera")
func snap(facing: CameraFacing?, maxWidth: Int?, quality: Double?) async throws -> (data: Data, size: CGSize) {
let facing = facing ?? .front
let maxWidth = maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
return try Self.reencodeJPEG(imageData: rawData, maxWidth: maxWidth, quality: quality)
}
func clip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async throws -> (path: String, durationMs: Int, hasAudio: Bool)
{
let facing = facing ?? .front
let durationMs = Self.clampDurationMs(durationMs)
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
guard session.canAddInput(micInput) else {
throw CameraError.captureFailed("Failed to add microphone input")
}
session.addInput(micInput)
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let tmpMovURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
defer { try? FileManager.default.removeItem(at: tmpMovURL) }
let outputURL: URL = {
if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
return URL(fileURLWithPath: outPath)
}
return FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
}()
// Ensure we don't fail exporting due to an existing file.
try? FileManager.default.removeItem(at: outputURL)
let logger = self.logger
let recordedURL: URL = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.startRecording(to: tmpMovURL, recordingDelegate: MovieFileDelegate(cont, logger: logger))
}
try await Self.exportToMP4(inputURL: recordedURL, outputURL: outputURL)
return (path: outputURL.path, durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: CameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) {
return device
}
// Many macOS cameras report `unspecified` position; fall back to any default.
return AVCaptureDevice.default(for: .video)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
return min(15_000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let src = CGImageSourceCreateWithData(imageData as CFData, nil),
let img = CGImageSourceCreateImageAtIndex(src, 0, nil)
else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: CGImage
if let maxWidth, img.width > maxWidth {
guard let scaled = self.downscale(image: img, maxWidth: maxWidth) else {
throw CameraError.captureFailed("Failed to downscale image")
}
finalImage = scaled
} else {
finalImage = img
}
let out = NSMutableData()
guard let dest = CGImageDestinationCreateWithData(out, UTType.jpeg.identifier as CFString, 1, nil) else {
throw CameraError.captureFailed("Failed to create JPEG destination")
}
let props = [kCGImageDestinationLossyCompressionQuality: quality] as CFDictionary
CGImageDestinationAddImage(dest, finalImage, props)
guard CGImageDestinationFinalize(dest) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out as Data, CGSize(width: finalImage.width, height: finalImage.height))
}
private nonisolated static func downscale(image: CGImage, maxWidth: Int) -> CGImage? {
guard image.width > 0, image.height > 0 else { return image }
guard image.width > maxWidth else { return image }
let scale = Double(maxWidth) / Double(image.width)
let targetW = maxWidth
let targetH = max(1, Int((Double(image.height) * scale).rounded()))
let cs = CGColorSpaceCreateDeviceRGB()
let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue
guard let ctx = CGContext(
data: nil,
width: targetW,
height: targetH,
bitsPerComponent: 8,
bytesPerRow: 0,
space: cs,
bitmapInfo: bitmapInfo)
else { return nil }
ctx.interpolationQuality = .high
ctx.draw(image, in: CGRect(x: 0, y: 0, width: targetW, height: targetH))
return ctx.makeImage()
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let export = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetMediumQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
export.outputURL = outputURL
export.outputFileType = .mp4
export.shouldOptimizeForNetworkUse = true
await withCheckedContinuation { cont in
export.exportAsynchronously {
cont.resume()
}
}
switch export.status {
case .completed:
return
case .failed:
throw CameraError.exportFailed(export.error?.localizedDescription ?? "export failed")
case .cancelled:
throw CameraError.exportFailed("export cancelled")
default:
throw CameraError.exportFailed("export did not complete (\(export.status.rawValue))")
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private var cont: CheckedContinuation<Data, Error>?
init(_ cont: CheckedContinuation<Data, Error>) {
self.cont = cont
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
cont.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
cont.resume(throwing: CameraCaptureService.CameraError.captureFailed("No photo data"))
return
}
cont.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private var cont: CheckedContinuation<URL, Error>?
private let logger: Logger
init(_ cont: CheckedContinuation<URL, Error>, logger: Logger) {
self.cont = cont
self.logger = logger
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
let ns = error as NSError
if ns.domain == AVFoundationErrorDomain,
ns.code == AVError.maximumDurationReached.rawValue
{
cont.resume(returning: outputFileURL)
return
}
self.logger.error("camera record failed: \(error.localizedDescription, privacy: .public)")
cont.resume(throwing: error)
return
}
cont.resume(returning: outputFileURL)
}
}

View File

@@ -24,6 +24,7 @@ let webChatEnabledKey = "clawdis.webChatEnabled"
let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled"
let webChatPortKey = "clawdis.webChatPort"
let canvasEnabledKey = "clawdis.canvasEnabled"
let cameraEnabledKey = "clawdis.cameraEnabled"
let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled"
let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled"
let deepLinkKeyKey = "clawdis.deepLinkKey"

View File

@@ -3,6 +3,8 @@ import Foundation
import OSLog
enum ControlRequestHandler {
private static let cameraCapture = CameraCaptureService()
static func process(
request: Request,
notifier: NotificationManager = NotificationManager(),
@@ -77,6 +79,16 @@ enum ControlRequestHandler {
command: command,
paramsJSON: paramsJSON,
logger: logger)
case let .cameraSnap(facing, maxWidth, quality, outPath):
return await self.handleCameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
return await self.handleCameraClip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
}
}
@@ -173,6 +185,10 @@ enum ControlRequestHandler {
UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true
}
private static func cameraEnabled() -> Bool {
UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false
}
private static func handleCanvasShow(
session: String,
path: String?,
@@ -254,4 +270,46 @@ enum ControlRequestHandler {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCameraSnap(
facing: CameraFacing?,
maxWidth: Int?,
quality: Double?,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.snap(facing: facing, maxWidth: maxWidth, quality: quality)
let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
URL(fileURLWithPath: outPath)
} else {
FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).jpg")
}
try res.data.write(to: url, options: [.atomic])
return Response(ok: true, message: url.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCameraClip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.clip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
return Response(ok: true, message: res.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
}

View File

@@ -9,6 +9,7 @@ struct DebugSettings: View {
@AppStorage(modelCatalogReloadKey) private var modelCatalogReloadBump: Int = 0
@AppStorage(iconOverrideKey) private var iconOverrideRaw: String = IconOverrideSelection.system.rawValue
@AppStorage(canvasEnabledKey) private var canvasEnabled: Bool = true
@AppStorage(cameraEnabledKey) private var cameraEnabled: Bool = false
@AppStorage(deepLinkAgentEnabledKey) private var deepLinkAgentEnabled: Bool = false
@State private var modelsCount: Int?
@State private var modelsLoading = false
@@ -48,6 +49,7 @@ struct DebugSettings: View {
self.pathsSection
self.quickActionsSection
self.canvasSection
self.cameraSection
self.experimentsSection
Spacer(minLength: 0)
@@ -571,6 +573,20 @@ struct DebugSettings: View {
}
}
private var cameraSection: some View {
GroupBox("Camera") {
VStack(alignment: .leading, spacing: 10) {
Toggle("Allow Camera (agent)", isOn: self.$cameraEnabled)
.toggleStyle(.checkbox)
.help("When off, camera requests return “Camera disabled by user”.")
Text("Allows Clawdis to capture a photo or short video via the built-in camera.")
.font(.caption)
.foregroundStyle(.secondary)
}
}
}
private var experimentsSection: some View {
GroupBox("Experiments") {
Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) {

View File

@@ -52,6 +52,7 @@ struct ClawdisCLI {
enum Kind {
case generic
case mediaPath
}
}
@@ -91,6 +92,9 @@ struct ClawdisCLI {
case "canvas":
return try self.parseCanvas(args: &args)
case "camera":
return try self.parseCamera(args: &args)
default:
throw CLIError.help
}
@@ -292,6 +296,62 @@ struct ClawdisCLI {
}
}
private static func parseCamera(args: inout [String]) throws -> ParsedCLIRequest {
guard let sub = args.popFirst() else { throw CLIError.help }
switch sub {
case "snap":
var facing: CameraFacing?
var maxWidth: Int?
var quality: Double?
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--max-width":
maxWidth = args.popFirst().flatMap(Int.init)
case "--quality":
quality = args.popFirst().flatMap(Double.init)
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath),
kind: .mediaPath)
case "clip":
var facing: CameraFacing?
var durationMs: Int?
var includeAudio = true
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--duration-ms":
durationMs = args.popFirst().flatMap(Int.init)
case "--no-audio":
includeAudio = false
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath),
kind: .mediaPath)
default:
throw CLIError.help
}
}
private static func parseCanvasPlacement(
args: inout [String],
session: inout String,
@@ -334,6 +394,10 @@ struct ClawdisCLI {
if let message = response.message, !message.isEmpty {
FileHandle.standardOutput.write(Data((message + "\n").utf8))
}
case .mediaPath:
if let message = response.message, !message.isEmpty {
print("MEDIA:\(message)")
}
}
}
@@ -352,6 +416,8 @@ struct ClawdisCLI {
output["payload"] = text
}
}
case .mediaPath:
break
}
let json = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted])
@@ -406,6 +472,10 @@ struct ClawdisCLI {
clawdis-mac canvas eval --js <code> [--session <key>]
clawdis-mac canvas snapshot [--out <path>] [--session <key>]
Camera:
clawdis-mac camera snap [--facing <front|back>] [--max-width <px>] [--quality <0-1>] [--out <path>]
clawdis-mac camera clip [--facing <front|back>] [--duration-ms <ms>] [--no-audio] [--out <path>]
Browser (clawd):
clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot
@@ -433,6 +503,7 @@ struct ClawdisCLI {
Output:
Default output is text. Use --json for machine-readable output.
In text mode, `browser screenshot` prints MEDIA:<path>.
In text mode, `camera snap` and `camera clip` print MEDIA:<path>.
"""
print(usage)
}

View File

@@ -13,6 +13,11 @@ public enum Capability: String, Codable, CaseIterable, Sendable {
case speechRecognition
}
public enum CameraFacing: String, Codable, Sendable {
case front
case back
}
// MARK: - Requests
/// Notification interruption level (maps to UNNotificationInterruptionLevel)
@@ -74,6 +79,8 @@ public enum Request: Sendable {
case canvasSnapshot(session: String, outPath: String?)
case nodeList
case nodeInvoke(nodeId: String, command: String, paramsJSON: String?)
case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?)
case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?)
}
// MARK: - Responses
@@ -104,6 +111,11 @@ extension Request: Codable {
case path
case javaScript
case outPath
case facing
case maxWidth
case quality
case durationMs
case includeAudio
case placement
case nodeId
case nodeCommand
@@ -124,6 +136,8 @@ extension Request: Codable {
case canvasSnapshot
case nodeList
case nodeInvoke
case cameraSnap
case cameraClip
}
public func encode(to encoder: Encoder) throws {
@@ -198,6 +212,20 @@ extension Request: Codable {
try container.encode(nodeId, forKey: .nodeId)
try container.encode(command, forKey: .nodeCommand)
try container.encodeIfPresent(paramsJSON, forKey: .paramsJSON)
case let .cameraSnap(facing, maxWidth, quality, outPath):
try container.encode(Kind.cameraSnap, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(maxWidth, forKey: .maxWidth)
try container.encodeIfPresent(quality, forKey: .quality)
try container.encodeIfPresent(outPath, forKey: .outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
try container.encode(Kind.cameraClip, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(durationMs, forKey: .durationMs)
try container.encode(includeAudio, forKey: .includeAudio)
try container.encodeIfPresent(outPath, forKey: .outPath)
}
}
@@ -274,6 +302,20 @@ extension Request: Codable {
let command = try container.decode(String.self, forKey: .nodeCommand)
let paramsJSON = try container.decodeIfPresent(String.self, forKey: .paramsJSON)
self = .nodeInvoke(nodeId: nodeId, command: command, paramsJSON: paramsJSON)
case .cameraSnap:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let maxWidth = try container.decodeIfPresent(Int.self, forKey: .maxWidth)
let quality = try container.decodeIfPresent(Double.self, forKey: .quality)
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case .cameraClip:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs)
let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath)
}
}
}

View File

@@ -0,0 +1,62 @@
import ClawdisIPC
import Foundation
import Testing
@Suite struct CameraIPCTests {
@Test func cameraSnapCodableRoundtrip() throws {
let req: Request = .cameraSnap(
facing: .front,
maxWidth: 640,
quality: 0.85,
outPath: "/tmp/test.jpg")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraSnap(facing, maxWidth, quality, outPath):
#expect(facing == .front)
#expect(maxWidth == 640)
#expect(quality == 0.85)
#expect(outPath == "/tmp/test.jpg")
default:
Issue.record("expected cameraSnap, got \(decoded)")
}
}
@Test func cameraClipCodableRoundtrip() throws {
let req: Request = .cameraClip(
facing: .back,
durationMs: 3000,
includeAudio: false,
outPath: "/tmp/test.mp4")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraClip(facing, durationMs, includeAudio, outPath):
#expect(facing == .back)
#expect(durationMs == 3000)
#expect(includeAudio == false)
#expect(outPath == "/tmp/test.mp4")
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
@Test func cameraClipDefaultsIncludeAudioToTrueWhenMissing() throws {
let json = """
{"type":"cameraClip","durationMs":1234}
"""
let decoded = try JSONDecoder().decode(Request.self, from: Data(json.utf8))
switch decoded {
case let .cameraClip(_, durationMs, includeAudio, _):
#expect(durationMs == 1234)
#expect(includeAudio == true)
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
}

View File

@@ -0,0 +1,58 @@
import Foundation
public enum ClawdisCameraCommand: String, Codable, Sendable {
case snap = "camera.snap"
case clip = "camera.clip"
}
public enum ClawdisCameraFacing: String, Codable, Sendable {
case back
case front
}
public enum ClawdisCameraImageFormat: String, Codable, Sendable {
case jpg
case jpeg
}
public enum ClawdisCameraVideoFormat: String, Codable, Sendable {
case mp4
}
public struct ClawdisCameraSnapParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var maxWidth: Int?
public var quality: Double?
public var format: ClawdisCameraImageFormat?
public init(
facing: ClawdisCameraFacing? = nil,
maxWidth: Int? = nil,
quality: Double? = nil,
format: ClawdisCameraImageFormat? = nil)
{
self.facing = facing
self.maxWidth = maxWidth
self.quality = quality
self.format = format
}
}
public struct ClawdisCameraClipParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var durationMs: Int?
public var includeAudio: Bool?
public var format: ClawdisCameraVideoFormat?
public init(
facing: ClawdisCameraFacing? = nil,
durationMs: Int? = nil,
includeAudio: Bool? = nil,
format: ClawdisCameraVideoFormat? = nil)
{
self.facing = facing
self.durationMs = durationMs
self.includeAudio = includeAudio
self.format = format
}
}

View File

@@ -9,7 +9,7 @@ read_when:
## What Clawdis Does
- Runs WhatsApp gateway + Pi coding agent so the assistant can read/write chats, fetch context, and run tools via the host Mac.
- macOS app manages permissions (screen recording, notifications, microphone) and exposes a CLI helper `clawdis-mac` for scripts.
- Sessions are per-sender; heartbeats keep background tasks alive.
- Direct chats collapse into the shared `main` session by default; groups stay isolated as `group:<jid>`; heartbeats keep background tasks alive.
## Core Tools (enable in Settings → Tools)
- **mcporter** — MCP runtime/CLI to list, call, and sync Model Context Protocol servers.

View File

@@ -122,8 +122,8 @@
<span class="footer__sep">·</span>
<a href="https://github.com/steipete/clawdis">source</a>
<span class="footer__sep">·</span>
<a href="https://www.npmjs.com/package/clawdis">npm</a>
</div>
<a href="https://github.com/steipete/clawdis/releases">releases</a>
</div>
<div class="footer__hint" aria-hidden="true">
tip: press <kbd>F2</kbd> (Mac: <kbd>fn</kbd>+<kbd>F2</kbd>) to flip
the universe

98
docs/camera.md Normal file
View File

@@ -0,0 +1,98 @@
---
summary: "Camera capture (iOS node + macOS app) for agent use: photos (jpg) and short video clips (mp4)"
read_when:
- Adding or modifying camera capture on iOS nodes or macOS
- Extending agent-accessible MEDIA temp-file workflows
---
# Camera capture (agent)
Clawdis supports **camera capture** for agent workflows:
- **iOS node** (paired via Gateway): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `node.invoke`.
- **macOS app** (local control socket): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `clawdis-mac`.
All camera access is gated behind **user-controlled settings**.
## iOS node
### User setting (default on)
- iOS Settings tab → **Camera****Allow Camera** (`camera.enabled`)
- Default: **on** (missing key is treated as enabled).
- When off: `camera.*` commands return `CAMERA_DISABLED`.
### Commands (via Gateway `node.invoke`)
- `camera.snap`
- Params:
- `facing`: `front|back` (default: `front`)
- `maxWidth`: number (optional)
- `quality`: `0..1` (optional; default `0.9`)
- `format`: currently `jpg`
- Response payload:
- `format: "jpg"`
- `base64: "<...>"`
- `width`, `height`
- `camera.clip`
- Params:
- `facing`: `front|back` (default: `front`)
- `durationMs`: number (default `3000`, clamped to a max)
- `includeAudio`: boolean (default `true`)
- `format`: currently `mp4`
- Response payload:
- `format: "mp4"`
- `base64: "<...>"`
- `durationMs`
- `hasAudio`
### Foreground requirement
Like `screen.*`, the iOS node only allows `camera.*` commands in the **foreground**. Background invocations return `NODE_BACKGROUND_UNAVAILABLE`.
### CLI helper (temp files + MEDIA)
The easiest way to get attachments is via the CLI helper, which writes decoded media to a temp file and prints `MEDIA:<path>`.
Examples:
```bash
clawdis nodes camera snap --node <id> # default: both front + back (2 MEDIA lines)
clawdis nodes camera snap --node <id> --facing front
clawdis nodes camera clip --node <id> --duration 3000
clawdis nodes camera clip --node <id> --no-audio
```
Notes:
- `nodes camera snap` defaults to **both** facings to give the agent both views.
- Output files are temporary (in the OS temp directory) unless you build your own wrapper.
## macOS app
### User setting (default off)
The macOS companion app exposes a checkbox:
- **Settings → Debug → Camera → Allow Camera (agent)** (`clawdis.cameraEnabled`)
- Default: **off**
- When off: camera requests return “Camera disabled by user”.
### CLI helper (local control socket)
The `clawdis-mac` helper talks to the running menu bar app over the local control socket.
Examples:
```bash
clawdis-mac camera snap # prints MEDIA:<path>
clawdis-mac camera snap --max-width 1280
clawdis-mac camera clip --duration-ms 3000 # prints MEDIA:<path>
clawdis-mac camera clip --no-audio
```
## Safety + practical limits
- Camera and microphone access trigger the usual OS permission prompts (and require usage strings in Info.plist).
- Video clips are intentionally short to avoid oversized bridge payloads (base64 overhead + WebSocket message limits).

View File

@@ -24,9 +24,17 @@ Start conservative:
## Prerequisites
- Node **22+**
- CLAWDIS installed: `npm install -g clawdis`
- CLAWDIS available on PATH (recommended during development: from source + global link)
- A second phone number (SIM/eSIM/prepaid) for the assistant
From source (recommended while the npm package is still settling):
```bash
pnpm install
pnpm build
pnpm link --global
```
## The two-phone setup (recommended)
You want this:
@@ -121,7 +129,7 @@ Example:
## Sessions and memory
- Session files: `~/.clawdis/sessions/{{SessionId}}.jsonl`
- Session metadata (token usage, last route, etc): `~/.clawdis/sessions.json`
- Session metadata (token usage, last route, etc): `~/.clawdis/sessions/sessions.json` (legacy: `~/.clawdis/sessions.json`)
- `/new` starts a fresh session for that chat (configurable via `resetTriggers`)
## Heartbeats (proactive mode)

View File

@@ -5,9 +5,10 @@ read_when:
---
# Control channel API (newline-delimited JSON)
**Deprecated:** superseded by the WebSocket Gateway protocol (`clawdis gateway`, see `docs/architecture.md` and `docs/gateway.md`). Use only for legacy builds predating the Gateway rollout.
**Deprecated (historical):** superseded by the WebSocket Gateway protocol (`clawdis gateway`, see `docs/architecture.md` and `docs/gateway.md`).
Current builds use a WebSocket server on `ws://127.0.0.1:18789` and do **not** expose this TCP control channel.
Endpoint: `127.0.0.1:18789` (TCP, localhost only). Clients reach it via SSH port forward in remote mode.
Legacy endpoint (if present in an older build): `127.0.0.1:18789` (TCP, localhost only), typically reached via SSH port forward in remote mode.
## Frame format
Each line is a JSON object. Two shapes exist:
@@ -45,4 +46,4 @@ Each line is a JSON object. Two shapes exist:
4) For user toggles, send `set-heartbeats` and await response.
## Backward compatibility
- If the control port is unavailable (older gateway), the client may fall back to the legacy CLI path, but the intended path is to rely solely on this API.
- If the control channel is unavailable: thats expected on modern builds. Use the Gateway WS protocol instead.

View File

@@ -56,4 +56,4 @@ Notes:
## Known considerations
- Heartbeats are intentionally skipped for groups to avoid noisy broadcasts.
- Echo suppression uses the combined batch string; if you send identical text twice without mentions, only the first will get a response.
- Session store entries will appear as `group:<jid>` in `sessions.json`; a missing entry just means the group hasnt triggered a run yet.
- Session store entries will appear as `group:<jid>` in the session store (`~/.clawdis/sessions/sessions.json` by default); a missing entry just means the group hasnt triggered a run yet.

View File

@@ -16,7 +16,7 @@ Short guide to verify the WhatsApp Web / Baileys stack without guessing.
## Deep diagnostics
- Creds on disk: `ls -l ~/.clawdis/credentials/creds.json` (mtime should be recent).
- Session store: `ls -l ~/.clawdis/sessions.json` (path can be overridden in config). Count and recent recipients are surfaced via `status`.
- Session store: `ls -l ~/.clawdis/sessions/sessions.json` (legacy: `~/.clawdis/sessions.json`; path can be overridden in config). Count and recent recipients are surfaced via `status`.
- Relink flow: `clawdis logout && clawdis login --verbose` when status codes 409515 or `loggedOut` appear in logs.
## When something fails

View File

@@ -19,7 +19,7 @@ read_when:
<p align="center">
<a href="https://github.com/steipete/clawdis">GitHub</a> ·
<a href="https://www.npmjs.com/package/clawdis">npm</a> ·
<a href="https://github.com/steipete/clawdis/releases">Releases</a> ·
<a href="./clawd">Clawd setup</a>
</p>
@@ -29,25 +29,41 @@ Its built for [Clawd](https://clawd.me), a space lobster who needed a TARDIS.
## How it works
```
┌─────────────┐ ┌──────────┐ ┌─────────────┐
WhatsApp │ ───▶ │ CLAWDIS │ ───▶ │ AI Agent
Telegram │ ───▶ │ 🦞⏱️💙 │ ◀─── │ (Pi) │
(You) │ ◀─── │ │ │ │
└─────────────┘ └──────────┘ └─────────────┘
WhatsApp / Telegram
┌──────────────────────────┐
Gateway │ ws://127.0.0.1:18789 (loopback-only)
│ (single source) │ tcp://0.0.0.0:18790 (optional Bridge)
└───────────┬───────────────┘
├─ Pi agent (RPC)
├─ CLI (clawdis …)
├─ WebChat (loopback UI)
├─ macOS app (Clawdis.app)
└─ iOS node (Iris) via Bridge + pairing
```
Most operations flow through the **Gateway** (`clawdis gateway`), a single long-running process that owns provider connections and the WebSocket control plane.
## Network model
- **One Gateway per host**: it is the only process allowed to own the WhatsApp Web session.
- **Loopback-first**: Gateway WS is `ws://127.0.0.1:18789` (not exposed on the LAN).
- **Bridge for nodes**: optional LAN/tailnet-facing bridge on `tcp://0.0.0.0:18790` for paired nodes (Bonjour-discoverable).
- **Remote use**: SSH tunnel or tailnet/VPN; see `docs/remote.md` and `docs/discovery.md`.
## Features (high level)
- 📱 **WhatsApp Integration** — Uses Baileys for WhatsApp Web protocol
- ✈️ **Telegram Bot** — DMs + groups via grammY
- 🤖 **Agent bridge** — Pi (RPC mode) with tool streaming
- 💬 **Sessions**Per-sender (or shared `main`) conversation context
- 💬 **Sessions**Direct chats collapse into shared `main` (default); groups are isolated
- 👥 **Group Chat Support** — Mention-based triggering in group chats
- 📎 **Media Support** — Send and receive images, audio, documents
- 🎤 **Voice notes** — Optional transcription hook
- 🖥️ **WebChat + macOS app**A local UI + menu bar companion for ops and voice wake
- 🖥️ **WebChat + macOS app**Local UI + menu bar companion for ops and voice wake
- 📱 **iOS node (Iris)** — Pairs as a node and exposes a Canvas surface
Note: legacy Claude/Codex/Gemini/Opencode paths have been removed; Pi is the only coding-agent path.
@@ -56,8 +72,10 @@ Note: legacy Claude/Codex/Gemini/Opencode paths have been removed; Pi is the onl
Runtime requirement: **Node ≥ 22**.
```bash
# Install
npm install -g clawdis
# From source (recommended while the npm package is still settling)
pnpm install
pnpm build
pnpm link --global
# Pair WhatsApp Web (shows QR)
clawdis login
@@ -95,18 +113,23 @@ Example:
## Docs
- [Configuration](./configuration.md)
- [Gateway runbook](./gateway.md)
- [WebChat](./webchat.md)
- [Agent integration](./agents.md)
- [Telegram](./telegram.md)
- [Group messages](./group-messages.md)
- [Media: images](./images.md)
- [Media: audio](./audio.md)
- [Sessions](./session.md)
- [Cron + wakeups](./cron.md)
- [Security](./security.md)
- [Troubleshooting](./troubleshooting.md)
- Start here:
- [Configuration](./configuration.md)
- [Clawd personal assistant setup](./clawd.md)
- [Gateway runbook](./gateway.md)
- [Discovery + transports](./discovery.md)
- [Remote access](./remote.md)
- Providers and UX:
- [WebChat](./webchat.md)
- [Telegram](./telegram.md)
- [Group messages](./group-messages.md)
- [Media: images](./images.md)
- [Media: audio](./audio.md)
- Ops and safety:
- [Sessions](./session.md)
- [Cron + wakeups](./cron.md)
- [Security](./security.md)
- [Troubleshooting](./troubleshooting.md)
## The name

View File

@@ -54,13 +54,13 @@ More debugging notes: `docs/bonjour.md`.
In Iris:
- Pick the discovered bridge (or hit refresh).
- If not paired yet, Iris will initiate pairing automatically.
- After the first successful pairing, Iris will auto-reconnect to the **last bridge** on launch (including after reinstall), as long as the iOS Keychain entry is still present.
- After the first successful pairing, Iris will auto-reconnect **strictly to the last discovered gateway** on launch (including after reinstall), as long as the iOS Keychain entry is still present.
### Connection indicator (always visible)
The Settings tab icon shows a small status dot:
- **Green**: connected to the bridge
- **Yellow**: connecting
- **Yellow**: connecting (subtle pulse)
- **Red**: not connected / error
## 4) Approve pairing (CLI)

View File

@@ -10,7 +10,7 @@ Context: web chat currently lives in a WKWebView that loads the pi-web bundle. S
## Target state
- Gateway WS adds methods:
- `chat.history { sessionKey }``{ sessionKey, messages[], thinkingLevel }` (reads the existing JSONL + sessions.json).
- `chat.history { sessionKey }``{ sessionKey, messages[], thinkingLevel }` (reads the existing JSONL + session store).
- `chat.send { sessionKey, message, attachments?, thinking?, deliver?, timeoutMs<=30000, idempotencyKey }``res { runId, status:"accepted" }` or `res ok:false` on validation/timeout.
- Gateway WS emits `chat` events `{ runId, sessionKey, seq, state:"delta"|"final"|"error", message?, errorMessage?, usage?, stopReason? }`. Streaming is optional; minimum is a single `state:"final"` per send.
- Client consumes only WS: bootstrap via `chat.history`, send via `chat.send`, live updates via `chat` events. No file watchers.

View File

@@ -3,48 +3,50 @@ summary: "Remote mode topology using SSH control channels between gateway and ma
read_when:
- Running or troubleshooting remote gateway setups
---
# Remote mode with control channel
# Remote access (SSH, tunnels, and tailnets)
This repo supports “remote over SSH” by keeping a single gateway (the master) running on a host (e.g., your Mac Studio) and connecting one or more macOS menu bar clients to it. The menu app no longer shells out to `pnpm clawdis …`; it talks to the gateway over a persistent control channel that is tunneled through SSH.
This repo supports “remote over SSH” by keeping a single Gateway (the master) running on a host (e.g., your Mac Studio) and connecting clients to it.
Remote mode is the SSH fallback transport. As Clawdis adds a direct “bridge” transport for LAN/tailnet setups, SSH remains supported for universal reach.
See `docs/discovery.md` for how clients choose between direct vs SSH.
- For **operators (you / the macOS app)**: SSH tunneling is the universal fallback.
- For **nodes (Iris/iOS and future devices)**: prefer the Gateway **Bridge** when on the same LAN/tailnet (see `docs/discovery.md`).
## Topology
- Master: runs the gateway + control server on `127.0.0.1:18789` (in-process TCP server).
- Clients: when “Remote over SSH” is selected, the app opens one SSH tunnel:
- `ssh -N -L <localPort>:127.0.0.1:18789 <user>@<host>`
- The app then connects to `localhost:<localPort>` and keeps that socket open.
- Messages are newline-delimited JSON (documented in `docs/control-api.md`).
## The core idea
## Connection flow (clients)
1) Establish SSH tunnel.
2) Open TCP socket to the local forwarded port.
3) Send `ping` to verify connectivity.
4) Issue `health`, `status`, and `last-heartbeat` requests to seed UI.
5) Listen for `event` frames (heartbeat updates, gateway status).
- The Gateway WebSocket binds to **loopback**: `ws://127.0.0.1:18789`.
- For remote use, you forward that loopback port over SSH (or use a tailnet/VPN and tunnel less).
## Heartbeats
- Heartbeats always run on the master gateway.
- The control server emits `event: "heartbeat"` after each heartbeat attempt and keeps the latest in memory for `last-heartbeat` requests.
- No file-based heartbeat logs/state are required when the control stream is available.
## SSH tunnel (CLI + tools)
## Local mode
- The menu app skips SSH and connects directly to `127.0.0.1:18789` with the same protocol.
Create a local tunnel to the remote Gateway WS:
## Failure handling
- If the tunnel drops, the client reconnects and re-issues `ping`, `health`, and `last-heartbeat` to refresh state (the mac app shows “Control channel disconnected”).
- If the control port is unavailable (older gateway), the app can optionally fall back to the legacy CLI path, but the goal is to rely solely on the control channel.
```bash
ssh -N -L 18789:127.0.0.1:18789 user@host
```
## Test Remote (in the mac app)
1) SSH reachability check (`ssh -o BatchMode=yes … echo ok`).
2) If SSH succeeds, the app opens the control tunnel and issues a `health` request; success marks the remote as ready.
With the tunnel up:
- `clawdis health` and `clawdis status --deep` now reach the remote gateway via `ws://127.0.0.1:18789`.
- `clawdis gateway {status,health,send,agent,call}` can also target the forwarded URL via `--url` when needed.
## Security
- Control server listens only on localhost.
- SSH tunneling reuses existing keys/agent; no additional auth is added by the control server.
## WebChat over SSH
## Files to keep in sync
- Protocol definition: `docs/control-api.md`.
- App connection logic: macOS `Remote over SSH` plumbing.
- Gateway control server: lives inside the Node gateway process.
Forward both the WebChat HTTP port and the Gateway WS port:
```bash
ssh -N \
-L 18788:127.0.0.1:18788 \
-L 18789:127.0.0.1:18789 \
user@host
```
Then open `http://127.0.0.1:18788/webchat/` locally. (Details: `docs/webchat.md`.)
## macOS app “Remote over SSH”
The macOS menu bar app can drive the same setup end-to-end (remote status checks, WebChat, and Voice Wake forwarding).
Runbook: `docs/mac/remote.md`.
## Legacy control channel
Older builds experimented with a newline-delimited TCP control channel on the same port.
That API is deprecated and should not be relied on. (Historical reference: `docs/control-api.md`.)

View File

@@ -7,7 +7,7 @@ read_when:
Updated: 2025-12-07
Status: ready for bot-mode use with grammY (long-poll + webhook). Text + media send, proxy, and webhook helpers all ship in-tree.
Status: ready for bot-mode use with grammY (long-polling by default; webhook supported when configured). Text + media send, mention-gated group replies, and optional proxy support are implemented.
## Goals
- Let you talk to Clawdis via a Telegram bot in DMs and groups.
@@ -17,7 +17,11 @@ Status: ready for bot-mode use with grammY (long-poll + webhook). Text + media s
## How it will work (Bot API)
1) Create a bot with @BotFather and grab the token.
2) Configure Clawdis with `TELEGRAM_BOT_TOKEN` (or `telegram.botToken` in `~/.clawdis/clawdis.json`).
3) Run the gateway; it auto-starts Telegram when the bot token is set. To force Telegram-only: `clawdis gateway --provider telegram`. Webhook mode: `clawdis gateway --provider telegram --webhook --port 8787 --webhook-secret <secret>` (optionally `--webhook-url` when the public URL differs).
3) Run the gateway; it auto-starts Telegram when the bot token is set.
- **Long-polling** is the default.
- **Webhook mode** is enabled by setting `telegram.webhookUrl` (optionally `telegram.webhookSecret` / `telegram.webhookPath`).
- The webhook listener currently binds to `0.0.0.0:8787` and serves `POST /telegram-webhook` by default.
- If you need a different public port/host, set `telegram.webhookUrl` to the externally reachable URL and use a reverse proxy to forward to `:8787`.
4) Direct chats: user sends the first message; all subsequent turns land in the shared `main` session (default, no extra config).
5) Groups: add the bot, disable privacy mode (or make it admin) so it can read messages; group threads stay on `group:<chatId>` and require mention/command to trigger replies.
6) Optional allowlist: reuse `inbound.allowFrom` for direct chats by chat id (`123456789` or `telegram:123456789`).
@@ -32,7 +36,7 @@ Status: ready for bot-mode use with grammY (long-poll + webhook). Text + media s
- Library: grammY is the only client for send + gateway (fetch fallback removed); grammY throttler is enabled by default to stay under Bot API limits.
- Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, and `Timestamp`; groups require @bot mention by default.
- Outbound: text and media (photo/video/audio/document) with optional caption; chunked to limits. Typing cue sent best-effort.
- Config: `TELEGRAM_BOT_TOKEN` env or `telegram.botToken` required; `telegram.requireMention`, `telegram.allowFrom`, `telegram.mediaMaxMb`, `telegram.proxy`, `telegram.webhookSecret`, `telegram.webhookUrl` supported.
- Config: `TELEGRAM_BOT_TOKEN` env or `telegram.botToken` required; `telegram.requireMention`, `telegram.allowFrom`, `telegram.mediaMaxMb`, `telegram.proxy`, `telegram.webhookSecret`, `telegram.webhookUrl`, `telegram.webhookPath` supported.
Example config:
```json5
@@ -44,6 +48,7 @@ Example config:
mediaMaxMb: 5,
proxy: "socks5://localhost:9050",
webhookSecret: "mysecret",
webhookPath: "/telegram-webhook",
webhookUrl: "https://yourdomain.com/telegram-webhook"
}
}
@@ -62,6 +67,6 @@ Example config:
- ⏳ Add more grammY coverage (webhook payloads, media edge cases)
## Safety & ops
- Treat the bot token as a secret (equivalent to account control); store under `~/.clawdis/credentials/` with 0600 perms.
- Respect Telegram rate limits (429s); well add throttling in the provider to stay below flood thresholds.
- Treat the bot token as a secret (equivalent to account control); prefer `TELEGRAM_BOT_TOKEN` or a locked-down config file (`chmod 600 ~/.clawdis/clawdis.json`).
- Respect Telegram rate limits (429s); grammY throttling is enabled by default.
- Use a test bot for development to avoid hitting production chats.

View File

@@ -98,6 +98,8 @@ cat > "$APP_ROOT/Contents/Info.plist" <<PLIST
<string>Clawdis needs notification permission to show alerts for agent actions.</string>
<key>NSScreenCaptureDescription</key>
<string>Clawdis captures the screen when the agent needs screenshots for context.</string>
<key>NSCameraUsageDescription</key>
<string>Clawdis can capture photos or short video clips when requested by the agent.</string>
<key>NSMicrophoneUsageDescription</key>
<string>Clawdis needs the mic for Voice Wake tests and agent audio capture.</string>
<key>NSSpeechRecognitionUsageDescription</key>

View File

@@ -0,0 +1,64 @@
import * as fs from "node:fs/promises";
import * as os from "node:os";
import * as path from "node:path";
import { describe, expect, it } from "vitest";
import {
cameraTempPath,
parseCameraClipPayload,
parseCameraSnapPayload,
writeBase64ToFile,
} from "./nodes-camera.js";
describe("nodes camera helpers", () => {
it("parses camera.snap payload", () => {
expect(
parseCameraSnapPayload({
format: "jpg",
base64: "aGk=",
width: 10,
height: 20,
}),
).toEqual({ format: "jpg", base64: "aGk=", width: 10, height: 20 });
});
it("rejects invalid camera.snap payload", () => {
expect(() => parseCameraSnapPayload({ format: "jpg" })).toThrow(
/invalid camera\.snap payload/i,
);
});
it("parses camera.clip payload", () => {
expect(
parseCameraClipPayload({
format: "mp4",
base64: "AAEC",
durationMs: 1234,
hasAudio: true,
}),
).toEqual({
format: "mp4",
base64: "AAEC",
durationMs: 1234,
hasAudio: true,
});
});
it("builds stable temp paths when id provided", () => {
const p = cameraTempPath({
kind: "snap",
facing: "front",
ext: "jpg",
tmpDir: "/tmp",
id: "id1",
});
expect(p).toBe(path.join("/tmp", "clawdis-camera-snap-front-id1.jpg"));
});
it("writes base64 to file", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-test-"));
const out = path.join(dir, "x.bin");
await writeBase64ToFile(out, "aGk=");
await expect(fs.readFile(out, "utf8")).resolves.toBe("hi");
await fs.rm(dir, { recursive: true, force: true });
});
});

92
src/cli/nodes-camera.ts Normal file
View File

@@ -0,0 +1,92 @@
import { randomUUID } from "node:crypto";
import * as fs from "node:fs/promises";
import * as os from "node:os";
import * as path from "node:path";
export type CameraFacing = "front" | "back";
export type CameraSnapPayload = {
format: string;
base64: string;
width: number;
height: number;
};
export type CameraClipPayload = {
format: string;
base64: string;
durationMs: number;
hasAudio: boolean;
};
function asRecord(value: unknown): Record<string, unknown> {
return typeof value === "object" && value !== null
? (value as Record<string, unknown>)
: {};
}
function asString(value: unknown): string | undefined {
return typeof value === "string" ? value : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value)
? value
: undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
export function parseCameraSnapPayload(value: unknown): CameraSnapPayload {
const obj = asRecord(value);
const format = asString(obj.format);
const base64 = asString(obj.base64);
const width = asNumber(obj.width);
const height = asNumber(obj.height);
if (!format || !base64 || width === undefined || height === undefined) {
throw new Error("invalid camera.snap payload");
}
return { format, base64, width, height };
}
export function parseCameraClipPayload(value: unknown): CameraClipPayload {
const obj = asRecord(value);
const format = asString(obj.format);
const base64 = asString(obj.base64);
const durationMs = asNumber(obj.durationMs);
const hasAudio = asBoolean(obj.hasAudio);
if (
!format ||
!base64 ||
durationMs === undefined ||
hasAudio === undefined
) {
throw new Error("invalid camera.clip payload");
}
return { format, base64, durationMs, hasAudio };
}
export function cameraTempPath(opts: {
kind: "snap" | "clip";
facing?: CameraFacing;
ext: string;
tmpDir?: string;
id?: string;
}) {
const tmpDir = opts.tmpDir ?? os.tmpdir();
const id = opts.id ?? randomUUID();
const facingPart = opts.facing ? `-${opts.facing}` : "";
const ext = opts.ext.startsWith(".") ? opts.ext : `.${opts.ext}`;
return path.join(
tmpDir,
`clawdis-camera-${opts.kind}${facingPart}-${id}${ext}`,
);
}
export async function writeBase64ToFile(filePath: string, base64: string) {
const buf = Buffer.from(base64, "base64");
await fs.writeFile(filePath, buf);
return { path: filePath, bytes: buf.length };
}

View File

@@ -1,6 +1,13 @@
import type { Command } from "commander";
import { callGateway, randomIdempotencyKey } from "../gateway/call.js";
import { defaultRuntime } from "../runtime.js";
import {
type CameraFacing,
cameraTempPath,
parseCameraClipPayload,
parseCameraSnapPayload,
writeBase64ToFile,
} from "./nodes-camera.js";
type NodesRpcOpts = {
url?: string;
@@ -12,6 +19,11 @@ type NodesRpcOpts = {
params?: string;
invokeTimeout?: string;
idempotencyKey?: string;
facing?: string;
maxWidth?: string;
quality?: string;
duration?: string;
audio?: boolean;
};
type NodeListNode = {
@@ -340,4 +352,203 @@ export function registerNodesCli(program: Command) {
}),
{ timeoutMs: 30_000 },
);
const parseFacing = (value: string): CameraFacing => {
const v = String(value ?? "")
.trim()
.toLowerCase();
if (v === "front" || v === "back") return v;
throw new Error(`invalid facing: ${value} (expected front|back)`);
};
const camera = nodes
.command("camera")
.description("Capture camera media from a paired node");
nodesCallOpts(
camera
.command("snap")
.description("Capture a photo from a node camera (prints MEDIA:<path>)")
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back|both>", "Camera facing", "both")
.option("--max-width <px>", "Max width in px (optional)")
.option("--quality <0-1>", "JPEG quality (default 0.9)")
.option(
"--invoke-timeout <ms>",
"Node invoke timeout in ms (default 20000)",
"20000",
)
.action(async (opts: NodesRpcOpts) => {
try {
const nodeId = await resolveNodeId(opts, String(opts.node ?? ""));
const facingOpt = String(opts.facing ?? "both")
.trim()
.toLowerCase();
const facings: CameraFacing[] =
facingOpt === "both"
? ["front", "back"]
: facingOpt === "front" || facingOpt === "back"
? [facingOpt]
: (() => {
throw new Error(
`invalid facing: ${String(opts.facing)} (expected front|back|both)`,
);
})();
const maxWidth = opts.maxWidth
? Number.parseInt(String(opts.maxWidth), 10)
: undefined;
const quality = opts.quality
? Number.parseFloat(String(opts.quality))
: undefined;
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
const results: Array<{
facing: CameraFacing;
path: string;
width: number;
height: number;
}> = [];
for (const facing of facings) {
const invokeParams: Record<string, unknown> = {
nodeId,
command: "camera.snap",
params: {
facing,
maxWidth: Number.isFinite(maxWidth) ? maxWidth : undefined,
quality: Number.isFinite(quality) ? quality : undefined,
format: "jpg",
},
idempotencyKey: randomIdempotencyKey(),
};
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
invokeParams.timeoutMs = timeoutMs;
}
const raw = (await callGatewayCli(
"node.invoke",
opts,
invokeParams,
)) as unknown;
const res =
typeof raw === "object" && raw !== null
? (raw as { payload?: unknown })
: {};
const payload = parseCameraSnapPayload(res.payload);
const filePath = cameraTempPath({
kind: "snap",
facing,
ext: payload.format === "jpeg" ? "jpg" : payload.format,
});
await writeBase64ToFile(filePath, payload.base64);
results.push({
facing,
path: filePath,
width: payload.width,
height: payload.height,
});
}
if (opts.json) {
defaultRuntime.log(JSON.stringify({ files: results }, null, 2));
return;
}
defaultRuntime.log(results.map((r) => `MEDIA:${r.path}`).join("\n"));
} catch (err) {
defaultRuntime.error(`nodes camera snap failed: ${String(err)}`);
defaultRuntime.exit(1);
}
}),
{ timeoutMs: 60_000 },
);
nodesCallOpts(
camera
.command("clip")
.description(
"Capture a short video clip from a node camera (prints MEDIA:<path>)",
)
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back>", "Camera facing", "front")
.option("--duration <ms>", "Duration in ms (default 3000)", "3000")
.option("--no-audio", "Disable audio capture")
.option(
"--invoke-timeout <ms>",
"Node invoke timeout in ms (default 45000)",
"45000",
)
.action(async (opts: NodesRpcOpts & { audio?: boolean }) => {
try {
const nodeId = await resolveNodeId(opts, String(opts.node ?? ""));
const facing = parseFacing(String(opts.facing ?? "front"));
const durationMs = Number.parseInt(
String(opts.duration ?? "3000"),
10,
);
const includeAudio = opts.audio !== false;
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
const invokeParams: Record<string, unknown> = {
nodeId,
command: "camera.clip",
params: {
facing,
durationMs: Number.isFinite(durationMs) ? durationMs : undefined,
includeAudio,
format: "mp4",
},
idempotencyKey: randomIdempotencyKey(),
};
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
invokeParams.timeoutMs = timeoutMs;
}
const raw = (await callGatewayCli(
"node.invoke",
opts,
invokeParams,
)) as unknown;
const res =
typeof raw === "object" && raw !== null
? (raw as { payload?: unknown })
: {};
const payload = parseCameraClipPayload(res.payload);
const filePath = cameraTempPath({
kind: "clip",
facing,
ext: payload.format,
});
await writeBase64ToFile(filePath, payload.base64);
if (opts.json) {
defaultRuntime.log(
JSON.stringify(
{
file: {
facing,
path: filePath,
durationMs: payload.durationMs,
hasAudio: payload.hasAudio,
},
},
null,
2,
),
);
return;
}
defaultRuntime.log(`MEDIA:${filePath}`);
} catch (err) {
defaultRuntime.error(`nodes camera clip failed: ${String(err)}`);
defaultRuntime.exit(1);
}
}),
{ timeoutMs: 90_000 },
);
}

View File

@@ -1,3 +1,4 @@
import * as fs from "node:fs/promises";
import { beforeEach, describe, expect, it, vi } from "vitest";
const sendCommand = vi.fn();
@@ -148,4 +149,145 @@ describe("cli program", () => {
);
expect(runtime.log).toHaveBeenCalled();
});
it("runs nodes camera snap and prints two MEDIA paths", async () => {
callGateway
.mockResolvedValueOnce({
ts: Date.now(),
nodes: [
{
nodeId: "ios-node",
displayName: "iOS Node",
remoteIp: "192.168.0.88",
connected: true,
},
],
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.snap",
payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 },
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.snap",
payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 },
});
const program = buildProgram();
runtime.log.mockClear();
await program.parseAsync(
["nodes", "camera", "snap", "--node", "ios-node"],
{
from: "user",
},
);
expect(callGateway).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.snap",
timeoutMs: 20000,
idempotencyKey: "idem-test",
params: expect.objectContaining({ facing: "front", format: "jpg" }),
}),
}),
);
expect(callGateway).toHaveBeenNthCalledWith(
3,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.snap",
timeoutMs: 20000,
idempotencyKey: "idem-test",
params: expect.objectContaining({ facing: "back", format: "jpg" }),
}),
}),
);
const out = String(runtime.log.mock.calls[0]?.[0] ?? "");
const mediaPaths = out
.split("\n")
.filter((l) => l.startsWith("MEDIA:"))
.map((l) => l.replace(/^MEDIA:/, ""))
.filter(Boolean);
expect(mediaPaths).toHaveLength(2);
try {
for (const p of mediaPaths) {
await expect(fs.readFile(p, "utf8")).resolves.toBe("hi");
}
} finally {
await Promise.all(mediaPaths.map((p) => fs.unlink(p).catch(() => {})));
}
});
it("runs nodes camera clip and prints one MEDIA path", async () => {
callGateway
.mockResolvedValueOnce({
ts: Date.now(),
nodes: [
{
nodeId: "ios-node",
displayName: "iOS Node",
remoteIp: "192.168.0.88",
connected: true,
},
],
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.clip",
payload: {
format: "mp4",
base64: "aGk=",
durationMs: 3000,
hasAudio: true,
},
});
const program = buildProgram();
runtime.log.mockClear();
await program.parseAsync(
["nodes", "camera", "clip", "--node", "ios-node", "--duration", "3000"],
{ from: "user" },
);
expect(callGateway).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.clip",
timeoutMs: 45000,
idempotencyKey: "idem-test",
params: expect.objectContaining({
facing: "front",
durationMs: 3000,
includeAudio: true,
format: "mp4",
}),
}),
}),
);
const out = String(runtime.log.mock.calls[0]?.[0] ?? "");
const mediaPath = out.replace(/^MEDIA:/, "").trim();
expect(mediaPath).toMatch(/clawdis-camera-clip-front-.*\.mp4$/);
try {
await expect(fs.readFile(mediaPath, "utf8")).resolves.toBe("hi");
} finally {
await fs.unlink(mediaPath).catch(() => {});
}
});
});