feat: route mac control via nodes

This commit is contained in:
Peter Steinberger
2025-12-19 03:16:25 +01:00
parent 7f3be083c1
commit 74cdc1cf3e
15 changed files with 470 additions and 293 deletions

View File

@@ -40,6 +40,7 @@ class MainActivity : ComponentActivity() {
viewModel.camera.attachLifecycleOwner(this)
viewModel.camera.attachPermissionRequester(permissionRequester)
viewModel.screenRecorder.attachScreenCaptureRequester(screenCaptureRequester)
viewModel.screenRecorder.attachPermissionRequester(permissionRequester)
lifecycleScope.launch {
repeatOnLifecycle(Lifecycle.State.STARTED) {

View File

@@ -16,11 +16,16 @@ class ScreenRecordManager(private val context: Context) {
data class Payload(val payloadJson: String)
@Volatile private var screenCaptureRequester: ScreenCaptureRequester? = null
@Volatile private var permissionRequester: com.steipete.clawdis.node.PermissionRequester? = null
fun attachScreenCaptureRequester(requester: ScreenCaptureRequester) {
screenCaptureRequester = requester
}
fun attachPermissionRequester(requester: com.steipete.clawdis.node.PermissionRequester) {
permissionRequester = requester
}
suspend fun record(paramsJson: String?): Payload =
withContext(Dispatchers.Default) {
val requester =
@@ -33,6 +38,7 @@ class ScreenRecordManager(private val context: Context) {
val fps = (parseFps(paramsJson) ?: 10.0).coerceIn(1.0, 60.0)
val fpsInt = fps.roundToInt().coerceIn(1, 60)
val screenIndex = parseScreenIndex(paramsJson)
val includeAudio = parseIncludeAudio(paramsJson) ?: true
val format = parseString(paramsJson, key = "format")
if (format != null && format.lowercase() != "mp4") {
throw IllegalArgumentException("INVALID_REQUEST: screen format must be mp4")
@@ -57,12 +63,23 @@ class ScreenRecordManager(private val context: Context) {
val densityDpi = metrics.densityDpi
val file = File.createTempFile("clawdis-screen-", ".mp4")
if (includeAudio) ensureMicPermission()
val recorder = MediaRecorder()
var virtualDisplay: android.hardware.display.VirtualDisplay? = null
try {
if (includeAudio) {
recorder.setAudioSource(MediaRecorder.AudioSource.MIC)
}
recorder.setVideoSource(MediaRecorder.VideoSource.SURFACE)
recorder.setOutputFormat(MediaRecorder.OutputFormat.MPEG_4)
recorder.setVideoEncoder(MediaRecorder.VideoEncoder.H264)
if (includeAudio) {
recorder.setAudioEncoder(MediaRecorder.AudioEncoder.AAC)
recorder.setAudioChannels(1)
recorder.setAudioSamplingRate(44_100)
recorder.setAudioEncodingBitRate(96_000)
}
recorder.setVideoSize(width, height)
recorder.setVideoFrameRate(fpsInt)
recorder.setVideoEncodingBitRate(estimateBitrate(width, height, fpsInt))
@@ -100,10 +117,27 @@ class ScreenRecordManager(private val context: Context) {
file.delete()
val base64 = Base64.encodeToString(bytes, Base64.NO_WRAP)
Payload(
"""{"format":"mp4","base64":"$base64","durationMs":$durationMs,"fps":$fpsInt,"screenIndex":0}""",
"""{"format":"mp4","base64":"$base64","durationMs":$durationMs,"fps":$fpsInt,"screenIndex":0,"hasAudio":$includeAudio}""",
)
}
private suspend fun ensureMicPermission() {
val granted =
androidx.core.content.ContextCompat.checkSelfPermission(
context,
android.Manifest.permission.RECORD_AUDIO,
) == android.content.pm.PackageManager.PERMISSION_GRANTED
if (granted) return
val requester =
permissionRequester
?: throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission")
val results = requester.requestIfMissing(listOf(android.Manifest.permission.RECORD_AUDIO))
if (results[android.Manifest.permission.RECORD_AUDIO] != true) {
throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission")
}
}
private fun parseDurationMs(paramsJson: String?): Int? =
parseNumber(paramsJson, key = "durationMs")?.toIntOrNull()
@@ -113,6 +147,21 @@ class ScreenRecordManager(private val context: Context) {
private fun parseScreenIndex(paramsJson: String?): Int? =
parseNumber(paramsJson, key = "screenIndex")?.toIntOrNull()
private fun parseIncludeAudio(paramsJson: String?): Boolean? {
val raw = paramsJson ?: return null
val key = "\"includeAudio\""
val idx = raw.indexOf(key)
if (idx < 0) return null
val colon = raw.indexOf(':', idx + key.length)
if (colon < 0) return null
val tail = raw.substring(colon + 1).trimStart()
return when {
tail.startsWith("true") -> true
tail.startsWith("false") -> false
else -> null
}
}
private fun parseNumber(paramsJson: String?, key: String): String? {
val raw = paramsJson ?: return null
val needle = "\"$key\""

View File

@@ -539,6 +539,7 @@ final class NodeAppModel {
screenIndex: params.screenIndex,
durationMs: params.durationMs,
fps: params.fps,
includeAudio: params.includeAudio,
outPath: nil)
defer { try? FileManager.default.removeItem(atPath: path) }
let data = try Data(contentsOf: URL(fileURLWithPath: path))
@@ -548,13 +549,15 @@ final class NodeAppModel {
var durationMs: Int?
var fps: Double?
var screenIndex: Int?
var hasAudio: Bool
}
let payload = try Self.encodePayload(Payload(
format: "mp4",
base64: data.base64EncodedString(),
durationMs: params.durationMs,
fps: params.fps,
screenIndex: params.screenIndex))
screenIndex: params.screenIndex,
hasAudio: params.includeAudio ?? true))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
default:

View File

@@ -1,20 +1,17 @@
import AVFoundation
import UIKit
import ReplayKit
@MainActor
final class ScreenRecordService {
enum ScreenRecordError: LocalizedError {
case noWindow
case invalidScreenIndex(Int)
case captureFailed(String)
case writeFailed(String)
var errorDescription: String? {
switch self {
case .noWindow:
return "Screen capture unavailable"
case let .invalidScreenIndex(idx):
return "Invalid screen index \(idx)"
case let .invalidScreenIndex(idx):
return "Invalid screen index \(idx)"
case let .captureFailed(msg):
return msg
case let .writeFailed(msg):
@@ -27,12 +24,18 @@ final class ScreenRecordService {
screenIndex: Int?,
durationMs: Int?,
fps: Double?,
includeAudio: Bool?,
outPath: String?) async throws -> String
{
let durationMs = Self.clampDurationMs(durationMs)
let fps = Self.clampFps(fps)
let fpsInt = Int32(fps.rounded())
let fpsValue = Double(fpsInt)
let includeAudio = includeAudio ?? true
if let idx = screenIndex, idx != 0 {
throw ScreenRecordError.invalidScreenIndex(idx)
}
let outURL: URL = {
if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
@@ -43,83 +46,124 @@ final class ScreenRecordService {
}()
try? FileManager.default.removeItem(at: outURL)
if let idx = screenIndex, idx != 0 {
throw ScreenRecordError.invalidScreenIndex(idx)
let recorder = RPScreenRecorder.shared()
recorder.isMicrophoneEnabled = includeAudio
var writer: AVAssetWriter?
var videoInput: AVAssetWriterInput?
var audioInput: AVAssetWriterInput?
var started = false
var sawVideo = false
var lastVideoTime: CMTime?
var handlerError: Error?
let lock = NSLock()
func setHandlerError(_ error: Error) {
lock.lock()
defer { lock.unlock() }
if handlerError == nil { handlerError = error }
}
guard let window = Self.resolveKeyWindow() else {
throw ScreenRecordError.noWindow
}
let size = window.bounds.size
let scale = window.screen.scale
let widthPx = max(1, Int(size.width * scale))
let heightPx = max(1, Int(size.height * scale))
let writer = try AVAssetWriter(outputURL: outURL, fileType: .mp4)
let settings: [String: Any] = [
AVVideoCodecKey: AVVideoCodecType.h264,
AVVideoWidthKey: widthPx,
AVVideoHeightKey: heightPx,
]
let input = AVAssetWriterInput(mediaType: .video, outputSettings: settings)
input.expectsMediaDataInRealTime = false
let attrs: [String: Any] = [
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA,
kCVPixelBufferWidthKey as String: widthPx,
kCVPixelBufferHeightKey as String: heightPx,
kCVPixelBufferCGImageCompatibilityKey as String: true,
kCVPixelBufferCGBitmapContextCompatibilityKey as String: true,
]
let adaptor = AVAssetWriterInputPixelBufferAdaptor(
assetWriterInput: input,
sourcePixelBufferAttributes: attrs)
guard writer.canAdd(input) else {
throw ScreenRecordError.writeFailed("Cannot add video input")
}
writer.add(input)
guard writer.startWriting() else {
throw ScreenRecordError.writeFailed(writer.error?.localizedDescription ?? "Failed to start writer")
}
writer.startSession(atSourceTime: .zero)
let frameCount = max(1, Int((Double(durationMs) / 1000.0 * fpsValue).rounded(.up)))
let frameDuration = CMTime(value: 1, timescale: fpsInt)
let frameSleepNs = UInt64(1_000_000_000.0 / fpsValue)
for frame in 0..<frameCount {
while !input.isReadyForMoreMediaData {
try await Task.sleep(nanoseconds: 10_000_000)
}
var frameError: Error?
autoreleasepool {
do {
guard let image = Self.captureImage(window: window, size: size) else {
throw ScreenRecordError.captureFailed("Failed to capture frame")
}
guard let buffer = Self.pixelBuffer(from: image, width: widthPx, height: heightPx) else {
throw ScreenRecordError.captureFailed("Failed to render frame")
}
let time = CMTimeMultiply(frameDuration, multiplier: Int32(frame))
if !adaptor.append(buffer, withPresentationTime: time) {
throw ScreenRecordError.writeFailed("Failed to append frame")
}
} catch {
frameError = error
try await withCheckedThrowingContinuation { (cont: CheckedContinuation<Void, Error>) in
recorder.startCapture(handler: { sample, type, error in
if let error {
setHandlerError(error)
return
}
}
if let frameError { throw frameError }
guard CMSampleBufferDataIsReady(sample) else { return }
if frame < frameCount - 1 {
try await Task.sleep(nanoseconds: frameSleepNs)
}
switch type {
case .video:
let pts = CMSampleBufferGetPresentationTimeStamp(sample)
if let lastVideoTime {
let delta = CMTimeSubtract(pts, lastVideoTime)
if delta.seconds < (1.0 / fpsValue) { return }
}
if writer == nil {
guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else {
setHandlerError(ScreenRecordError.captureFailed("Missing image buffer"))
return
}
let width = CVPixelBufferGetWidth(imageBuffer)
let height = CVPixelBufferGetHeight(imageBuffer)
do {
let w = try AVAssetWriter(outputURL: outURL, fileType: .mp4)
let settings: [String: Any] = [
AVVideoCodecKey: AVVideoCodecType.h264,
AVVideoWidthKey: width,
AVVideoHeightKey: height,
]
let vInput = AVAssetWriterInput(mediaType: .video, outputSettings: settings)
vInput.expectsMediaDataInRealTime = true
guard w.canAdd(vInput) else {
throw ScreenRecordError.writeFailed("Cannot add video input")
}
w.add(vInput)
if includeAudio {
let aInput = AVAssetWriterInput(mediaType: .audio, outputSettings: nil)
aInput.expectsMediaDataInRealTime = true
if w.canAdd(aInput) {
w.add(aInput)
audioInput = aInput
}
}
guard w.startWriting() else {
throw ScreenRecordError.writeFailed(w.error?.localizedDescription ?? "Failed to start writer")
}
w.startSession(atSourceTime: pts)
writer = w
videoInput = vInput
started = true
} catch {
setHandlerError(error)
return
}
}
guard let vInput = videoInput, started else { return }
if vInput.isReadyForMoreMediaData {
if vInput.append(sample) {
sawVideo = true
lastVideoTime = pts
} else {
if let err = writer?.error {
setHandlerError(ScreenRecordError.writeFailed(err.localizedDescription))
}
}
}
case .audioApp, .audioMic:
guard includeAudio, let aInput = audioInput, started else { return }
if aInput.isReadyForMoreMediaData {
_ = aInput.append(sample)
}
@unknown default:
break
}
}, completionHandler: { error in
if let error { cont.resume(throwing: error) } else { cont.resume() }
})
}
input.markAsFinished()
try await Task.sleep(nanoseconds: UInt64(durationMs) * 1_000_000)
let stopError = await withCheckedContinuation { cont in
recorder.stopCapture { error in cont.resume(returning: error) }
}
if let stopError { throw stopError }
if let handlerError { throw handlerError }
guard let writer, let videoInput, sawVideo else {
throw ScreenRecordError.captureFailed("No frames captured")
}
videoInput.markAsFinished()
audioInput?.markAsFinished()
try await withCheckedThrowingContinuation { (cont: CheckedContinuation<Void, Error>) in
writer.finishWriting {
if let err = writer.error {
@@ -146,60 +190,4 @@ final class ScreenRecordService {
return min(30, max(1, v))
}
private nonisolated static func resolveKeyWindow() -> UIWindow? {
let scenes = UIApplication.shared.connectedScenes
for scene in scenes {
guard let windowScene = scene as? UIWindowScene else { continue }
if let window = windowScene.windows.first(where: { $0.isKeyWindow }) {
return window
}
if let window = windowScene.windows.first {
return window
}
}
return nil
}
private nonisolated static func captureImage(window: UIWindow, size: CGSize) -> CGImage? {
let format = UIGraphicsImageRendererFormat()
format.scale = window.screen.scale
let renderer = UIGraphicsImageRenderer(size: size, format: format)
let image = renderer.image { _ in
window.drawHierarchy(in: CGRect(origin: .zero, size: size), afterScreenUpdates: false)
}
return image.cgImage
}
private nonisolated static func pixelBuffer(from image: CGImage, width: Int, height: Int) -> CVPixelBuffer? {
var buffer: CVPixelBuffer?
let status = CVPixelBufferCreate(
kCFAllocatorDefault,
width,
height,
kCVPixelFormatType_32BGRA,
[
kCVPixelBufferCGImageCompatibilityKey: true,
kCVPixelBufferCGBitmapContextCompatibilityKey: true,
] as CFDictionary,
&buffer)
guard status == kCVReturnSuccess, let buffer else { return nil }
CVPixelBufferLockBaseAddress(buffer, [])
defer { CVPixelBufferUnlockBaseAddress(buffer, []) }
guard let context = CGContext(
data: CVPixelBufferGetBaseAddress(buffer),
width: width,
height: height,
bitsPerComponent: 8,
bytesPerRow: CVPixelBufferGetBytesPerRow(buffer),
space: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: CGImageAlphaInfo.premultipliedFirst.rawValue
) else {
return nil
}
context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
return buffer
}
}

View File

@@ -4,8 +4,6 @@ import Foundation
import OSLog
enum ControlRequestHandler {
private static let cameraCapture = CameraCaptureService()
@MainActor private static let screenRecorder = ScreenRecordService()
struct NodeListNode: Codable {
var nodeId: String
@@ -135,11 +133,12 @@ enum ControlRequestHandler {
includeAudio: includeAudio,
outPath: outPath)
case let .screenRecord(screenIndex, durationMs, fps, outPath):
case let .screenRecord(screenIndex, durationMs, fps, includeAudio, outPath):
return await self.handleScreenRecord(
screenIndex: screenIndex,
durationMs: durationMs,
fps: fps,
includeAudio: includeAudio,
outPath: outPath)
}
}
@@ -242,50 +241,84 @@ enum ControlRequestHandler {
placement: CanvasPlacement?) async -> Response
{
guard self.canvasEnabled() else { return Response(ok: false, message: "Canvas disabled by user") }
let logger = Logger(subsystem: "com.steipete.clawdis", category: "CanvasControl")
logger.info("canvas show start session=\(session, privacy: .public) path=\(path ?? "", privacy: .public)")
_ = session
do {
logger.info("canvas show awaiting CanvasManager")
let res = try await CanvasManager.shared.showDetailed(
sessionKey: session,
target: path,
placement: placement)
logger
.info(
"canvas show done dir=\(res.directory, privacy: .public) status=\(String(describing: res.status), privacy: .public)")
let payload = try? JSONEncoder().encode(res)
return Response(ok: true, message: res.directory, payload: payload)
if let path, !path.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
_ = try await self.invokeLocalNode(
command: ClawdisCanvasCommand.navigate.rawValue,
params: ["url": path],
timeoutMs: 20000)
} else {
_ = try await self.invokeLocalNode(
command: ClawdisCanvasCommand.show.rawValue,
params: nil,
timeoutMs: 20000)
}
if placement != nil {
return Response(ok: true, message: "Canvas placement ignored (node mode)")
}
return Response(ok: true)
} catch {
logger.error("canvas show failed \(error.localizedDescription, privacy: .public)")
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCanvasHide(session: String) async -> Response {
await CanvasManager.shared.hide(sessionKey: session)
return Response(ok: true)
_ = session
do {
_ = try await self.invokeLocalNode(
command: ClawdisCanvasCommand.hide.rawValue,
params: nil,
timeoutMs: 10000)
return Response(ok: true)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCanvasEval(session: String, javaScript: String) async -> Response {
guard self.canvasEnabled() else { return Response(ok: false, message: "Canvas disabled by user") }
let logger = Logger(subsystem: "com.steipete.clawdis", category: "CanvasControl")
logger.info("canvas eval start session=\(session, privacy: .public) bytes=\(javaScript.utf8.count)")
_ = session
do {
logger.info("canvas eval awaiting CanvasManager.eval")
let result = try await CanvasManager.shared.eval(sessionKey: session, javaScript: javaScript)
logger.info("canvas eval done bytes=\(result.utf8.count)")
return Response(ok: true, payload: Data(result.utf8))
let payload = try await self.invokeLocalNode(
command: ClawdisCanvasCommand.evalJS.rawValue,
params: ["javaScript": javaScript],
timeoutMs: 20000)
if let dict = payload as? [String: Any],
let result = dict["result"] as? String
{
return Response(ok: true, payload: Data(result.utf8))
}
return Response(ok: true)
} catch {
logger.error("canvas eval failed \(error.localizedDescription, privacy: .public)")
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCanvasSnapshot(session: String, outPath: String?) async -> Response {
guard self.canvasEnabled() else { return Response(ok: false, message: "Canvas disabled by user") }
_ = session
do {
let path = try await CanvasManager.shared.snapshot(sessionKey: session, outPath: outPath)
return Response(ok: true, message: path)
let payload = try await self.invokeLocalNode(
command: ClawdisCanvasCommand.snapshot.rawValue,
params: [:],
timeoutMs: 20000)
guard let dict = payload as? [String: Any],
let format = dict["format"] as? String,
let base64 = dict["base64"] as? String,
let data = Data(base64Encoded: base64)
else {
return Response(ok: false, message: "invalid canvas snapshot payload")
}
let ext = (format.lowercased() == "jpeg" || format.lowercased() == "jpg") ? "jpg" : "png"
let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
URL(fileURLWithPath: outPath)
} else {
FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-canvas-snapshot-\(UUID().uuidString).\(ext)")
}
try data.write(to: url, options: [.atomic])
return Response(ok: true, message: url.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
@@ -297,112 +330,38 @@ enum ControlRequestHandler {
jsonl: String?) async -> Response
{
guard self.canvasEnabled() else { return Response(ok: false, message: "Canvas disabled by user") }
_ = session
do {
// Ensure the Canvas is visible without forcing a navigation/reload.
_ = try await CanvasManager.shared.show(sessionKey: session, path: nil)
// Wait for the in-page A2UI bridge. If it doesn't appear, force-load the bundled A2UI shell once.
var ready = await Self.waitForCanvasA2UI(session: session, requireBuiltinPath: false, timeoutMs: 2000)
if !ready {
_ = try await CanvasManager.shared.show(sessionKey: session, path: "/__clawdis__/a2ui/")
ready = await Self.waitForCanvasA2UI(session: session, requireBuiltinPath: true, timeoutMs: 5000)
}
guard ready else { return Response(ok: false, message: "A2UI not ready") }
let js: String
switch command {
case .reset:
js = """
(() => {
try {
if (!globalThis.clawdisA2UI) { return JSON.stringify({ ok: false, error: "missing clawdisA2UI" }); }
return JSON.stringify(globalThis.clawdisA2UI.reset());
} catch (e) {
return JSON.stringify({ ok: false, error: String(e?.message ?? e), stack: e?.stack });
}
})()
"""
let payload = try await self.invokeLocalNode(
command: ClawdisCanvasA2UICommand.reset.rawValue,
params: nil,
timeoutMs: 20000)
if let payload {
let data = try JSONSerialization.data(withJSONObject: payload)
return Response(ok: true, payload: data)
}
return Response(ok: true)
case .pushJSONL:
guard let jsonl, !jsonl.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
return Response(ok: false, message: "missing jsonl")
}
let messages: [ClawdisKit.AnyCodable]
do {
messages = try ClawdisCanvasA2UIJSONL.decodeMessagesFromJSONL(jsonl)
} catch {
return Response(ok: false, message: error.localizedDescription)
let payload = try await self.invokeLocalNode(
command: ClawdisCanvasA2UICommand.pushJSONL.rawValue,
params: ["jsonl": jsonl],
timeoutMs: 30000)
if let payload {
let data = try JSONSerialization.data(withJSONObject: payload)
return Response(ok: true, payload: data)
}
let json: String
do {
json = try ClawdisCanvasA2UIJSONL.encodeMessagesJSONArray(messages)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
js = """
(() => {
try {
if (!globalThis.clawdisA2UI) { return JSON.stringify({ ok: false, error: "missing clawdisA2UI" }); }
const messages = \(json);
return JSON.stringify(globalThis.clawdisA2UI.applyMessages(messages));
} catch (e) {
return JSON.stringify({ ok: false, error: String(e?.message ?? e), stack: e?.stack });
}
})()
"""
return Response(ok: true)
}
let result = try await CanvasManager.shared.eval(sessionKey: session, javaScript: js)
let payload = Data(result.utf8)
if let obj = try? JSONSerialization.jsonObject(with: payload, options: []) as? [String: Any],
let ok = obj["ok"] as? Bool
{
let error = obj["error"] as? String
return Response(ok: ok, message: ok ? "" : (error ?? "A2UI error"), payload: payload)
}
return Response(ok: true, payload: payload)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func waitForCanvasA2UI(session: String, requireBuiltinPath: Bool, timeoutMs: Int) async -> Bool {
let clock = ContinuousClock()
let deadline = clock.now.advanced(by: .milliseconds(timeoutMs))
while clock.now < deadline {
do {
let res = try await CanvasManager.shared.eval(
sessionKey: session,
javaScript: """
(() => {
try {
if (document?.readyState !== 'complete') { return ''; }
if (!globalThis.clawdisA2UI) { return ''; }
if (typeof globalThis.clawdisA2UI.applyMessages !== 'function') { return ''; }
if (\(requireBuiltinPath ? "true" : "false")) {
const p = String(location?.pathname ?? '');
if (!p.startsWith('/__clawdis__/a2ui')) { return ''; }
}
return 'ready';
} catch {
return '';
}
})()
""")
if res == "ready" { return true }
} catch {
// Ignore; keep waiting.
}
try? await Task.sleep(nanoseconds: 60_000_000)
}
return false
}
private static func handleNodeList() async -> Response {
do {
let data = try await GatewayConnection.shared.request(
@@ -509,15 +468,33 @@ enum ControlRequestHandler {
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.snap(facing: facing, maxWidth: maxWidth, quality: quality)
var params: [String: Any] = [:]
if let facing { params["facing"] = facing.rawValue }
if let maxWidth { params["maxWidth"] = maxWidth }
if let quality { params["quality"] = quality }
params["format"] = "jpg"
let payload = try await self.invokeLocalNode(
command: ClawdisCameraCommand.snap.rawValue,
params: params,
timeoutMs: 30000)
guard let dict = payload as? [String: Any],
let format = dict["format"] as? String,
let base64 = dict["base64"] as? String,
let data = Data(base64Encoded: base64)
else {
return Response(ok: false, message: "invalid camera snapshot payload")
}
let ext = (format.lowercased() == "jpeg" || format.lowercased() == "jpg") ? "jpg" : format.lowercased()
let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
URL(fileURLWithPath: outPath)
} else {
FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).jpg")
.appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).\(ext)")
}
try res.data.write(to: url, options: [.atomic])
try data.write(to: url, options: [.atomic])
return Response(ok: true, message: url.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
@@ -532,12 +509,31 @@ enum ControlRequestHandler {
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.clip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
return Response(ok: true, message: res.path)
var params: [String: Any] = ["includeAudio": includeAudio, "format": "mp4"]
if let facing { params["facing"] = facing.rawValue }
if let durationMs { params["durationMs"] = durationMs }
let payload = try await self.invokeLocalNode(
command: ClawdisCameraCommand.clip.rawValue,
params: params,
timeoutMs: 90000)
guard let dict = payload as? [String: Any],
let format = dict["format"] as? String,
let base64 = dict["base64"] as? String,
let data = Data(base64Encoded: base64)
else {
return Response(ok: false, message: "invalid camera clip payload")
}
let ext = format.lowercased()
let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
URL(fileURLWithPath: outPath)
} else {
FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-clip-\(UUID().uuidString).\(ext)")
}
try data.write(to: url, options: [.atomic])
return Response(ok: true, message: url.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
@@ -547,23 +543,69 @@ enum ControlRequestHandler {
screenIndex: Int?,
durationMs: Int?,
fps: Double?,
includeAudio: Bool,
outPath: String?) async -> Response
{
let authorized = await PermissionManager
.ensure([.screenRecording], interactive: false)[.screenRecording] ?? false
guard authorized else { return Response(ok: false, message: "screen recording permission missing") }
do {
let path = try await Task { @MainActor in
try await self.screenRecorder.record(
screenIndex: screenIndex,
durationMs: durationMs,
fps: fps,
outPath: outPath)
}.value
return Response(ok: true, message: path)
var params: [String: Any] = ["format": "mp4", "includeAudio": includeAudio]
if let screenIndex { params["screenIndex"] = screenIndex }
if let durationMs { params["durationMs"] = durationMs }
if let fps { params["fps"] = fps }
let payload = try await self.invokeLocalNode(
command: "screen.record",
params: params,
timeoutMs: 120000)
guard let dict = payload as? [String: Any],
let base64 = dict["base64"] as? String,
let data = Data(base64Encoded: base64)
else {
return Response(ok: false, message: "invalid screen record payload")
}
let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
URL(fileURLWithPath: outPath)
} else {
FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-screen-record-\(UUID().uuidString).mp4")
}
try data.write(to: url, options: [.atomic])
return Response(ok: true, message: url.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func invokeLocalNode(
command: String,
params: [String: Any]?,
timeoutMs: Int) async throws -> Any?
{
var gatewayParams: [String: AnyCodable] = [
"nodeId": AnyCodable(Self.localNodeId()),
"command": AnyCodable(command),
"idempotencyKey": AnyCodable(UUID().uuidString),
]
if let params {
gatewayParams["params"] = AnyCodable(params)
}
let data = try await GatewayConnection.shared.request(
method: "node.invoke",
params: gatewayParams,
timeoutMs: timeoutMs)
return try Self.decodeNodeInvokePayload(data: data)
}
private static func decodeNodeInvokePayload(data: Data) throws -> Any? {
let obj = try JSONSerialization.jsonObject(with: data)
guard let dict = obj as? [String: Any] else {
throw NSError(domain: "Node", code: 30, userInfo: [
NSLocalizedDescriptionKey: "invalid node invoke response",
])
}
return dict["payload"]
}
private static func localNodeId() -> String {
"mac-\(InstanceIdentity.instanceId)"
}
}

View File

@@ -9,6 +9,14 @@ actor MacNodeRuntime {
func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse {
let command = req.command
if (command.hasPrefix("canvas.") || command.hasPrefix("canvas.a2ui.")) && !Self.canvasEnabled() {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .unavailable,
message: "CANVAS_DISABLED: enable Canvas in Settings"))
}
do {
switch command {
case ClawdisCanvasCommand.show.rawValue:
@@ -141,26 +149,29 @@ actor MacNodeRuntime {
code: .invalidRequest,
message: "INVALID_REQUEST: screen format must be mp4")
}
let path = try await self.screenRecorder.record(
let res = try await self.screenRecorder.record(
screenIndex: params.screenIndex,
durationMs: params.durationMs,
fps: params.fps,
includeAudio: params.includeAudio,
outPath: nil)
defer { try? FileManager.default.removeItem(atPath: path) }
let data = try Data(contentsOf: URL(fileURLWithPath: path))
defer { try? FileManager.default.removeItem(atPath: res.path) }
let data = try Data(contentsOf: URL(fileURLWithPath: res.path))
struct ScreenPayload: Encodable {
var format: String
var base64: String
var durationMs: Int?
var fps: Double?
var screenIndex: Int?
var hasAudio: Bool
}
let payload = try Self.encodePayload(ScreenPayload(
format: "mp4",
base64: data.base64EncodedString(),
durationMs: params.durationMs,
fps: params.fps,
screenIndex: params.screenIndex))
screenIndex: params.screenIndex,
hasAudio: res.hasAudio))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
default:
@@ -246,6 +257,10 @@ actor MacNodeRuntime {
return json
}
private nonisolated static func canvasEnabled() -> Bool {
UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true
}
private nonisolated static func cameraEnabled() -> Bool {
UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false
}

View File

@@ -9,4 +9,5 @@ struct MacNodeScreenRecordParams: Codable, Sendable, Equatable {
var durationMs: Int?
var fps: Double?
var format: String?
var includeAudio: Bool?
}

View File

@@ -31,10 +31,12 @@ final class ScreenRecordService {
screenIndex: Int?,
durationMs: Int?,
fps: Double?,
outPath: String?) async throws -> String
includeAudio: Bool?,
outPath: String?) async throws -> (path: String, hasAudio: Bool)
{
let durationMs = Self.clampDurationMs(durationMs)
let fps = Self.clampFps(fps)
let includeAudio = includeAudio ?? false
let outURL: URL = {
if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
@@ -60,15 +62,22 @@ final class ScreenRecordService {
config.queueDepth = 8
config.showsCursor = true
config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(max(1, Int32(fps.rounded()))))
if includeAudio {
config.capturesAudio = true
}
let recorder = try StreamRecorder(
outputURL: outURL,
width: display.width,
height: display.height,
includeAudio: includeAudio,
logger: self.logger)
let stream = SCStream(filter: filter, configuration: config, delegate: recorder)
try stream.addStreamOutput(recorder, type: .screen, sampleHandlerQueue: recorder.queue)
if includeAudio {
try stream.addStreamOutput(recorder, type: .audio, sampleHandlerQueue: recorder.queue)
}
self.logger.info(
"screen record start idx=\(idx) durationMs=\(durationMs) fps=\(fps) out=\(outURL.path, privacy: .public)")
@@ -85,7 +94,7 @@ final class ScreenRecordService {
}
try await recorder.finish()
return outURL.path
return (path: outURL.path, hasAudio: recorder.hasAudio)
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
@@ -106,13 +115,15 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate,
private let logger: Logger
private let writer: AVAssetWriter
private let input: AVAssetWriterInput
private let audioInput: AVAssetWriterInput?
let hasAudio: Bool
private var started = false
private var sawFrame = false
private var didFinish = false
private var pendingErrorMessage: String?
init(outputURL: URL, width: Int, height: Int, logger: Logger) throws {
init(outputURL: URL, width: Int, height: Int, includeAudio: Bool, logger: Logger) throws {
self.logger = logger
self.writer = try AVAssetWriter(outputURL: outputURL, fileType: .mp4)
@@ -128,6 +139,28 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate,
throw ScreenRecordService.ScreenRecordError.writeFailed("Cannot add video input")
}
self.writer.add(self.input)
if includeAudio {
let audioSettings: [String: Any] = [
AVFormatIDKey: kAudioFormatMPEG4AAC,
AVNumberOfChannelsKey: 1,
AVSampleRateKey: 44_100,
AVEncoderBitRateKey: 96_000,
]
let audioInput = AVAssetWriterInput(mediaType: .audio, outputSettings: audioSettings)
audioInput.expectsMediaDataInRealTime = true
if self.writer.canAdd(audioInput) {
self.writer.add(audioInput)
self.audioInput = audioInput
self.hasAudio = true
} else {
self.audioInput = nil
self.hasAudio = false
}
} else {
self.audioInput = nil
self.hasAudio = false
}
super.init()
}
@@ -145,14 +178,20 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate,
didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
of type: SCStreamOutputType)
{
guard type == .screen else { return }
guard CMSampleBufferDataIsReady(sampleBuffer) else { return }
// Callback runs on `sampleHandlerQueue` (`self.queue`).
self.handle(sampleBuffer: sampleBuffer)
switch type {
case .screen:
self.handleVideo(sampleBuffer: sampleBuffer)
case .audio:
self.handleAudio(sampleBuffer: sampleBuffer)
@unknown default:
break
}
_ = stream
}
private func handle(sampleBuffer: CMSampleBuffer) {
private func handleVideo(sampleBuffer: CMSampleBuffer) {
if let msg = self.pendingErrorMessage {
self.logger.error("screen record aborting due to prior error: \(msg, privacy: .public)")
return
@@ -175,6 +214,18 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate,
}
}
private func handleAudio(sampleBuffer: CMSampleBuffer) {
guard let audioInput else { return }
if let msg = self.pendingErrorMessage {
self.logger.error("screen record audio aborting due to prior error: \(msg, privacy: .public)")
return
}
if self.didFinish || !self.started { return }
if audioInput.isReadyForMoreMediaData {
_ = audioInput.append(sampleBuffer)
}
}
func finish() async throws {
try await withCheckedThrowingContinuation { (cont: CheckedContinuation<Void, Error>) in
self.queue.async {
@@ -193,6 +244,7 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate,
self.didFinish = true
self.input.markAsFinished()
self.audioInput?.markAsFinished()
self.writer.finishWriting {
if let err = self.writer.error {
cont.resume(throwing: ScreenRecordService.ScreenRecordError.writeFailed(err.localizedDescription))
@@ -206,4 +258,3 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate,
}
}
}

View File

@@ -476,6 +476,7 @@ struct ClawdisCLI {
var durationMs: Int?
var fps: Double?
var outPath: String?
var includeAudio = true
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
@@ -487,6 +488,8 @@ struct ClawdisCLI {
durationMs = args.popFirst().flatMap(Int.init)
case "--fps":
fps = args.popFirst().flatMap(Double.init)
case "--no-audio":
includeAudio = false
case "--out":
outPath = args.popFirst()
default:
@@ -494,7 +497,12 @@ struct ClawdisCLI {
}
}
return ParsedCLIRequest(
request: .screenRecord(screenIndex: screenIndex, durationMs: durationMs, fps: fps, outPath: outPath),
request: .screenRecord(
screenIndex: screenIndex,
durationMs: durationMs,
fps: fps,
includeAudio: includeAudio,
outPath: outPath),
kind: .mediaPath)
default:
@@ -766,7 +774,7 @@ struct ClawdisCLI {
Screen:
clawdis-mac screen record [--screen <index>]
[--duration <ms|10s|1m>|--duration-ms <ms>] [--fps <n>] [--out <path>]
[--duration <ms|10s|1m>|--duration-ms <ms>] [--fps <n>] [--no-audio] [--out <path>]
Browser (clawd):
clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot
@@ -1000,7 +1008,7 @@ struct ClawdisCLI {
case let .cameraClip(_, durationMs, _, _):
let ms = durationMs ?? 3000
return min(180, max(10, TimeInterval(ms) / 1000.0 + 10))
case let .screenRecord(_, durationMs, _, _):
case let .screenRecord(_, durationMs, _, _, _):
let ms = durationMs ?? 10_000
return min(180, max(10, TimeInterval(ms) / 1000.0 + 10))
default:

View File

@@ -132,7 +132,7 @@ public enum Request: Sendable {
case nodeInvoke(nodeId: String, command: String, paramsJSON: String?)
case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?)
case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?)
case screenRecord(screenIndex: Int?, durationMs: Int?, fps: Double?, outPath: String?)
case screenRecord(screenIndex: Int?, durationMs: Int?, fps: Double?, includeAudio: Bool, outPath: String?)
}
// MARK: - Responses
@@ -289,11 +289,12 @@ extension Request: Codable {
try container.encode(includeAudio, forKey: .includeAudio)
try container.encodeIfPresent(outPath, forKey: .outPath)
case let .screenRecord(screenIndex, durationMs, fps, outPath):
case let .screenRecord(screenIndex, durationMs, fps, includeAudio, outPath):
try container.encode(Kind.screenRecord, forKey: .type)
try container.encodeIfPresent(screenIndex, forKey: .screenIndex)
try container.encodeIfPresent(durationMs, forKey: .durationMs)
try container.encodeIfPresent(fps, forKey: .fps)
try container.encode(includeAudio, forKey: .includeAudio)
try container.encodeIfPresent(outPath, forKey: .outPath)
}
}
@@ -394,8 +395,14 @@ extension Request: Codable {
let screenIndex = try container.decodeIfPresent(Int.self, forKey: .screenIndex)
let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs)
let fps = try container.decodeIfPresent(Double.self, forKey: .fps)
let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .screenRecord(screenIndex: screenIndex, durationMs: durationMs, fps: fps, outPath: outPath)
self = .screenRecord(
screenIndex: screenIndex,
durationMs: durationMs,
fps: fps,
includeAudio: includeAudio,
outPath: outPath)
}
}
}

View File

@@ -9,16 +9,19 @@ public struct ClawdisScreenRecordParams: Codable, Sendable, Equatable {
public var durationMs: Int?
public var fps: Double?
public var format: String?
public var includeAudio: Bool?
public init(
screenIndex: Int? = nil,
durationMs: Int? = nil,
fps: Double? = nil,
format: String? = nil)
format: String? = nil,
includeAudio: Bool? = nil)
{
self.screenIndex = screenIndex
self.durationMs = durationMs
self.fps = fps
self.format = format
self.includeAudio = includeAudio
}
}

View File

@@ -81,12 +81,14 @@ Nodes expose `screen.record` (mp4). Example:
```bash
clawdis nodes screen record --node <idOrNameOrIp> --duration 10s --fps 10
clawdis nodes screen record --node <idOrNameOrIp> --duration 10s --fps 10 --no-audio
```
Notes:
- `screen.record` requires the node app to be foregrounded.
- Android will show the system screen-capture prompt before recording.
- Screen recordings are clamped to `<= 60s`.
- `--no-audio` disables microphone capture (supported on iOS/Android; macOS uses system capture audio).
## Mac node mode

View File

@@ -782,6 +782,7 @@ export function registerNodesCli(program: Command) {
.option("--screen <index>", "Screen index (0 = primary)", "0")
.option("--duration <ms|10s>", "Clip duration (ms or 10s)", "10000")
.option("--fps <fps>", "Frames per second", "10")
.option("--no-audio", "Disable microphone audio capture")
.option("--out <path>", "Output path")
.option(
"--invoke-timeout <ms>",
@@ -808,6 +809,7 @@ export function registerNodesCli(program: Command) {
: undefined,
fps: Number.isFinite(fps) ? fps : undefined,
format: "mp4",
includeAudio: opts.audio !== false,
},
idempotencyKey: randomIdempotencyKey(),
};
@@ -844,6 +846,7 @@ export function registerNodesCli(program: Command) {
durationMs: parsed.durationMs,
fps: parsed.fps,
screenIndex: parsed.screenIndex,
hasAudio: parsed.hasAudio,
},
},
null,

View File

@@ -13,12 +13,14 @@ describe("nodes screen helpers", () => {
durationMs: 1000,
fps: 12,
screenIndex: 0,
hasAudio: true,
});
expect(payload.format).toBe("mp4");
expect(payload.base64).toBe("Zm9v");
expect(payload.durationMs).toBe(1000);
expect(payload.fps).toBe(12);
expect(payload.screenIndex).toBe(0);
expect(payload.hasAudio).toBe(true);
});
it("rejects invalid screen.record payload", () => {

View File

@@ -10,6 +10,7 @@ export type ScreenRecordPayload = {
durationMs?: number;
fps?: number;
screenIndex?: number;
hasAudio?: boolean;
};
function asRecord(value: unknown): Record<string, unknown> {
@@ -36,6 +37,7 @@ export function parseScreenRecordPayload(value: unknown): ScreenRecordPayload {
fps: typeof obj.fps === "number" ? obj.fps : undefined,
screenIndex:
typeof obj.screenIndex === "number" ? obj.screenIndex : undefined,
hasAudio: typeof obj.hasAudio === "boolean" ? obj.hasAudio : undefined,
};
}