diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainActivity.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainActivity.kt index 609c08bf8..8be49d36d 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainActivity.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainActivity.kt @@ -40,6 +40,7 @@ class MainActivity : ComponentActivity() { viewModel.camera.attachLifecycleOwner(this) viewModel.camera.attachPermissionRequester(permissionRequester) viewModel.screenRecorder.attachScreenCaptureRequester(screenCaptureRequester) + viewModel.screenRecorder.attachPermissionRequester(permissionRequester) lifecycleScope.launch { repeatOnLifecycle(Lifecycle.State.STARTED) { diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/node/ScreenRecordManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/ScreenRecordManager.kt index 4de891ff3..049f43659 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/node/ScreenRecordManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/ScreenRecordManager.kt @@ -16,11 +16,16 @@ class ScreenRecordManager(private val context: Context) { data class Payload(val payloadJson: String) @Volatile private var screenCaptureRequester: ScreenCaptureRequester? = null + @Volatile private var permissionRequester: com.steipete.clawdis.node.PermissionRequester? = null fun attachScreenCaptureRequester(requester: ScreenCaptureRequester) { screenCaptureRequester = requester } + fun attachPermissionRequester(requester: com.steipete.clawdis.node.PermissionRequester) { + permissionRequester = requester + } + suspend fun record(paramsJson: String?): Payload = withContext(Dispatchers.Default) { val requester = @@ -33,6 +38,7 @@ class ScreenRecordManager(private val context: Context) { val fps = (parseFps(paramsJson) ?: 10.0).coerceIn(1.0, 60.0) val fpsInt = fps.roundToInt().coerceIn(1, 60) val screenIndex = parseScreenIndex(paramsJson) + val includeAudio = parseIncludeAudio(paramsJson) ?: true val format = parseString(paramsJson, key = "format") if (format != null && format.lowercase() != "mp4") { throw IllegalArgumentException("INVALID_REQUEST: screen format must be mp4") @@ -57,12 +63,23 @@ class ScreenRecordManager(private val context: Context) { val densityDpi = metrics.densityDpi val file = File.createTempFile("clawdis-screen-", ".mp4") + if (includeAudio) ensureMicPermission() + val recorder = MediaRecorder() var virtualDisplay: android.hardware.display.VirtualDisplay? = null try { + if (includeAudio) { + recorder.setAudioSource(MediaRecorder.AudioSource.MIC) + } recorder.setVideoSource(MediaRecorder.VideoSource.SURFACE) recorder.setOutputFormat(MediaRecorder.OutputFormat.MPEG_4) recorder.setVideoEncoder(MediaRecorder.VideoEncoder.H264) + if (includeAudio) { + recorder.setAudioEncoder(MediaRecorder.AudioEncoder.AAC) + recorder.setAudioChannels(1) + recorder.setAudioSamplingRate(44_100) + recorder.setAudioEncodingBitRate(96_000) + } recorder.setVideoSize(width, height) recorder.setVideoFrameRate(fpsInt) recorder.setVideoEncodingBitRate(estimateBitrate(width, height, fpsInt)) @@ -100,10 +117,27 @@ class ScreenRecordManager(private val context: Context) { file.delete() val base64 = Base64.encodeToString(bytes, Base64.NO_WRAP) Payload( - """{"format":"mp4","base64":"$base64","durationMs":$durationMs,"fps":$fpsInt,"screenIndex":0}""", + """{"format":"mp4","base64":"$base64","durationMs":$durationMs,"fps":$fpsInt,"screenIndex":0,"hasAudio":$includeAudio}""", ) } + private suspend fun ensureMicPermission() { + val granted = + androidx.core.content.ContextCompat.checkSelfPermission( + context, + android.Manifest.permission.RECORD_AUDIO, + ) == android.content.pm.PackageManager.PERMISSION_GRANTED + if (granted) return + + val requester = + permissionRequester + ?: throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission") + val results = requester.requestIfMissing(listOf(android.Manifest.permission.RECORD_AUDIO)) + if (results[android.Manifest.permission.RECORD_AUDIO] != true) { + throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission") + } + } + private fun parseDurationMs(paramsJson: String?): Int? = parseNumber(paramsJson, key = "durationMs")?.toIntOrNull() @@ -113,6 +147,21 @@ class ScreenRecordManager(private val context: Context) { private fun parseScreenIndex(paramsJson: String?): Int? = parseNumber(paramsJson, key = "screenIndex")?.toIntOrNull() + private fun parseIncludeAudio(paramsJson: String?): Boolean? { + val raw = paramsJson ?: return null + val key = "\"includeAudio\"" + val idx = raw.indexOf(key) + if (idx < 0) return null + val colon = raw.indexOf(':', idx + key.length) + if (colon < 0) return null + val tail = raw.substring(colon + 1).trimStart() + return when { + tail.startsWith("true") -> true + tail.startsWith("false") -> false + else -> null + } + } + private fun parseNumber(paramsJson: String?, key: String): String? { val raw = paramsJson ?: return null val needle = "\"$key\"" diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index 1cb380f3b..d22defa0d 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -539,6 +539,7 @@ final class NodeAppModel { screenIndex: params.screenIndex, durationMs: params.durationMs, fps: params.fps, + includeAudio: params.includeAudio, outPath: nil) defer { try? FileManager.default.removeItem(atPath: path) } let data = try Data(contentsOf: URL(fileURLWithPath: path)) @@ -548,13 +549,15 @@ final class NodeAppModel { var durationMs: Int? var fps: Double? var screenIndex: Int? + var hasAudio: Bool } let payload = try Self.encodePayload(Payload( format: "mp4", base64: data.base64EncodedString(), durationMs: params.durationMs, fps: params.fps, - screenIndex: params.screenIndex)) + screenIndex: params.screenIndex, + hasAudio: params.includeAudio ?? true)) return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) default: diff --git a/apps/ios/Sources/Screen/ScreenRecordService.swift b/apps/ios/Sources/Screen/ScreenRecordService.swift index 54224ec26..0c8706562 100644 --- a/apps/ios/Sources/Screen/ScreenRecordService.swift +++ b/apps/ios/Sources/Screen/ScreenRecordService.swift @@ -1,20 +1,17 @@ import AVFoundation -import UIKit +import ReplayKit @MainActor final class ScreenRecordService { enum ScreenRecordError: LocalizedError { - case noWindow case invalidScreenIndex(Int) case captureFailed(String) case writeFailed(String) var errorDescription: String? { switch self { - case .noWindow: - return "Screen capture unavailable" - case let .invalidScreenIndex(idx): - return "Invalid screen index \(idx)" + case let .invalidScreenIndex(idx): + return "Invalid screen index \(idx)" case let .captureFailed(msg): return msg case let .writeFailed(msg): @@ -27,12 +24,18 @@ final class ScreenRecordService { screenIndex: Int?, durationMs: Int?, fps: Double?, + includeAudio: Bool?, outPath: String?) async throws -> String { let durationMs = Self.clampDurationMs(durationMs) let fps = Self.clampFps(fps) let fpsInt = Int32(fps.rounded()) let fpsValue = Double(fpsInt) + let includeAudio = includeAudio ?? true + + if let idx = screenIndex, idx != 0 { + throw ScreenRecordError.invalidScreenIndex(idx) + } let outURL: URL = { if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { @@ -43,83 +46,124 @@ final class ScreenRecordService { }() try? FileManager.default.removeItem(at: outURL) - if let idx = screenIndex, idx != 0 { - throw ScreenRecordError.invalidScreenIndex(idx) + let recorder = RPScreenRecorder.shared() + recorder.isMicrophoneEnabled = includeAudio + + var writer: AVAssetWriter? + var videoInput: AVAssetWriterInput? + var audioInput: AVAssetWriterInput? + var started = false + var sawVideo = false + var lastVideoTime: CMTime? + var handlerError: Error? + let lock = NSLock() + + func setHandlerError(_ error: Error) { + lock.lock() + defer { lock.unlock() } + if handlerError == nil { handlerError = error } } - guard let window = Self.resolveKeyWindow() else { - throw ScreenRecordError.noWindow - } - - let size = window.bounds.size - let scale = window.screen.scale - let widthPx = max(1, Int(size.width * scale)) - let heightPx = max(1, Int(size.height * scale)) - - let writer = try AVAssetWriter(outputURL: outURL, fileType: .mp4) - let settings: [String: Any] = [ - AVVideoCodecKey: AVVideoCodecType.h264, - AVVideoWidthKey: widthPx, - AVVideoHeightKey: heightPx, - ] - let input = AVAssetWriterInput(mediaType: .video, outputSettings: settings) - input.expectsMediaDataInRealTime = false - - let attrs: [String: Any] = [ - kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA, - kCVPixelBufferWidthKey as String: widthPx, - kCVPixelBufferHeightKey as String: heightPx, - kCVPixelBufferCGImageCompatibilityKey as String: true, - kCVPixelBufferCGBitmapContextCompatibilityKey as String: true, - ] - let adaptor = AVAssetWriterInputPixelBufferAdaptor( - assetWriterInput: input, - sourcePixelBufferAttributes: attrs) - - guard writer.canAdd(input) else { - throw ScreenRecordError.writeFailed("Cannot add video input") - } - writer.add(input) - - guard writer.startWriting() else { - throw ScreenRecordError.writeFailed(writer.error?.localizedDescription ?? "Failed to start writer") - } - writer.startSession(atSourceTime: .zero) - - let frameCount = max(1, Int((Double(durationMs) / 1000.0 * fpsValue).rounded(.up))) - let frameDuration = CMTime(value: 1, timescale: fpsInt) - let frameSleepNs = UInt64(1_000_000_000.0 / fpsValue) - - for frame in 0..) in + recorder.startCapture(handler: { sample, type, error in + if let error { + setHandlerError(error) + return } - } - if let frameError { throw frameError } + guard CMSampleBufferDataIsReady(sample) else { return } - if frame < frameCount - 1 { - try await Task.sleep(nanoseconds: frameSleepNs) - } + switch type { + case .video: + let pts = CMSampleBufferGetPresentationTimeStamp(sample) + if let lastVideoTime { + let delta = CMTimeSubtract(pts, lastVideoTime) + if delta.seconds < (1.0 / fpsValue) { return } + } + + if writer == nil { + guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else { + setHandlerError(ScreenRecordError.captureFailed("Missing image buffer")) + return + } + let width = CVPixelBufferGetWidth(imageBuffer) + let height = CVPixelBufferGetHeight(imageBuffer) + do { + let w = try AVAssetWriter(outputURL: outURL, fileType: .mp4) + let settings: [String: Any] = [ + AVVideoCodecKey: AVVideoCodecType.h264, + AVVideoWidthKey: width, + AVVideoHeightKey: height, + ] + let vInput = AVAssetWriterInput(mediaType: .video, outputSettings: settings) + vInput.expectsMediaDataInRealTime = true + guard w.canAdd(vInput) else { + throw ScreenRecordError.writeFailed("Cannot add video input") + } + w.add(vInput) + + if includeAudio { + let aInput = AVAssetWriterInput(mediaType: .audio, outputSettings: nil) + aInput.expectsMediaDataInRealTime = true + if w.canAdd(aInput) { + w.add(aInput) + audioInput = aInput + } + } + + guard w.startWriting() else { + throw ScreenRecordError.writeFailed(w.error?.localizedDescription ?? "Failed to start writer") + } + w.startSession(atSourceTime: pts) + writer = w + videoInput = vInput + started = true + } catch { + setHandlerError(error) + return + } + } + + guard let vInput = videoInput, started else { return } + if vInput.isReadyForMoreMediaData { + if vInput.append(sample) { + sawVideo = true + lastVideoTime = pts + } else { + if let err = writer?.error { + setHandlerError(ScreenRecordError.writeFailed(err.localizedDescription)) + } + } + } + + case .audioApp, .audioMic: + guard includeAudio, let aInput = audioInput, started else { return } + if aInput.isReadyForMoreMediaData { + _ = aInput.append(sample) + } + + @unknown default: + break + } + }, completionHandler: { error in + if let error { cont.resume(throwing: error) } else { cont.resume() } + }) } - input.markAsFinished() + try await Task.sleep(nanoseconds: UInt64(durationMs) * 1_000_000) + + let stopError = await withCheckedContinuation { cont in + recorder.stopCapture { error in cont.resume(returning: error) } + } + if let stopError { throw stopError } + + if let handlerError { throw handlerError } + guard let writer, let videoInput, sawVideo else { + throw ScreenRecordError.captureFailed("No frames captured") + } + + videoInput.markAsFinished() + audioInput?.markAsFinished() + try await withCheckedThrowingContinuation { (cont: CheckedContinuation) in writer.finishWriting { if let err = writer.error { @@ -146,60 +190,4 @@ final class ScreenRecordService { return min(30, max(1, v)) } - private nonisolated static func resolveKeyWindow() -> UIWindow? { - let scenes = UIApplication.shared.connectedScenes - for scene in scenes { - guard let windowScene = scene as? UIWindowScene else { continue } - if let window = windowScene.windows.first(where: { $0.isKeyWindow }) { - return window - } - if let window = windowScene.windows.first { - return window - } - } - return nil - } - - private nonisolated static func captureImage(window: UIWindow, size: CGSize) -> CGImage? { - let format = UIGraphicsImageRendererFormat() - format.scale = window.screen.scale - let renderer = UIGraphicsImageRenderer(size: size, format: format) - let image = renderer.image { _ in - window.drawHierarchy(in: CGRect(origin: .zero, size: size), afterScreenUpdates: false) - } - return image.cgImage - } - - private nonisolated static func pixelBuffer(from image: CGImage, width: Int, height: Int) -> CVPixelBuffer? { - var buffer: CVPixelBuffer? - let status = CVPixelBufferCreate( - kCFAllocatorDefault, - width, - height, - kCVPixelFormatType_32BGRA, - [ - kCVPixelBufferCGImageCompatibilityKey: true, - kCVPixelBufferCGBitmapContextCompatibilityKey: true, - ] as CFDictionary, - &buffer) - guard status == kCVReturnSuccess, let buffer else { return nil } - - CVPixelBufferLockBaseAddress(buffer, []) - defer { CVPixelBufferUnlockBaseAddress(buffer, []) } - - guard let context = CGContext( - data: CVPixelBufferGetBaseAddress(buffer), - width: width, - height: height, - bitsPerComponent: 8, - bytesPerRow: CVPixelBufferGetBytesPerRow(buffer), - space: CGColorSpaceCreateDeviceRGB(), - bitmapInfo: CGImageAlphaInfo.premultipliedFirst.rawValue - ) else { - return nil - } - - context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height)) - return buffer - } } diff --git a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift index 4cf9442f0..40c47f3b5 100644 --- a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift +++ b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift @@ -4,8 +4,6 @@ import Foundation import OSLog enum ControlRequestHandler { - private static let cameraCapture = CameraCaptureService() - @MainActor private static let screenRecorder = ScreenRecordService() struct NodeListNode: Codable { var nodeId: String @@ -135,11 +133,12 @@ enum ControlRequestHandler { includeAudio: includeAudio, outPath: outPath) - case let .screenRecord(screenIndex, durationMs, fps, outPath): + case let .screenRecord(screenIndex, durationMs, fps, includeAudio, outPath): return await self.handleScreenRecord( screenIndex: screenIndex, durationMs: durationMs, fps: fps, + includeAudio: includeAudio, outPath: outPath) } } @@ -242,50 +241,84 @@ enum ControlRequestHandler { placement: CanvasPlacement?) async -> Response { guard self.canvasEnabled() else { return Response(ok: false, message: "Canvas disabled by user") } - let logger = Logger(subsystem: "com.steipete.clawdis", category: "CanvasControl") - logger.info("canvas show start session=\(session, privacy: .public) path=\(path ?? "", privacy: .public)") + _ = session do { - logger.info("canvas show awaiting CanvasManager") - let res = try await CanvasManager.shared.showDetailed( - sessionKey: session, - target: path, - placement: placement) - logger - .info( - "canvas show done dir=\(res.directory, privacy: .public) status=\(String(describing: res.status), privacy: .public)") - let payload = try? JSONEncoder().encode(res) - return Response(ok: true, message: res.directory, payload: payload) + if let path, !path.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + _ = try await self.invokeLocalNode( + command: ClawdisCanvasCommand.navigate.rawValue, + params: ["url": path], + timeoutMs: 20000) + } else { + _ = try await self.invokeLocalNode( + command: ClawdisCanvasCommand.show.rawValue, + params: nil, + timeoutMs: 20000) + } + if placement != nil { + return Response(ok: true, message: "Canvas placement ignored (node mode)") + } + return Response(ok: true) } catch { - logger.error("canvas show failed \(error.localizedDescription, privacy: .public)") return Response(ok: false, message: error.localizedDescription) } } private static func handleCanvasHide(session: String) async -> Response { - await CanvasManager.shared.hide(sessionKey: session) - return Response(ok: true) + _ = session + do { + _ = try await self.invokeLocalNode( + command: ClawdisCanvasCommand.hide.rawValue, + params: nil, + timeoutMs: 10000) + return Response(ok: true) + } catch { + return Response(ok: false, message: error.localizedDescription) + } } private static func handleCanvasEval(session: String, javaScript: String) async -> Response { guard self.canvasEnabled() else { return Response(ok: false, message: "Canvas disabled by user") } - let logger = Logger(subsystem: "com.steipete.clawdis", category: "CanvasControl") - logger.info("canvas eval start session=\(session, privacy: .public) bytes=\(javaScript.utf8.count)") + _ = session do { - logger.info("canvas eval awaiting CanvasManager.eval") - let result = try await CanvasManager.shared.eval(sessionKey: session, javaScript: javaScript) - logger.info("canvas eval done bytes=\(result.utf8.count)") - return Response(ok: true, payload: Data(result.utf8)) + let payload = try await self.invokeLocalNode( + command: ClawdisCanvasCommand.evalJS.rawValue, + params: ["javaScript": javaScript], + timeoutMs: 20000) + if let dict = payload as? [String: Any], + let result = dict["result"] as? String + { + return Response(ok: true, payload: Data(result.utf8)) + } + return Response(ok: true) } catch { - logger.error("canvas eval failed \(error.localizedDescription, privacy: .public)") return Response(ok: false, message: error.localizedDescription) } } private static func handleCanvasSnapshot(session: String, outPath: String?) async -> Response { guard self.canvasEnabled() else { return Response(ok: false, message: "Canvas disabled by user") } + _ = session do { - let path = try await CanvasManager.shared.snapshot(sessionKey: session, outPath: outPath) - return Response(ok: true, message: path) + let payload = try await self.invokeLocalNode( + command: ClawdisCanvasCommand.snapshot.rawValue, + params: [:], + timeoutMs: 20000) + guard let dict = payload as? [String: Any], + let format = dict["format"] as? String, + let base64 = dict["base64"] as? String, + let data = Data(base64Encoded: base64) + else { + return Response(ok: false, message: "invalid canvas snapshot payload") + } + let ext = (format.lowercased() == "jpeg" || format.lowercased() == "jpg") ? "jpg" : "png" + let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + URL(fileURLWithPath: outPath) + } else { + FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-canvas-snapshot-\(UUID().uuidString).\(ext)") + } + try data.write(to: url, options: [.atomic]) + return Response(ok: true, message: url.path) } catch { return Response(ok: false, message: error.localizedDescription) } @@ -297,112 +330,38 @@ enum ControlRequestHandler { jsonl: String?) async -> Response { guard self.canvasEnabled() else { return Response(ok: false, message: "Canvas disabled by user") } + _ = session do { - // Ensure the Canvas is visible without forcing a navigation/reload. - _ = try await CanvasManager.shared.show(sessionKey: session, path: nil) - - // Wait for the in-page A2UI bridge. If it doesn't appear, force-load the bundled A2UI shell once. - var ready = await Self.waitForCanvasA2UI(session: session, requireBuiltinPath: false, timeoutMs: 2000) - if !ready { - _ = try await CanvasManager.shared.show(sessionKey: session, path: "/__clawdis__/a2ui/") - ready = await Self.waitForCanvasA2UI(session: session, requireBuiltinPath: true, timeoutMs: 5000) - } - - guard ready else { return Response(ok: false, message: "A2UI not ready") } - - let js: String switch command { case .reset: - js = """ - (() => { - try { - if (!globalThis.clawdisA2UI) { return JSON.stringify({ ok: false, error: "missing clawdisA2UI" }); } - return JSON.stringify(globalThis.clawdisA2UI.reset()); - } catch (e) { - return JSON.stringify({ ok: false, error: String(e?.message ?? e), stack: e?.stack }); - } - })() - """ - + let payload = try await self.invokeLocalNode( + command: ClawdisCanvasA2UICommand.reset.rawValue, + params: nil, + timeoutMs: 20000) + if let payload { + let data = try JSONSerialization.data(withJSONObject: payload) + return Response(ok: true, payload: data) + } + return Response(ok: true) case .pushJSONL: guard let jsonl, !jsonl.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { return Response(ok: false, message: "missing jsonl") } - - let messages: [ClawdisKit.AnyCodable] - do { - messages = try ClawdisCanvasA2UIJSONL.decodeMessagesFromJSONL(jsonl) - } catch { - return Response(ok: false, message: error.localizedDescription) + let payload = try await self.invokeLocalNode( + command: ClawdisCanvasA2UICommand.pushJSONL.rawValue, + params: ["jsonl": jsonl], + timeoutMs: 30000) + if let payload { + let data = try JSONSerialization.data(withJSONObject: payload) + return Response(ok: true, payload: data) } - - let json: String - do { - json = try ClawdisCanvasA2UIJSONL.encodeMessagesJSONArray(messages) - } catch { - return Response(ok: false, message: error.localizedDescription) - } - js = """ - (() => { - try { - if (!globalThis.clawdisA2UI) { return JSON.stringify({ ok: false, error: "missing clawdisA2UI" }); } - const messages = \(json); - return JSON.stringify(globalThis.clawdisA2UI.applyMessages(messages)); - } catch (e) { - return JSON.stringify({ ok: false, error: String(e?.message ?? e), stack: e?.stack }); - } - })() - """ + return Response(ok: true) } - - let result = try await CanvasManager.shared.eval(sessionKey: session, javaScript: js) - - let payload = Data(result.utf8) - if let obj = try? JSONSerialization.jsonObject(with: payload, options: []) as? [String: Any], - let ok = obj["ok"] as? Bool - { - let error = obj["error"] as? String - return Response(ok: ok, message: ok ? "" : (error ?? "A2UI error"), payload: payload) - } - - return Response(ok: true, payload: payload) } catch { return Response(ok: false, message: error.localizedDescription) } } - private static func waitForCanvasA2UI(session: String, requireBuiltinPath: Bool, timeoutMs: Int) async -> Bool { - let clock = ContinuousClock() - let deadline = clock.now.advanced(by: .milliseconds(timeoutMs)) - while clock.now < deadline { - do { - let res = try await CanvasManager.shared.eval( - sessionKey: session, - javaScript: """ - (() => { - try { - if (document?.readyState !== 'complete') { return ''; } - if (!globalThis.clawdisA2UI) { return ''; } - if (typeof globalThis.clawdisA2UI.applyMessages !== 'function') { return ''; } - if (\(requireBuiltinPath ? "true" : "false")) { - const p = String(location?.pathname ?? ''); - if (!p.startsWith('/__clawdis__/a2ui')) { return ''; } - } - return 'ready'; - } catch { - return ''; - } - })() - """) - if res == "ready" { return true } - } catch { - // Ignore; keep waiting. - } - try? await Task.sleep(nanoseconds: 60_000_000) - } - return false - } - private static func handleNodeList() async -> Response { do { let data = try await GatewayConnection.shared.request( @@ -509,15 +468,33 @@ enum ControlRequestHandler { { guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") } do { - let res = try await self.cameraCapture.snap(facing: facing, maxWidth: maxWidth, quality: quality) + var params: [String: Any] = [:] + if let facing { params["facing"] = facing.rawValue } + if let maxWidth { params["maxWidth"] = maxWidth } + if let quality { params["quality"] = quality } + params["format"] = "jpg" + + let payload = try await self.invokeLocalNode( + command: ClawdisCameraCommand.snap.rawValue, + params: params, + timeoutMs: 30000) + guard let dict = payload as? [String: Any], + let format = dict["format"] as? String, + let base64 = dict["base64"] as? String, + let data = Data(base64Encoded: base64) + else { + return Response(ok: false, message: "invalid camera snapshot payload") + } + + let ext = (format.lowercased() == "jpeg" || format.lowercased() == "jpg") ? "jpg" : format.lowercased() let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { URL(fileURLWithPath: outPath) } else { FileManager.default.temporaryDirectory - .appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).jpg") + .appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).\(ext)") } - try res.data.write(to: url, options: [.atomic]) + try data.write(to: url, options: [.atomic]) return Response(ok: true, message: url.path) } catch { return Response(ok: false, message: error.localizedDescription) @@ -532,12 +509,31 @@ enum ControlRequestHandler { { guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") } do { - let res = try await self.cameraCapture.clip( - facing: facing, - durationMs: durationMs, - includeAudio: includeAudio, - outPath: outPath) - return Response(ok: true, message: res.path) + var params: [String: Any] = ["includeAudio": includeAudio, "format": "mp4"] + if let facing { params["facing"] = facing.rawValue } + if let durationMs { params["durationMs"] = durationMs } + + let payload = try await self.invokeLocalNode( + command: ClawdisCameraCommand.clip.rawValue, + params: params, + timeoutMs: 90000) + guard let dict = payload as? [String: Any], + let format = dict["format"] as? String, + let base64 = dict["base64"] as? String, + let data = Data(base64Encoded: base64) + else { + return Response(ok: false, message: "invalid camera clip payload") + } + + let ext = format.lowercased() + let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + URL(fileURLWithPath: outPath) + } else { + FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-camera-clip-\(UUID().uuidString).\(ext)") + } + try data.write(to: url, options: [.atomic]) + return Response(ok: true, message: url.path) } catch { return Response(ok: false, message: error.localizedDescription) } @@ -547,23 +543,69 @@ enum ControlRequestHandler { screenIndex: Int?, durationMs: Int?, fps: Double?, + includeAudio: Bool, outPath: String?) async -> Response { - let authorized = await PermissionManager - .ensure([.screenRecording], interactive: false)[.screenRecording] ?? false - guard authorized else { return Response(ok: false, message: "screen recording permission missing") } - do { - let path = try await Task { @MainActor in - try await self.screenRecorder.record( - screenIndex: screenIndex, - durationMs: durationMs, - fps: fps, - outPath: outPath) - }.value - return Response(ok: true, message: path) + var params: [String: Any] = ["format": "mp4", "includeAudio": includeAudio] + if let screenIndex { params["screenIndex"] = screenIndex } + if let durationMs { params["durationMs"] = durationMs } + if let fps { params["fps"] = fps } + + let payload = try await self.invokeLocalNode( + command: "screen.record", + params: params, + timeoutMs: 120000) + guard let dict = payload as? [String: Any], + let base64 = dict["base64"] as? String, + let data = Data(base64Encoded: base64) + else { + return Response(ok: false, message: "invalid screen record payload") + } + let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + URL(fileURLWithPath: outPath) + } else { + FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-screen-record-\(UUID().uuidString).mp4") + } + try data.write(to: url, options: [.atomic]) + return Response(ok: true, message: url.path) } catch { return Response(ok: false, message: error.localizedDescription) } } + + private static func invokeLocalNode( + command: String, + params: [String: Any]?, + timeoutMs: Int) async throws -> Any? + { + var gatewayParams: [String: AnyCodable] = [ + "nodeId": AnyCodable(Self.localNodeId()), + "command": AnyCodable(command), + "idempotencyKey": AnyCodable(UUID().uuidString), + ] + if let params { + gatewayParams["params"] = AnyCodable(params) + } + let data = try await GatewayConnection.shared.request( + method: "node.invoke", + params: gatewayParams, + timeoutMs: timeoutMs) + return try Self.decodeNodeInvokePayload(data: data) + } + + private static func decodeNodeInvokePayload(data: Data) throws -> Any? { + let obj = try JSONSerialization.jsonObject(with: data) + guard let dict = obj as? [String: Any] else { + throw NSError(domain: "Node", code: 30, userInfo: [ + NSLocalizedDescriptionKey: "invalid node invoke response", + ]) + } + return dict["payload"] + } + + private static func localNodeId() -> String { + "mac-\(InstanceIdentity.instanceId)" + } } diff --git a/apps/macos/Sources/Clawdis/NodeMode/MacNodeRuntime.swift b/apps/macos/Sources/Clawdis/NodeMode/MacNodeRuntime.swift index 9696d34f7..661f8a61c 100644 --- a/apps/macos/Sources/Clawdis/NodeMode/MacNodeRuntime.swift +++ b/apps/macos/Sources/Clawdis/NodeMode/MacNodeRuntime.swift @@ -9,6 +9,14 @@ actor MacNodeRuntime { func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse { let command = req.command + if (command.hasPrefix("canvas.") || command.hasPrefix("canvas.a2ui.")) && !Self.canvasEnabled() { + return BridgeInvokeResponse( + id: req.id, + ok: false, + error: ClawdisNodeError( + code: .unavailable, + message: "CANVAS_DISABLED: enable Canvas in Settings")) + } do { switch command { case ClawdisCanvasCommand.show.rawValue: @@ -141,26 +149,29 @@ actor MacNodeRuntime { code: .invalidRequest, message: "INVALID_REQUEST: screen format must be mp4") } - let path = try await self.screenRecorder.record( + let res = try await self.screenRecorder.record( screenIndex: params.screenIndex, durationMs: params.durationMs, fps: params.fps, + includeAudio: params.includeAudio, outPath: nil) - defer { try? FileManager.default.removeItem(atPath: path) } - let data = try Data(contentsOf: URL(fileURLWithPath: path)) + defer { try? FileManager.default.removeItem(atPath: res.path) } + let data = try Data(contentsOf: URL(fileURLWithPath: res.path)) struct ScreenPayload: Encodable { var format: String var base64: String var durationMs: Int? var fps: Double? var screenIndex: Int? + var hasAudio: Bool } let payload = try Self.encodePayload(ScreenPayload( format: "mp4", base64: data.base64EncodedString(), durationMs: params.durationMs, fps: params.fps, - screenIndex: params.screenIndex)) + screenIndex: params.screenIndex, + hasAudio: res.hasAudio)) return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) default: @@ -246,6 +257,10 @@ actor MacNodeRuntime { return json } + private nonisolated static func canvasEnabled() -> Bool { + UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true + } + private nonisolated static func cameraEnabled() -> Bool { UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false } diff --git a/apps/macos/Sources/Clawdis/NodeMode/MacNodeScreenCommands.swift b/apps/macos/Sources/Clawdis/NodeMode/MacNodeScreenCommands.swift index 31a210edf..6f849fdf0 100644 --- a/apps/macos/Sources/Clawdis/NodeMode/MacNodeScreenCommands.swift +++ b/apps/macos/Sources/Clawdis/NodeMode/MacNodeScreenCommands.swift @@ -9,4 +9,5 @@ struct MacNodeScreenRecordParams: Codable, Sendable, Equatable { var durationMs: Int? var fps: Double? var format: String? + var includeAudio: Bool? } diff --git a/apps/macos/Sources/Clawdis/ScreenRecordService.swift b/apps/macos/Sources/Clawdis/ScreenRecordService.swift index bede0fbf0..4b26f3aed 100644 --- a/apps/macos/Sources/Clawdis/ScreenRecordService.swift +++ b/apps/macos/Sources/Clawdis/ScreenRecordService.swift @@ -31,10 +31,12 @@ final class ScreenRecordService { screenIndex: Int?, durationMs: Int?, fps: Double?, - outPath: String?) async throws -> String + includeAudio: Bool?, + outPath: String?) async throws -> (path: String, hasAudio: Bool) { let durationMs = Self.clampDurationMs(durationMs) let fps = Self.clampFps(fps) + let includeAudio = includeAudio ?? false let outURL: URL = { if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { @@ -60,15 +62,22 @@ final class ScreenRecordService { config.queueDepth = 8 config.showsCursor = true config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(max(1, Int32(fps.rounded())))) + if includeAudio { + config.capturesAudio = true + } let recorder = try StreamRecorder( outputURL: outURL, width: display.width, height: display.height, + includeAudio: includeAudio, logger: self.logger) let stream = SCStream(filter: filter, configuration: config, delegate: recorder) try stream.addStreamOutput(recorder, type: .screen, sampleHandlerQueue: recorder.queue) + if includeAudio { + try stream.addStreamOutput(recorder, type: .audio, sampleHandlerQueue: recorder.queue) + } self.logger.info( "screen record start idx=\(idx) durationMs=\(durationMs) fps=\(fps) out=\(outURL.path, privacy: .public)") @@ -85,7 +94,7 @@ final class ScreenRecordService { } try await recorder.finish() - return outURL.path + return (path: outURL.path, hasAudio: recorder.hasAudio) } private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { @@ -106,13 +115,15 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate, private let logger: Logger private let writer: AVAssetWriter private let input: AVAssetWriterInput + private let audioInput: AVAssetWriterInput? + let hasAudio: Bool private var started = false private var sawFrame = false private var didFinish = false private var pendingErrorMessage: String? - init(outputURL: URL, width: Int, height: Int, logger: Logger) throws { + init(outputURL: URL, width: Int, height: Int, includeAudio: Bool, logger: Logger) throws { self.logger = logger self.writer = try AVAssetWriter(outputURL: outputURL, fileType: .mp4) @@ -128,6 +139,28 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate, throw ScreenRecordService.ScreenRecordError.writeFailed("Cannot add video input") } self.writer.add(self.input) + + if includeAudio { + let audioSettings: [String: Any] = [ + AVFormatIDKey: kAudioFormatMPEG4AAC, + AVNumberOfChannelsKey: 1, + AVSampleRateKey: 44_100, + AVEncoderBitRateKey: 96_000, + ] + let audioInput = AVAssetWriterInput(mediaType: .audio, outputSettings: audioSettings) + audioInput.expectsMediaDataInRealTime = true + if self.writer.canAdd(audioInput) { + self.writer.add(audioInput) + self.audioInput = audioInput + self.hasAudio = true + } else { + self.audioInput = nil + self.hasAudio = false + } + } else { + self.audioInput = nil + self.hasAudio = false + } super.init() } @@ -145,14 +178,20 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, of type: SCStreamOutputType) { - guard type == .screen else { return } guard CMSampleBufferDataIsReady(sampleBuffer) else { return } // Callback runs on `sampleHandlerQueue` (`self.queue`). - self.handle(sampleBuffer: sampleBuffer) + switch type { + case .screen: + self.handleVideo(sampleBuffer: sampleBuffer) + case .audio: + self.handleAudio(sampleBuffer: sampleBuffer) + @unknown default: + break + } _ = stream } - private func handle(sampleBuffer: CMSampleBuffer) { + private func handleVideo(sampleBuffer: CMSampleBuffer) { if let msg = self.pendingErrorMessage { self.logger.error("screen record aborting due to prior error: \(msg, privacy: .public)") return @@ -175,6 +214,18 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate, } } + private func handleAudio(sampleBuffer: CMSampleBuffer) { + guard let audioInput else { return } + if let msg = self.pendingErrorMessage { + self.logger.error("screen record audio aborting due to prior error: \(msg, privacy: .public)") + return + } + if self.didFinish || !self.started { return } + if audioInput.isReadyForMoreMediaData { + _ = audioInput.append(sampleBuffer) + } + } + func finish() async throws { try await withCheckedThrowingContinuation { (cont: CheckedContinuation) in self.queue.async { @@ -193,6 +244,7 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate, self.didFinish = true self.input.markAsFinished() + self.audioInput?.markAsFinished() self.writer.finishWriting { if let err = self.writer.error { cont.resume(throwing: ScreenRecordService.ScreenRecordError.writeFailed(err.localizedDescription)) @@ -206,4 +258,3 @@ private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate, } } } - diff --git a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift index f6d934338..9086fa56d 100644 --- a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift +++ b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift @@ -476,6 +476,7 @@ struct ClawdisCLI { var durationMs: Int? var fps: Double? var outPath: String? + var includeAudio = true while !args.isEmpty { let arg = args.removeFirst() switch arg { @@ -487,6 +488,8 @@ struct ClawdisCLI { durationMs = args.popFirst().flatMap(Int.init) case "--fps": fps = args.popFirst().flatMap(Double.init) + case "--no-audio": + includeAudio = false case "--out": outPath = args.popFirst() default: @@ -494,7 +497,12 @@ struct ClawdisCLI { } } return ParsedCLIRequest( - request: .screenRecord(screenIndex: screenIndex, durationMs: durationMs, fps: fps, outPath: outPath), + request: .screenRecord( + screenIndex: screenIndex, + durationMs: durationMs, + fps: fps, + includeAudio: includeAudio, + outPath: outPath), kind: .mediaPath) default: @@ -766,7 +774,7 @@ struct ClawdisCLI { Screen: clawdis-mac screen record [--screen ] - [--duration |--duration-ms ] [--fps ] [--out ] + [--duration |--duration-ms ] [--fps ] [--no-audio] [--out ] Browser (clawd): clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot @@ -1000,7 +1008,7 @@ struct ClawdisCLI { case let .cameraClip(_, durationMs, _, _): let ms = durationMs ?? 3000 return min(180, max(10, TimeInterval(ms) / 1000.0 + 10)) - case let .screenRecord(_, durationMs, _, _): + case let .screenRecord(_, durationMs, _, _, _): let ms = durationMs ?? 10_000 return min(180, max(10, TimeInterval(ms) / 1000.0 + 10)) default: diff --git a/apps/macos/Sources/ClawdisIPC/IPC.swift b/apps/macos/Sources/ClawdisIPC/IPC.swift index 7559175b2..ba48b813d 100644 --- a/apps/macos/Sources/ClawdisIPC/IPC.swift +++ b/apps/macos/Sources/ClawdisIPC/IPC.swift @@ -132,7 +132,7 @@ public enum Request: Sendable { case nodeInvoke(nodeId: String, command: String, paramsJSON: String?) case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?) case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?) - case screenRecord(screenIndex: Int?, durationMs: Int?, fps: Double?, outPath: String?) + case screenRecord(screenIndex: Int?, durationMs: Int?, fps: Double?, includeAudio: Bool, outPath: String?) } // MARK: - Responses @@ -289,11 +289,12 @@ extension Request: Codable { try container.encode(includeAudio, forKey: .includeAudio) try container.encodeIfPresent(outPath, forKey: .outPath) - case let .screenRecord(screenIndex, durationMs, fps, outPath): + case let .screenRecord(screenIndex, durationMs, fps, includeAudio, outPath): try container.encode(Kind.screenRecord, forKey: .type) try container.encodeIfPresent(screenIndex, forKey: .screenIndex) try container.encodeIfPresent(durationMs, forKey: .durationMs) try container.encodeIfPresent(fps, forKey: .fps) + try container.encode(includeAudio, forKey: .includeAudio) try container.encodeIfPresent(outPath, forKey: .outPath) } } @@ -394,8 +395,14 @@ extension Request: Codable { let screenIndex = try container.decodeIfPresent(Int.self, forKey: .screenIndex) let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs) let fps = try container.decodeIfPresent(Double.self, forKey: .fps) + let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true let outPath = try container.decodeIfPresent(String.self, forKey: .outPath) - self = .screenRecord(screenIndex: screenIndex, durationMs: durationMs, fps: fps, outPath: outPath) + self = .screenRecord( + screenIndex: screenIndex, + durationMs: durationMs, + fps: fps, + includeAudio: includeAudio, + outPath: outPath) } } } diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/ScreenCommands.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/ScreenCommands.swift index 5e84446de..cc0a851d5 100644 --- a/apps/shared/ClawdisKit/Sources/ClawdisKit/ScreenCommands.swift +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/ScreenCommands.swift @@ -9,16 +9,19 @@ public struct ClawdisScreenRecordParams: Codable, Sendable, Equatable { public var durationMs: Int? public var fps: Double? public var format: String? + public var includeAudio: Bool? public init( screenIndex: Int? = nil, durationMs: Int? = nil, fps: Double? = nil, - format: String? = nil) + format: String? = nil, + includeAudio: Bool? = nil) { self.screenIndex = screenIndex self.durationMs = durationMs self.fps = fps self.format = format + self.includeAudio = includeAudio } } diff --git a/docs/nodes.md b/docs/nodes.md index d5383bc10..d754e0d17 100644 --- a/docs/nodes.md +++ b/docs/nodes.md @@ -81,12 +81,14 @@ Nodes expose `screen.record` (mp4). Example: ```bash clawdis nodes screen record --node --duration 10s --fps 10 +clawdis nodes screen record --node --duration 10s --fps 10 --no-audio ``` Notes: - `screen.record` requires the node app to be foregrounded. - Android will show the system screen-capture prompt before recording. - Screen recordings are clamped to `<= 60s`. +- `--no-audio` disables microphone capture (supported on iOS/Android; macOS uses system capture audio). ## Mac node mode diff --git a/src/cli/nodes-cli.ts b/src/cli/nodes-cli.ts index 8a140bc20..e1cfc8e75 100644 --- a/src/cli/nodes-cli.ts +++ b/src/cli/nodes-cli.ts @@ -782,6 +782,7 @@ export function registerNodesCli(program: Command) { .option("--screen ", "Screen index (0 = primary)", "0") .option("--duration ", "Clip duration (ms or 10s)", "10000") .option("--fps ", "Frames per second", "10") + .option("--no-audio", "Disable microphone audio capture") .option("--out ", "Output path") .option( "--invoke-timeout ", @@ -808,6 +809,7 @@ export function registerNodesCli(program: Command) { : undefined, fps: Number.isFinite(fps) ? fps : undefined, format: "mp4", + includeAudio: opts.audio !== false, }, idempotencyKey: randomIdempotencyKey(), }; @@ -844,6 +846,7 @@ export function registerNodesCli(program: Command) { durationMs: parsed.durationMs, fps: parsed.fps, screenIndex: parsed.screenIndex, + hasAudio: parsed.hasAudio, }, }, null, diff --git a/src/cli/nodes-screen.test.ts b/src/cli/nodes-screen.test.ts index 7e13b29d6..06b085981 100644 --- a/src/cli/nodes-screen.test.ts +++ b/src/cli/nodes-screen.test.ts @@ -13,12 +13,14 @@ describe("nodes screen helpers", () => { durationMs: 1000, fps: 12, screenIndex: 0, + hasAudio: true, }); expect(payload.format).toBe("mp4"); expect(payload.base64).toBe("Zm9v"); expect(payload.durationMs).toBe(1000); expect(payload.fps).toBe(12); expect(payload.screenIndex).toBe(0); + expect(payload.hasAudio).toBe(true); }); it("rejects invalid screen.record payload", () => { diff --git a/src/cli/nodes-screen.ts b/src/cli/nodes-screen.ts index 84ac85991..3e2b9913c 100644 --- a/src/cli/nodes-screen.ts +++ b/src/cli/nodes-screen.ts @@ -10,6 +10,7 @@ export type ScreenRecordPayload = { durationMs?: number; fps?: number; screenIndex?: number; + hasAudio?: boolean; }; function asRecord(value: unknown): Record { @@ -36,6 +37,7 @@ export function parseScreenRecordPayload(value: unknown): ScreenRecordPayload { fps: typeof obj.fps === "number" ? obj.fps : undefined, screenIndex: typeof obj.screenIndex === "number" ? obj.screenIndex : undefined, + hasAudio: typeof obj.hasAudio === "boolean" ? obj.hasAudio : undefined, }; }