From 3772a29557b4b662341e0d8a43ad7c209de5c348 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 19 Dec 2025 00:29:38 +0100 Subject: [PATCH] macOS: add screen record + safer camera defaults --- .../Clawdis/CameraCaptureService.swift | 2 +- .../Clawdis/ControlRequestHandler.swift | 34 ++- .../Sources/Clawdis/GeneralSettings.swift | 2 +- .../Sources/Clawdis/ScreenRecordService.swift | 209 ++++++++++++++++++ .../macos/Sources/ClawdisCLI/ClawdisCLI.swift | 106 ++++++++- apps/macos/Sources/ClawdisIPC/IPC.swift | 18 ++ 6 files changed, 364 insertions(+), 7 deletions(-) create mode 100644 apps/macos/Sources/Clawdis/ScreenRecordService.swift diff --git a/apps/macos/Sources/Clawdis/CameraCaptureService.swift b/apps/macos/Sources/Clawdis/CameraCaptureService.swift index a7d25d359..72795c2b5 100644 --- a/apps/macos/Sources/Clawdis/CameraCaptureService.swift +++ b/apps/macos/Sources/Clawdis/CameraCaptureService.swift @@ -206,7 +206,7 @@ actor CameraCaptureService { private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { let v = ms ?? 3000 - return min(15000, max(250, v)) + return min(60_000, max(250, v)) } private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws { diff --git a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift index 64333ed70..4cf9442f0 100644 --- a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift +++ b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift @@ -5,6 +5,7 @@ import OSLog enum ControlRequestHandler { private static let cameraCapture = CameraCaptureService() + @MainActor private static let screenRecorder = ScreenRecordService() struct NodeListNode: Codable { var nodeId: String @@ -133,6 +134,13 @@ enum ControlRequestHandler { durationMs: durationMs, includeAudio: includeAudio, outPath: outPath) + + case let .screenRecord(screenIndex, durationMs, fps, outPath): + return await self.handleScreenRecord( + screenIndex: screenIndex, + durationMs: durationMs, + fps: fps, + outPath: outPath) } } @@ -225,7 +233,7 @@ enum ControlRequestHandler { } private static func cameraEnabled() -> Bool { - UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? true + UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false } private static func handleCanvasShow( @@ -534,4 +542,28 @@ enum ControlRequestHandler { return Response(ok: false, message: error.localizedDescription) } } + + private static func handleScreenRecord( + screenIndex: Int?, + durationMs: Int?, + fps: Double?, + outPath: String?) async -> Response + { + let authorized = await PermissionManager + .ensure([.screenRecording], interactive: false)[.screenRecording] ?? false + guard authorized else { return Response(ok: false, message: "screen recording permission missing") } + + do { + let path = try await Task { @MainActor in + try await self.screenRecorder.record( + screenIndex: screenIndex, + durationMs: durationMs, + fps: fps, + outPath: outPath) + }.value + return Response(ok: true, message: path) + } catch { + return Response(ok: false, message: error.localizedDescription) + } + } } diff --git a/apps/macos/Sources/Clawdis/GeneralSettings.swift b/apps/macos/Sources/Clawdis/GeneralSettings.swift index e1e432b5b..390706559 100644 --- a/apps/macos/Sources/Clawdis/GeneralSettings.swift +++ b/apps/macos/Sources/Clawdis/GeneralSettings.swift @@ -4,7 +4,7 @@ import SwiftUI struct GeneralSettings: View { @Bindable var state: AppState - @AppStorage(cameraEnabledKey) private var cameraEnabled: Bool = true + @AppStorage(cameraEnabledKey) private var cameraEnabled: Bool = false private let healthStore = HealthStore.shared private let gatewayManager = GatewayProcessManager.shared // swiftlint:disable:next inclusive_language diff --git a/apps/macos/Sources/Clawdis/ScreenRecordService.swift b/apps/macos/Sources/Clawdis/ScreenRecordService.swift new file mode 100644 index 000000000..bede0fbf0 --- /dev/null +++ b/apps/macos/Sources/Clawdis/ScreenRecordService.swift @@ -0,0 +1,209 @@ +import AVFoundation +import Foundation +import OSLog +@preconcurrency import ScreenCaptureKit + +@MainActor +final class ScreenRecordService { + enum ScreenRecordError: LocalizedError { + case noDisplays + case invalidScreenIndex(Int) + case noFramesCaptured + case writeFailed(String) + + var errorDescription: String? { + switch self { + case .noDisplays: + "No displays available for screen recording" + case let .invalidScreenIndex(idx): + "Invalid screen index \(idx)" + case .noFramesCaptured: + "No frames captured" + case let .writeFailed(msg): + msg + } + } + } + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "screenRecord") + + func record( + screenIndex: Int?, + durationMs: Int?, + fps: Double?, + outPath: String?) async throws -> String + { + let durationMs = Self.clampDurationMs(durationMs) + let fps = Self.clampFps(fps) + + let outURL: URL = { + if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + return URL(fileURLWithPath: outPath) + } + return FileManager.default.temporaryDirectory + .appendingPathComponent("clawdis-screen-record-\(UUID().uuidString).mp4") + }() + try? FileManager.default.removeItem(at: outURL) + + let content = try await SCShareableContent.current + let displays = content.displays.sorted { $0.displayID < $1.displayID } + guard !displays.isEmpty else { throw ScreenRecordError.noDisplays } + + let idx = screenIndex ?? 0 + guard idx >= 0, idx < displays.count else { throw ScreenRecordError.invalidScreenIndex(idx) } + let display = displays[idx] + + let filter = SCContentFilter(display: display, excludingWindows: []) + let config = SCStreamConfiguration() + config.width = display.width + config.height = display.height + config.queueDepth = 8 + config.showsCursor = true + config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(max(1, Int32(fps.rounded())))) + + let recorder = try StreamRecorder( + outputURL: outURL, + width: display.width, + height: display.height, + logger: self.logger) + + let stream = SCStream(filter: filter, configuration: config, delegate: recorder) + try stream.addStreamOutput(recorder, type: .screen, sampleHandlerQueue: recorder.queue) + + self.logger.info( + "screen record start idx=\(idx) durationMs=\(durationMs) fps=\(fps) out=\(outURL.path, privacy: .public)") + + var started = false + do { + try await stream.startCapture() + started = true + try await Task.sleep(nanoseconds: UInt64(durationMs) * 1_000_000) + try await stream.stopCapture() + } catch { + if started { try? await stream.stopCapture() } + throw error + } + + try await recorder.finish() + return outURL.path + } + + private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { + let v = ms ?? 10_000 + return min(60_000, max(250, v)) + } + + private nonisolated static func clampFps(_ fps: Double?) -> Double { + let v = fps ?? 10 + if !v.isFinite { return 10 } + return min(60, max(1, v)) + } +} + +private final class StreamRecorder: NSObject, SCStreamOutput, SCStreamDelegate, @unchecked Sendable { + let queue = DispatchQueue(label: "com.steipete.clawdis.screenRecord.writer") + + private let logger: Logger + private let writer: AVAssetWriter + private let input: AVAssetWriterInput + + private var started = false + private var sawFrame = false + private var didFinish = false + private var pendingErrorMessage: String? + + init(outputURL: URL, width: Int, height: Int, logger: Logger) throws { + self.logger = logger + self.writer = try AVAssetWriter(outputURL: outputURL, fileType: .mp4) + + let settings: [String: Any] = [ + AVVideoCodecKey: AVVideoCodecType.h264, + AVVideoWidthKey: width, + AVVideoHeightKey: height, + ] + self.input = AVAssetWriterInput(mediaType: .video, outputSettings: settings) + self.input.expectsMediaDataInRealTime = true + + guard self.writer.canAdd(self.input) else { + throw ScreenRecordService.ScreenRecordError.writeFailed("Cannot add video input") + } + self.writer.add(self.input) + super.init() + } + + func stream(_ stream: SCStream, didStopWithError error: any Error) { + self.queue.async { + let msg = String(describing: error) + self.pendingErrorMessage = msg + self.logger.error("screen record stream stopped with error: \(msg, privacy: .public)") + _ = stream + } + } + + func stream( + _ stream: SCStream, + didOutputSampleBuffer sampleBuffer: CMSampleBuffer, + of type: SCStreamOutputType) + { + guard type == .screen else { return } + guard CMSampleBufferDataIsReady(sampleBuffer) else { return } + // Callback runs on `sampleHandlerQueue` (`self.queue`). + self.handle(sampleBuffer: sampleBuffer) + _ = stream + } + + private func handle(sampleBuffer: CMSampleBuffer) { + if let msg = self.pendingErrorMessage { + self.logger.error("screen record aborting due to prior error: \(msg, privacy: .public)") + return + } + if self.didFinish { return } + + if !self.started { + guard self.writer.startWriting() else { + self.pendingErrorMessage = self.writer.error?.localizedDescription ?? "Failed to start writer" + return + } + let pts = CMSampleBufferGetPresentationTimeStamp(sampleBuffer) + self.writer.startSession(atSourceTime: pts) + self.started = true + } + + self.sawFrame = true + if self.input.isReadyForMoreMediaData { + _ = self.input.append(sampleBuffer) + } + } + + func finish() async throws { + try await withCheckedThrowingContinuation { (cont: CheckedContinuation) in + self.queue.async { + if let msg = self.pendingErrorMessage { + cont.resume(throwing: ScreenRecordService.ScreenRecordError.writeFailed(msg)) + return + } + guard self.started, self.sawFrame else { + cont.resume(throwing: ScreenRecordService.ScreenRecordError.noFramesCaptured) + return + } + if self.didFinish { + cont.resume() + return + } + self.didFinish = true + + self.input.markAsFinished() + self.writer.finishWriting { + if let err = self.writer.error { + cont.resume(throwing: ScreenRecordService.ScreenRecordError.writeFailed(err.localizedDescription)) + } else if self.writer.status != .completed { + cont.resume(throwing: ScreenRecordService.ScreenRecordError.writeFailed("Failed to finalize video")) + } else { + cont.resume() + } + } + } + } + } +} + diff --git a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift index 92ce15687..f6d934338 100644 --- a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift +++ b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift @@ -96,11 +96,62 @@ struct ClawdisCLI { case "camera": return try self.parseCamera(args: &args) + case "screen": + return try self.parseScreen(args: &args) + default: throw CLIError.help } } + private static func parseDurationMsArg(_ raw: String?) throws -> Int? { + guard let raw else { return nil } + let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + if trimmed.isEmpty { return nil } + + let regex = try NSRegularExpression(pattern: "^(\\d+(?:\\.\\d+)?)(ms|s|m)?$") + let range = NSRange(trimmed.startIndex..= 0 else { + throw NSError(domain: "ClawdisCLI", code: 3, userInfo: [ + NSLocalizedDescriptionKey: "invalid duration: \(raw) (expected 1000, 10s, 1m)", + ]) + } + + let unit: String = { + if let unitRange = Range(match.range(at: 2), in: trimmed) { + return String(trimmed[unitRange]) + } + return "ms" + }() + + let multiplier: Double = switch unit { + case "ms": 1 + case "s": 1000 + case "m": 60_000 + default: 1 + } + + let ms = Int((value * multiplier).rounded()) + guard ms >= 0 else { + throw NSError(domain: "ClawdisCLI", code: 3, userInfo: [ + NSLocalizedDescriptionKey: "invalid duration: \(raw) (expected 1000, 10s, 1m)", + ]) + } + return ms + } + private static func parseNotify(args: inout [String]) throws -> ParsedCLIRequest { var title: String? var body: String? @@ -392,6 +443,8 @@ struct ClawdisCLI { switch arg { case "--facing": if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f } + case "--duration": + durationMs = try self.parseDurationMsArg(args.popFirst()) case "--duration-ms": durationMs = args.popFirst().flatMap(Int.init) case "--no-audio": @@ -415,6 +468,40 @@ struct ClawdisCLI { } } + private static func parseScreen(args: inout [String]) throws -> ParsedCLIRequest { + guard let sub = args.popFirst() else { throw CLIError.help } + switch sub { + case "record": + var screenIndex: Int? + var durationMs: Int? + var fps: Double? + var outPath: String? + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--screen": + screenIndex = args.popFirst().flatMap(Int.init) + case "--duration": + durationMs = try self.parseDurationMsArg(args.popFirst()) + case "--duration-ms": + durationMs = args.popFirst().flatMap(Int.init) + case "--fps": + fps = args.popFirst().flatMap(Double.init) + case "--out": + outPath = args.popFirst() + default: + break + } + } + return ParsedCLIRequest( + request: .screenRecord(screenIndex: screenIndex, durationMs: durationMs, fps: fps, outPath: outPath), + kind: .mediaPath) + + default: + throw CLIError.help + } + } + private static func parseCanvasPlacement( args: inout [String], session: inout String, @@ -674,7 +761,12 @@ struct ClawdisCLI { Camera: clawdis-mac camera snap [--facing ] [--max-width ] [--quality <0-1>] [--out ] - clawdis-mac camera clip [--facing ] [--duration-ms ] [--no-audio] [--out ] + clawdis-mac camera clip [--facing ] + [--duration |--duration-ms ] [--no-audio] [--out ] + + Screen: + clawdis-mac screen record [--screen ] + [--duration |--duration-ms ] [--fps ] [--out ] Browser (clawd): clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot @@ -703,7 +795,7 @@ struct ClawdisCLI { Output: Default output is text. Use --json for machine-readable output. In text mode, `browser screenshot` prints MEDIA:. - In text mode, `camera snap` and `camera clip` print MEDIA:. + In text mode, `camera snap`, `camera clip`, and `screen record` print MEDIA:. """ print(usage) } @@ -904,10 +996,16 @@ struct ClawdisCLI { switch request { case let .runShell(_, _, _, timeoutSec, _): // Allow longer for commands; still cap overall to a sane bound. - min(300, max(10, (timeoutSec ?? 10) + 2)) + return min(300, max(10, (timeoutSec ?? 10) + 2)) + case let .cameraClip(_, durationMs, _, _): + let ms = durationMs ?? 3000 + return min(180, max(10, TimeInterval(ms) / 1000.0 + 10)) + case let .screenRecord(_, durationMs, _, _): + let ms = durationMs ?? 10_000 + return min(180, max(10, TimeInterval(ms) / 1000.0 + 10)) default: // Fail-fast so callers (incl. SSH tool calls) don't hang forever. - 10 + return 10 } } diff --git a/apps/macos/Sources/ClawdisIPC/IPC.swift b/apps/macos/Sources/ClawdisIPC/IPC.swift index 364010326..7559175b2 100644 --- a/apps/macos/Sources/ClawdisIPC/IPC.swift +++ b/apps/macos/Sources/ClawdisIPC/IPC.swift @@ -132,6 +132,7 @@ public enum Request: Sendable { case nodeInvoke(nodeId: String, command: String, paramsJSON: String?) case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?) case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?) + case screenRecord(screenIndex: Int?, durationMs: Int?, fps: Double?, outPath: String?) } // MARK: - Responses @@ -162,6 +163,8 @@ extension Request: Codable { case path case javaScript case outPath + case screenIndex + case fps case canvasA2UICommand case jsonl case facing @@ -192,6 +195,7 @@ extension Request: Codable { case nodeInvoke case cameraSnap case cameraClip + case screenRecord } public func encode(to encoder: Encoder) throws { @@ -284,6 +288,13 @@ extension Request: Codable { try container.encodeIfPresent(durationMs, forKey: .durationMs) try container.encode(includeAudio, forKey: .includeAudio) try container.encodeIfPresent(outPath, forKey: .outPath) + + case let .screenRecord(screenIndex, durationMs, fps, outPath): + try container.encode(Kind.screenRecord, forKey: .type) + try container.encodeIfPresent(screenIndex, forKey: .screenIndex) + try container.encodeIfPresent(durationMs, forKey: .durationMs) + try container.encodeIfPresent(fps, forKey: .fps) + try container.encodeIfPresent(outPath, forKey: .outPath) } } @@ -378,6 +389,13 @@ extension Request: Codable { let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true let outPath = try container.decodeIfPresent(String.self, forKey: .outPath) self = .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath) + + case .screenRecord: + let screenIndex = try container.decodeIfPresent(Int.self, forKey: .screenIndex) + let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs) + let fps = try container.decodeIfPresent(Double.self, forKey: .fps) + let outPath = try container.decodeIfPresent(String.self, forKey: .outPath) + self = .screenRecord(screenIndex: screenIndex, durationMs: durationMs, fps: fps, outPath: outPath) } } }