diff --git a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift index 489e2b1b0..5dee97bbd 100644 --- a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift +++ b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift @@ -58,20 +58,53 @@ enum ControlRequestHandler { let result = await AgentRPC.shared.status() return Response(ok: result.ok, message: result.error) - case let .screenshot(displayID, windowID, _): - let authorized = await PermissionManager - .ensure([.screenRecording], interactive: false)[.screenRecording] ?? false - guard authorized else { return Response(ok: false, message: "screen recording permission missing") } - if let data = await Screenshotter.capture(displayID: displayID, windowID: windowID) { - return Response(ok: true, payload: data) - } - return Response(ok: false, message: "screenshot failed") - case .uiListScreens: let screens = await MainActor.run { UIScreenService.listScreens() } let payload = try JSONEncoder().encode(screens) return Response(ok: true, payload: payload) + case let .uiScreenshot(screenIndex, windowID): + let authorized = await PermissionManager + .ensure([.screenRecording], interactive: false)[.screenRecording] ?? false + guard authorized else { return Response(ok: false, message: "screen recording permission missing") } + + let resolution: (screenIndex: Int?, displayID: UInt32?) = await Task { @MainActor in + if let screenIndex, + let match = UIScreenService.listScreens().first(where: { $0.index == screenIndex }) + { + return (screenIndex, match.displayID) + } + return (nil, nil) + }.value + + let data = await Task { @MainActor in + await Screenshotter.capture(displayID: resolution.displayID, windowID: windowID) + }.value + + guard let data else { + return Response(ok: false, message: "screenshot failed") + } + + let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-ui", isDirectory: true) + try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) + let outURL = dir.appendingPathComponent("screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png") + do { + try data.write(to: outURL) + } catch { + return Response(ok: false, message: "failed to write screenshot: \(error.localizedDescription)") + } + + let size = ScreenshotSize.readPNGSize(data: data) + let result = UIScreenshotResult( + path: outURL.path, + width: size?.width ?? 0, + height: size?.height ?? 0, + screenIndex: resolution.screenIndex, + displayID: resolution.displayID, + windowID: windowID) + let payload = try JSONEncoder().encode(result) + return Response(ok: true, payload: payload) + case let .runShell(command, cwd, env, timeoutSec, needsSR): if needsSR { let authorized = await PermissionManager diff --git a/apps/macos/Sources/Clawdis/ScreenshotSize.swift b/apps/macos/Sources/Clawdis/ScreenshotSize.swift new file mode 100644 index 000000000..8ffeba779 --- /dev/null +++ b/apps/macos/Sources/Clawdis/ScreenshotSize.swift @@ -0,0 +1,18 @@ +import Foundation +import ImageIO + +enum ScreenshotSize { + struct Size { + let width: Int + let height: Int + } + + static func readPNGSize(data: Data) -> Size? { + guard let source = CGImageSourceCreateWithData(data as CFData, nil) else { return nil } + guard let props = CGImageSourceCopyPropertiesAtIndex(source, 0, nil) as? [CFString: Any] else { return nil } + guard let width = props[kCGImagePropertyPixelWidth] as? Int else { return nil } + guard let height = props[kCGImagePropertyPixelHeight] as? Int else { return nil } + return Size(width: width, height: height) + } +} + diff --git a/apps/macos/Sources/ClawdisCLI/main.swift b/apps/macos/Sources/ClawdisCLI/main.swift index 4c667ce72..3ce5e132c 100644 --- a/apps/macos/Sources/ClawdisCLI/main.swift +++ b/apps/macos/Sources/ClawdisCLI/main.swift @@ -37,8 +37,8 @@ struct ClawdisCLI { var kind: Kind enum Kind { - case screenshot(outPath: String?) case uiScreens + case uiScreenshot case generic } } @@ -95,23 +95,6 @@ struct ClawdisCLI { if caps.isEmpty { caps = Capability.allCases } return ParsedCLIRequest(request: .ensurePermissions(caps, interactive: interactive), kind: .generic) - case "screenshot": - var displayID: UInt32? - var windowID: UInt32? - var outPath: String? - while !args.isEmpty { - let arg = args.removeFirst() - switch arg { - case "--display-id": if let val = args.popFirst(), let num = UInt32(val) { displayID = num } - case "--window-id": if let val = args.popFirst(), let num = UInt32(val) { windowID = num } - case "--out": outPath = args.popFirst() - default: break - } - } - return ParsedCLIRequest( - request: .screenshot(displayID: displayID, windowID: windowID, format: "png"), - kind: .screenshot(outPath: outPath)) - case "ui": guard let sub = args.first else { throw CLIError.help } args = Array(args.dropFirst()) @@ -119,6 +102,18 @@ struct ClawdisCLI { switch sub { case "screens": return ParsedCLIRequest(request: .uiListScreens, kind: .uiScreens) + case "screenshot": + var screenIndex: Int? + var windowID: UInt32? + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--screen-index": screenIndex = args.popFirst().flatMap(Int.init) + case "--window-id": windowID = args.popFirst().flatMap(UInt32.init) + default: break + } + } + return ParsedCLIRequest(request: .uiScreenshot(screenIndex: screenIndex, windowID: windowID), kind: .uiScreenshot) default: throw CLIError.help } @@ -333,10 +328,6 @@ struct ClawdisCLI { } switch parsed.kind { - case let .screenshot(outPath): - let path = try self.writeScreenshotPayloadToFile(payload: response.payload, outPath: outPath) - FileHandle.standardOutput.write(Data((path + "\n").utf8)) - case .uiScreens: let screens = try self.decodePayload([UIScreenInfo].self, payload: response.payload) if screens.isEmpty { @@ -351,6 +342,10 @@ struct ClawdisCLI { FileHandle.standardOutput.write(Data(line.utf8)) } + case .uiScreenshot: + let result = try self.decodePayload(UIScreenshotResult.self, payload: response.payload) + FileHandle.standardOutput.write(Data((result.path + "\n").utf8)) + case .generic: if let payload = response.payload, let text = String(data: payload, encoding: .utf8), !text.isEmpty { FileHandle.standardOutput.write(payload) @@ -370,14 +365,6 @@ struct ClawdisCLI { ] switch parsed.kind { - case let .screenshot(outPath): - if response.ok { - let path = try self.writeScreenshotPayloadToFile(payload: response.payload, outPath: outPath) - output["result"] = ["path": path] - } else { - output["result"] = NSNull() - } - case .uiScreens: if let payload = response.payload, let obj = try? JSONSerialization.jsonObject(with: payload) { @@ -386,6 +373,14 @@ struct ClawdisCLI { output["result"] = [] } + case .uiScreenshot: + if let payload = response.payload, + let obj = try? JSONSerialization.jsonObject(with: payload) { + output["result"] = obj + } else { + output["result"] = NSNull() + } + case .generic: if let payload = response.payload, !payload.isEmpty { if let obj = try? JSONSerialization.jsonObject(with: payload) { @@ -406,21 +401,6 @@ struct ClawdisCLI { return try JSONDecoder().decode(T.self, from: payload) } - private static func writeScreenshotPayloadToFile(payload: Data?, outPath: String?) throws -> String { - guard let payload, !payload.isEmpty else { throw POSIXError(.EINVAL) } - let url: URL - if let outPath, !outPath.isEmpty { - url = URL(fileURLWithPath: outPath).resolvingSymlinksInPath() - } else { - let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-mac", isDirectory: true) - try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) - let name = "screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png" - url = dir.appendingPathComponent(name) - } - try payload.write(to: url) - return url.path - } - private static func printHelp() { let usage = """ clawdis-mac — talk to the running Clawdis.app XPC service @@ -431,8 +411,8 @@ struct ClawdisCLI { clawdis-mac ensure-permissions [--cap ] [--interactive] - clawdis-mac screenshot [--display-id ] [--window-id ] [--out ] clawdis-mac ui screens + clawdis-mac ui screenshot [--screen-index ] [--window-id ] clawdis-mac run [--cwd ] [--env KEY=VAL] [--timeout ] [--needs-screen-recording] clawdis-mac status clawdis-mac rpc-status diff --git a/apps/macos/Sources/ClawdisIPC/IPC.swift b/apps/macos/Sources/ClawdisIPC/IPC.swift index 0a7b6c089..526d91fdf 100644 --- a/apps/macos/Sources/ClawdisIPC/IPC.swift +++ b/apps/macos/Sources/ClawdisIPC/IPC.swift @@ -83,6 +83,31 @@ public struct UIScreenInfo: Codable, Sendable { } } +public struct UIScreenshotResult: Codable, Sendable { + public let path: String + public let width: Int + public let height: Int + public let screenIndex: Int? + public let displayID: UInt32? + public let windowID: UInt32? + + public init( + path: String, + width: Int, + height: Int, + screenIndex: Int? = nil, + displayID: UInt32? = nil, + windowID: UInt32? = nil) + { + self.path = path + self.width = width + self.height = height + self.screenIndex = screenIndex + self.displayID = displayID + self.windowID = windowID + } +} + public enum Request: Sendable { case notify( title: String, @@ -91,8 +116,8 @@ public enum Request: Sendable { priority: NotificationPriority?, delivery: NotificationDelivery?) case ensurePermissions([Capability], interactive: Bool) - case screenshot(displayID: UInt32?, windowID: UInt32?, format: String) case uiListScreens + case uiScreenshot(screenIndex: Int?, windowID: UInt32?) case runShell( command: [String], cwd: String?, @@ -133,7 +158,7 @@ extension Request: Codable { case type case title, body, sound, priority, delivery case caps, interactive - case displayID, windowID, format + case screenIndex, windowID case command, cwd, env, timeoutSec, needsScreenRecording case message, thinking, session, deliver, to case rpcStatus @@ -149,8 +174,8 @@ extension Request: Codable { private enum Kind: String, Codable { case notify case ensurePermissions - case screenshot case uiListScreens + case uiScreenshot case runShell case status case agent @@ -180,15 +205,14 @@ extension Request: Codable { try container.encode(caps, forKey: .caps) try container.encode(interactive, forKey: .interactive) - case let .screenshot(displayID, windowID, format): - try container.encode(Kind.screenshot, forKey: .type) - try container.encodeIfPresent(displayID, forKey: .displayID) - try container.encodeIfPresent(windowID, forKey: .windowID) - try container.encode(format, forKey: .format) - case .uiListScreens: try container.encode(Kind.uiListScreens, forKey: .type) + case let .uiScreenshot(screenIndex, windowID): + try container.encode(Kind.uiScreenshot, forKey: .type) + try container.encodeIfPresent(screenIndex, forKey: .screenIndex) + try container.encodeIfPresent(windowID, forKey: .windowID) + case let .runShell(command, cwd, env, timeoutSec, needsSR): try container.encode(Kind.runShell, forKey: .type) try container.encode(command, forKey: .command) @@ -265,15 +289,14 @@ extension Request: Codable { let interactive = try container.decode(Bool.self, forKey: .interactive) self = .ensurePermissions(caps, interactive: interactive) - case .screenshot: - let displayID = try container.decodeIfPresent(UInt32.self, forKey: .displayID) - let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID) - let format = try container.decode(String.self, forKey: .format) - self = .screenshot(displayID: displayID, windowID: windowID, format: format) - case .uiListScreens: self = .uiListScreens + case .uiScreenshot: + let screenIndex = try container.decodeIfPresent(Int.self, forKey: .screenIndex) + let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID) + self = .uiScreenshot(screenIndex: screenIndex, windowID: windowID) + case .runShell: let command = try container.decode([String].self, forKey: .command) let cwd = try container.decodeIfPresent(String.self, forKey: .cwd) diff --git a/docs/clawdis-mac.md b/docs/clawdis-mac.md index 802740d80..4cc5be6f9 100644 --- a/docs/clawdis-mac.md +++ b/docs/clawdis-mac.md @@ -36,7 +36,7 @@ enum Capability { notifications, accessibility, screenRecording, appleScript, mi enum Request { notify(title, body, sound?) ensurePermissions([Capability], interactive: Bool) - screenshot(displayID?, windowID?, format="png") + uiScreenshot(screenIndex?, windowID?) runShell(command:[String], cwd?, env?, timeoutSec?, needsScreenRecording: Bool) status } @@ -66,8 +66,8 @@ struct Response { ok: Bool; message?: String; payload?: Data } - Subcommands (text by default; `--json` for machine output; non-zero exit on failure): - `notify --title --body [--sound] [--priority passive|active|timeSensitive] [--delivery system|overlay|auto]` - `ensure-permissions --cap accessibility --cap screenRecording [--interactive]` - - `screenshot [--display-id N | --window-id N] [--out path]` - `ui screens` + - `ui screenshot [--screen-index N] [--window-id N]` - `run -- cmd args... [--cwd] [--env KEY=VAL] [--timeout 30] [--needs-screen-recording]` - `status` - Sounds: supply any macOS alert name with `--sound` per notification; omit the flag to use the system default. There is no longer a persisted “default sound” in the app UI. diff --git a/docs/mac/child-process.md b/docs/mac/child-process.md index 38fcb7a31..630abee66 100644 --- a/docs/mac/child-process.md +++ b/docs/mac/child-process.md @@ -23,7 +23,7 @@ Run the Node-based Clawdis/clawdis gateway as a direct child of the LSUIElement ## TCC guardrails (must keep) - Screen Recording, Accessibility, mic, and speech prompts must originate from the Swift app/XPC. The Node child should never call these APIs directly; use the existing XPC/CLI broker (`clawdis-mac`) for: - `ensure-permissions` - - `screenshot` / ScreenCaptureKit work + - `ui screenshot` / ScreenCaptureKit work - mic/speech permission checks - notifications - shell runs that need `needs-screen-recording` diff --git a/docs/mac/peekaboo.md b/docs/mac/peekaboo.md index adb97d562..6bbfe3574 100644 --- a/docs/mac/peekaboo.md +++ b/docs/mac/peekaboo.md @@ -106,14 +106,14 @@ Current state: The visualizer is intentionally display-only (no clickable overlays needed). ## Screenshots (legacy → Peekaboo takeover) -Clawdis currently has a legacy `screenshot` request returning raw PNG bytes in `Response.payload`. +Clawdis uses `clawdis-mac ui screenshot` and returns a file path (default location: temp directory) instead of raw image bytes. Migration plan: - Replace capture implementation with PeekabooAutomationKit’s capture service so we share: - per-screen mapping - window/app targeting - visual feedback (flash / watch HUD) when enabled -- Prefer writing images to a file path on the app side and returning the path (text-friendly), with `--json` providing the structured metadata. +- Keep writing images to a file path on the app side and returning the path (text-friendly), with `--json` providing the structured metadata. - No aliases: remove the old `Request.screenshot` and introduce a new `Request.uiScreenshot` (or similar) so the new behavior is explicit and there’s no “legacy mode” to maintain. ## Permissions behavior