feat!(mac): move screenshot to ui

This commit is contained in:
Peter Steinberger
2025-12-13 11:51:51 +00:00
parent cf90bd9c86
commit 36f21c5a4f
7 changed files with 129 additions and 75 deletions

View File

@@ -58,20 +58,53 @@ enum ControlRequestHandler {
let result = await AgentRPC.shared.status() let result = await AgentRPC.shared.status()
return Response(ok: result.ok, message: result.error) return Response(ok: result.ok, message: result.error)
case let .screenshot(displayID, windowID, _):
let authorized = await PermissionManager
.ensure([.screenRecording], interactive: false)[.screenRecording] ?? false
guard authorized else { return Response(ok: false, message: "screen recording permission missing") }
if let data = await Screenshotter.capture(displayID: displayID, windowID: windowID) {
return Response(ok: true, payload: data)
}
return Response(ok: false, message: "screenshot failed")
case .uiListScreens: case .uiListScreens:
let screens = await MainActor.run { UIScreenService.listScreens() } let screens = await MainActor.run { UIScreenService.listScreens() }
let payload = try JSONEncoder().encode(screens) let payload = try JSONEncoder().encode(screens)
return Response(ok: true, payload: payload) return Response(ok: true, payload: payload)
case let .uiScreenshot(screenIndex, windowID):
let authorized = await PermissionManager
.ensure([.screenRecording], interactive: false)[.screenRecording] ?? false
guard authorized else { return Response(ok: false, message: "screen recording permission missing") }
let resolution: (screenIndex: Int?, displayID: UInt32?) = await Task { @MainActor in
if let screenIndex,
let match = UIScreenService.listScreens().first(where: { $0.index == screenIndex })
{
return (screenIndex, match.displayID)
}
return (nil, nil)
}.value
let data = await Task { @MainActor in
await Screenshotter.capture(displayID: resolution.displayID, windowID: windowID)
}.value
guard let data else {
return Response(ok: false, message: "screenshot failed")
}
let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-ui", isDirectory: true)
try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
let outURL = dir.appendingPathComponent("screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png")
do {
try data.write(to: outURL)
} catch {
return Response(ok: false, message: "failed to write screenshot: \(error.localizedDescription)")
}
let size = ScreenshotSize.readPNGSize(data: data)
let result = UIScreenshotResult(
path: outURL.path,
width: size?.width ?? 0,
height: size?.height ?? 0,
screenIndex: resolution.screenIndex,
displayID: resolution.displayID,
windowID: windowID)
let payload = try JSONEncoder().encode(result)
return Response(ok: true, payload: payload)
case let .runShell(command, cwd, env, timeoutSec, needsSR): case let .runShell(command, cwd, env, timeoutSec, needsSR):
if needsSR { if needsSR {
let authorized = await PermissionManager let authorized = await PermissionManager

View File

@@ -0,0 +1,18 @@
import Foundation
import ImageIO
enum ScreenshotSize {
struct Size {
let width: Int
let height: Int
}
static func readPNGSize(data: Data) -> Size? {
guard let source = CGImageSourceCreateWithData(data as CFData, nil) else { return nil }
guard let props = CGImageSourceCopyPropertiesAtIndex(source, 0, nil) as? [CFString: Any] else { return nil }
guard let width = props[kCGImagePropertyPixelWidth] as? Int else { return nil }
guard let height = props[kCGImagePropertyPixelHeight] as? Int else { return nil }
return Size(width: width, height: height)
}
}

View File

@@ -37,8 +37,8 @@ struct ClawdisCLI {
var kind: Kind var kind: Kind
enum Kind { enum Kind {
case screenshot(outPath: String?)
case uiScreens case uiScreens
case uiScreenshot
case generic case generic
} }
} }
@@ -95,23 +95,6 @@ struct ClawdisCLI {
if caps.isEmpty { caps = Capability.allCases } if caps.isEmpty { caps = Capability.allCases }
return ParsedCLIRequest(request: .ensurePermissions(caps, interactive: interactive), kind: .generic) return ParsedCLIRequest(request: .ensurePermissions(caps, interactive: interactive), kind: .generic)
case "screenshot":
var displayID: UInt32?
var windowID: UInt32?
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--display-id": if let val = args.popFirst(), let num = UInt32(val) { displayID = num }
case "--window-id": if let val = args.popFirst(), let num = UInt32(val) { windowID = num }
case "--out": outPath = args.popFirst()
default: break
}
}
return ParsedCLIRequest(
request: .screenshot(displayID: displayID, windowID: windowID, format: "png"),
kind: .screenshot(outPath: outPath))
case "ui": case "ui":
guard let sub = args.first else { throw CLIError.help } guard let sub = args.first else { throw CLIError.help }
args = Array(args.dropFirst()) args = Array(args.dropFirst())
@@ -119,6 +102,18 @@ struct ClawdisCLI {
switch sub { switch sub {
case "screens": case "screens":
return ParsedCLIRequest(request: .uiListScreens, kind: .uiScreens) return ParsedCLIRequest(request: .uiListScreens, kind: .uiScreens)
case "screenshot":
var screenIndex: Int?
var windowID: UInt32?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--screen-index": screenIndex = args.popFirst().flatMap(Int.init)
case "--window-id": windowID = args.popFirst().flatMap(UInt32.init)
default: break
}
}
return ParsedCLIRequest(request: .uiScreenshot(screenIndex: screenIndex, windowID: windowID), kind: .uiScreenshot)
default: default:
throw CLIError.help throw CLIError.help
} }
@@ -333,10 +328,6 @@ struct ClawdisCLI {
} }
switch parsed.kind { switch parsed.kind {
case let .screenshot(outPath):
let path = try self.writeScreenshotPayloadToFile(payload: response.payload, outPath: outPath)
FileHandle.standardOutput.write(Data((path + "\n").utf8))
case .uiScreens: case .uiScreens:
let screens = try self.decodePayload([UIScreenInfo].self, payload: response.payload) let screens = try self.decodePayload([UIScreenInfo].self, payload: response.payload)
if screens.isEmpty { if screens.isEmpty {
@@ -351,6 +342,10 @@ struct ClawdisCLI {
FileHandle.standardOutput.write(Data(line.utf8)) FileHandle.standardOutput.write(Data(line.utf8))
} }
case .uiScreenshot:
let result = try self.decodePayload(UIScreenshotResult.self, payload: response.payload)
FileHandle.standardOutput.write(Data((result.path + "\n").utf8))
case .generic: case .generic:
if let payload = response.payload, let text = String(data: payload, encoding: .utf8), !text.isEmpty { if let payload = response.payload, let text = String(data: payload, encoding: .utf8), !text.isEmpty {
FileHandle.standardOutput.write(payload) FileHandle.standardOutput.write(payload)
@@ -370,14 +365,6 @@ struct ClawdisCLI {
] ]
switch parsed.kind { switch parsed.kind {
case let .screenshot(outPath):
if response.ok {
let path = try self.writeScreenshotPayloadToFile(payload: response.payload, outPath: outPath)
output["result"] = ["path": path]
} else {
output["result"] = NSNull()
}
case .uiScreens: case .uiScreens:
if let payload = response.payload, if let payload = response.payload,
let obj = try? JSONSerialization.jsonObject(with: payload) { let obj = try? JSONSerialization.jsonObject(with: payload) {
@@ -386,6 +373,14 @@ struct ClawdisCLI {
output["result"] = [] output["result"] = []
} }
case .uiScreenshot:
if let payload = response.payload,
let obj = try? JSONSerialization.jsonObject(with: payload) {
output["result"] = obj
} else {
output["result"] = NSNull()
}
case .generic: case .generic:
if let payload = response.payload, !payload.isEmpty { if let payload = response.payload, !payload.isEmpty {
if let obj = try? JSONSerialization.jsonObject(with: payload) { if let obj = try? JSONSerialization.jsonObject(with: payload) {
@@ -406,21 +401,6 @@ struct ClawdisCLI {
return try JSONDecoder().decode(T.self, from: payload) return try JSONDecoder().decode(T.self, from: payload)
} }
private static func writeScreenshotPayloadToFile(payload: Data?, outPath: String?) throws -> String {
guard let payload, !payload.isEmpty else { throw POSIXError(.EINVAL) }
let url: URL
if let outPath, !outPath.isEmpty {
url = URL(fileURLWithPath: outPath).resolvingSymlinksInPath()
} else {
let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-mac", isDirectory: true)
try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
let name = "screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png"
url = dir.appendingPathComponent(name)
}
try payload.write(to: url)
return url.path
}
private static func printHelp() { private static func printHelp() {
let usage = """ let usage = """
clawdis-mac — talk to the running Clawdis.app XPC service clawdis-mac — talk to the running Clawdis.app XPC service
@@ -431,8 +411,8 @@ struct ClawdisCLI {
clawdis-mac ensure-permissions clawdis-mac ensure-permissions
[--cap <notifications|accessibility|screenRecording|microphone|speechRecognition>] [--cap <notifications|accessibility|screenRecording|microphone|speechRecognition>]
[--interactive] [--interactive]
clawdis-mac screenshot [--display-id <u32>] [--window-id <u32>] [--out <path>]
clawdis-mac ui screens clawdis-mac ui screens
clawdis-mac ui screenshot [--screen-index <n>] [--window-id <u32>]
clawdis-mac run [--cwd <path>] [--env KEY=VAL] [--timeout <sec>] [--needs-screen-recording] <command ...> clawdis-mac run [--cwd <path>] [--env KEY=VAL] [--timeout <sec>] [--needs-screen-recording] <command ...>
clawdis-mac status clawdis-mac status
clawdis-mac rpc-status clawdis-mac rpc-status

View File

@@ -83,6 +83,31 @@ public struct UIScreenInfo: Codable, Sendable {
} }
} }
public struct UIScreenshotResult: Codable, Sendable {
public let path: String
public let width: Int
public let height: Int
public let screenIndex: Int?
public let displayID: UInt32?
public let windowID: UInt32?
public init(
path: String,
width: Int,
height: Int,
screenIndex: Int? = nil,
displayID: UInt32? = nil,
windowID: UInt32? = nil)
{
self.path = path
self.width = width
self.height = height
self.screenIndex = screenIndex
self.displayID = displayID
self.windowID = windowID
}
}
public enum Request: Sendable { public enum Request: Sendable {
case notify( case notify(
title: String, title: String,
@@ -91,8 +116,8 @@ public enum Request: Sendable {
priority: NotificationPriority?, priority: NotificationPriority?,
delivery: NotificationDelivery?) delivery: NotificationDelivery?)
case ensurePermissions([Capability], interactive: Bool) case ensurePermissions([Capability], interactive: Bool)
case screenshot(displayID: UInt32?, windowID: UInt32?, format: String)
case uiListScreens case uiListScreens
case uiScreenshot(screenIndex: Int?, windowID: UInt32?)
case runShell( case runShell(
command: [String], command: [String],
cwd: String?, cwd: String?,
@@ -133,7 +158,7 @@ extension Request: Codable {
case type case type
case title, body, sound, priority, delivery case title, body, sound, priority, delivery
case caps, interactive case caps, interactive
case displayID, windowID, format case screenIndex, windowID
case command, cwd, env, timeoutSec, needsScreenRecording case command, cwd, env, timeoutSec, needsScreenRecording
case message, thinking, session, deliver, to case message, thinking, session, deliver, to
case rpcStatus case rpcStatus
@@ -149,8 +174,8 @@ extension Request: Codable {
private enum Kind: String, Codable { private enum Kind: String, Codable {
case notify case notify
case ensurePermissions case ensurePermissions
case screenshot
case uiListScreens case uiListScreens
case uiScreenshot
case runShell case runShell
case status case status
case agent case agent
@@ -180,15 +205,14 @@ extension Request: Codable {
try container.encode(caps, forKey: .caps) try container.encode(caps, forKey: .caps)
try container.encode(interactive, forKey: .interactive) try container.encode(interactive, forKey: .interactive)
case let .screenshot(displayID, windowID, format):
try container.encode(Kind.screenshot, forKey: .type)
try container.encodeIfPresent(displayID, forKey: .displayID)
try container.encodeIfPresent(windowID, forKey: .windowID)
try container.encode(format, forKey: .format)
case .uiListScreens: case .uiListScreens:
try container.encode(Kind.uiListScreens, forKey: .type) try container.encode(Kind.uiListScreens, forKey: .type)
case let .uiScreenshot(screenIndex, windowID):
try container.encode(Kind.uiScreenshot, forKey: .type)
try container.encodeIfPresent(screenIndex, forKey: .screenIndex)
try container.encodeIfPresent(windowID, forKey: .windowID)
case let .runShell(command, cwd, env, timeoutSec, needsSR): case let .runShell(command, cwd, env, timeoutSec, needsSR):
try container.encode(Kind.runShell, forKey: .type) try container.encode(Kind.runShell, forKey: .type)
try container.encode(command, forKey: .command) try container.encode(command, forKey: .command)
@@ -265,15 +289,14 @@ extension Request: Codable {
let interactive = try container.decode(Bool.self, forKey: .interactive) let interactive = try container.decode(Bool.self, forKey: .interactive)
self = .ensurePermissions(caps, interactive: interactive) self = .ensurePermissions(caps, interactive: interactive)
case .screenshot:
let displayID = try container.decodeIfPresent(UInt32.self, forKey: .displayID)
let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID)
let format = try container.decode(String.self, forKey: .format)
self = .screenshot(displayID: displayID, windowID: windowID, format: format)
case .uiListScreens: case .uiListScreens:
self = .uiListScreens self = .uiListScreens
case .uiScreenshot:
let screenIndex = try container.decodeIfPresent(Int.self, forKey: .screenIndex)
let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID)
self = .uiScreenshot(screenIndex: screenIndex, windowID: windowID)
case .runShell: case .runShell:
let command = try container.decode([String].self, forKey: .command) let command = try container.decode([String].self, forKey: .command)
let cwd = try container.decodeIfPresent(String.self, forKey: .cwd) let cwd = try container.decodeIfPresent(String.self, forKey: .cwd)

View File

@@ -36,7 +36,7 @@ enum Capability { notifications, accessibility, screenRecording, appleScript, mi
enum Request { enum Request {
notify(title, body, sound?) notify(title, body, sound?)
ensurePermissions([Capability], interactive: Bool) ensurePermissions([Capability], interactive: Bool)
screenshot(displayID?, windowID?, format="png") uiScreenshot(screenIndex?, windowID?)
runShell(command:[String], cwd?, env?, timeoutSec?, needsScreenRecording: Bool) runShell(command:[String], cwd?, env?, timeoutSec?, needsScreenRecording: Bool)
status status
} }
@@ -66,8 +66,8 @@ struct Response { ok: Bool; message?: String; payload?: Data }
- Subcommands (text by default; `--json` for machine output; non-zero exit on failure): - Subcommands (text by default; `--json` for machine output; non-zero exit on failure):
- `notify --title --body [--sound] [--priority passive|active|timeSensitive] [--delivery system|overlay|auto]` - `notify --title --body [--sound] [--priority passive|active|timeSensitive] [--delivery system|overlay|auto]`
- `ensure-permissions --cap accessibility --cap screenRecording [--interactive]` - `ensure-permissions --cap accessibility --cap screenRecording [--interactive]`
- `screenshot [--display-id N | --window-id N] [--out path]`
- `ui screens` - `ui screens`
- `ui screenshot [--screen-index N] [--window-id N]`
- `run -- cmd args... [--cwd] [--env KEY=VAL] [--timeout 30] [--needs-screen-recording]` - `run -- cmd args... [--cwd] [--env KEY=VAL] [--timeout 30] [--needs-screen-recording]`
- `status` - `status`
- Sounds: supply any macOS alert name with `--sound` per notification; omit the flag to use the system default. There is no longer a persisted “default sound” in the app UI. - Sounds: supply any macOS alert name with `--sound` per notification; omit the flag to use the system default. There is no longer a persisted “default sound” in the app UI.

View File

@@ -23,7 +23,7 @@ Run the Node-based Clawdis/clawdis gateway as a direct child of the LSUIElement
## TCC guardrails (must keep) ## TCC guardrails (must keep)
- Screen Recording, Accessibility, mic, and speech prompts must originate from the Swift app/XPC. The Node child should never call these APIs directly; use the existing XPC/CLI broker (`clawdis-mac`) for: - Screen Recording, Accessibility, mic, and speech prompts must originate from the Swift app/XPC. The Node child should never call these APIs directly; use the existing XPC/CLI broker (`clawdis-mac`) for:
- `ensure-permissions` - `ensure-permissions`
- `screenshot` / ScreenCaptureKit work - `ui screenshot` / ScreenCaptureKit work
- mic/speech permission checks - mic/speech permission checks
- notifications - notifications
- shell runs that need `needs-screen-recording` - shell runs that need `needs-screen-recording`

View File

@@ -106,14 +106,14 @@ Current state:
The visualizer is intentionally display-only (no clickable overlays needed). The visualizer is intentionally display-only (no clickable overlays needed).
## Screenshots (legacy → Peekaboo takeover) ## Screenshots (legacy → Peekaboo takeover)
Clawdis currently has a legacy `screenshot` request returning raw PNG bytes in `Response.payload`. Clawdis uses `clawdis-mac ui screenshot` and returns a file path (default location: temp directory) instead of raw image bytes.
Migration plan: Migration plan:
- Replace capture implementation with PeekabooAutomationKits capture service so we share: - Replace capture implementation with PeekabooAutomationKits capture service so we share:
- per-screen mapping - per-screen mapping
- window/app targeting - window/app targeting
- visual feedback (flash / watch HUD) when enabled - visual feedback (flash / watch HUD) when enabled
- Prefer writing images to a file path on the app side and returning the path (text-friendly), with `--json` providing the structured metadata. - Keep writing images to a file path on the app side and returning the path (text-friendly), with `--json` providing the structured metadata.
- No aliases: remove the old `Request.screenshot` and introduce a new `Request.uiScreenshot` (or similar) so the new behavior is explicit and theres no “legacy mode” to maintain. - No aliases: remove the old `Request.screenshot` and introduce a new `Request.uiScreenshot` (or similar) so the new behavior is explicit and theres no “legacy mode” to maintain.
## Permissions behavior ## Permissions behavior