feat!(mac): move screenshot to ui

This commit is contained in:
Peter Steinberger
2025-12-13 11:51:51 +00:00
parent cf90bd9c86
commit 36f21c5a4f
7 changed files with 129 additions and 75 deletions

View File

@@ -58,20 +58,53 @@ enum ControlRequestHandler {
let result = await AgentRPC.shared.status()
return Response(ok: result.ok, message: result.error)
case let .screenshot(displayID, windowID, _):
let authorized = await PermissionManager
.ensure([.screenRecording], interactive: false)[.screenRecording] ?? false
guard authorized else { return Response(ok: false, message: "screen recording permission missing") }
if let data = await Screenshotter.capture(displayID: displayID, windowID: windowID) {
return Response(ok: true, payload: data)
}
return Response(ok: false, message: "screenshot failed")
case .uiListScreens:
let screens = await MainActor.run { UIScreenService.listScreens() }
let payload = try JSONEncoder().encode(screens)
return Response(ok: true, payload: payload)
case let .uiScreenshot(screenIndex, windowID):
let authorized = await PermissionManager
.ensure([.screenRecording], interactive: false)[.screenRecording] ?? false
guard authorized else { return Response(ok: false, message: "screen recording permission missing") }
let resolution: (screenIndex: Int?, displayID: UInt32?) = await Task { @MainActor in
if let screenIndex,
let match = UIScreenService.listScreens().first(where: { $0.index == screenIndex })
{
return (screenIndex, match.displayID)
}
return (nil, nil)
}.value
let data = await Task { @MainActor in
await Screenshotter.capture(displayID: resolution.displayID, windowID: windowID)
}.value
guard let data else {
return Response(ok: false, message: "screenshot failed")
}
let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-ui", isDirectory: true)
try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
let outURL = dir.appendingPathComponent("screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png")
do {
try data.write(to: outURL)
} catch {
return Response(ok: false, message: "failed to write screenshot: \(error.localizedDescription)")
}
let size = ScreenshotSize.readPNGSize(data: data)
let result = UIScreenshotResult(
path: outURL.path,
width: size?.width ?? 0,
height: size?.height ?? 0,
screenIndex: resolution.screenIndex,
displayID: resolution.displayID,
windowID: windowID)
let payload = try JSONEncoder().encode(result)
return Response(ok: true, payload: payload)
case let .runShell(command, cwd, env, timeoutSec, needsSR):
if needsSR {
let authorized = await PermissionManager

View File

@@ -0,0 +1,18 @@
import Foundation
import ImageIO
enum ScreenshotSize {
struct Size {
let width: Int
let height: Int
}
static func readPNGSize(data: Data) -> Size? {
guard let source = CGImageSourceCreateWithData(data as CFData, nil) else { return nil }
guard let props = CGImageSourceCopyPropertiesAtIndex(source, 0, nil) as? [CFString: Any] else { return nil }
guard let width = props[kCGImagePropertyPixelWidth] as? Int else { return nil }
guard let height = props[kCGImagePropertyPixelHeight] as? Int else { return nil }
return Size(width: width, height: height)
}
}

View File

@@ -37,8 +37,8 @@ struct ClawdisCLI {
var kind: Kind
enum Kind {
case screenshot(outPath: String?)
case uiScreens
case uiScreenshot
case generic
}
}
@@ -95,23 +95,6 @@ struct ClawdisCLI {
if caps.isEmpty { caps = Capability.allCases }
return ParsedCLIRequest(request: .ensurePermissions(caps, interactive: interactive), kind: .generic)
case "screenshot":
var displayID: UInt32?
var windowID: UInt32?
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--display-id": if let val = args.popFirst(), let num = UInt32(val) { displayID = num }
case "--window-id": if let val = args.popFirst(), let num = UInt32(val) { windowID = num }
case "--out": outPath = args.popFirst()
default: break
}
}
return ParsedCLIRequest(
request: .screenshot(displayID: displayID, windowID: windowID, format: "png"),
kind: .screenshot(outPath: outPath))
case "ui":
guard let sub = args.first else { throw CLIError.help }
args = Array(args.dropFirst())
@@ -119,6 +102,18 @@ struct ClawdisCLI {
switch sub {
case "screens":
return ParsedCLIRequest(request: .uiListScreens, kind: .uiScreens)
case "screenshot":
var screenIndex: Int?
var windowID: UInt32?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--screen-index": screenIndex = args.popFirst().flatMap(Int.init)
case "--window-id": windowID = args.popFirst().flatMap(UInt32.init)
default: break
}
}
return ParsedCLIRequest(request: .uiScreenshot(screenIndex: screenIndex, windowID: windowID), kind: .uiScreenshot)
default:
throw CLIError.help
}
@@ -333,10 +328,6 @@ struct ClawdisCLI {
}
switch parsed.kind {
case let .screenshot(outPath):
let path = try self.writeScreenshotPayloadToFile(payload: response.payload, outPath: outPath)
FileHandle.standardOutput.write(Data((path + "\n").utf8))
case .uiScreens:
let screens = try self.decodePayload([UIScreenInfo].self, payload: response.payload)
if screens.isEmpty {
@@ -351,6 +342,10 @@ struct ClawdisCLI {
FileHandle.standardOutput.write(Data(line.utf8))
}
case .uiScreenshot:
let result = try self.decodePayload(UIScreenshotResult.self, payload: response.payload)
FileHandle.standardOutput.write(Data((result.path + "\n").utf8))
case .generic:
if let payload = response.payload, let text = String(data: payload, encoding: .utf8), !text.isEmpty {
FileHandle.standardOutput.write(payload)
@@ -370,14 +365,6 @@ struct ClawdisCLI {
]
switch parsed.kind {
case let .screenshot(outPath):
if response.ok {
let path = try self.writeScreenshotPayloadToFile(payload: response.payload, outPath: outPath)
output["result"] = ["path": path]
} else {
output["result"] = NSNull()
}
case .uiScreens:
if let payload = response.payload,
let obj = try? JSONSerialization.jsonObject(with: payload) {
@@ -386,6 +373,14 @@ struct ClawdisCLI {
output["result"] = []
}
case .uiScreenshot:
if let payload = response.payload,
let obj = try? JSONSerialization.jsonObject(with: payload) {
output["result"] = obj
} else {
output["result"] = NSNull()
}
case .generic:
if let payload = response.payload, !payload.isEmpty {
if let obj = try? JSONSerialization.jsonObject(with: payload) {
@@ -406,21 +401,6 @@ struct ClawdisCLI {
return try JSONDecoder().decode(T.self, from: payload)
}
private static func writeScreenshotPayloadToFile(payload: Data?, outPath: String?) throws -> String {
guard let payload, !payload.isEmpty else { throw POSIXError(.EINVAL) }
let url: URL
if let outPath, !outPath.isEmpty {
url = URL(fileURLWithPath: outPath).resolvingSymlinksInPath()
} else {
let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-mac", isDirectory: true)
try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
let name = "screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png"
url = dir.appendingPathComponent(name)
}
try payload.write(to: url)
return url.path
}
private static func printHelp() {
let usage = """
clawdis-mac — talk to the running Clawdis.app XPC service
@@ -431,8 +411,8 @@ struct ClawdisCLI {
clawdis-mac ensure-permissions
[--cap <notifications|accessibility|screenRecording|microphone|speechRecognition>]
[--interactive]
clawdis-mac screenshot [--display-id <u32>] [--window-id <u32>] [--out <path>]
clawdis-mac ui screens
clawdis-mac ui screenshot [--screen-index <n>] [--window-id <u32>]
clawdis-mac run [--cwd <path>] [--env KEY=VAL] [--timeout <sec>] [--needs-screen-recording] <command ...>
clawdis-mac status
clawdis-mac rpc-status

View File

@@ -83,6 +83,31 @@ public struct UIScreenInfo: Codable, Sendable {
}
}
public struct UIScreenshotResult: Codable, Sendable {
public let path: String
public let width: Int
public let height: Int
public let screenIndex: Int?
public let displayID: UInt32?
public let windowID: UInt32?
public init(
path: String,
width: Int,
height: Int,
screenIndex: Int? = nil,
displayID: UInt32? = nil,
windowID: UInt32? = nil)
{
self.path = path
self.width = width
self.height = height
self.screenIndex = screenIndex
self.displayID = displayID
self.windowID = windowID
}
}
public enum Request: Sendable {
case notify(
title: String,
@@ -91,8 +116,8 @@ public enum Request: Sendable {
priority: NotificationPriority?,
delivery: NotificationDelivery?)
case ensurePermissions([Capability], interactive: Bool)
case screenshot(displayID: UInt32?, windowID: UInt32?, format: String)
case uiListScreens
case uiScreenshot(screenIndex: Int?, windowID: UInt32?)
case runShell(
command: [String],
cwd: String?,
@@ -133,7 +158,7 @@ extension Request: Codable {
case type
case title, body, sound, priority, delivery
case caps, interactive
case displayID, windowID, format
case screenIndex, windowID
case command, cwd, env, timeoutSec, needsScreenRecording
case message, thinking, session, deliver, to
case rpcStatus
@@ -149,8 +174,8 @@ extension Request: Codable {
private enum Kind: String, Codable {
case notify
case ensurePermissions
case screenshot
case uiListScreens
case uiScreenshot
case runShell
case status
case agent
@@ -180,15 +205,14 @@ extension Request: Codable {
try container.encode(caps, forKey: .caps)
try container.encode(interactive, forKey: .interactive)
case let .screenshot(displayID, windowID, format):
try container.encode(Kind.screenshot, forKey: .type)
try container.encodeIfPresent(displayID, forKey: .displayID)
try container.encodeIfPresent(windowID, forKey: .windowID)
try container.encode(format, forKey: .format)
case .uiListScreens:
try container.encode(Kind.uiListScreens, forKey: .type)
case let .uiScreenshot(screenIndex, windowID):
try container.encode(Kind.uiScreenshot, forKey: .type)
try container.encodeIfPresent(screenIndex, forKey: .screenIndex)
try container.encodeIfPresent(windowID, forKey: .windowID)
case let .runShell(command, cwd, env, timeoutSec, needsSR):
try container.encode(Kind.runShell, forKey: .type)
try container.encode(command, forKey: .command)
@@ -265,15 +289,14 @@ extension Request: Codable {
let interactive = try container.decode(Bool.self, forKey: .interactive)
self = .ensurePermissions(caps, interactive: interactive)
case .screenshot:
let displayID = try container.decodeIfPresent(UInt32.self, forKey: .displayID)
let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID)
let format = try container.decode(String.self, forKey: .format)
self = .screenshot(displayID: displayID, windowID: windowID, format: format)
case .uiListScreens:
self = .uiListScreens
case .uiScreenshot:
let screenIndex = try container.decodeIfPresent(Int.self, forKey: .screenIndex)
let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID)
self = .uiScreenshot(screenIndex: screenIndex, windowID: windowID)
case .runShell:
let command = try container.decode([String].self, forKey: .command)
let cwd = try container.decodeIfPresent(String.self, forKey: .cwd)