feat(mac): host PeekabooBridge for ui

This commit is contained in:
Peter Steinberger
2025-12-13 16:55:41 +00:00
parent fd566bda14
commit c17440f5b4
21 changed files with 1197 additions and 422 deletions

View File

@@ -155,6 +155,15 @@ final class AppState: ObservableObject {
didSet { self.ifNotPreview { UserDefaults.standard.set(self.canvasEnabled, forKey: canvasEnabledKey) } }
}
@Published var peekabooBridgeEnabled: Bool {
didSet {
self.ifNotPreview {
UserDefaults.standard.set(self.peekabooBridgeEnabled, forKey: peekabooBridgeEnabledKey)
Task { await PeekabooBridgeHostCoordinator.shared.setEnabled(self.peekabooBridgeEnabled) }
}
}
}
@Published var attachExistingGatewayOnly: Bool {
didSet {
self.ifNotPreview {
@@ -231,6 +240,8 @@ final class AppState: ObservableObject {
let storedPort = UserDefaults.standard.integer(forKey: webChatPortKey)
self.webChatPort = storedPort > 0 ? storedPort : 18788
self.canvasEnabled = UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true
self.peekabooBridgeEnabled = UserDefaults.standard
.object(forKey: peekabooBridgeEnabledKey) as? Bool ?? true
self.attachExistingGatewayOnly = UserDefaults.standard.bool(forKey: attachExistingGatewayOnlyKey)
if !self.isPreview {

View File

@@ -24,6 +24,7 @@ let webChatEnabledKey = "clawdis.webChatEnabled"
let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled"
let webChatPortKey = "clawdis.webChatPort"
let canvasEnabledKey = "clawdis.canvasEnabled"
let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled"
let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled"
let deepLinkKeyKey = "clawdis.deepLinkKey"
let modelCatalogPathKey = "clawdis.modelCatalogPath"

View File

@@ -58,53 +58,6 @@ enum ControlRequestHandler {
let result = await AgentRPC.shared.status()
return Response(ok: result.ok, message: result.error)
case .uiListScreens:
let screens = await MainActor.run { UIScreenService.listScreens() }
let payload = try JSONEncoder().encode(screens)
return Response(ok: true, payload: payload)
case let .uiScreenshot(screenIndex, windowID):
let authorized = await PermissionManager
.ensure([.screenRecording], interactive: false)[.screenRecording] ?? false
guard authorized else { return Response(ok: false, message: "screen recording permission missing") }
let resolution: (screenIndex: Int?, displayID: UInt32?) = await Task { @MainActor in
if let screenIndex,
let match = UIScreenService.listScreens().first(where: { $0.index == screenIndex })
{
return (screenIndex, match.displayID)
}
return (nil, nil)
}.value
let data = await Task { @MainActor in
await Screenshotter.capture(displayID: resolution.displayID, windowID: windowID)
}.value
guard let data else {
return Response(ok: false, message: "screenshot failed")
}
let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-ui", isDirectory: true)
try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
let outURL = dir.appendingPathComponent("screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png")
do {
try data.write(to: outURL)
} catch {
return Response(ok: false, message: "failed to write screenshot: \(error.localizedDescription)")
}
let size = ScreenshotSize.readPNGSize(data: data)
let result = UIScreenshotResult(
path: outURL.path,
width: size?.width ?? 0,
height: size?.height ?? 0,
screenIndex: resolution.screenIndex,
displayID: resolution.displayID,
windowID: windowID)
let payload = try JSONEncoder().encode(result)
return Response(ok: true, payload: payload)
case let .runShell(command, cwd, env, timeoutSec, needsSR):
if needsSR {
let authorized = await PermissionManager

View File

@@ -57,6 +57,11 @@ struct GeneralSettings: View {
subtitle: "Allow the agent to show and control the Canvas panel.",
binding: self.$state.canvasEnabled)
SettingsToggleRow(
title: "Enable Peekaboo Bridge",
subtitle: "Allow signed tools to drive UI automation via `clawdis-mac ui …`.",
binding: self.$state.peekabooBridgeEnabled)
SettingsToggleRow(
title: "Enable debug tools",
subtitle: "Show the Debug tab with development utilities.",

View File

@@ -183,6 +183,7 @@ final class AppDelegate: NSObject, NSApplicationDelegate {
Task { await HealthStore.shared.refresh(onDemand: true) }
Task { await PortGuardian.shared.sweep(mode: AppStateStore.shared.connectionMode) }
Task { await self.socketServer.start() }
Task { await PeekabooBridgeHostCoordinator.shared.setEnabled(AppStateStore.shared.peekabooBridgeEnabled) }
self.scheduleFirstRunOnboardingIfNeeded()
// Developer/testing helper: auto-open WebChat when launched with --webchat
@@ -202,6 +203,7 @@ final class AppDelegate: NSObject, NSApplicationDelegate {
Task { await AgentRPC.shared.shutdown() }
Task { await GatewayConnection.shared.shutdown() }
Task { await self.socketServer.stop() }
Task { await PeekabooBridgeHostCoordinator.shared.stop() }
}
@MainActor

View File

@@ -0,0 +1,254 @@
import Foundation
import os
import PeekabooAutomationKit
import PeekabooBridge
import PeekabooFoundation
import PeekabooVisualizer
@MainActor
final class PeekabooBridgeHostCoordinator {
static let shared = PeekabooBridgeHostCoordinator()
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "PeekabooBridge")
private var host: PeekabooBridgeHost?
private var services: ClawdisPeekabooBridgeServices?
func setEnabled(_ enabled: Bool) async {
if enabled {
await self.startIfNeeded()
} else {
await self.stop()
}
}
func stop() async {
guard let host else { return }
await host.stop()
self.host = nil
self.services = nil
self.logger.info("PeekabooBridge host stopped")
}
private func startIfNeeded() async {
guard self.host == nil else { return }
let allowlistedTeamIDs: Set<String> = ["Y5PE65HELJ"]
let allowlistedBundles: Set<String> = []
let services = ClawdisPeekabooBridgeServices()
let server = PeekabooBridgeServer(
services: services,
hostKind: .gui,
allowlistedTeams: allowlistedTeamIDs,
allowlistedBundles: allowlistedBundles)
let host = PeekabooBridgeHost(
socketPath: PeekabooBridgeConstants.clawdisSocketPath,
server: server,
allowedTeamIDs: allowlistedTeamIDs,
requestTimeoutSec: 10)
self.services = services
self.host = host
await host.start()
self.logger.info("PeekabooBridge host started at \(PeekabooBridgeConstants.clawdisSocketPath, privacy: .public)")
}
}
@MainActor
private final class ClawdisPeekabooBridgeServices: PeekabooBridgeServiceProviding {
let permissions: PermissionsService
let screenCapture: any ScreenCaptureServiceProtocol
let automation: any UIAutomationServiceProtocol
let windows: any WindowManagementServiceProtocol
let applications: any ApplicationServiceProtocol
let menu: any MenuServiceProtocol
let dock: any DockServiceProtocol
let dialogs: any DialogServiceProtocol
let snapshots: any SnapshotManagerProtocol
init() {
let logging = LoggingService(subsystem: "com.steipete.clawdis.peekaboo")
let visualizer = PeekabooVisualizerFeedbackClient(client: .shared)
let snapshots = InMemorySnapshotManager(options: .init(
snapshotValidityWindow: 600,
maxSnapshots: 50,
deleteArtifactsOnCleanup: false))
let applications = ApplicationService(feedbackClient: visualizer)
let captureBase = ScreenCaptureService(loggingService: logging)
let screenCapture = FeedbackScreenCaptureService(base: captureBase, feedbackClient: visualizer)
self.permissions = PermissionsService()
self.snapshots = snapshots
self.applications = applications
self.screenCapture = screenCapture
self.automation = UIAutomationService(
snapshotManager: snapshots,
loggingService: logging,
searchPolicy: .balanced,
feedbackClient: visualizer)
self.windows = WindowManagementService(applicationService: applications, feedbackClient: visualizer)
self.menu = MenuService(applicationService: applications, feedbackClient: visualizer)
self.dock = DockService(feedbackClient: visualizer)
self.dialogs = DialogService(feedbackClient: visualizer)
}
}
@MainActor
private final class PeekabooVisualizerFeedbackClient: AutomationFeedbackClient {
private let client: VisualizationClient
init(client: VisualizationClient) {
self.client = client
}
func connect() {
self.client.connect()
}
func showClickFeedback(at point: CGPoint, type: ClickType) async -> Bool {
await self.client.showClickFeedback(at: point, type: type)
}
func showTypingFeedback(keys: [String], duration: TimeInterval, cadence: TypingCadence) async -> Bool {
await self.client.showTypingFeedback(keys: keys, duration: duration, cadence: cadence)
}
func showScrollFeedback(at point: CGPoint, direction: ScrollDirection, amount: Int) async -> Bool {
await self.client.showScrollFeedback(at: point, direction: direction, amount: amount)
}
func showHotkeyDisplay(keys: [String], duration: TimeInterval) async -> Bool {
await self.client.showHotkeyDisplay(keys: keys, duration: duration)
}
func showSwipeGesture(from: CGPoint, to: CGPoint, duration: TimeInterval) async -> Bool {
await self.client.showSwipeGesture(from: from, to: to, duration: duration)
}
func showMouseMovement(from: CGPoint, to: CGPoint, duration: TimeInterval) async -> Bool {
await self.client.showMouseMovement(from: from, to: to, duration: duration)
}
func showWindowOperation(_ kind: WindowOperationKind, windowRect: CGRect, duration: TimeInterval) async -> Bool {
let mapped: WindowOperation = switch kind {
case .close: .close
case .minimize: .minimize
case .maximize: .maximize
case .move: .move
case .resize: .resize
case .setBounds: .setBounds
case .focus: .focus
}
return await self.client.showWindowOperation(mapped, windowRect: windowRect, duration: duration)
}
func showDialogInteraction(
element: DialogElementType,
elementRect: CGRect,
action: DialogActionType) async -> Bool
{
await self.client.showDialogInteraction(element: element, elementRect: elementRect, action: action)
}
func showMenuNavigation(menuPath: [String]) async -> Bool {
await self.client.showMenuNavigation(menuPath: menuPath)
}
func showSpaceSwitch(from: Int, to: Int, direction: SpaceSwitchDirection) async -> Bool {
let mapped: SpaceDirection = direction == .left ? .left : .right
return await self.client.showSpaceSwitch(from: from, to: to, direction: mapped)
}
func showAppLaunch(appName: String, iconPath: String?) async -> Bool {
await self.client.showAppLaunch(appName: appName, iconPath: iconPath)
}
func showAppQuit(appName: String, iconPath: String?) async -> Bool {
await self.client.showAppQuit(appName: appName, iconPath: iconPath)
}
func showScreenshotFlash(in rect: CGRect) async -> Bool {
await self.client.showScreenshotFlash(in: rect)
}
func showWatchCapture(in rect: CGRect) async -> Bool {
await self.client.showWatchCapture(in: rect)
}
}
@MainActor
private final class FeedbackScreenCaptureService: ScreenCaptureServiceProtocol {
private let base: any ScreenCaptureServiceProtocol
private let feedbackClient: any AutomationFeedbackClient
init(base: any ScreenCaptureServiceProtocol, feedbackClient: any AutomationFeedbackClient) {
self.base = base
self.feedbackClient = feedbackClient
}
func captureScreen(
displayIndex: Int?,
visualizerMode: CaptureVisualizerMode,
scale: CaptureScalePreference) async throws -> CaptureResult
{
let result = try await self.base.captureScreen(
displayIndex: displayIndex,
visualizerMode: visualizerMode,
scale: scale)
await self.showCaptureFeedback(mode: visualizerMode, rect: result.metadata.displayInfo?.bounds)
return result
}
func captureWindow(
appIdentifier: String,
windowIndex: Int?,
visualizerMode: CaptureVisualizerMode,
scale: CaptureScalePreference) async throws -> CaptureResult
{
let result = try await self.base.captureWindow(
appIdentifier: appIdentifier,
windowIndex: windowIndex,
visualizerMode: visualizerMode,
scale: scale)
await self.showCaptureFeedback(mode: visualizerMode, rect: result.metadata.windowInfo?.bounds)
return result
}
func captureFrontmost(
visualizerMode: CaptureVisualizerMode,
scale: CaptureScalePreference) async throws -> CaptureResult
{
let result = try await self.base.captureFrontmost(visualizerMode: visualizerMode, scale: scale)
await self.showCaptureFeedback(mode: visualizerMode, rect: result.metadata.windowInfo?.bounds)
return result
}
func captureArea(
_ rect: CGRect,
visualizerMode: CaptureVisualizerMode,
scale: CaptureScalePreference) async throws -> CaptureResult
{
let result = try await self.base.captureArea(rect, visualizerMode: visualizerMode, scale: scale)
await self.showCaptureFeedback(mode: visualizerMode, rect: rect)
return result
}
func hasScreenRecordingPermission() async -> Bool {
await self.base.hasScreenRecordingPermission()
}
private func showCaptureFeedback(mode: CaptureVisualizerMode, rect: CGRect?) async {
guard let rect else { return }
switch mode {
case .screenshotFlash:
_ = await self.feedbackClient.showScreenshotFlash(in: rect)
case .watchCapture:
_ = await self.feedbackClient.showWatchCapture(in: rect)
}
}
}

View File

@@ -1,80 +0,0 @@
import AppKit
import CoreGraphics
import Foundation
@preconcurrency import ScreenCaptureKit
import VideoToolbox
enum Screenshotter {
@MainActor
static func capture(displayID: UInt32?, windowID: UInt32?) async -> Data? {
guard let content = try? await SCShareableContent.current else { return nil }
let targetDisplay: SCDisplay? = if let displayID {
content.displays.first(where: { $0.displayID == displayID })
} else {
content.displays.first
}
let filter: SCContentFilter
if let windowID, let win = content.windows.first(where: { $0.windowID == windowID }) {
filter = SCContentFilter(desktopIndependentWindow: win)
} else if let display = targetDisplay {
filter = SCContentFilter(display: display, excludingWindows: [])
} else {
return nil
}
let config = SCStreamConfiguration()
if let display = targetDisplay {
config.width = display.width
config.height = display.height
}
config.scalesToFit = true
config.colorSpaceName = CGColorSpace.displayP3
let stream = SCStream(filter: filter, configuration: config, delegate: nil)
let grabber = FrameGrabber()
try? stream.addStreamOutput(
grabber,
type: .screen,
sampleHandlerQueue: DispatchQueue(label: "com.steipete.clawdis.sshot"))
do {
try await stream.startCapture()
let data = await grabber.awaitPNG()
try? await stream.stopCapture()
return data
} catch {
return nil
}
}
}
final class FrameGrabber: NSObject, SCStreamOutput {
private var continuation: CheckedContinuation<Data?, Never>?
private var delivered = false
func awaitPNG() async -> Data? {
await withCheckedContinuation { cont in
self.continuation = cont
}
}
nonisolated func stream(
_ stream: SCStream,
didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
of outputType: SCStreamOutputType)
{
guard outputType == .screen else { return }
if self.delivered { return }
guard let imageBuffer = sampleBuffer.imageBuffer else { return }
var cgImage: CGImage?
let result = VTCreateCGImageFromCVPixelBuffer(imageBuffer, options: nil, imageOut: &cgImage)
guard result == noErr, let cgImage else { return }
let rep = NSBitmapImageRep(cgImage: cgImage)
guard let data = rep.representation(using: .png, properties: [:]) else { return }
self.delivered = true
self.continuation?.resume(returning: data)
self.continuation = nil
}
}

View File

@@ -1,44 +0,0 @@
import AppKit
import ClawdisIPC
import CoreGraphics
enum UIScreenService {
static func listScreens() -> [UIScreenInfo] {
let screens = NSScreen.screens
let mainScreen = NSScreen.main
return screens.enumerated().map { index, screen in
UIScreenInfo(
index: index,
name: screen.peekabooName,
frame: screen.frame,
visibleFrame: screen.visibleFrame,
isPrimary: screen == mainScreen,
scaleFactor: screen.backingScaleFactor,
displayID: screen.displayID)
}
}
}
private extension NSScreen {
var displayID: UInt32 {
if let num = self.deviceDescription[NSDeviceDescriptionKey("NSScreenNumber")] as? NSNumber {
return num.uint32Value
}
return 0
}
/// Match Peekaboo's `ScreenService` naming (built-in vs. resolution fallback).
var peekabooName: String {
let id = self.displayID
guard id != 0 else { return "Display" }
if CGDisplayIsBuiltin(id) != 0 { return "Built-in Display" }
if let mode = CGDisplayCopyDisplayMode(id) {
return "\(mode.pixelWidth)×\(mode.pixelHeight) Display"
}
return "External Display"
}
}

View File

@@ -15,6 +15,11 @@ struct ClawdisCLI {
exit(code)
}
if args.first == "ui" {
let code = try await UICLI.run(args: Array(args.dropFirst()), jsonOutput: jsonOutput)
exit(code)
}
let parsed = try parseCommandLine(args: args)
let response = try await send(request: parsed.request)
@@ -42,8 +47,6 @@ struct ClawdisCLI {
var kind: Kind
enum Kind {
case uiScreens
case uiScreenshot
case generic
}
}
@@ -100,29 +103,6 @@ struct ClawdisCLI {
if caps.isEmpty { caps = Capability.allCases }
return ParsedCLIRequest(request: .ensurePermissions(caps, interactive: interactive), kind: .generic)
case "ui":
guard let sub = args.first else { throw CLIError.help }
args = Array(args.dropFirst())
switch sub {
case "screens":
return ParsedCLIRequest(request: .uiListScreens, kind: .uiScreens)
case "screenshot":
var screenIndex: Int?
var windowID: UInt32?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--screen-index": screenIndex = args.popFirst().flatMap(Int.init)
case "--window-id": windowID = args.popFirst().flatMap(UInt32.init)
default: break
}
}
return ParsedCLIRequest(request: .uiScreenshot(screenIndex: screenIndex, windowID: windowID), kind: .uiScreenshot)
default:
throw CLIError.help
}
case "run":
var cwd: String?
var env: [String: String] = [:]
@@ -333,24 +313,6 @@ struct ClawdisCLI {
}
switch parsed.kind {
case .uiScreens:
let screens = try self.decodePayload([UIScreenInfo].self, payload: response.payload)
if screens.isEmpty {
FileHandle.standardOutput.write(Data("No screens\n".utf8))
return
}
for s in screens {
let primary = s.isPrimary ? " (primary)" : ""
let size = "\(Int(s.frame.width))×\(Int(s.frame.height))"
let scale = String(format: "%.1f", Double(s.scaleFactor))
let line = "Display \(s.index + 1)\(primary): \(s.name) \(size) @\(scale)x (id \(s.displayID))\n"
FileHandle.standardOutput.write(Data(line.utf8))
}
case .uiScreenshot:
let result = try self.decodePayload(UIScreenshotResult.self, payload: response.payload)
FileHandle.standardOutput.write(Data((result.path + "\n").utf8))
case .generic:
if let payload = response.payload, let text = String(data: payload, encoding: .utf8), !text.isEmpty {
FileHandle.standardOutput.write(payload)
@@ -370,22 +332,6 @@ struct ClawdisCLI {
]
switch parsed.kind {
case .uiScreens:
if let payload = response.payload,
let obj = try? JSONSerialization.jsonObject(with: payload) {
output["result"] = obj
} else {
output["result"] = []
}
case .uiScreenshot:
if let payload = response.payload,
let obj = try? JSONSerialization.jsonObject(with: payload) {
output["result"] = obj
} else {
output["result"] = NSNull()
}
case .generic:
if let payload = response.payload, !payload.isEmpty {
if let obj = try? JSONSerialization.jsonObject(with: payload) {
@@ -424,8 +370,12 @@ struct ClawdisCLI {
[--interactive]
UI:
clawdis-mac ui screens
clawdis-mac ui screenshot [--screen-index <n>] [--window-id <u32>]
clawdis-mac ui screenshot [...]
clawdis-mac ui see [...]
clawdis-mac ui click ...
clawdis-mac ui type ...
clawdis-mac ui wait ...
clawdis-mac ui --help
Shell:
clawdis-mac run [--cwd <path>] [--env KEY=VAL] [--timeout <sec>]

View File

@@ -0,0 +1,589 @@
import Foundation
import Darwin
import PeekabooAutomationKit
import PeekabooBridge
import PeekabooFoundation
enum UICLI {
static func run(args: [String], jsonOutput: Bool) async throws -> Int32 {
var args = args
guard let sub = args.first else {
self.printHelp()
return 0
}
args.removeFirst()
if sub == "--help" || sub == "-h" || sub == "help" {
self.printHelp()
return 0
}
let context = try await self.resolveContext()
switch sub {
case "permissions":
return try await self.runPermissions(args: args, jsonOutput: jsonOutput, context: context)
case "frontmost":
return try await self.runFrontmost(args: args, jsonOutput: jsonOutput, context: context)
case "apps":
return try await self.runApps(args: args, jsonOutput: jsonOutput, context: context)
case "windows":
return try await self.runWindows(args: args, jsonOutput: jsonOutput, context: context)
case "screenshot":
return try await self.runScreenshot(args: args, jsonOutput: jsonOutput, context: context)
case "see":
return try await self.runSee(args: args, jsonOutput: jsonOutput, context: context)
case "click":
return try await self.runClick(args: args, jsonOutput: jsonOutput, context: context)
case "type":
return try await self.runType(args: args, jsonOutput: jsonOutput, context: context)
case "wait":
return try await self.runWait(args: args, jsonOutput: jsonOutput, context: context)
default:
self.printHelp()
return 1
}
}
// MARK: - Context
private struct Context {
let client: PeekabooBridgeClient
let hostDescription: String
}
private static func resolveContext() async throws -> Context {
let explicitSocket = ProcessInfo.processInfo.environment["PEEKABOO_BRIDGE_SOCKET"]
let candidates: [String] = if let explicitSocket, !explicitSocket.isEmpty {
[explicitSocket]
} else {
[
PeekabooBridgeConstants.peekabooSocketPath,
PeekabooBridgeConstants.clawdisSocketPath,
]
}
let identity = PeekabooBridgeClientIdentity(
bundleIdentifier: Bundle.main.bundleIdentifier,
teamIdentifier: nil,
processIdentifier: getpid(),
hostname: Host.current().name)
for socketPath in candidates {
let client = PeekabooBridgeClient(socketPath: socketPath, requestTimeoutSec: 10)
do {
let handshake = try await client.handshake(client: identity, requestedHost: nil)
return Context(
client: client,
hostDescription: "\(handshake.hostKind.rawValue) via \(socketPath)")
} catch let envelope as PeekabooBridgeErrorEnvelope {
if envelope.code == .unauthorizedClient {
throw envelope
}
} catch {
continue
}
}
throw NSError(domain: "clawdis.ui", code: 1, userInfo: [
NSLocalizedDescriptionKey: "No PeekabooBridge host reachable (run Peekaboo.app or Clawdis.app).",
])
}
// MARK: - Commands
private static func runPermissions(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 {
let sub = args.first ?? "status"
if sub != "status" && sub != "--help" && sub != "-h" && sub != "help" {
self.printHelp()
return 1
}
let status = try await context.client.permissionsStatus()
if jsonOutput {
try self.writeJSON([
"ok": true,
"host": context.hostDescription,
"result": try self.toJSONObject(status),
])
} else {
FileHandle.standardOutput.write(Data((self.formatPermissions(status) + "\n").utf8))
}
return 0
}
private static func runFrontmost(args _: [String], jsonOutput: Bool, context: Context) async throws -> Int32 {
let app = try await context.client.getFrontmostApplication()
let window = try await context.client.getFocusedWindow()
if jsonOutput {
let windowObject: Any = if let window {
try self.toJSONObject(window)
} else {
NSNull()
}
try self.writeJSON([
"ok": true,
"host": context.hostDescription,
"app": try self.toJSONObject(app),
"window": windowObject,
])
} else {
let bundle = app.bundleIdentifier ?? "<unknown>"
let line = "\(bundle) (pid \(app.processIdentifier))"
FileHandle.standardOutput.write(Data((line + "\n").utf8))
if let window {
FileHandle.standardOutput.write(Data(("window \(window.windowID): \(window.title)\n").utf8))
}
}
return 0
}
private static func runApps(args _: [String], jsonOutput: Bool, context: Context) async throws -> Int32 {
let apps = try await context.client.listApplications()
if jsonOutput {
try self.writeJSON([
"ok": true,
"host": context.hostDescription,
"result": try self.toJSONObject(apps),
])
} else {
for app in apps {
let bundle = app.bundleIdentifier ?? "<unknown>"
FileHandle.standardOutput.write(Data(("\(bundle)\t\(app.name)\n").utf8))
}
}
return 0
}
private static func runWindows(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 {
var args = args
var bundleId: String?
while !args.isEmpty {
switch args.removeFirst() {
case "--bundle-id":
bundleId = args.popFirst()
case "--help", "-h", "help":
self.printHelp()
return 0
default:
break
}
}
let target: WindowTarget = if let bundleId, !bundleId.isEmpty { .application(bundleId) } else { .frontmost }
let windows = try await context.client.listWindows(target: target)
if jsonOutput {
try self.writeJSON([
"ok": true,
"host": context.hostDescription,
"result": try self.toJSONObject(windows),
])
} else {
for window in windows {
FileHandle.standardOutput.write(Data(("\(window.windowID)\t\(window.title)\n").utf8))
}
}
return 0
}
private static func runScreenshot(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 {
var args = args
var displayIndex: Int?
var bundleId: String?
var windowIndex: Int?
var mode: CaptureVisualizerMode = .screenshotFlash
var scale: CaptureScalePreference = .logical1x
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--screen-index":
displayIndex = args.popFirst().flatMap(Int.init)
case "--bundle-id":
bundleId = args.popFirst()
case "--window-index":
windowIndex = args.popFirst().flatMap(Int.init)
case "--watch":
mode = .watchCapture
case "--scale":
let raw = args.popFirst()?.lowercased()
if raw == "native" { scale = .native }
if raw == "1x" || raw == "logical" || raw == "logical1x" { scale = .logical1x }
case "--help", "-h", "help":
self.printHelp()
return 0
default:
break
}
}
let capture: CaptureResult
if let bundleId, !bundleId.isEmpty {
capture = try await context.client.captureWindow(
appIdentifier: bundleId,
windowIndex: windowIndex,
visualizerMode: mode,
scale: scale)
} else if displayIndex != nil {
capture = try await context.client.captureScreen(
displayIndex: displayIndex,
visualizerMode: mode,
scale: scale)
} else {
capture = try await context.client.captureFrontmost(visualizerMode: mode, scale: scale)
}
let path = try self.writeTempPNG(capture.imageData)
if jsonOutput {
try self.writeJSON([
"ok": true,
"host": context.hostDescription,
"path": path,
"metadata": try self.toJSONObject(capture.metadata),
"warning": capture.warning ?? "",
])
} else {
FileHandle.standardOutput.write(Data((path + "\n").utf8))
}
return 0
}
private static func runSee(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 {
var args = args
var bundleId: String?
var windowIndex: Int?
var snapshotId: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--bundle-id":
bundleId = args.popFirst()
case "--window-index":
windowIndex = args.popFirst().flatMap(Int.init)
case "--snapshot-id":
snapshotId = args.popFirst()
case "--help", "-h", "help":
self.printHelp()
return 0
default:
break
}
}
let capture: CaptureResult
if let bundleId, !bundleId.isEmpty {
capture = try await context.client.captureWindow(
appIdentifier: bundleId,
windowIndex: windowIndex,
visualizerMode: .screenshotFlash,
scale: .logical1x)
} else {
capture = try await context.client.captureFrontmost(visualizerMode: .screenshotFlash, scale: .logical1x)
bundleId = capture.metadata.applicationInfo?.bundleIdentifier
}
let resolvedSnapshotId: String = if let snapshotId, !snapshotId.isEmpty {
snapshotId
} else if let bundleId, !bundleId.isEmpty, let existing = try? await context.client
.getMostRecentSnapshot(applicationBundleId: bundleId) {
existing
} else {
try await context.client.createSnapshot()
}
let screenshotPath = try self.writeTempPNG(capture.imageData)
try await context.client.storeScreenshot(
snapshotId: resolvedSnapshotId,
screenshotPath: screenshotPath,
applicationBundleId: bundleId,
applicationProcessId: capture.metadata.applicationInfo?.processIdentifier,
applicationName: capture.metadata.applicationInfo?.name,
windowTitle: capture.metadata.windowInfo?.title,
windowBounds: capture.metadata.windowInfo?.bounds)
let windowContext = WindowContext(
applicationName: capture.metadata.applicationInfo?.name,
windowTitle: capture.metadata.windowInfo?.title,
windowBounds: capture.metadata.windowInfo?.bounds)
let detection = try await context.client.detectElements(
in: capture.imageData,
snapshotId: resolvedSnapshotId,
windowContext: windowContext)
try await context.client.storeDetectionResult(snapshotId: resolvedSnapshotId, result: detection)
if jsonOutput {
try self.writeJSON([
"ok": true,
"host": context.hostDescription,
"snapshotId": resolvedSnapshotId,
"screenshotPath": screenshotPath,
"result": try self.toJSONObject(detection),
])
} else {
FileHandle.standardOutput.write(Data((screenshotPath + "\n").utf8))
for el in detection.elements.all {
let b = el.bounds
let label = (el.label ?? el.value ?? "").replacingOccurrences(of: "\n", with: " ")
let line =
"\(el.id)\t\(el.type)\t\(Int(b.origin.x)),\(Int(b.origin.y)) \(Int(b.size.width))x\(Int(b.size.height))\t\(label)\n"
FileHandle.standardOutput.write(Data(line.utf8))
}
}
return 0
}
private static func runClick(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 {
var args = args
var bundleId: String?
var snapshotId: String?
var on: String?
var clickType: ClickType = .single
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--bundle-id":
bundleId = args.popFirst()
case "--snapshot-id":
snapshotId = args.popFirst()
case "--on":
on = args.popFirst()
case "--double":
clickType = .double
case "--right":
clickType = .right
case "--help", "-h", "help":
self.printHelp()
return 0
default:
break
}
}
guard let on, !on.isEmpty else {
throw NSError(domain: "clawdis.ui", code: 2, userInfo: [
NSLocalizedDescriptionKey: "Missing --on <elementId> (run `clawdis-mac ui see` first).",
])
}
let effectiveSnapshotId = try await self.resolveImplicitSnapshotId(
snapshotId: snapshotId,
bundleId: bundleId,
client: context.client)
try await context.client.click(target: .elementId(on), clickType: clickType, snapshotId: effectiveSnapshotId)
if jsonOutput {
try self.writeJSON([
"ok": true,
"host": context.hostDescription,
])
}
return 0
}
private static func runType(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 {
var args = args
var bundleId: String?
var snapshotId: String?
var into: String?
var clearExisting = false
var delayMs = 20
var textParts: [String] = []
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--bundle-id":
bundleId = args.popFirst()
case "--snapshot-id":
snapshotId = args.popFirst()
case "--into":
into = args.popFirst()
case "--clear":
clearExisting = true
case "--delay-ms":
delayMs = args.popFirst().flatMap(Int.init) ?? delayMs
case "--text":
if let next = args.popFirst() {
textParts.append(next)
}
case "--help", "-h", "help":
self.printHelp()
return 0
default:
textParts.append(arg)
}
}
let text = textParts.joined(separator: " ").trimmingCharacters(in: .whitespacesAndNewlines)
guard !text.isEmpty else {
throw NSError(domain: "clawdis.ui", code: 3, userInfo: [
NSLocalizedDescriptionKey: "Missing text (use --text <value>).",
])
}
let effectiveSnapshotId = try await self.resolveImplicitSnapshotId(
snapshotId: snapshotId,
bundleId: bundleId,
client: context.client)
try await context.client.type(
text: text,
target: into,
clearExisting: clearExisting,
typingDelay: delayMs,
snapshotId: effectiveSnapshotId)
if jsonOutput {
try self.writeJSON([
"ok": true,
"host": context.hostDescription,
])
}
return 0
}
private static func runWait(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 {
var args = args
var bundleId: String?
var snapshotId: String?
var on: String?
var timeoutSec: Double = 10
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--bundle-id":
bundleId = args.popFirst()
case "--snapshot-id":
snapshotId = args.popFirst()
case "--on":
on = args.popFirst()
case "--timeout":
timeoutSec = args.popFirst().flatMap(Double.init) ?? timeoutSec
case "--help", "-h", "help":
self.printHelp()
return 0
default:
break
}
}
guard let on, !on.isEmpty else {
throw NSError(domain: "clawdis.ui", code: 4, userInfo: [
NSLocalizedDescriptionKey: "Missing --on <elementId>.",
])
}
let effectiveSnapshotId = try await self.resolveImplicitSnapshotId(
snapshotId: snapshotId,
bundleId: bundleId,
client: context.client)
let result = try await context.client.waitForElement(
target: .elementId(on),
timeout: timeoutSec,
snapshotId: effectiveSnapshotId)
if jsonOutput {
try self.writeJSON([
"ok": true,
"host": context.hostDescription,
"result": try self.toJSONObject(result),
])
} else {
FileHandle.standardOutput.write(Data((result.found ? "found\n" : "not found\n").utf8))
}
return result.found ? 0 : 1
}
private static func resolveImplicitSnapshotId(
snapshotId: String?,
bundleId: String?,
client: PeekabooBridgeClient) async throws -> String
{
if let snapshotId, !snapshotId.isEmpty { return snapshotId }
let resolvedBundle: String? = if let bundleId, !bundleId.isEmpty {
bundleId
} else {
try await client.getFrontmostApplication().bundleIdentifier
}
guard let resolvedBundle, !resolvedBundle.isEmpty else {
throw NSError(domain: "clawdis.ui", code: 5, userInfo: [
NSLocalizedDescriptionKey: "Could not determine bundle id for implicit snapshot.",
])
}
do {
return try await client.getMostRecentSnapshot(applicationBundleId: resolvedBundle)
} catch {
throw NSError(domain: "clawdis.ui", code: 6, userInfo: [
NSLocalizedDescriptionKey: "No recent snapshot for \(resolvedBundle). Run `clawdis-mac ui see --bundle-id \(resolvedBundle)` first.",
])
}
}
// MARK: - IO helpers
private static func writeTempPNG(_ data: Data) throws -> String {
let dir = FileManager.default.temporaryDirectory
let formatter = ISO8601DateFormatter()
formatter.formatOptions = [.withInternetDateTime, .withFractionalSeconds]
let stamp = formatter.string(from: Date()).replacingOccurrences(of: ":", with: "-")
let url = dir.appendingPathComponent("clawdis-ui-\(stamp).png")
try data.write(to: url, options: [.atomic])
return url.path
}
private static func formatPermissions(_ status: PermissionsStatus) -> String {
let sr = status.screenRecording ? "screen-recording=ok" : "screen-recording=missing"
let ax = status.accessibility ? "accessibility=ok" : "accessibility=missing"
let ascr = status.appleScript ? "applescript=ok" : "applescript=missing"
return "\(sr) \(ax) \(ascr)"
}
private static func toJSONObject<T: Encodable>(_ value: T) throws -> Any {
let encoder = JSONEncoder()
encoder.dateEncodingStrategy = .iso8601
let data = try encoder.encode(value)
return try JSONSerialization.jsonObject(with: data)
}
private static func writeJSON(_ obj: [String: Any]) throws {
let data = try JSONSerialization.data(withJSONObject: obj, options: [.prettyPrinted])
FileHandle.standardOutput.write(data)
FileHandle.standardOutput.write(Data([0x0A]))
}
private static func printHelp() {
let usage = """
clawdis-mac ui — UI automation via PeekabooBridge
Usage:
clawdis-mac [--json] ui <command> ...
Commands:
permissions status
frontmost
apps
windows [--bundle-id <id>]
screenshot [--screen-index <n>] [--bundle-id <id>] [--window-index <n>] [--watch] [--scale native|1x]
see [--bundle-id <id>] [--window-index <n>] [--snapshot-id <id>]
click --on <elementId> [--bundle-id <id>] [--snapshot-id <id>] [--double|--right]
type --text <value> [--into <elementId>] [--bundle-id <id>] [--snapshot-id <id>] [--clear] [--delay-ms <n>]
wait --on <elementId> [--bundle-id <id>] [--snapshot-id <id>] [--timeout <sec>]
Notes:
- Prefers Peekaboo.apps bridge, then Clawdis.apps bridge.
- Default timeout is 10 seconds per action.
"""
FileHandle.standardError.write(Data((usage + "\n").utf8))
}
}

View File

@@ -50,64 +50,6 @@ public struct CanvasPlacement: Codable, Sendable {
}
}
// MARK: - UI (Peekaboo-aligned types)
/// Display info aligned with Peekaboo's `ScreenService.ScreenInfo`:
/// - `index` is the 0-based position in `NSScreen.screens` at runtime.
/// - `frame`/`visibleFrame` are AppKit screen rectangles (bottom-left origin).
public struct UIScreenInfo: Codable, Sendable {
public let index: Int
public let name: String
public let frame: CGRect
public let visibleFrame: CGRect
public let isPrimary: Bool
public let scaleFactor: CGFloat
public let displayID: UInt32
public init(
index: Int,
name: String,
frame: CGRect,
visibleFrame: CGRect,
isPrimary: Bool,
scaleFactor: CGFloat,
displayID: UInt32)
{
self.index = index
self.name = name
self.frame = frame
self.visibleFrame = visibleFrame
self.isPrimary = isPrimary
self.scaleFactor = scaleFactor
self.displayID = displayID
}
}
public struct UIScreenshotResult: Codable, Sendable {
public let path: String
public let width: Int
public let height: Int
public let screenIndex: Int?
public let displayID: UInt32?
public let windowID: UInt32?
public init(
path: String,
width: Int,
height: Int,
screenIndex: Int? = nil,
displayID: UInt32? = nil,
windowID: UInt32? = nil)
{
self.path = path
self.width = width
self.height = height
self.screenIndex = screenIndex
self.displayID = displayID
self.windowID = windowID
}
}
public enum Request: Sendable {
case notify(
title: String,
@@ -116,8 +58,6 @@ public enum Request: Sendable {
priority: NotificationPriority?,
delivery: NotificationDelivery?)
case ensurePermissions([Capability], interactive: Bool)
case uiListScreens
case uiScreenshot(screenIndex: Int?, windowID: UInt32?)
case runShell(
command: [String],
cwd: String?,
@@ -158,7 +98,6 @@ extension Request: Codable {
case type
case title, body, sound, priority, delivery
case caps, interactive
case screenIndex, windowID
case command, cwd, env, timeoutSec, needsScreenRecording
case message, thinking, session, deliver, to
case rpcStatus
@@ -174,8 +113,6 @@ extension Request: Codable {
private enum Kind: String, Codable {
case notify
case ensurePermissions
case uiListScreens
case uiScreenshot
case runShell
case status
case agent
@@ -205,14 +142,6 @@ extension Request: Codable {
try container.encode(caps, forKey: .caps)
try container.encode(interactive, forKey: .interactive)
case .uiListScreens:
try container.encode(Kind.uiListScreens, forKey: .type)
case let .uiScreenshot(screenIndex, windowID):
try container.encode(Kind.uiScreenshot, forKey: .type)
try container.encodeIfPresent(screenIndex, forKey: .screenIndex)
try container.encodeIfPresent(windowID, forKey: .windowID)
case let .runShell(command, cwd, env, timeoutSec, needsSR):
try container.encode(Kind.runShell, forKey: .type)
try container.encode(command, forKey: .command)
@@ -289,14 +218,6 @@ extension Request: Codable {
let interactive = try container.decode(Bool.self, forKey: .interactive)
self = .ensurePermissions(caps, interactive: interactive)
case .uiListScreens:
self = .uiListScreens
case .uiScreenshot:
let screenIndex = try container.decodeIfPresent(Int.self, forKey: .screenIndex)
let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID)
self = .uiScreenshot(screenIndex: screenIndex, windowID: windowID)
case .runShell:
let command = try container.decode([String].self, forKey: .command)
let cwd = try container.decodeIfPresent(String.self, forKey: .cwd)