From c17440f5b4874882f8c459abb0c0064e1846c652 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 13 Dec 2025 16:55:41 +0000 Subject: [PATCH] feat(mac): host PeekabooBridge for ui --- apps/macos/Package.resolved | 101 ++- apps/macos/Package.swift | 8 + apps/macos/Sources/Clawdis/AppState.swift | 11 + apps/macos/Sources/Clawdis/Constants.swift | 1 + .../Clawdis/ControlRequestHandler.swift | 47 -- .../Sources/Clawdis/GeneralSettings.swift | 5 + apps/macos/Sources/Clawdis/MenuBar.swift | 2 + .../PeekabooBridgeHostCoordinator.swift | 254 ++++++++ .../macos/Sources/Clawdis/Screenshotter.swift | 80 --- .../Sources/Clawdis/UIScreenService.swift | 44 -- .../macos/Sources/ClawdisCLI/ClawdisCLI.swift | 72 +-- apps/macos/Sources/ClawdisCLI/UICLI.swift | 589 ++++++++++++++++++ apps/macos/Sources/ClawdisIPC/IPC.swift | 79 --- docs/clawdis-mac.md | 56 +- docs/mac/canvas.md | 2 +- docs/mac/child-process.md | 17 +- docs/mac/icon.md | 2 +- docs/mac/peekaboo.md | 174 ++++-- docs/mac/xpc.md | 24 +- src/cli/gateway.sigterm.test.ts | 49 +- src/web/auto-reply.test.ts | 2 +- 21 files changed, 1197 insertions(+), 422 deletions(-) create mode 100644 apps/macos/Sources/Clawdis/PeekabooBridgeHostCoordinator.swift delete mode 100644 apps/macos/Sources/Clawdis/Screenshotter.swift delete mode 100644 apps/macos/Sources/Clawdis/UIScreenService.swift create mode 100644 apps/macos/Sources/ClawdisCLI/UICLI.swift diff --git a/apps/macos/Package.resolved b/apps/macos/Package.resolved index 9d633d7c8..bb3ce2bfc 100644 --- a/apps/macos/Package.resolved +++ b/apps/macos/Package.resolved @@ -1,6 +1,15 @@ { - "originHash" : "ee7127ff91914397f9991e22a0b06ab0bca0d83582adeed6011198c49167631b", + "originHash" : "5de6834e5cb92c45c61a2e6792b780ac231c5741def70f1efa9ec857fa12f8cb", "pins" : [ + { + "identity" : "eventsource", + "kind" : "remoteSourceControl", + "location" : "https://github.com/mattt/eventsource.git", + "state" : { + "revision" : "ca2a9d90cbe49e09b92f4b6ebd922c03ebea51d0", + "version" : "1.3.0" + } + }, { "identity" : "menubarextraaccess", "kind" : "remoteSourceControl", @@ -19,6 +28,96 @@ "version" : "2.8.1" } }, + { + "identity" : "swift-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-algorithms", + "state" : { + "revision" : "87e50f483c54e6efd60e885f7f5aa946cee68023", + "version" : "1.2.1" + } + }, + { + "identity" : "swift-asn1", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-asn1.git", + "state" : { + "revision" : "810496cf121e525d660cd0ea89a758740476b85f", + "version" : "1.5.1" + } + }, + { + "identity" : "swift-async-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-async-algorithms", + "state" : { + "revision" : "6c050d5ef8e1aa6342528460db614e9770d7f804", + "version" : "1.1.1" + } + }, + { + "identity" : "swift-collections", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-collections", + "state" : { + "branch" : "main", + "revision" : "8e5e4a8f3617283b556064574651fc0869943c9a" + } + }, + { + "identity" : "swift-configuration", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-configuration", + "state" : { + "branch" : "main", + "revision" : "3528deb75256d7dcbb0d71fa75077caae0a8c749" + } + }, + { + "identity" : "swift-crypto", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-crypto.git", + "state" : { + "revision" : "6f70fa9eab24c1fd982af18c281c4525d05e3095", + "version" : "4.2.0" + } + }, + { + "identity" : "swift-log", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-log.git", + "state" : { + "revision" : "bc386b95f2a16ccd0150a8235e7c69eab2b866ca", + "version" : "1.8.0" + } + }, + { + "identity" : "swift-numerics", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-numerics.git", + "state" : { + "revision" : "0c0290ff6b24942dadb83a929ffaaa1481df04a2", + "version" : "1.1.1" + } + }, + { + "identity" : "swift-sdk", + "kind" : "remoteSourceControl", + "location" : "https://github.com/modelcontextprotocol/swift-sdk.git", + "state" : { + "revision" : "c0407a0b52677cb395d824cac2879b963075ba8c", + "version" : "0.10.2" + } + }, + { + "identity" : "swift-service-lifecycle", + "kind" : "remoteSourceControl", + "location" : "https://github.com/swift-server/swift-service-lifecycle", + "state" : { + "revision" : "1de37290c0ab3c5a96028e0f02911b672fd42348", + "version" : "2.9.1" + } + }, { "identity" : "swift-subprocess", "kind" : "remoteSourceControl", diff --git a/apps/macos/Package.swift b/apps/macos/Package.swift index 997948874..e772e0db5 100644 --- a/apps/macos/Package.swift +++ b/apps/macos/Package.swift @@ -18,6 +18,9 @@ let package = Package( .package(url: "https://github.com/swiftlang/swift-subprocess.git", from: "0.1.0"), .package(url: "https://github.com/sparkle-project/Sparkle", from: "2.8.1"), .package(path: "../shared/ClawdisKit"), + .package(path: "../../Peekaboo/Core/PeekabooCore"), + .package(path: "../../Peekaboo/Core/PeekabooAutomationKit"), + .package(path: "../../Peekaboo/Core/PeekabooVisualizer"), ], targets: [ .target( @@ -42,6 +45,9 @@ let package = Package( .product(name: "MenuBarExtraAccess", package: "MenuBarExtraAccess"), .product(name: "Subprocess", package: "swift-subprocess"), .product(name: "Sparkle", package: "Sparkle"), + .product(name: "PeekabooBridge", package: "PeekabooCore"), + .product(name: "PeekabooAutomationKit", package: "PeekabooAutomationKit"), + .product(name: "PeekabooVisualizer", package: "PeekabooVisualizer"), ], resources: [ .copy("Resources/Clawdis.icns"), @@ -55,6 +61,8 @@ let package = Package( dependencies: [ "ClawdisIPC", "ClawdisProtocol", + .product(name: "PeekabooBridge", package: "PeekabooCore"), + .product(name: "PeekabooAutomationKit", package: "PeekabooAutomationKit"), ], swiftSettings: [ .enableUpcomingFeature("StrictConcurrency"), diff --git a/apps/macos/Sources/Clawdis/AppState.swift b/apps/macos/Sources/Clawdis/AppState.swift index 75fd5ac15..d3d47c988 100644 --- a/apps/macos/Sources/Clawdis/AppState.swift +++ b/apps/macos/Sources/Clawdis/AppState.swift @@ -155,6 +155,15 @@ final class AppState: ObservableObject { didSet { self.ifNotPreview { UserDefaults.standard.set(self.canvasEnabled, forKey: canvasEnabledKey) } } } + @Published var peekabooBridgeEnabled: Bool { + didSet { + self.ifNotPreview { + UserDefaults.standard.set(self.peekabooBridgeEnabled, forKey: peekabooBridgeEnabledKey) + Task { await PeekabooBridgeHostCoordinator.shared.setEnabled(self.peekabooBridgeEnabled) } + } + } + } + @Published var attachExistingGatewayOnly: Bool { didSet { self.ifNotPreview { @@ -231,6 +240,8 @@ final class AppState: ObservableObject { let storedPort = UserDefaults.standard.integer(forKey: webChatPortKey) self.webChatPort = storedPort > 0 ? storedPort : 18788 self.canvasEnabled = UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true + self.peekabooBridgeEnabled = UserDefaults.standard + .object(forKey: peekabooBridgeEnabledKey) as? Bool ?? true self.attachExistingGatewayOnly = UserDefaults.standard.bool(forKey: attachExistingGatewayOnlyKey) if !self.isPreview { diff --git a/apps/macos/Sources/Clawdis/Constants.swift b/apps/macos/Sources/Clawdis/Constants.swift index e509bf371..c4538365c 100644 --- a/apps/macos/Sources/Clawdis/Constants.swift +++ b/apps/macos/Sources/Clawdis/Constants.swift @@ -24,6 +24,7 @@ let webChatEnabledKey = "clawdis.webChatEnabled" let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled" let webChatPortKey = "clawdis.webChatPort" let canvasEnabledKey = "clawdis.canvasEnabled" +let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled" let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled" let deepLinkKeyKey = "clawdis.deepLinkKey" let modelCatalogPathKey = "clawdis.modelCatalogPath" diff --git a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift index 5dee97bbd..fc2a49e1c 100644 --- a/apps/macos/Sources/Clawdis/ControlRequestHandler.swift +++ b/apps/macos/Sources/Clawdis/ControlRequestHandler.swift @@ -58,53 +58,6 @@ enum ControlRequestHandler { let result = await AgentRPC.shared.status() return Response(ok: result.ok, message: result.error) - case .uiListScreens: - let screens = await MainActor.run { UIScreenService.listScreens() } - let payload = try JSONEncoder().encode(screens) - return Response(ok: true, payload: payload) - - case let .uiScreenshot(screenIndex, windowID): - let authorized = await PermissionManager - .ensure([.screenRecording], interactive: false)[.screenRecording] ?? false - guard authorized else { return Response(ok: false, message: "screen recording permission missing") } - - let resolution: (screenIndex: Int?, displayID: UInt32?) = await Task { @MainActor in - if let screenIndex, - let match = UIScreenService.listScreens().first(where: { $0.index == screenIndex }) - { - return (screenIndex, match.displayID) - } - return (nil, nil) - }.value - - let data = await Task { @MainActor in - await Screenshotter.capture(displayID: resolution.displayID, windowID: windowID) - }.value - - guard let data else { - return Response(ok: false, message: "screenshot failed") - } - - let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-ui", isDirectory: true) - try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) - let outURL = dir.appendingPathComponent("screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png") - do { - try data.write(to: outURL) - } catch { - return Response(ok: false, message: "failed to write screenshot: \(error.localizedDescription)") - } - - let size = ScreenshotSize.readPNGSize(data: data) - let result = UIScreenshotResult( - path: outURL.path, - width: size?.width ?? 0, - height: size?.height ?? 0, - screenIndex: resolution.screenIndex, - displayID: resolution.displayID, - windowID: windowID) - let payload = try JSONEncoder().encode(result) - return Response(ok: true, payload: payload) - case let .runShell(command, cwd, env, timeoutSec, needsSR): if needsSR { let authorized = await PermissionManager diff --git a/apps/macos/Sources/Clawdis/GeneralSettings.swift b/apps/macos/Sources/Clawdis/GeneralSettings.swift index 4a21e7435..82c3dc617 100644 --- a/apps/macos/Sources/Clawdis/GeneralSettings.swift +++ b/apps/macos/Sources/Clawdis/GeneralSettings.swift @@ -57,6 +57,11 @@ struct GeneralSettings: View { subtitle: "Allow the agent to show and control the Canvas panel.", binding: self.$state.canvasEnabled) + SettingsToggleRow( + title: "Enable Peekaboo Bridge", + subtitle: "Allow signed tools to drive UI automation via `clawdis-mac ui …`.", + binding: self.$state.peekabooBridgeEnabled) + SettingsToggleRow( title: "Enable debug tools", subtitle: "Show the Debug tab with development utilities.", diff --git a/apps/macos/Sources/Clawdis/MenuBar.swift b/apps/macos/Sources/Clawdis/MenuBar.swift index 86c45c365..c21df7b9e 100644 --- a/apps/macos/Sources/Clawdis/MenuBar.swift +++ b/apps/macos/Sources/Clawdis/MenuBar.swift @@ -183,6 +183,7 @@ final class AppDelegate: NSObject, NSApplicationDelegate { Task { await HealthStore.shared.refresh(onDemand: true) } Task { await PortGuardian.shared.sweep(mode: AppStateStore.shared.connectionMode) } Task { await self.socketServer.start() } + Task { await PeekabooBridgeHostCoordinator.shared.setEnabled(AppStateStore.shared.peekabooBridgeEnabled) } self.scheduleFirstRunOnboardingIfNeeded() // Developer/testing helper: auto-open WebChat when launched with --webchat @@ -202,6 +203,7 @@ final class AppDelegate: NSObject, NSApplicationDelegate { Task { await AgentRPC.shared.shutdown() } Task { await GatewayConnection.shared.shutdown() } Task { await self.socketServer.stop() } + Task { await PeekabooBridgeHostCoordinator.shared.stop() } } @MainActor diff --git a/apps/macos/Sources/Clawdis/PeekabooBridgeHostCoordinator.swift b/apps/macos/Sources/Clawdis/PeekabooBridgeHostCoordinator.swift new file mode 100644 index 000000000..35286ed25 --- /dev/null +++ b/apps/macos/Sources/Clawdis/PeekabooBridgeHostCoordinator.swift @@ -0,0 +1,254 @@ +import Foundation +import os +import PeekabooAutomationKit +import PeekabooBridge +import PeekabooFoundation +import PeekabooVisualizer + +@MainActor +final class PeekabooBridgeHostCoordinator { + static let shared = PeekabooBridgeHostCoordinator() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "PeekabooBridge") + + private var host: PeekabooBridgeHost? + private var services: ClawdisPeekabooBridgeServices? + + func setEnabled(_ enabled: Bool) async { + if enabled { + await self.startIfNeeded() + } else { + await self.stop() + } + } + + func stop() async { + guard let host else { return } + await host.stop() + self.host = nil + self.services = nil + self.logger.info("PeekabooBridge host stopped") + } + + private func startIfNeeded() async { + guard self.host == nil else { return } + + let allowlistedTeamIDs: Set = ["Y5PE65HELJ"] + let allowlistedBundles: Set = [] + + let services = ClawdisPeekabooBridgeServices() + let server = PeekabooBridgeServer( + services: services, + hostKind: .gui, + allowlistedTeams: allowlistedTeamIDs, + allowlistedBundles: allowlistedBundles) + + let host = PeekabooBridgeHost( + socketPath: PeekabooBridgeConstants.clawdisSocketPath, + server: server, + allowedTeamIDs: allowlistedTeamIDs, + requestTimeoutSec: 10) + + self.services = services + self.host = host + + await host.start() + self.logger.info("PeekabooBridge host started at \(PeekabooBridgeConstants.clawdisSocketPath, privacy: .public)") + } +} + +@MainActor +private final class ClawdisPeekabooBridgeServices: PeekabooBridgeServiceProviding { + let permissions: PermissionsService + let screenCapture: any ScreenCaptureServiceProtocol + let automation: any UIAutomationServiceProtocol + let windows: any WindowManagementServiceProtocol + let applications: any ApplicationServiceProtocol + let menu: any MenuServiceProtocol + let dock: any DockServiceProtocol + let dialogs: any DialogServiceProtocol + let snapshots: any SnapshotManagerProtocol + + init() { + let logging = LoggingService(subsystem: "com.steipete.clawdis.peekaboo") + let visualizer = PeekabooVisualizerFeedbackClient(client: .shared) + + let snapshots = InMemorySnapshotManager(options: .init( + snapshotValidityWindow: 600, + maxSnapshots: 50, + deleteArtifactsOnCleanup: false)) + let applications = ApplicationService(feedbackClient: visualizer) + + let captureBase = ScreenCaptureService(loggingService: logging) + let screenCapture = FeedbackScreenCaptureService(base: captureBase, feedbackClient: visualizer) + + self.permissions = PermissionsService() + self.snapshots = snapshots + self.applications = applications + self.screenCapture = screenCapture + self.automation = UIAutomationService( + snapshotManager: snapshots, + loggingService: logging, + searchPolicy: .balanced, + feedbackClient: visualizer) + self.windows = WindowManagementService(applicationService: applications, feedbackClient: visualizer) + self.menu = MenuService(applicationService: applications, feedbackClient: visualizer) + self.dock = DockService(feedbackClient: visualizer) + self.dialogs = DialogService(feedbackClient: visualizer) + } +} + +@MainActor +private final class PeekabooVisualizerFeedbackClient: AutomationFeedbackClient { + private let client: VisualizationClient + + init(client: VisualizationClient) { + self.client = client + } + + func connect() { + self.client.connect() + } + + func showClickFeedback(at point: CGPoint, type: ClickType) async -> Bool { + await self.client.showClickFeedback(at: point, type: type) + } + + func showTypingFeedback(keys: [String], duration: TimeInterval, cadence: TypingCadence) async -> Bool { + await self.client.showTypingFeedback(keys: keys, duration: duration, cadence: cadence) + } + + func showScrollFeedback(at point: CGPoint, direction: ScrollDirection, amount: Int) async -> Bool { + await self.client.showScrollFeedback(at: point, direction: direction, amount: amount) + } + + func showHotkeyDisplay(keys: [String], duration: TimeInterval) async -> Bool { + await self.client.showHotkeyDisplay(keys: keys, duration: duration) + } + + func showSwipeGesture(from: CGPoint, to: CGPoint, duration: TimeInterval) async -> Bool { + await self.client.showSwipeGesture(from: from, to: to, duration: duration) + } + + func showMouseMovement(from: CGPoint, to: CGPoint, duration: TimeInterval) async -> Bool { + await self.client.showMouseMovement(from: from, to: to, duration: duration) + } + + func showWindowOperation(_ kind: WindowOperationKind, windowRect: CGRect, duration: TimeInterval) async -> Bool { + let mapped: WindowOperation = switch kind { + case .close: .close + case .minimize: .minimize + case .maximize: .maximize + case .move: .move + case .resize: .resize + case .setBounds: .setBounds + case .focus: .focus + } + return await self.client.showWindowOperation(mapped, windowRect: windowRect, duration: duration) + } + + func showDialogInteraction( + element: DialogElementType, + elementRect: CGRect, + action: DialogActionType) async -> Bool + { + await self.client.showDialogInteraction(element: element, elementRect: elementRect, action: action) + } + + func showMenuNavigation(menuPath: [String]) async -> Bool { + await self.client.showMenuNavigation(menuPath: menuPath) + } + + func showSpaceSwitch(from: Int, to: Int, direction: SpaceSwitchDirection) async -> Bool { + let mapped: SpaceDirection = direction == .left ? .left : .right + return await self.client.showSpaceSwitch(from: from, to: to, direction: mapped) + } + + func showAppLaunch(appName: String, iconPath: String?) async -> Bool { + await self.client.showAppLaunch(appName: appName, iconPath: iconPath) + } + + func showAppQuit(appName: String, iconPath: String?) async -> Bool { + await self.client.showAppQuit(appName: appName, iconPath: iconPath) + } + + func showScreenshotFlash(in rect: CGRect) async -> Bool { + await self.client.showScreenshotFlash(in: rect) + } + + func showWatchCapture(in rect: CGRect) async -> Bool { + await self.client.showWatchCapture(in: rect) + } +} + +@MainActor +private final class FeedbackScreenCaptureService: ScreenCaptureServiceProtocol { + private let base: any ScreenCaptureServiceProtocol + private let feedbackClient: any AutomationFeedbackClient + + init(base: any ScreenCaptureServiceProtocol, feedbackClient: any AutomationFeedbackClient) { + self.base = base + self.feedbackClient = feedbackClient + } + + func captureScreen( + displayIndex: Int?, + visualizerMode: CaptureVisualizerMode, + scale: CaptureScalePreference) async throws -> CaptureResult + { + let result = try await self.base.captureScreen( + displayIndex: displayIndex, + visualizerMode: visualizerMode, + scale: scale) + await self.showCaptureFeedback(mode: visualizerMode, rect: result.metadata.displayInfo?.bounds) + return result + } + + func captureWindow( + appIdentifier: String, + windowIndex: Int?, + visualizerMode: CaptureVisualizerMode, + scale: CaptureScalePreference) async throws -> CaptureResult + { + let result = try await self.base.captureWindow( + appIdentifier: appIdentifier, + windowIndex: windowIndex, + visualizerMode: visualizerMode, + scale: scale) + await self.showCaptureFeedback(mode: visualizerMode, rect: result.metadata.windowInfo?.bounds) + return result + } + + func captureFrontmost( + visualizerMode: CaptureVisualizerMode, + scale: CaptureScalePreference) async throws -> CaptureResult + { + let result = try await self.base.captureFrontmost(visualizerMode: visualizerMode, scale: scale) + await self.showCaptureFeedback(mode: visualizerMode, rect: result.metadata.windowInfo?.bounds) + return result + } + + func captureArea( + _ rect: CGRect, + visualizerMode: CaptureVisualizerMode, + scale: CaptureScalePreference) async throws -> CaptureResult + { + let result = try await self.base.captureArea(rect, visualizerMode: visualizerMode, scale: scale) + await self.showCaptureFeedback(mode: visualizerMode, rect: rect) + return result + } + + func hasScreenRecordingPermission() async -> Bool { + await self.base.hasScreenRecordingPermission() + } + + private func showCaptureFeedback(mode: CaptureVisualizerMode, rect: CGRect?) async { + guard let rect else { return } + switch mode { + case .screenshotFlash: + _ = await self.feedbackClient.showScreenshotFlash(in: rect) + case .watchCapture: + _ = await self.feedbackClient.showWatchCapture(in: rect) + } + } +} diff --git a/apps/macos/Sources/Clawdis/Screenshotter.swift b/apps/macos/Sources/Clawdis/Screenshotter.swift deleted file mode 100644 index cc1c6902e..000000000 --- a/apps/macos/Sources/Clawdis/Screenshotter.swift +++ /dev/null @@ -1,80 +0,0 @@ -import AppKit -import CoreGraphics -import Foundation -@preconcurrency import ScreenCaptureKit -import VideoToolbox - -enum Screenshotter { - @MainActor - static func capture(displayID: UInt32?, windowID: UInt32?) async -> Data? { - guard let content = try? await SCShareableContent.current else { return nil } - - let targetDisplay: SCDisplay? = if let displayID { - content.displays.first(where: { $0.displayID == displayID }) - } else { - content.displays.first - } - - let filter: SCContentFilter - if let windowID, let win = content.windows.first(where: { $0.windowID == windowID }) { - filter = SCContentFilter(desktopIndependentWindow: win) - } else if let display = targetDisplay { - filter = SCContentFilter(display: display, excludingWindows: []) - } else { - return nil - } - - let config = SCStreamConfiguration() - if let display = targetDisplay { - config.width = display.width - config.height = display.height - } - config.scalesToFit = true - config.colorSpaceName = CGColorSpace.displayP3 - - let stream = SCStream(filter: filter, configuration: config, delegate: nil) - let grabber = FrameGrabber() - try? stream.addStreamOutput( - grabber, - type: .screen, - sampleHandlerQueue: DispatchQueue(label: "com.steipete.clawdis.sshot")) - do { - try await stream.startCapture() - let data = await grabber.awaitPNG() - try? await stream.stopCapture() - return data - } catch { - return nil - } - } -} - -final class FrameGrabber: NSObject, SCStreamOutput { - private var continuation: CheckedContinuation? - private var delivered = false - - func awaitPNG() async -> Data? { - await withCheckedContinuation { cont in - self.continuation = cont - } - } - - nonisolated func stream( - _ stream: SCStream, - didOutputSampleBuffer sampleBuffer: CMSampleBuffer, - of outputType: SCStreamOutputType) - { - guard outputType == .screen else { return } - if self.delivered { return } - guard let imageBuffer = sampleBuffer.imageBuffer else { return } - var cgImage: CGImage? - let result = VTCreateCGImageFromCVPixelBuffer(imageBuffer, options: nil, imageOut: &cgImage) - guard result == noErr, let cgImage else { return } - let rep = NSBitmapImageRep(cgImage: cgImage) - guard let data = rep.representation(using: .png, properties: [:]) else { return } - - self.delivered = true - self.continuation?.resume(returning: data) - self.continuation = nil - } -} diff --git a/apps/macos/Sources/Clawdis/UIScreenService.swift b/apps/macos/Sources/Clawdis/UIScreenService.swift deleted file mode 100644 index 7442b8bb9..000000000 --- a/apps/macos/Sources/Clawdis/UIScreenService.swift +++ /dev/null @@ -1,44 +0,0 @@ -import AppKit -import ClawdisIPC -import CoreGraphics - -enum UIScreenService { - static func listScreens() -> [UIScreenInfo] { - let screens = NSScreen.screens - let mainScreen = NSScreen.main - - return screens.enumerated().map { index, screen in - UIScreenInfo( - index: index, - name: screen.peekabooName, - frame: screen.frame, - visibleFrame: screen.visibleFrame, - isPrimary: screen == mainScreen, - scaleFactor: screen.backingScaleFactor, - displayID: screen.displayID) - } - } -} - -private extension NSScreen { - var displayID: UInt32 { - if let num = self.deviceDescription[NSDeviceDescriptionKey("NSScreenNumber")] as? NSNumber { - return num.uint32Value - } - return 0 - } - - /// Match Peekaboo's `ScreenService` naming (built-in vs. resolution fallback). - var peekabooName: String { - let id = self.displayID - guard id != 0 else { return "Display" } - if CGDisplayIsBuiltin(id) != 0 { return "Built-in Display" } - - if let mode = CGDisplayCopyDisplayMode(id) { - return "\(mode.pixelWidth)×\(mode.pixelHeight) Display" - } - - return "External Display" - } -} - diff --git a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift index 96dc70f5f..14ba977fa 100644 --- a/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift +++ b/apps/macos/Sources/ClawdisCLI/ClawdisCLI.swift @@ -15,6 +15,11 @@ struct ClawdisCLI { exit(code) } + if args.first == "ui" { + let code = try await UICLI.run(args: Array(args.dropFirst()), jsonOutput: jsonOutput) + exit(code) + } + let parsed = try parseCommandLine(args: args) let response = try await send(request: parsed.request) @@ -42,8 +47,6 @@ struct ClawdisCLI { var kind: Kind enum Kind { - case uiScreens - case uiScreenshot case generic } } @@ -100,29 +103,6 @@ struct ClawdisCLI { if caps.isEmpty { caps = Capability.allCases } return ParsedCLIRequest(request: .ensurePermissions(caps, interactive: interactive), kind: .generic) - case "ui": - guard let sub = args.first else { throw CLIError.help } - args = Array(args.dropFirst()) - - switch sub { - case "screens": - return ParsedCLIRequest(request: .uiListScreens, kind: .uiScreens) - case "screenshot": - var screenIndex: Int? - var windowID: UInt32? - while !args.isEmpty { - let arg = args.removeFirst() - switch arg { - case "--screen-index": screenIndex = args.popFirst().flatMap(Int.init) - case "--window-id": windowID = args.popFirst().flatMap(UInt32.init) - default: break - } - } - return ParsedCLIRequest(request: .uiScreenshot(screenIndex: screenIndex, windowID: windowID), kind: .uiScreenshot) - default: - throw CLIError.help - } - case "run": var cwd: String? var env: [String: String] = [:] @@ -333,24 +313,6 @@ struct ClawdisCLI { } switch parsed.kind { - case .uiScreens: - let screens = try self.decodePayload([UIScreenInfo].self, payload: response.payload) - if screens.isEmpty { - FileHandle.standardOutput.write(Data("No screens\n".utf8)) - return - } - for s in screens { - let primary = s.isPrimary ? " (primary)" : "" - let size = "\(Int(s.frame.width))×\(Int(s.frame.height))" - let scale = String(format: "%.1f", Double(s.scaleFactor)) - let line = "Display \(s.index + 1)\(primary): \(s.name) \(size) @\(scale)x (id \(s.displayID))\n" - FileHandle.standardOutput.write(Data(line.utf8)) - } - - case .uiScreenshot: - let result = try self.decodePayload(UIScreenshotResult.self, payload: response.payload) - FileHandle.standardOutput.write(Data((result.path + "\n").utf8)) - case .generic: if let payload = response.payload, let text = String(data: payload, encoding: .utf8), !text.isEmpty { FileHandle.standardOutput.write(payload) @@ -370,22 +332,6 @@ struct ClawdisCLI { ] switch parsed.kind { - case .uiScreens: - if let payload = response.payload, - let obj = try? JSONSerialization.jsonObject(with: payload) { - output["result"] = obj - } else { - output["result"] = [] - } - - case .uiScreenshot: - if let payload = response.payload, - let obj = try? JSONSerialization.jsonObject(with: payload) { - output["result"] = obj - } else { - output["result"] = NSNull() - } - case .generic: if let payload = response.payload, !payload.isEmpty { if let obj = try? JSONSerialization.jsonObject(with: payload) { @@ -424,8 +370,12 @@ struct ClawdisCLI { [--interactive] UI: - clawdis-mac ui screens - clawdis-mac ui screenshot [--screen-index ] [--window-id ] + clawdis-mac ui screenshot [...] + clawdis-mac ui see [...] + clawdis-mac ui click ... + clawdis-mac ui type ... + clawdis-mac ui wait ... + clawdis-mac ui --help Shell: clawdis-mac run [--cwd ] [--env KEY=VAL] [--timeout ] diff --git a/apps/macos/Sources/ClawdisCLI/UICLI.swift b/apps/macos/Sources/ClawdisCLI/UICLI.swift new file mode 100644 index 000000000..0c987d0c3 --- /dev/null +++ b/apps/macos/Sources/ClawdisCLI/UICLI.swift @@ -0,0 +1,589 @@ +import Foundation +import Darwin +import PeekabooAutomationKit +import PeekabooBridge +import PeekabooFoundation + +enum UICLI { + static func run(args: [String], jsonOutput: Bool) async throws -> Int32 { + var args = args + guard let sub = args.first else { + self.printHelp() + return 0 + } + args.removeFirst() + + if sub == "--help" || sub == "-h" || sub == "help" { + self.printHelp() + return 0 + } + + let context = try await self.resolveContext() + + switch sub { + case "permissions": + return try await self.runPermissions(args: args, jsonOutput: jsonOutput, context: context) + case "frontmost": + return try await self.runFrontmost(args: args, jsonOutput: jsonOutput, context: context) + case "apps": + return try await self.runApps(args: args, jsonOutput: jsonOutput, context: context) + case "windows": + return try await self.runWindows(args: args, jsonOutput: jsonOutput, context: context) + case "screenshot": + return try await self.runScreenshot(args: args, jsonOutput: jsonOutput, context: context) + case "see": + return try await self.runSee(args: args, jsonOutput: jsonOutput, context: context) + case "click": + return try await self.runClick(args: args, jsonOutput: jsonOutput, context: context) + case "type": + return try await self.runType(args: args, jsonOutput: jsonOutput, context: context) + case "wait": + return try await self.runWait(args: args, jsonOutput: jsonOutput, context: context) + default: + self.printHelp() + return 1 + } + } + + // MARK: - Context + + private struct Context { + let client: PeekabooBridgeClient + let hostDescription: String + } + + private static func resolveContext() async throws -> Context { + let explicitSocket = ProcessInfo.processInfo.environment["PEEKABOO_BRIDGE_SOCKET"] + let candidates: [String] = if let explicitSocket, !explicitSocket.isEmpty { + [explicitSocket] + } else { + [ + PeekabooBridgeConstants.peekabooSocketPath, + PeekabooBridgeConstants.clawdisSocketPath, + ] + } + + let identity = PeekabooBridgeClientIdentity( + bundleIdentifier: Bundle.main.bundleIdentifier, + teamIdentifier: nil, + processIdentifier: getpid(), + hostname: Host.current().name) + + for socketPath in candidates { + let client = PeekabooBridgeClient(socketPath: socketPath, requestTimeoutSec: 10) + do { + let handshake = try await client.handshake(client: identity, requestedHost: nil) + return Context( + client: client, + hostDescription: "\(handshake.hostKind.rawValue) via \(socketPath)") + } catch let envelope as PeekabooBridgeErrorEnvelope { + if envelope.code == .unauthorizedClient { + throw envelope + } + } catch { + continue + } + } + + throw NSError(domain: "clawdis.ui", code: 1, userInfo: [ + NSLocalizedDescriptionKey: "No PeekabooBridge host reachable (run Peekaboo.app or Clawdis.app).", + ]) + } + + // MARK: - Commands + + private static func runPermissions(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 { + let sub = args.first ?? "status" + if sub != "status" && sub != "--help" && sub != "-h" && sub != "help" { + self.printHelp() + return 1 + } + let status = try await context.client.permissionsStatus() + if jsonOutput { + try self.writeJSON([ + "ok": true, + "host": context.hostDescription, + "result": try self.toJSONObject(status), + ]) + } else { + FileHandle.standardOutput.write(Data((self.formatPermissions(status) + "\n").utf8)) + } + return 0 + } + + private static func runFrontmost(args _: [String], jsonOutput: Bool, context: Context) async throws -> Int32 { + let app = try await context.client.getFrontmostApplication() + let window = try await context.client.getFocusedWindow() + if jsonOutput { + let windowObject: Any = if let window { + try self.toJSONObject(window) + } else { + NSNull() + } + try self.writeJSON([ + "ok": true, + "host": context.hostDescription, + "app": try self.toJSONObject(app), + "window": windowObject, + ]) + } else { + let bundle = app.bundleIdentifier ?? "" + let line = "\(bundle) (pid \(app.processIdentifier))" + FileHandle.standardOutput.write(Data((line + "\n").utf8)) + if let window { + FileHandle.standardOutput.write(Data(("window \(window.windowID): \(window.title)\n").utf8)) + } + } + return 0 + } + + private static func runApps(args _: [String], jsonOutput: Bool, context: Context) async throws -> Int32 { + let apps = try await context.client.listApplications() + if jsonOutput { + try self.writeJSON([ + "ok": true, + "host": context.hostDescription, + "result": try self.toJSONObject(apps), + ]) + } else { + for app in apps { + let bundle = app.bundleIdentifier ?? "" + FileHandle.standardOutput.write(Data(("\(bundle)\t\(app.name)\n").utf8)) + } + } + return 0 + } + + private static func runWindows(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 { + var args = args + var bundleId: String? + while !args.isEmpty { + switch args.removeFirst() { + case "--bundle-id": + bundleId = args.popFirst() + case "--help", "-h", "help": + self.printHelp() + return 0 + default: + break + } + } + + let target: WindowTarget = if let bundleId, !bundleId.isEmpty { .application(bundleId) } else { .frontmost } + let windows = try await context.client.listWindows(target: target) + + if jsonOutput { + try self.writeJSON([ + "ok": true, + "host": context.hostDescription, + "result": try self.toJSONObject(windows), + ]) + } else { + for window in windows { + FileHandle.standardOutput.write(Data(("\(window.windowID)\t\(window.title)\n").utf8)) + } + } + return 0 + } + + private static func runScreenshot(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 { + var args = args + var displayIndex: Int? + var bundleId: String? + var windowIndex: Int? + var mode: CaptureVisualizerMode = .screenshotFlash + var scale: CaptureScalePreference = .logical1x + + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--screen-index": + displayIndex = args.popFirst().flatMap(Int.init) + case "--bundle-id": + bundleId = args.popFirst() + case "--window-index": + windowIndex = args.popFirst().flatMap(Int.init) + case "--watch": + mode = .watchCapture + case "--scale": + let raw = args.popFirst()?.lowercased() + if raw == "native" { scale = .native } + if raw == "1x" || raw == "logical" || raw == "logical1x" { scale = .logical1x } + case "--help", "-h", "help": + self.printHelp() + return 0 + default: + break + } + } + + let capture: CaptureResult + if let bundleId, !bundleId.isEmpty { + capture = try await context.client.captureWindow( + appIdentifier: bundleId, + windowIndex: windowIndex, + visualizerMode: mode, + scale: scale) + } else if displayIndex != nil { + capture = try await context.client.captureScreen( + displayIndex: displayIndex, + visualizerMode: mode, + scale: scale) + } else { + capture = try await context.client.captureFrontmost(visualizerMode: mode, scale: scale) + } + + let path = try self.writeTempPNG(capture.imageData) + + if jsonOutput { + try self.writeJSON([ + "ok": true, + "host": context.hostDescription, + "path": path, + "metadata": try self.toJSONObject(capture.metadata), + "warning": capture.warning ?? "", + ]) + } else { + FileHandle.standardOutput.write(Data((path + "\n").utf8)) + } + return 0 + } + + private static func runSee(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 { + var args = args + var bundleId: String? + var windowIndex: Int? + var snapshotId: String? + + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--bundle-id": + bundleId = args.popFirst() + case "--window-index": + windowIndex = args.popFirst().flatMap(Int.init) + case "--snapshot-id": + snapshotId = args.popFirst() + case "--help", "-h", "help": + self.printHelp() + return 0 + default: + break + } + } + + let capture: CaptureResult + if let bundleId, !bundleId.isEmpty { + capture = try await context.client.captureWindow( + appIdentifier: bundleId, + windowIndex: windowIndex, + visualizerMode: .screenshotFlash, + scale: .logical1x) + } else { + capture = try await context.client.captureFrontmost(visualizerMode: .screenshotFlash, scale: .logical1x) + bundleId = capture.metadata.applicationInfo?.bundleIdentifier + } + + let resolvedSnapshotId: String = if let snapshotId, !snapshotId.isEmpty { + snapshotId + } else if let bundleId, !bundleId.isEmpty, let existing = try? await context.client + .getMostRecentSnapshot(applicationBundleId: bundleId) { + existing + } else { + try await context.client.createSnapshot() + } + + let screenshotPath = try self.writeTempPNG(capture.imageData) + + try await context.client.storeScreenshot( + snapshotId: resolvedSnapshotId, + screenshotPath: screenshotPath, + applicationBundleId: bundleId, + applicationProcessId: capture.metadata.applicationInfo?.processIdentifier, + applicationName: capture.metadata.applicationInfo?.name, + windowTitle: capture.metadata.windowInfo?.title, + windowBounds: capture.metadata.windowInfo?.bounds) + + let windowContext = WindowContext( + applicationName: capture.metadata.applicationInfo?.name, + windowTitle: capture.metadata.windowInfo?.title, + windowBounds: capture.metadata.windowInfo?.bounds) + + let detection = try await context.client.detectElements( + in: capture.imageData, + snapshotId: resolvedSnapshotId, + windowContext: windowContext) + try await context.client.storeDetectionResult(snapshotId: resolvedSnapshotId, result: detection) + + if jsonOutput { + try self.writeJSON([ + "ok": true, + "host": context.hostDescription, + "snapshotId": resolvedSnapshotId, + "screenshotPath": screenshotPath, + "result": try self.toJSONObject(detection), + ]) + } else { + FileHandle.standardOutput.write(Data((screenshotPath + "\n").utf8)) + for el in detection.elements.all { + let b = el.bounds + let label = (el.label ?? el.value ?? "").replacingOccurrences(of: "\n", with: " ") + let line = + "\(el.id)\t\(el.type)\t\(Int(b.origin.x)),\(Int(b.origin.y)) \(Int(b.size.width))x\(Int(b.size.height))\t\(label)\n" + FileHandle.standardOutput.write(Data(line.utf8)) + } + } + return 0 + } + + private static func runClick(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 { + var args = args + var bundleId: String? + var snapshotId: String? + var on: String? + var clickType: ClickType = .single + + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--bundle-id": + bundleId = args.popFirst() + case "--snapshot-id": + snapshotId = args.popFirst() + case "--on": + on = args.popFirst() + case "--double": + clickType = .double + case "--right": + clickType = .right + case "--help", "-h", "help": + self.printHelp() + return 0 + default: + break + } + } + + guard let on, !on.isEmpty else { + throw NSError(domain: "clawdis.ui", code: 2, userInfo: [ + NSLocalizedDescriptionKey: "Missing --on (run `clawdis-mac ui see` first).", + ]) + } + + let effectiveSnapshotId = try await self.resolveImplicitSnapshotId( + snapshotId: snapshotId, + bundleId: bundleId, + client: context.client) + + try await context.client.click(target: .elementId(on), clickType: clickType, snapshotId: effectiveSnapshotId) + + if jsonOutput { + try self.writeJSON([ + "ok": true, + "host": context.hostDescription, + ]) + } + return 0 + } + + private static func runType(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 { + var args = args + var bundleId: String? + var snapshotId: String? + var into: String? + var clearExisting = false + var delayMs = 20 + var textParts: [String] = [] + + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--bundle-id": + bundleId = args.popFirst() + case "--snapshot-id": + snapshotId = args.popFirst() + case "--into": + into = args.popFirst() + case "--clear": + clearExisting = true + case "--delay-ms": + delayMs = args.popFirst().flatMap(Int.init) ?? delayMs + case "--text": + if let next = args.popFirst() { + textParts.append(next) + } + case "--help", "-h", "help": + self.printHelp() + return 0 + default: + textParts.append(arg) + } + } + + let text = textParts.joined(separator: " ").trimmingCharacters(in: .whitespacesAndNewlines) + guard !text.isEmpty else { + throw NSError(domain: "clawdis.ui", code: 3, userInfo: [ + NSLocalizedDescriptionKey: "Missing text (use --text ).", + ]) + } + + let effectiveSnapshotId = try await self.resolveImplicitSnapshotId( + snapshotId: snapshotId, + bundleId: bundleId, + client: context.client) + + try await context.client.type( + text: text, + target: into, + clearExisting: clearExisting, + typingDelay: delayMs, + snapshotId: effectiveSnapshotId) + + if jsonOutput { + try self.writeJSON([ + "ok": true, + "host": context.hostDescription, + ]) + } + return 0 + } + + private static func runWait(args: [String], jsonOutput: Bool, context: Context) async throws -> Int32 { + var args = args + var bundleId: String? + var snapshotId: String? + var on: String? + var timeoutSec: Double = 10 + + while !args.isEmpty { + let arg = args.removeFirst() + switch arg { + case "--bundle-id": + bundleId = args.popFirst() + case "--snapshot-id": + snapshotId = args.popFirst() + case "--on": + on = args.popFirst() + case "--timeout": + timeoutSec = args.popFirst().flatMap(Double.init) ?? timeoutSec + case "--help", "-h", "help": + self.printHelp() + return 0 + default: + break + } + } + + guard let on, !on.isEmpty else { + throw NSError(domain: "clawdis.ui", code: 4, userInfo: [ + NSLocalizedDescriptionKey: "Missing --on .", + ]) + } + + let effectiveSnapshotId = try await self.resolveImplicitSnapshotId( + snapshotId: snapshotId, + bundleId: bundleId, + client: context.client) + + let result = try await context.client.waitForElement( + target: .elementId(on), + timeout: timeoutSec, + snapshotId: effectiveSnapshotId) + + if jsonOutput { + try self.writeJSON([ + "ok": true, + "host": context.hostDescription, + "result": try self.toJSONObject(result), + ]) + } else { + FileHandle.standardOutput.write(Data((result.found ? "found\n" : "not found\n").utf8)) + } + return result.found ? 0 : 1 + } + + private static func resolveImplicitSnapshotId( + snapshotId: String?, + bundleId: String?, + client: PeekabooBridgeClient) async throws -> String + { + if let snapshotId, !snapshotId.isEmpty { return snapshotId } + + let resolvedBundle: String? = if let bundleId, !bundleId.isEmpty { + bundleId + } else { + try await client.getFrontmostApplication().bundleIdentifier + } + + guard let resolvedBundle, !resolvedBundle.isEmpty else { + throw NSError(domain: "clawdis.ui", code: 5, userInfo: [ + NSLocalizedDescriptionKey: "Could not determine bundle id for implicit snapshot.", + ]) + } + + do { + return try await client.getMostRecentSnapshot(applicationBundleId: resolvedBundle) + } catch { + throw NSError(domain: "clawdis.ui", code: 6, userInfo: [ + NSLocalizedDescriptionKey: "No recent snapshot for \(resolvedBundle). Run `clawdis-mac ui see --bundle-id \(resolvedBundle)` first.", + ]) + } + } + + // MARK: - IO helpers + + private static func writeTempPNG(_ data: Data) throws -> String { + let dir = FileManager.default.temporaryDirectory + let formatter = ISO8601DateFormatter() + formatter.formatOptions = [.withInternetDateTime, .withFractionalSeconds] + let stamp = formatter.string(from: Date()).replacingOccurrences(of: ":", with: "-") + let url = dir.appendingPathComponent("clawdis-ui-\(stamp).png") + try data.write(to: url, options: [.atomic]) + return url.path + } + + private static func formatPermissions(_ status: PermissionsStatus) -> String { + let sr = status.screenRecording ? "screen-recording=ok" : "screen-recording=missing" + let ax = status.accessibility ? "accessibility=ok" : "accessibility=missing" + let ascr = status.appleScript ? "applescript=ok" : "applescript=missing" + return "\(sr) \(ax) \(ascr)" + } + + private static func toJSONObject(_ value: T) throws -> Any { + let encoder = JSONEncoder() + encoder.dateEncodingStrategy = .iso8601 + let data = try encoder.encode(value) + return try JSONSerialization.jsonObject(with: data) + } + + private static func writeJSON(_ obj: [String: Any]) throws { + let data = try JSONSerialization.data(withJSONObject: obj, options: [.prettyPrinted]) + FileHandle.standardOutput.write(data) + FileHandle.standardOutput.write(Data([0x0A])) + } + + private static func printHelp() { + let usage = """ + clawdis-mac ui — UI automation via PeekabooBridge + + Usage: + clawdis-mac [--json] ui ... + + Commands: + permissions status + frontmost + apps + windows [--bundle-id ] + screenshot [--screen-index ] [--bundle-id ] [--window-index ] [--watch] [--scale native|1x] + see [--bundle-id ] [--window-index ] [--snapshot-id ] + click --on [--bundle-id ] [--snapshot-id ] [--double|--right] + type --text [--into ] [--bundle-id ] [--snapshot-id ] [--clear] [--delay-ms ] + wait --on [--bundle-id ] [--snapshot-id ] [--timeout ] + + Notes: + - Prefers Peekaboo.app’s bridge, then Clawdis.app’s bridge. + - Default timeout is 10 seconds per action. + """ + FileHandle.standardError.write(Data((usage + "\n").utf8)) + } +} diff --git a/apps/macos/Sources/ClawdisIPC/IPC.swift b/apps/macos/Sources/ClawdisIPC/IPC.swift index 526d91fdf..feb062bdc 100644 --- a/apps/macos/Sources/ClawdisIPC/IPC.swift +++ b/apps/macos/Sources/ClawdisIPC/IPC.swift @@ -50,64 +50,6 @@ public struct CanvasPlacement: Codable, Sendable { } } -// MARK: - UI (Peekaboo-aligned types) - -/// Display info aligned with Peekaboo's `ScreenService.ScreenInfo`: -/// - `index` is the 0-based position in `NSScreen.screens` at runtime. -/// - `frame`/`visibleFrame` are AppKit screen rectangles (bottom-left origin). -public struct UIScreenInfo: Codable, Sendable { - public let index: Int - public let name: String - public let frame: CGRect - public let visibleFrame: CGRect - public let isPrimary: Bool - public let scaleFactor: CGFloat - public let displayID: UInt32 - - public init( - index: Int, - name: String, - frame: CGRect, - visibleFrame: CGRect, - isPrimary: Bool, - scaleFactor: CGFloat, - displayID: UInt32) - { - self.index = index - self.name = name - self.frame = frame - self.visibleFrame = visibleFrame - self.isPrimary = isPrimary - self.scaleFactor = scaleFactor - self.displayID = displayID - } -} - -public struct UIScreenshotResult: Codable, Sendable { - public let path: String - public let width: Int - public let height: Int - public let screenIndex: Int? - public let displayID: UInt32? - public let windowID: UInt32? - - public init( - path: String, - width: Int, - height: Int, - screenIndex: Int? = nil, - displayID: UInt32? = nil, - windowID: UInt32? = nil) - { - self.path = path - self.width = width - self.height = height - self.screenIndex = screenIndex - self.displayID = displayID - self.windowID = windowID - } -} - public enum Request: Sendable { case notify( title: String, @@ -116,8 +58,6 @@ public enum Request: Sendable { priority: NotificationPriority?, delivery: NotificationDelivery?) case ensurePermissions([Capability], interactive: Bool) - case uiListScreens - case uiScreenshot(screenIndex: Int?, windowID: UInt32?) case runShell( command: [String], cwd: String?, @@ -158,7 +98,6 @@ extension Request: Codable { case type case title, body, sound, priority, delivery case caps, interactive - case screenIndex, windowID case command, cwd, env, timeoutSec, needsScreenRecording case message, thinking, session, deliver, to case rpcStatus @@ -174,8 +113,6 @@ extension Request: Codable { private enum Kind: String, Codable { case notify case ensurePermissions - case uiListScreens - case uiScreenshot case runShell case status case agent @@ -205,14 +142,6 @@ extension Request: Codable { try container.encode(caps, forKey: .caps) try container.encode(interactive, forKey: .interactive) - case .uiListScreens: - try container.encode(Kind.uiListScreens, forKey: .type) - - case let .uiScreenshot(screenIndex, windowID): - try container.encode(Kind.uiScreenshot, forKey: .type) - try container.encodeIfPresent(screenIndex, forKey: .screenIndex) - try container.encodeIfPresent(windowID, forKey: .windowID) - case let .runShell(command, cwd, env, timeoutSec, needsSR): try container.encode(Kind.runShell, forKey: .type) try container.encode(command, forKey: .command) @@ -289,14 +218,6 @@ extension Request: Codable { let interactive = try container.decode(Bool.self, forKey: .interactive) self = .ensurePermissions(caps, interactive: interactive) - case .uiListScreens: - self = .uiListScreens - - case .uiScreenshot: - let screenIndex = try container.decodeIfPresent(Int.self, forKey: .screenIndex) - let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID) - self = .uiScreenshot(screenIndex: screenIndex, windowID: windowID) - case .runShell: let command = try container.decode([String].self, forKey: .command) let cwd = try container.decodeIfPresent(String.self, forKey: .cwd) diff --git a/docs/clawdis-mac.md b/docs/clawdis-mac.md index 4cc5be6f9..538938e0d 100644 --- a/docs/clawdis-mac.md +++ b/docs/clawdis-mac.md @@ -1,10 +1,10 @@ --- -summary: "Spec for the Clawdis macOS companion menu bar app and XPC broker" +summary: "Spec for the Clawdis macOS companion menu bar app and local broker (control socket + PeekabooBridge)" read_when: - Implementing macOS app features - - Touching XPC/CLI bridging + - Touching broker/CLI bridging --- -# Clawdis macOS Companion (menu bar + XPC broker) +# Clawdis macOS Companion (menu bar + local broker) Author: steipete · Status: draft spec · Date: 2025-12-05 @@ -12,21 +12,24 @@ Author: steipete · Status: draft spec · Date: 2025-12-05 - Single macOS menu-bar app named **Clawdis** that: - Shows native notifications for Clawdis/clawdis events. - Owns TCC prompts (Notifications, Accessibility, Screen Recording, Automation/AppleScript, Microphone, Speech Recognition). - - Brokers privileged actions (screen capture, shell with elevated UI context) via XPC. + - Brokers privileged actions via local IPC: + - Clawdis control socket (app-specific actions like notify/run) + - PeekabooBridge socket (`bridge.sock`) for UI automation (see `docs/mac/peekaboo.md`) - Provides a tiny CLI (`clawdis-mac`) that talks to the app; Node/TS shells out to it. - Replace the separate notifier helper pattern (Oracle) with a built-in notifier. - Offer a first-run experience similar to VibeTunnel’s onboarding (permissions + CLI install). ## High-level design -- SwiftPM package in `apps/macos/` (macOS 15+, Swift 6): - - Dependency: `https://github.com/ChimeHQ/AsyncXPCConnection` (>=0.6.0). - - Targets: - - `ClawdisIPC` (shared Codable types + helpers). - - `Clawdis` (LSUIElement MenuBarExtra app; embeds XPC listener and notifier). - - `ClawdisCLI` (client that forms requests, talks XPC, prints JSON for scripts). -- Bundle ID: `com.steipete.clawdis`; XPC service name: `com.steipete.clawdis.xpc`. +- SwiftPM package in `apps/macos/` (macOS 15+, Swift 6). +- Targets: + - `ClawdisIPC` (shared Codable types + helpers for app-specific commands). + - `Clawdis` (LSUIElement MenuBarExtra app; hosts control socket + optional PeekabooBridgeHost). + - `ClawdisCLI` (`clawdis-mac`; prints text by default, `--json` for scripts). +- Bundle ID: `com.steipete.clawdis`. - The CLI lives in the app bundle `Contents/Helpers/clawdis-mac`; dev symlink `bin/clawdis-mac` points there. -- Node/TS layer calls the CLI; no direct XPC from Node. +- Node/TS layer calls the CLI; no direct privileged API calls from Node. + +Note: `docs/mac/xpc.md` describes an aspirational long-term Mach/XPC architecture. The current direction for UI automation is PeekabooBridge (socket-based). ## IPC contract (ClawdisIPC) - Codable enums; small payloads (<1 MB enforced in listener): @@ -36,13 +39,15 @@ enum Capability { notifications, accessibility, screenRecording, appleScript, mi enum Request { notify(title, body, sound?) ensurePermissions([Capability], interactive: Bool) - uiScreenshot(screenIndex?, windowID?) runShell(command:[String], cwd?, env?, timeoutSec?, needsScreenRecording: Bool) status } struct Response { ok: Bool; message?: String; payload?: Data } ``` -- Listener rejects oversize/unknown cases and validates the caller by code signature TeamID (with a `DEBUG`-only same-UID escape hatch controlled by `CLAWDIS_ALLOW_UNSIGNED_SOCKET_CLIENTS=1`). +- The control-socket server rejects oversize/unknown cases and validates the caller by code signature TeamID (with a `DEBUG`-only same-UID escape hatch controlled by `CLAWDIS_ALLOW_UNSIGNED_SOCKET_CLIENTS=1`). + +UI automation is not part of `ClawdisIPC.Request`: +- `clawdis-mac ui …` speaks **PeekabooBridge** (see `docs/mac/peekaboo.md`). ## App UX (Clawdis) - MenuBarExtra icon only (LSUIElement; no Dock). @@ -52,28 +57,37 @@ struct Response { ok: Bool; message?: String; payload?: Data } - Permissions: live status + “Request” buttons for Notifications/Accessibility/Screen Recording; links to System Settings. - Debug (when enabled): PID/log links, restart/reveal app shortcuts, manual test notification. - About: version, links, license. -- Pause behavior: matches Trimmy’s “Auto Trim” toggle. When paused, XPC listener returns `ok=false, message="clawdis paused"` for actions that would touch TCC (notify/run/screenshot). State is persisted (UserDefaults) and surfaced in menu and status view. +- Pause behavior: matches Trimmy’s “Auto Trim” toggle. When paused, the broker returns `ok=false, message="clawdis paused"` for actions that would touch TCC. State is persisted (UserDefaults) and surfaced in menu and status view. - Onboarding (VibeTunnel-inspired): Welcome → What it does → Install CLI (shows `ln -s .../clawdis-mac /usr/local/bin`) → Permissions checklist with live status → Test notification → Done. Re-show when `welcomeVersion` bumps or CLI/app version mismatch. ## Built-in services - NotificationManager: UNUserNotificationCenter primary; AppleScript `display notification` fallback; respects the `--sound` value on each request. - PermissionManager: checks/requests Notifications, Accessibility (AX), Screen Recording (capture probe); publishes changes for UI. -- ScreenCaptureManager: window/display PNG capture; gated on permission. +- UI automation + capture: provided by **PeekabooBridgeHost** when enabled (see `docs/mac/peekaboo.md`). - ShellExecutor: executes `Process` with timeout; rejects when `needsScreenRecording` and permission missing; returns stdout/stderr in payload. -- XPCListener actor: routes Request → managers; logs via OSLog. +- ControlSocketServer actor: routes Request → managers; logs via OSLog. ## CLI (`clawdis-mac`) - Subcommands (text by default; `--json` for machine output; non-zero exit on failure): - `notify --title --body [--sound] [--priority passive|active|timeSensitive] [--delivery system|overlay|auto]` - `ensure-permissions --cap accessibility --cap screenRecording [--interactive]` - - `ui screens` - - `ui screenshot [--screen-index N] [--window-id N]` + - `ui permissions status` + - `ui frontmost` + - `ui apps` + - `ui windows [--bundle-id ]` + - `ui screenshot [--screen-index ] [--bundle-id ] [--window-index ] [--watch] [--scale native|1x]` + - `ui see [--bundle-id ] [--window-index ] [--snapshot-id ]` + - `ui click --on [--bundle-id ] [--snapshot-id ] [--double|--right]` + - `ui type --text [--into ] [--bundle-id ] [--snapshot-id ] [--clear] [--delay-ms ]` + - `ui wait --on [--bundle-id ] [--snapshot-id ] [--timeout ]` - `run -- cmd args... [--cwd] [--env KEY=VAL] [--timeout 30] [--needs-screen-recording]` - `status` - Sounds: supply any macOS alert name with `--sound` per notification; omit the flag to use the system default. There is no longer a persisted “default sound” in the app UI. - Priority: `timeSensitive` is best-effort and falls back to `active` unless the app is signed with the Time Sensitive Notifications entitlement. - Delivery: `overlay` and `auto` show an in-app toast panel (bypasses Notification Center/Focus). -- Internals: builds a `ClawdisIPC.Request`, sends it to the running app over the local control socket, and prints text by default (or JSON with `--json`). +- Internals: + - For app-specific commands (`notify`, `ensure-permissions`, `run`, `status`): build `ClawdisIPC.Request`, send over the control socket. + - For UI automation (`ui …`): connect to PeekabooBridge hosts (Peekaboo.app → Clawdis.app) and send one JSON request per command (see `docs/mac/peekaboo.md`). ## Integration with clawdis/Clawdis (Node/TS) - Add helper module that shells to `clawdis-mac`: @@ -135,6 +149,6 @@ Notes: ## Open questions / decisions - Where to place the dev symlink `bin/clawdis-mac` (repo root vs. `apps/macos/bin`)? -- Should `runShell` support streaming stdout/stderr (XPC with AsyncSequence) or just buffered? (Start buffered; streaming later.) +- Should `runShell` support streaming stdout/stderr (IPC with AsyncSequence) or just buffered? (Start buffered; streaming later.) - Icon: reuse Clawdis lobster or new mac-specific glyph? - Sparkle updates: bundled via Sparkle; release builds point at `https://raw.githubusercontent.com/steipete/clawdis/main/appcast.xml` and enable auto-checks, while debug builds leave the feed blank and disable checks. diff --git a/docs/mac/canvas.md b/docs/mac/canvas.md index e9c558a89..c7b396b87 100644 --- a/docs/mac/canvas.md +++ b/docs/mac/canvas.md @@ -69,7 +69,7 @@ Implementation notes: ## Agent API surface (proposed) -Expose Canvas via the existing `clawdis-mac` → XPC → app routing so the agent can: +Expose Canvas via the existing `clawdis-mac` → control socket → app routing so the agent can: - Show/hide the panel. - Navigate to a path (relative to the session root). - Evaluate JavaScript and optionally return results. diff --git a/docs/mac/child-process.md b/docs/mac/child-process.md index 630abee66..c1af8a2a5 100644 --- a/docs/mac/child-process.md +++ b/docs/mac/child-process.md @@ -8,7 +8,7 @@ read_when: Date: 2025-12-06 · Status: draft · Owner: steipete ## Goal -Run the Node-based Clawdis/clawdis gateway as a direct child of the LSUIElement app (instead of a launchd agent) while keeping all TCC-sensitive work inside the Swift app/XPC and wiring the existing “Clawdis Active” toggle to start/stop the child. +Run the Node-based Clawdis/clawdis gateway as a direct child of the LSUIElement app (instead of a launchd agent) while keeping all TCC-sensitive work inside the Swift app/broker layer and wiring the existing “Clawdis Active” toggle to start/stop the child. ## When to prefer the child-process mode - You want gateway lifetime strictly coupled to the menu-bar app (dies when the app quits) and controlled by the “Clawdis Active” toggle without touching launchd. @@ -18,12 +18,13 @@ Run the Node-based Clawdis/clawdis gateway as a direct child of the LSUIElement ## Tradeoffs vs. launchd - **Pros:** tighter coupling to UI state; simpler surface (no plist install/bootout); easier to stream stdout/stderr; fewer moving parts for beta users. - **Cons:** no built-in KeepAlive/login auto-start; app crash kills gateway; you must build your own restart/backoff; Activity Monitor will show both processes under the app; still need correct TCC handling (see below). -- **TCC:** behaviorally, child processes often inherit the parent app’s “responsible process” for TCC, but this is *not a contract*. Continue to route all protected actions through the Swift app/XPC so prompts stay tied to the signed app bundle. +- **TCC:** behaviorally, child processes often inherit the parent app’s “responsible process” for TCC, but this is *not a contract*. Continue to route all protected actions through the Swift app/broker so prompts stay tied to the signed app bundle. ## TCC guardrails (must keep) -- Screen Recording, Accessibility, mic, and speech prompts must originate from the Swift app/XPC. The Node child should never call these APIs directly; use the existing XPC/CLI broker (`clawdis-mac`) for: +- Screen Recording, Accessibility, mic, and speech prompts must originate from the signed Swift app/broker. The Node child should never call these APIs directly; use the CLI broker (`clawdis-mac`) for: - `ensure-permissions` - - `ui screenshot` / ScreenCaptureKit work + - `ui screenshot` (via PeekabooBridge host) + - other `ui …` automation (see/click/type/scroll/wait) when implemented - mic/speech permission checks - notifications - shell runs that need `needs-screen-recording` @@ -48,7 +49,7 @@ Run the Node-based Clawdis/clawdis gateway as a direct child of the LSUIElement ## Packaging and signing - Bundle the gateway payload (dist + production node_modules) under `Contents/Resources/Gateway/`; rely on host Node ≥22 instead of embedding a runtime. - Codesign native addons and dylibs inside the bundle; no nested runtime binary to sign now. -- Host runtime should not call TCC APIs directly; keep privileged work inside the app/XPC. +- Host runtime should not call TCC APIs directly; keep privileged work inside the app/broker. ## Logging and observability - Stream child stdout/stderr to `/tmp/clawdis-gateway.log`; surface the last N lines in the Debug tab. @@ -58,14 +59,14 @@ Run the Node-based Clawdis/clawdis gateway as a direct child of the LSUIElement ## Failure/edge cases - App crash/quit kills the gateway. Decide if that is acceptable for the deployment tier; otherwise, stick with launchd for production and keep child-process for dev/experiments. - If the gateway exits repeatedly, back off (e.g., 1s/2s/5s/10s) and give up after N attempts with a menu warning. -- Respect the existing pause semantics: when paused, the XPC should return `ok=false, "clawdis paused"`; the gateway should avoid calling privileged routes while paused. +- Respect the existing pause semantics: when paused, the broker should return `ok=false, "clawdis paused"`; the gateway should avoid calling privileged routes while paused. ## Open questions / follow-ups - Do we need dual-mode (launchd for prod, child for dev)? If yes, gate via a setting or build flag. - Embedding a runtime is off the table for now; we rely on host Node for size/simplicity. Revisit only if host PATH drift becomes painful. -- Do we want a tiny signed helper for rare TCC actions that cannot be brokered via XPC? +- Do we want a tiny signed helper for rare TCC actions that cannot be brokered via the Swift app/broker? ## Decision snapshot (current recommendation) -- Keep all TCC surfaces in the Swift app/XPC. +- Keep all TCC surfaces in the Swift app/broker (control socket + PeekabooBridgeHost). - Implement `GatewayProcessManager` with Swift Subprocess to start/stop the gateway on the “Clawdis Active” toggle. - Maintain the launchd path as a fallback for uptime/login persistence until child-mode proves stable. diff --git a/docs/mac/icon.md b/docs/mac/icon.md index 4e2acadc1..3e574183e 100644 --- a/docs/mac/icon.md +++ b/docs/mac/icon.md @@ -22,5 +22,5 @@ Shapes & sizes - Scurry uses leg wiggle up to ~1.0 with a small horizontal jiggle; it’s additive to any existing idle wiggle. Behavioral notes -- No external CLI/XPC toggle for ears/working; keep it internal to the app’s own signals to avoid accidental flapping. +- No external CLI/broker toggle for ears/working; keep it internal to the app’s own signals to avoid accidental flapping. - Keep TTLs short (<10s) so the icon returns to baseline quickly if a job hangs. diff --git a/docs/mac/peekaboo.md b/docs/mac/peekaboo.md index 6bbfe3574..4313d0e56 100644 --- a/docs/mac/peekaboo.md +++ b/docs/mac/peekaboo.md @@ -1,44 +1,80 @@ --- -summary: "Plan for integrating Peekaboo automation + visualizer into Clawdis macOS app (via clawdis-mac)" +summary: "Plan for integrating Peekaboo automation into Clawdis via PeekabooBridge (socket-based TCC broker)" read_when: - Adding UI automation commands - Integrating Peekaboo as a submodule - Changing clawdis-mac IPC/output formats --- -# Peekaboo in Clawdis (macOS UI automation + visualizer) +# Peekaboo Bridge in Clawdis (macOS UI automation broker) -## Goal -Reuse Peekaboo’s mac automation “core” inside **Clawdis.app** so we piggyback on Clawdis’ existing TCC grants (Screen Recording, Accessibility, etc.). The CLI (`clawdis-mac`) stays a thin synchronous trigger surface for **single actions** (no batches), returning errors cleanly. +## TL;DR +- **Peekaboo removed its XPC helper** and now exposes privileged automation via a **UNIX domain socket bridge** (`PeekabooBridge` / `PeekabooBridgeHost`, socket name `bridge.sock`). +- Clawdis integrates by **hosting the same bridge** inside **Clawdis.app** (optional, user-toggleable), and by making `clawdis-mac ui …` act as a **bridge client**. +- For **visualizations**, we keep them in **Peekaboo.app** (best UX); Clawdis stays a thin broker host. No visualizer toggle in Clawdis. Non-goals: -- No AI/agent runtime parts from Peekaboo (no Tachikoma/MCP/Commander entrypoints). -- No auto-onboarding or System Settings deep-linking from the automation layer (Clawdis onboarding already handles that). +- No auto-launching Peekaboo.app. +- No onboarding deep links from the automation endpoint (Clawdis onboarding already handles permissions). +- No AI provider/agent runtime dependencies in Clawdis (avoid pulling Tachikoma/MCP into the Clawdis app/CLI). -## Where code lives -- **Clawdis.app (macOS)**: owns all automation + visualization + TCC prompts. -- **`clawdis-mac` CLI**: sends one request, waits, prints result, exits non-zero on failure. -- **Gateway/Node/TS**: shells out to `clawdis-mac` when it needs TCC-backed actions. +## Big refactor (Dec 2025): XPC → Bridge +Peekaboo’s privileged execution moved from “CLI → XPC helper” to “CLI → socket bridge host”. For Clawdis this is a win: +- It matches the existing “local socket + codesign checks” approach. +- It lets us piggyback on **either** Peekaboo.app’s permissions **or** Clawdis.app’s permissions (whichever is running). +- It avoids “two apps with two TCC bubbles” unless needed. -Transport: existing UNIX domain socket (`controlSocketPath`) already used by `clawdis-mac`. +Reference (Peekaboo submodule): `docs/bridge-host.md`. -## Dependencies (submodule strategy) -Integrate Peekaboo via git submodule (nested submodules OK). +## Architecture +### Processes +- **Bridge hosts** (provide TCC-backed automation): + - **Peekaboo.app** (preferred; also provides visualizations + controls) + - **Clawdis.app** (secondary; “thin host” only) +- **Bridge clients** (trigger single actions): + - `clawdis-mac ui …` + - Node/Gateway shells out to `clawdis-mac` -Consume only: -- `PeekabooAutomationKit` (AX automation, element detection, capture helpers; no Tachikoma/MCP). -- `AXorcist` (input driving / AX helpers). -- `PeekabooVisualizer` (overlay visualizations). +### Host discovery (client-side) +Order is deliberate: +1. Peekaboo.app host (full UX) +2. Clawdis.app host (piggyback on Clawdis permissions) -Important nuance: -- `PeekabooAutomationKit` is a standalone SwiftPM package and does **not** require Tachikoma/MCP/Commander. -- `PeekabooVisualizer` ships as a product inside `PeekabooCore/Package.swift`. That package declares other dependencies (including a path dependency to Tachikoma). SwiftPM will still need those paths to exist during dependency resolution even if we don’t build those targets. - - If this becomes annoying for Clawdis, the follow-up is to extract `PeekabooVisualizer` into its own standalone Swift package that depends only on `PeekabooFoundation`/`PeekabooProtocols`/`PeekabooExternalDependencies`. +Socket paths (convention; exact paths must match Peekaboo): +- Peekaboo: `~/Library/Application Support/Peekaboo/bridge.sock` +- Clawdis: `~/Library/Application Support/clawdis/bridge.sock` + +No auto-launch: if a host isn’t reachable, the command fails with a clear error (start Peekaboo.app or Clawdis.app). + +Override (debugging): set `PEEKABOO_BRIDGE_SOCKET=/path/to/bridge.sock`. + +### Protocol shape +- **Single request per connection**: connect → write one JSON request → half-close → read one JSON response → close. +- **Timeout**: 10 seconds end-to-end per action (client enforced; host should also enforce per-operation). +- **Errors**: human-readable string by default; structured envelope in `--json`. + +## Dependency strategy (submodule) +Integrate Peekaboo via git submodule (nested submodules are OK). + +Path in Clawdis repo: +- `./Peekaboo` (Swabble-style; keep stable so SwiftPM path deps don’t churn). + +What Clawdis should use: +- **Client side**: `PeekabooBridge` (socket client + protocol models). +- **Host side (Clawdis.app)**: `PeekabooBridgeHost` + the minimal Peekaboo services needed to implement operations. + +What Clawdis should *not* embed: +- **Visualizer UI**: keep it in Peekaboo.app for now (toggle + controls live there). +- **XPC**: don’t reintroduce helper targets; use the bridge. ## IPC / CLI surface ### Namespacing Add new automation commands behind a `ui` prefix: - `clawdis-mac ui …` for UI automation + visualization-related actions. -- Keep existing top-level commands (`notify`, `run`, `canvas …`, etc.) for compatibility, but do a clean cutover for screenshots: remove the legacy top-level `screenshot` command and ship only `clawdis-mac ui screenshot`. +- Keep existing top-level commands (`notify`, `run`, `canvas …`, etc.) for compatibility. + +Screenshot cutover: +- Remove legacy screenshot endpoints/commands. +- Ship only `clawdis-mac ui screenshot` (no aliases). ### Output format Change `clawdis-mac` to default to human text output: @@ -50,14 +86,14 @@ This applies globally, not only `ui` commands. Note (current state as of 2025-12-13): `clawdis-mac` prints text by default; use `--json` for structured output. ### Timeouts -Default timeout for UI actions: **10 seconds** end-to-end (CLI already defaults to 10s). -- CLI: keep the fail-fast default at 10s (unless a command explicitly requests longer). -- Server: only has a ~5s read/decode timeout today; UI operations must also enforce their own per-action timeout so “wait for element” can fail deterministically. +Default timeout for UI actions: **10 seconds** end-to-end. ## Coordinate model (multi-display) Requirement: coordinates are **per screen**, not global. -Proposed API shape: +Standardize for the CLI (agent-friendly): **top-left origin per screen**. + +Proposed request shape: - Requests accept `screenIndex` + `{x, y}` in that screen’s local coordinate space. - Clawdis.app converts to global CG coordinates using `NSScreen.screens[screenIndex].frame.origin`. - Responses should echo both: @@ -68,53 +104,48 @@ Proposed API shape: Ordering: use `NSScreen.screens` ordering consistently (documented in the CLI help + JSON schema). ## Targeting (per app/window) -Expose window/app targeting in the IPC surface (based on Peekaboo’s existing `WindowTarget` model): +Expose window/app targeting in the UI surface (align with Peekaboo targeting): - frontmost - by app name / bundle id - by window title substring - by (app, index) -- by window id + +Current `clawdis-mac ui …` support: +- `--bundle-id ` for app targeting +- `--window-index ` (0-based) for disambiguating within an app when capturing (see/screenshot) All “see/click/type/scroll/wait” requests should accept a target (default: frontmost). ## “See” + click packs (Playwright-style) -Peekaboo already has the core ingredients: -- element detection yielding stable IDs (e.g., `B1`, `T3`) -- bounds + labels/values -- snapshot IDs to allow follow-up actions without re-scanning +Behavior stays aligned with Peekaboo: +- `ui see` returns element IDs (e.g. `B1`, `T3`) with bounds/labels. +- Follow-up actions reference those IDs without re-scanning. -Clawdis’s `ui see` should: +`clawdis-mac ui see` should: - capture (optionally targeted) window/screen -- return a **snapshot id** -- return a list of elements with `{id, type, label/value?, bounds}` -- optionally return screenshot path/bytes (pref: path) +- return a screenshot **file path** (default: temp directory) +- return a list of elements (text or JSON) Snapshot lifecycle requirement: -- Clawdis runs long-lived in memory, so “snapshot state” should be **in-memory by default** (no disk-backed JSON concept). -- Peekaboo already supports this via an `InMemorySnapshotManager` (keep disk-backed snapshots as an optional debug mode later). +- Host apps are long-lived, so snapshot state should be **in-memory by default**. +- Snapshot scoping: “implicit snapshot” is **per target bundle id** (reuse last snapshot for that app when snapshot id is omitted). + +Practical flow (agent-friendly): +- `clawdis-mac ui frontmost` returns the focused app (bundle id) + focused window (title/id) so follow-up calls can pass `--bundle-id …`. +- `clawdis-mac ui see --bundle-id X` updates the implicit snapshot for `X`. +- `clawdis-mac ui click --bundle-id X --on B1` reuses the most recent snapshot for `X` when `--snapshot-id` is omitted. ## Visualizer integration -Visualizer must be user-toggleable via a Clawdis setting. - -Implementation sketch: -- Add a Clawdis UserDefaults-backed setting (e.g. `clawdis.ui.visualizerEnabled`). -- Implement Peekaboo’s `VisualizerSettingsProviding` in Clawdis (`visualizerEnabled`, animation speed, and per-effect toggles). -- Create a Clawdis-specific `AutomationFeedbackClient` that forwards PeekabooAutomationKit feedback events into a shared `VisualizerCoordinator`. - -Current state: -- `PeekabooVisualizer` already includes the visualization implementation (SwiftUI overlay views + coordinator). -The visualizer is intentionally display-only (no clickable overlays needed). +Keep visualizations in **Peekaboo.app** for now. +- Clawdis hosts the bridge, but does not render overlays. +- Any “visualizer enabled/disabled” setting is controlled in Peekaboo.app. ## Screenshots (legacy → Peekaboo takeover) Clawdis uses `clawdis-mac ui screenshot` and returns a file path (default location: temp directory) instead of raw image bytes. Migration plan: -- Replace capture implementation with PeekabooAutomationKit’s capture service so we share: - - per-screen mapping - - window/app targeting - - visual feedback (flash / watch HUD) when enabled -- Keep writing images to a file path on the app side and returning the path (text-friendly), with `--json` providing the structured metadata. -- No aliases: remove the old `Request.screenshot` and introduce a new `Request.uiScreenshot` (or similar) so the new behavior is explicit and there’s no “legacy mode” to maintain. +- Bridge host performs capture and returns a temp file path. +- No legacy aliases; make the old screenshot surface disappear cleanly. ## Permissions behavior If required permissions are missing: @@ -122,17 +153,32 @@ If required permissions are missing: - do not try to open System Settings from the automation endpoint ## Security (socket auth) -Clawdis’ socket is protected by: +Both hosts must enforce: - filesystem perms on the socket path (owner read/write only) -- server-side caller check: - - requires the caller’s code signature TeamID to be `Y5PE65HELJ` - - in `DEBUG` builds only, an explicit escape hatch allows same-UID clients when `CLAWDIS_ALLOW_UNSIGNED_SOCKET_CLIENTS=1` is set (development convenience) +- server-side caller validation: + - require the caller’s code signature TeamID to be `Y5PE65HELJ` + - optional bundle-id allowlist for tighter scoping -This ensures “any local process” can’t drive the privileged surface just because it runs under the same macOS user. +Debug-only escape hatch (development convenience): +- “allow same-UID callers” means: *skip codesign checks for clients running under the same Unix user*. +- This must be **opt-in**, **DEBUG-only**, and guarded by an env var (Peekaboo uses `PEEKABOO_ALLOW_UNSIGNED_SOCKET_CLIENTS=1`). + +## Current `clawdis-mac ui` commands (Dec 2025) +All commands default to text output. Add `--json` right after `clawdis-mac` for a structured envelope. + +- `clawdis-mac ui permissions status` +- `clawdis-mac ui frontmost` +- `clawdis-mac ui apps` +- `clawdis-mac ui windows [--bundle-id ]` +- `clawdis-mac ui screenshot [--screen-index ] [--bundle-id ] [--window-index ] [--watch] [--scale native|1x]` +- `clawdis-mac ui see [--bundle-id ] [--window-index ] [--snapshot-id ]` +- `clawdis-mac ui click --on [--bundle-id ] [--snapshot-id ] [--double|--right]` +- `clawdis-mac ui type --text [--into ] [--bundle-id ] [--snapshot-id ] [--clear] [--delay-ms ]` +- `clawdis-mac ui wait --on [--bundle-id ] [--snapshot-id ] [--timeout ]` ## Next integration steps (after this doc) -1. Add Peekaboo as a git submodule (and required nested submodules). -2. Wire SwiftPM deps in `apps/macos/Package.swift` to import `PeekabooAutomationKit` + `PeekabooVisualizer`. -3. Extend `ClawdisIPC.Request` with `ui.*` commands (`see/click/type/scroll/wait/screenshot/windows/screens`). -4. Implement handlers in Clawdis.app and route through PeekabooAutomationKit services. -5. Update `clawdis-mac` output defaults (text + `--json`), and adjust any internal call sites that relied on JSON-by-default. +1. Add Peekaboo as a git submodule (nested submodules OK). +2. Add a small `clawdis-mac ui …` surface that speaks PeekabooBridge (text by default, `--json` for structured). +3. Host `PeekabooBridgeHost` inside Clawdis.app behind a single setting (“Enable Peekaboo Bridge”, default on). +4. Implement the minimum operation set needed for agents (see/click/type/scroll/wait/screenshot, plus list apps/windows/screens). +5. Keep all protocol decisions aligned with Peekaboo (coordinate system, element IDs, snapshot scoping, error envelopes). diff --git a/docs/mac/xpc.md b/docs/mac/xpc.md index 8b9361c84..0d7313d16 100644 --- a/docs/mac/xpc.md +++ b/docs/mac/xpc.md @@ -1,19 +1,29 @@ --- -summary: "macOS XPC architecture for Clawdis app, CLI helper, and gateway bridge" +summary: "macOS IPC architecture for Clawdis app, CLI helper, and gateway bridge (control socket + XPC + PeekabooBridge)" read_when: - - Editing XPC contracts or menu bar app IPC + - Editing IPC contracts or menu bar app IPC --- -# Clawdis macOS XPC architecture (Dec 2025) +# Clawdis macOS IPC architecture (Dec 2025) -Note: the current implementation primarily uses a local UNIX-domain control socket (`controlSocketPath`) between `clawdis-mac` and the app. This doc describes the intended long-term XPC/Mach-service architecture and the security constraints; update it as the implementation converges. +Note: the current implementation primarily uses a local UNIX-domain control socket (`controlSocketPath`) between `clawdis-mac` and the app. This doc captures the intended long-term Mach/XPC direction and the security constraints, and also documents the separate PeekabooBridge socket used for UI automation. ## Goals - Single GUI app instance that owns all TCC-facing work (notifications, screen recording, mic, speech, AppleScript). -- A small surface for automation: the `clawdis-mac` CLI and the Node gateway talk to the app via a local XPC channel. +- A small surface for automation: the `clawdis-mac` CLI and the Node gateway talk to the app via local IPC. - Predictable permissions: always the same signed bundle ID, launched by launchd, so TCC grants stick. - Limit who can connect: only signed clients from our team (with an explicit DEBUG-only escape hatch for development). ## How it works +### Control socket (current) +- `clawdis-mac` talks to the app via a local UNIX socket (`controlSocketPath`) for app-specific requests (notify, status, ensure-permissions, run, etc.). + +### PeekabooBridge (UI automation) +- UI automation uses a separate UNIX socket named `bridge.sock` and the PeekabooBridge JSON protocol. +- Host preference order (client-side): Peekaboo.app → Clawdis.app → local execution. +- Security: bridge hosts require TeamID `Y5PE65HELJ`; DEBUG-only same-UID escape hatch is guarded by `PEEKABOO_ALLOW_UNSIGNED_SOCKET_CLIENTS=1` (Peekaboo convention). +- See: `docs/mac/peekaboo.md` for the Clawdis plan and naming. + +### Mach/XPC (future direction) - The app registers a Mach service named `com.steipete.clawdis.xpc` via a user LaunchAgent at `~/Library/LaunchAgents/com.steipete.clawdis.plist`. - The launch agent runs `dist/Clawdis.app/Contents/MacOS/Clawdis` with `RunAtLoad=true`, `KeepAlive=false`, and a `MachServices` entry for the XPC name. - The app hosts the XPC listener (`NSXPCListener(machServiceName:)`) and exports `ClawdisXPCService`. @@ -35,6 +45,8 @@ Note: the current implementation primarily uses a local UNIX-domain control sock - RunAtLoad without KeepAlive means the app starts once; if it crashes it stays down (no unwanted respawn), but CLI calls will re-spawn via launchd. ## Hardening notes -- Prefer requiring a TeamID match for all privileged surfaces. The codebase currently has a `DEBUG`-only same-UID escape hatch gated behind `CLAWDIS_ALLOW_UNSIGNED_SOCKET_CLIENTS=1` for local development. +- Prefer requiring a TeamID match for all privileged surfaces. + - Clawdis control socket: `CLAWDIS_ALLOW_UNSIGNED_SOCKET_CLIENTS=1` (DEBUG-only) may allow same-UID callers for local development. + - PeekabooBridge: `PEEKABOO_ALLOW_UNSIGNED_SOCKET_CLIENTS=1` (DEBUG-only) may allow same-UID callers for local development. - All communication remains local-only; no network sockets are exposed. - TCC prompts originate only from the GUI app bundle; run scripts/package-mac-app.sh so the signed bundle ID stays stable. diff --git a/src/cli/gateway.sigterm.test.ts b/src/cli/gateway.sigterm.test.ts index 57deddcbd..cd89b691d 100644 --- a/src/cli/gateway.sigterm.test.ts +++ b/src/cli/gateway.sigterm.test.ts @@ -2,18 +2,49 @@ import { spawn } from "node:child_process"; import net from "node:net"; import { afterEach, describe, expect, it } from "vitest"; -const waitForText = async ( - chunks: string[], - pattern: RegExp, +const waitForPortOpen = async ( + proc: ReturnType, + chunksOut: string[], + chunksErr: string[], + port: number, timeoutMs: number, ) => { const startedAt = Date.now(); while (Date.now() - startedAt < timeoutMs) { - const joined = chunks.join(""); - if (pattern.test(joined)) return; + if (proc.exitCode !== null) { + const stdout = chunksOut.join(""); + const stderr = chunksErr.join(""); + throw new Error( + `gateway exited before listening (code=${String(proc.exitCode)} signal=${String(proc.signalCode)})\n` + + `--- stdout ---\n${stdout}\n--- stderr ---\n${stderr}`, + ); + } + + try { + await new Promise((resolve, reject) => { + const socket = net.connect({ host: "127.0.0.1", port }); + socket.once("connect", () => { + socket.destroy(); + resolve(); + }); + socket.once("error", (err) => { + socket.destroy(); + reject(err); + }); + }); + return; + } catch { + // keep polling + } + await new Promise((resolve) => setTimeout(resolve, 10)); } - throw new Error(`timeout waiting for ${String(pattern)}`); + const stdout = chunksOut.join(""); + const stderr = chunksErr.join(""); + throw new Error( + `timeout waiting for gateway to listen on port ${port}\n` + + `--- stdout ---\n${stdout}\n--- stderr ---\n${stderr}`, + ); }; const getFreePort = async () => { @@ -67,9 +98,11 @@ describe("gateway SIGTERM", () => { child.stdout?.on("data", (d) => out.push(String(d))); child.stderr?.on("data", (d) => err.push(String(d))); - await waitForText( + await waitForPortOpen( + proc, out, - new RegExp(`gateway listening on ws://127\\.0\\.0\\.1:${port}\\b`), + err, + port, 20_000, ); diff --git a/src/web/auto-reply.test.ts b/src/web/auto-reply.test.ts index baf50f6c8..cbd104e43 100644 --- a/src/web/auto-reply.test.ts +++ b/src/web/auto-reply.test.ts @@ -1015,7 +1015,7 @@ describe("web auto-reply", () => { it( "compresses common formats to jpeg under the cap", - { timeout: 15_000 }, + { timeout: 45_000 }, async () => { const formats = [ {