feat: add browser snapshot modes

This commit is contained in:
Peter Steinberger
2026-01-15 03:50:48 +00:00
parent 4e48d0a431
commit a6e780b2f6
18 changed files with 430 additions and 141 deletions

View File

@@ -34,8 +34,7 @@ extension AttributedString {
var ranges: [Range<AttributedString.Index>] = []
for wordRange in wordRanges {
if let lastRange = ranges.last,
self[lastRange].characters.count + self[wordRange].characters.count <= maxLength
{
self[lastRange].characters.count + self[wordRange].characters.count <= maxLength {
ranges[ranges.count - 1] = lastRange.lowerBound..<wordRange.upperBound
} else {
ranges.append(wordRange)

View File

@@ -13,8 +13,7 @@ public actor TranscriptsStore {
try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
fileURL = dir.appendingPathComponent("transcripts.log")
if let data = try? Data(contentsOf: fileURL),
let text = String(data: data, encoding: .utf8)
{
let text = String(data: data, encoding: .utf8) {
entries = text.split(separator: "\n").map(String.init).suffix(limit)
}
}

View File

@@ -24,8 +24,7 @@ public struct WakeWordGateConfig: Sendable, Equatable {
public init(
triggers: [String],
minPostTriggerGap: TimeInterval = 0.45,
minCommandLength: Int = 1)
{
minCommandLength: Int = 1) {
self.triggers = triggers
self.minPostTriggerGap = minPostTriggerGap
self.minCommandLength = minCommandLength
@@ -57,6 +56,12 @@ public enum WakeWordGate {
let tokens: [String]
}
private struct MatchCandidate {
let index: Int
let triggerEnd: TimeInterval
let gap: TimeInterval
}
public static func match(
transcript: String,
segments: [WakeWordSegment],
@@ -68,7 +73,7 @@ public enum WakeWordGate {
let tokens = normalizeSegments(segments)
guard !tokens.isEmpty else { return nil }
var best: (index: Int, triggerEnd: TimeInterval, gap: TimeInterval)?
var best: MatchCandidate?
for trigger in triggerTokens {
let count = trigger.tokens.count
@@ -84,7 +89,7 @@ public enum WakeWordGate {
if let best, i <= best.index { continue }
best = (i, triggerEnd, gap)
best = MatchCandidate(index: i, triggerEnd: triggerEnd, gap: gap)
}
}

View File

@@ -24,7 +24,7 @@ enum CLIRegistry {
subcommands: [
descriptor(for: ServiceInstall.self),
descriptor(for: ServiceUninstall.self),
descriptor(for: ServiceStatus.self),
descriptor(for: ServiceStatus.self)
])
let doctorDesc = descriptor(for: DoctorCommand.self)
let setupDesc = descriptor(for: SetupCommand.self)
@@ -54,7 +54,7 @@ enum CLIRegistry {
startDesc,
stopDesc,
restartDesc,
statusDesc,
statusDesc
])
return [root]
}

View File

@@ -25,7 +25,7 @@ private enum LaunchdHelper {
"Label": label,
"ProgramArguments": [executable, "serve"],
"RunAtLoad": true,
"KeepAlive": true,
"KeepAlive": true
]
let data = try PropertyListSerialization.data(fromPropertyList: plist, format: .xml, options: 0)
try data.write(to: plistURL)

View File

@@ -25,78 +25,123 @@ private func dispatch(invocation: CommandInvocation) async throws {
switch first {
case "swabble":
guard path.count >= 2 else { throw CommanderProgramError.missingSubcommand(command: "swabble") }
let sub = path[1]
switch sub {
case "serve":
var cmd = ServeCommand(parsed: parsed)
try await cmd.run()
case "transcribe":
var cmd = TranscribeCommand(parsed: parsed)
try await cmd.run()
case "test-hook":
var cmd = TestHookCommand(parsed: parsed)
try await cmd.run()
case "mic":
guard path.count >= 3 else { throw CommanderProgramError.missingSubcommand(command: "mic") }
let micSub = path[2]
if micSub == "list" {
var cmd = MicList(parsed: parsed)
try await cmd.run()
} else if micSub == "set" {
var cmd = MicSet(parsed: parsed)
try await cmd.run()
} else {
throw CommanderProgramError.unknownSubcommand(command: "mic", name: micSub)
}
case "service":
guard path.count >= 3 else { throw CommanderProgramError.missingSubcommand(command: "service") }
let svcSub = path[2]
switch svcSub {
case "install":
var cmd = ServiceInstall()
try await cmd.run()
case "uninstall":
var cmd = ServiceUninstall()
try await cmd.run()
case "status":
var cmd = ServiceStatus()
try await cmd.run()
default:
throw CommanderProgramError.unknownSubcommand(command: "service", name: svcSub)
}
case "doctor":
var cmd = DoctorCommand(parsed: parsed)
try await cmd.run()
case "setup":
var cmd = SetupCommand(parsed: parsed)
try await cmd.run()
case "health":
var cmd = HealthCommand(parsed: parsed)
try await cmd.run()
case "tail-log":
var cmd = TailLogCommand(parsed: parsed)
try await cmd.run()
case "start":
var cmd = StartCommand()
try await cmd.run()
case "stop":
var cmd = StopCommand()
try await cmd.run()
case "restart":
var cmd = RestartCommand()
try await cmd.run()
case "status":
var cmd = StatusCommand()
try await cmd.run()
default:
throw CommanderProgramError.unknownSubcommand(command: "swabble", name: sub)
}
try await dispatchSwabble(parsed: parsed, path: path)
default:
throw CommanderProgramError.unknownCommand(first)
}
}
@available(macOS 26.0, *)
@MainActor
private func dispatchSwabble(parsed: ParsedValues, path: [String]) async throws {
let sub = try subcommand(path, index: 1, command: "swabble")
switch sub {
case "mic":
try await dispatchMic(parsed: parsed, path: path)
case "service":
try await dispatchService(path: path)
default:
let handlers = swabbleHandlers(parsed: parsed)
guard let handler = handlers[sub] else {
throw CommanderProgramError.unknownSubcommand(command: "swabble", name: sub)
}
try await handler()
}
}
@available(macOS 26.0, *)
@MainActor
private func swabbleHandlers(parsed: ParsedValues) -> [String: () async throws -> Void] {
[
"serve": {
var cmd = ServeCommand(parsed: parsed)
try await cmd.run()
},
"transcribe": {
var cmd = TranscribeCommand(parsed: parsed)
try await cmd.run()
},
"test-hook": {
var cmd = TestHookCommand(parsed: parsed)
try await cmd.run()
},
"doctor": {
var cmd = DoctorCommand(parsed: parsed)
try await cmd.run()
},
"setup": {
var cmd = SetupCommand(parsed: parsed)
try await cmd.run()
},
"health": {
var cmd = HealthCommand(parsed: parsed)
try await cmd.run()
},
"tail-log": {
var cmd = TailLogCommand(parsed: parsed)
try await cmd.run()
},
"start": {
var cmd = StartCommand()
try await cmd.run()
},
"stop": {
var cmd = StopCommand()
try await cmd.run()
},
"restart": {
var cmd = RestartCommand()
try await cmd.run()
},
"status": {
var cmd = StatusCommand()
try await cmd.run()
}
]
}
@available(macOS 26.0, *)
@MainActor
private func dispatchMic(parsed: ParsedValues, path: [String]) async throws {
let micSub = try subcommand(path, index: 2, command: "mic")
switch micSub {
case "list":
var cmd = MicList(parsed: parsed)
try await cmd.run()
case "set":
var cmd = MicSet(parsed: parsed)
try await cmd.run()
default:
throw CommanderProgramError.unknownSubcommand(command: "mic", name: micSub)
}
}
@available(macOS 26.0, *)
@MainActor
private func dispatchService(path: [String]) async throws {
let svcSub = try subcommand(path, index: 2, command: "service")
switch svcSub {
case "install":
var cmd = ServiceInstall()
try await cmd.run()
case "uninstall":
var cmd = ServiceUninstall()
try await cmd.run()
case "status":
var cmd = ServiceStatus()
try await cmd.run()
default:
throw CommanderProgramError.unknownSubcommand(command: "service", name: svcSub)
}
}
private func subcommand(_ path: [String], index: Int, command: String) throws -> String {
guard path.count > index else {
throw CommanderProgramError.missingSubcommand(command: command)
}
return path[index]
}
if #available(macOS 26.0, *) {
let exitCode = await runCLI()
exit(exitCode)

View File

@@ -4,22 +4,22 @@ import Testing
@Suite(.serialized)
@MainActor
struct ConnectionsSettingsSmokeTests {
@Test func connectionsSettingsBuildsBodyWithSnapshot() {
let store = ConnectionsStore(isPreview: true)
store.snapshot = ChannelsStatusSnapshot(
ts: 1_700_000_000_000,
channelOrder: ["whatsapp", "telegram", "signal", "imessage"],
channelLabels: [
"whatsapp": "WhatsApp",
"telegram": "Telegram",
"signal": "Signal",
"imessage": "iMessage",
],
channels: [
"whatsapp": AnyCodable([
"configured": true,
"linked": true,
struct ConnectionsSettingsSmokeTests {
@Test func connectionsSettingsBuildsBodyWithSnapshot() {
let store = ConnectionsStore(isPreview: true)
store.snapshot = ChannelsStatusSnapshot(
ts: 1_700_000_000_000,
channelOrder: ["whatsapp", "telegram", "signal", "imessage"],
channelLabels: [
"whatsapp": "WhatsApp",
"telegram": "Telegram",
"signal": "Signal",
"imessage": "iMessage",
],
channels: [
"whatsapp": AnyCodable([
"configured": true,
"linked": true,
"authAgeMs": 86_400_000,
"self": ["e164": "+15551234567"],
"running": true,
@@ -70,13 +70,13 @@ import Testing
"lastError": "not configured",
"probe": ["ok": false, "error": "imsg not found (imsg)"],
"lastProbeAt": 1_700_000_050_000,
]),
],
channelAccounts: [:],
channelDefaultAccountId: [
"whatsapp": "default",
"telegram": "default",
"signal": "default",
]),
],
channelAccounts: [:],
channelDefaultAccountId: [
"whatsapp": "default",
"telegram": "default",
"signal": "default",
"imessage": "default",
])
@@ -93,23 +93,23 @@ import Testing
let view = ConnectionsSettings(store: store)
_ = view.body
}
}
@Test func connectionsSettingsBuildsBodyWithoutSnapshot() {
let store = ConnectionsStore(isPreview: true)
store.snapshot = ChannelsStatusSnapshot(
ts: 1_700_000_000_000,
channelOrder: ["whatsapp", "telegram", "signal", "imessage"],
channelLabels: [
"whatsapp": "WhatsApp",
"telegram": "Telegram",
"signal": "Signal",
"imessage": "iMessage",
],
channels: [
"whatsapp": AnyCodable([
"configured": false,
"linked": false,
@Test func connectionsSettingsBuildsBodyWithoutSnapshot() {
let store = ConnectionsStore(isPreview: true)
store.snapshot = ChannelsStatusSnapshot(
ts: 1_700_000_000_000,
channelOrder: ["whatsapp", "telegram", "signal", "imessage"],
channelLabels: [
"whatsapp": "WhatsApp",
"telegram": "Telegram",
"signal": "Signal",
"imessage": "iMessage",
],
channels: [
"whatsapp": AnyCodable([
"configured": false,
"linked": false,
"running": false,
"connected": false,
"reconnectAttempts": 0,
@@ -146,13 +146,13 @@ import Testing
"cliPath": "imsg",
"probe": ["ok": false, "error": "imsg not found (imsg)"],
"lastProbeAt": 1_700_000_200_000,
]),
],
channelAccounts: [:],
channelDefaultAccountId: [
"whatsapp": "default",
"telegram": "default",
"signal": "default",
]),
],
channelAccounts: [:],
channelDefaultAccountId: [
"whatsapp": "default",
"telegram": "default",
"signal": "default",
"imessage": "default",
])

View File

@@ -2,9 +2,9 @@ import Foundation
import Testing
@testable import Clawdbot
@Suite struct HealthDecodeTests {
private let sampleJSON: String = // minimal but complete payload
"""
@Suite struct HealthDecodeTests {
private let sampleJSON: String = // minimal but complete payload
"""
{"ts":1733622000,"durationMs":420,"channels":{"whatsapp":{"linked":true,"authAgeMs":120000},"telegram":{"configured":true,"probe":{"ok":true,"elapsedMs":800}}},"channelOrder":["whatsapp","telegram"],"heartbeatSeconds":60,"sessions":{"path":"/tmp/sessions.json","count":1,"recent":[{"key":"abc","updatedAt":1733621900,"age":120000}]}}
"""

View File

@@ -204,6 +204,8 @@ Inspection:
- `clawdbot browser snapshot`
- `clawdbot browser snapshot --format aria --limit 200`
- `clawdbot browser snapshot --interactive --compact --depth 6`
- `clawdbot browser snapshot --efficient`
- `clawdbot browser snapshot --labels`
- `clawdbot browser snapshot --selector "#main" --interactive`
- `clawdbot browser snapshot --frame "iframe#main" --interactive`
- `clawdbot browser console --level error`
@@ -260,9 +262,11 @@ Notes:
- `snapshot`:
- `--format ai` (default when Playwright is installed): returns an AI snapshot with numeric refs (`aria-ref="<n>"`).
- `--format aria`: returns the accessibility tree (no refs; inspection only).
- `--efficient` (or `--mode efficient`): compact role snapshot preset (interactive + compact + depth + lower maxChars).
- Role snapshot options (`--interactive`, `--compact`, `--depth`, `--selector`) force a role-based snapshot with refs like `ref=e12`.
- `--frame "<iframe selector>"` scopes role snapshots to an iframe (pairs with role refs like `e12`).
- `--interactive` outputs a flat, easy-to-pick list of interactive elements (best for driving actions).
- `--labels` adds a viewport-only screenshot with overlayed ref labels (prints `MEDIA:<path>`).
- `click`/`type`/etc require a `ref` from `snapshot` (either numeric `12` or role ref `e12`).
CSS selectors are intentionally not supported for actions.
@@ -279,6 +283,7 @@ Clawdbot supports two “snapshot” styles:
- Output: a role-based list/tree with `[ref=e12]` (and optional `[nth=1]`).
- Actions: `clawdbot browser click e12`, `clawdbot browser highlight e12`.
- Internally, the ref is resolved via `getByRole(...)` (plus `nth()` for duplicates).
- Add `--labels` to include a viewport screenshot with overlayed `e12` labels.
Ref behavior:
- Refs are **not stable across navigations**; if something fails, re-run `snapshot` and use a fresh ref.

View File

@@ -37,6 +37,7 @@ const BROWSER_TOOL_ACTIONS = [
const BROWSER_TARGETS = ["sandbox", "host", "custom"] as const;
const BROWSER_SNAPSHOT_FORMATS = ["aria", "ai"] as const;
const BROWSER_SNAPSHOT_MODES = ["efficient"] as const;
const BROWSER_IMAGE_TYPES = ["png", "jpeg"] as const;
@@ -87,12 +88,14 @@ export const BrowserToolSchema = Type.Object({
targetId: Type.Optional(Type.String()),
limit: Type.Optional(Type.Number()),
maxChars: Type.Optional(Type.Number()),
mode: optionalStringEnum(BROWSER_SNAPSHOT_MODES),
format: optionalStringEnum(BROWSER_SNAPSHOT_FORMATS),
interactive: Type.Optional(Type.Boolean()),
compact: Type.Optional(Type.Boolean()),
depth: Type.Optional(Type.Number()),
selector: Type.Optional(Type.String()),
frame: Type.Optional(Type.String()),
labels: Type.Optional(Type.Boolean()),
fullPage: Type.Optional(Type.Boolean()),
ref: Type.Optional(Type.String()),
element: Type.Optional(Type.String()),

View File

@@ -182,6 +182,8 @@ export function createBrowserTool(opts?: {
params.format === "ai" || params.format === "aria"
? (params.format as "ai" | "aria")
: "ai";
const mode = params.mode === "efficient" ? "efficient" : undefined;
const labels = typeof params.labels === "boolean" ? params.labels : undefined;
const hasMaxChars = Object.hasOwn(params, "maxChars");
const targetId = typeof params.targetId === "string" ? params.targetId.trim() : undefined;
const limit =
@@ -195,7 +197,13 @@ export function createBrowserTool(opts?: {
? Math.floor(params.maxChars)
: undefined;
const resolvedMaxChars =
format === "ai" ? (hasMaxChars ? maxChars : DEFAULT_AI_SNAPSHOT_MAX_CHARS) : undefined;
format === "ai"
? hasMaxChars
? maxChars
: mode === "efficient"
? undefined
: DEFAULT_AI_SNAPSHOT_MAX_CHARS
: undefined;
const interactive =
typeof params.interactive === "boolean" ? params.interactive : undefined;
const compact = typeof params.compact === "boolean" ? params.compact : undefined;
@@ -215,9 +223,19 @@ export function createBrowserTool(opts?: {
depth,
selector,
frame,
labels,
mode,
profile,
});
if (snapshot.format === "ai") {
if (labels && snapshot.imagePath) {
return await imageResultFromFile({
label: "browser:snapshot",
path: snapshot.imagePath,
extraText: snapshot.snapshot,
details: snapshot,
});
}
return {
content: [{ type: "text", text: snapshot.snapshot }],
details: snapshot,

View File

@@ -79,6 +79,11 @@ export type SnapshotResult =
refs: number;
interactive: number;
};
labels?: boolean;
labelsCount?: number;
labelsSkipped?: number;
imagePath?: string;
imageType?: "png" | "jpeg";
};
export function resolveBrowserControlUrl(overrideUrl?: string) {
@@ -264,6 +269,8 @@ export async function browserSnapshot(
depth?: number;
selector?: string;
frame?: string;
labels?: boolean;
mode?: "efficient";
profile?: string;
},
): Promise<SnapshotResult> {
@@ -280,6 +287,8 @@ export async function browserSnapshot(
q.set("depth", String(opts.depth));
if (opts.selector?.trim()) q.set("selector", opts.selector.trim());
if (opts.frame?.trim()) q.set("frame", opts.frame.trim());
if (opts.labels === true) q.set("labels", "1");
if (opts.mode) q.set("mode", opts.mode);
if (opts.profile) q.set("profile", opts.profile);
return await fetchBrowserJson<SnapshotResult>(`${baseUrl}/snapshot?${q.toString()}`, {
timeoutMs: 20000,

View File

@@ -3,3 +3,5 @@ export const DEFAULT_CLAWD_BROWSER_CONTROL_URL = "http://127.0.0.1:18791";
export const DEFAULT_CLAWD_BROWSER_COLOR = "#FF4500";
export const DEFAULT_CLAWD_BROWSER_PROFILE_NAME = "clawd";
export const DEFAULT_AI_SNAPSHOT_MAX_CHARS = 80_000;
export const DEFAULT_AI_SNAPSHOT_EFFICIENT_MAX_CHARS = 10_000;
export const DEFAULT_AI_SNAPSHOT_EFFICIENT_DEPTH = 6;

View File

@@ -42,6 +42,7 @@ export {
setTimezoneViaPlaywright,
snapshotAiViaPlaywright,
snapshotRoleViaPlaywright,
screenshotWithLabelsViaPlaywright,
storageClearViaPlaywright,
storageGetViaPlaywright,
storageSetViaPlaywright,

View File

@@ -347,6 +347,132 @@ export async function takeScreenshotViaPlaywright(opts: {
return { buffer };
}
export async function screenshotWithLabelsViaPlaywright(opts: {
cdpUrl: string;
targetId?: string;
refs: Record<string, { role: string; name?: string; nth?: number }>;
maxLabels?: number;
type?: "png" | "jpeg";
}): Promise<{ buffer: Buffer; labels: number; skipped: number }> {
const page = await getPageForTargetId(opts);
ensurePageState(page);
const type = opts.type ?? "png";
const maxLabels =
typeof opts.maxLabels === "number" && Number.isFinite(opts.maxLabels)
? Math.max(1, Math.floor(opts.maxLabels))
: 150;
const viewport = await page.evaluate(() => ({
scrollX: window.scrollX || 0,
scrollY: window.scrollY || 0,
width: window.innerWidth || 0,
height: window.innerHeight || 0,
}));
const refs = Object.keys(opts.refs ?? {});
const boxes: Array<{ ref: string; x: number; y: number; w: number; h: number }> = [];
let skipped = 0;
for (const ref of refs) {
if (boxes.length >= maxLabels) {
skipped += 1;
continue;
}
try {
const box = await refLocator(page, ref).boundingBox();
if (!box) {
skipped += 1;
continue;
}
const x0 = box.x;
const y0 = box.y;
const x1 = box.x + box.width;
const y1 = box.y + box.height;
const vx0 = viewport.scrollX;
const vy0 = viewport.scrollY;
const vx1 = viewport.scrollX + viewport.width;
const vy1 = viewport.scrollY + viewport.height;
if (x1 < vx0 || x0 > vx1 || y1 < vy0 || y0 > vy1) {
skipped += 1;
continue;
}
boxes.push({
ref,
x: x0 - viewport.scrollX,
y: y0 - viewport.scrollY,
w: Math.max(1, box.width),
h: Math.max(1, box.height),
});
} catch {
skipped += 1;
}
}
try {
if (boxes.length > 0) {
await page.evaluate((labels) => {
const existing = document.querySelectorAll("[data-clawdbot-labels]");
existing.forEach((el) => el.remove());
const root = document.createElement("div");
root.setAttribute("data-clawdbot-labels", "1");
root.style.position = "fixed";
root.style.left = "0";
root.style.top = "0";
root.style.zIndex = "2147483647";
root.style.pointerEvents = "none";
root.style.fontFamily =
'"SF Mono","SFMono-Regular",Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace';
const clamp = (value: number, min: number, max: number) =>
Math.min(max, Math.max(min, value));
for (const label of labels) {
const box = document.createElement("div");
box.setAttribute("data-clawdbot-labels", "1");
box.style.position = "absolute";
box.style.left = `${label.x}px`;
box.style.top = `${label.y}px`;
box.style.width = `${label.w}px`;
box.style.height = `${label.h}px`;
box.style.border = "2px solid #ffb020";
box.style.boxSizing = "border-box";
const tag = document.createElement("div");
tag.setAttribute("data-clawdbot-labels", "1");
tag.textContent = label.ref;
tag.style.position = "absolute";
tag.style.left = `${label.x}px`;
tag.style.top = `${clamp(label.y - 18, 0, 20000)}px`;
tag.style.background = "#ffb020";
tag.style.color = "#1a1a1a";
tag.style.fontSize = "12px";
tag.style.lineHeight = "14px";
tag.style.padding = "1px 4px";
tag.style.borderRadius = "3px";
tag.style.boxShadow = "0 1px 2px rgba(0,0,0,0.35)";
tag.style.whiteSpace = "nowrap";
root.appendChild(box);
root.appendChild(tag);
}
document.documentElement.appendChild(root);
}, boxes);
}
const buffer = await page.screenshot({ type });
return { buffer, labels: boxes.length, skipped };
} finally {
await page
.evaluate(() => {
const existing = document.querySelectorAll("[data-clawdbot-labels]");
existing.forEach((el) => el.remove());
})
.catch(() => {});
}
}
export async function setInputFilesViaPlaywright(opts: {
cdpUrl: string;
targetId?: string;

View File

@@ -4,7 +4,11 @@ import type express from "express";
import { ensureMediaDir, saveMediaBuffer } from "../../media/store.js";
import { captureScreenshot, snapshotAria } from "../cdp.js";
import { DEFAULT_AI_SNAPSHOT_MAX_CHARS } from "../constants.js";
import {
DEFAULT_AI_SNAPSHOT_EFFICIENT_DEPTH,
DEFAULT_AI_SNAPSHOT_EFFICIENT_MAX_CHARS,
DEFAULT_AI_SNAPSHOT_MAX_CHARS,
} from "../constants.js";
import {
DEFAULT_BROWSER_SCREENSHOT_MAX_BYTES,
DEFAULT_BROWSER_SCREENSHOT_MAX_SIDE,
@@ -138,14 +142,12 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
const profileCtx = resolveProfileContext(req, res, ctx);
if (!profileCtx) return;
const targetId = typeof req.query.targetId === "string" ? req.query.targetId.trim() : "";
const mode = req.query.mode === "efficient" ? "efficient" : undefined;
const labels = toBoolean(req.query.labels) ?? undefined;
const explicitFormat =
req.query.format === "aria" ? "aria" : req.query.format === "ai" ? "ai" : undefined;
const format =
req.query.format === "aria"
? "aria"
: req.query.format === "ai"
? "ai"
: (await getPwAiModule())
? "ai"
: "aria";
explicitFormat ?? (mode ? "ai" : (await getPwAiModule()) ? "ai" : "aria");
const limitRaw = typeof req.query.limit === "string" ? Number(req.query.limit) : undefined;
const hasMaxChars = Object.hasOwn(req.query, "maxChars");
const maxCharsRaw =
@@ -156,19 +158,34 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
? Math.floor(maxCharsRaw)
: undefined;
const resolvedMaxChars =
format === "ai" ? (hasMaxChars ? maxChars : DEFAULT_AI_SNAPSHOT_MAX_CHARS) : undefined;
const interactive = toBoolean(req.query.interactive);
const compact = toBoolean(req.query.compact);
const depth = toNumber(req.query.depth);
format === "ai"
? hasMaxChars
? maxChars
: mode === "efficient"
? DEFAULT_AI_SNAPSHOT_EFFICIENT_MAX_CHARS
: DEFAULT_AI_SNAPSHOT_MAX_CHARS
: undefined;
const interactiveRaw = toBoolean(req.query.interactive);
const compactRaw = toBoolean(req.query.compact);
const depthRaw = toNumber(req.query.depth);
const interactive = interactiveRaw ?? (mode === "efficient" ? true : undefined);
const compact = compactRaw ?? (mode === "efficient" ? true : undefined);
const depth =
depthRaw ?? (mode === "efficient" ? DEFAULT_AI_SNAPSHOT_EFFICIENT_DEPTH : undefined);
const selector = toStringOrEmpty(req.query.selector);
const frameSelector = toStringOrEmpty(req.query.frame);
try {
const tab = await profileCtx.ensureTabAvailable(targetId || undefined);
if ((labels || mode === "efficient") && format === "aria") {
return jsonError(res, 400, "labels/mode=efficient require format=ai");
}
if (format === "ai") {
const pw = await requirePwAi(res, "ai snapshot");
if (!pw) return;
const wantsRoleSnapshot =
labels === true ||
mode === "efficient" ||
interactive === true ||
compact === true ||
depth !== undefined ||
@@ -210,6 +227,39 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
}
throw err;
});
if (labels) {
const labeled = await pw.screenshotWithLabelsViaPlaywright({
cdpUrl: profileCtx.profile.cdpUrl,
targetId: tab.targetId,
refs: "refs" in snap ? snap.refs : {},
type: "png",
});
const normalized = await normalizeBrowserScreenshot(labeled.buffer, {
maxSide: DEFAULT_BROWSER_SCREENSHOT_MAX_SIDE,
maxBytes: DEFAULT_BROWSER_SCREENSHOT_MAX_BYTES,
});
await ensureMediaDir();
const saved = await saveMediaBuffer(
normalized.buffer,
normalized.contentType ?? "image/png",
"browser",
DEFAULT_BROWSER_SCREENSHOT_MAX_BYTES,
);
const imageType = normalized.contentType?.includes("jpeg") ? "jpeg" : "png";
return res.json({
ok: true,
format,
targetId: tab.targetId,
url: tab.url,
labels: true,
labelsCount: labeled.labels,
labelsSkipped: labeled.skipped,
imagePath: path.resolve(saved.path),
imageType,
...snap,
});
}
return res.json({
ok: true,
format,

View File

@@ -11,6 +11,8 @@ export const browserCoreExamples = [
"clawdbot browser screenshot --ref 12",
"clawdbot browser snapshot",
"clawdbot browser snapshot --format aria --limit 200",
"clawdbot browser snapshot --efficient",
"clawdbot browser snapshot --labels",
];
export const browserActionExamples = [

View File

@@ -48,17 +48,22 @@ export function registerBrowserInspectCommands(
.option("--format <aria|ai>", "Snapshot format (default: ai)", "ai")
.option("--target-id <id>", "CDP target id (or unique prefix)")
.option("--limit <n>", "Max nodes (default: 500/800)", (v: string) => Number(v))
.option("--mode <efficient>", "Snapshot preset (efficient)")
.option("--efficient", "Use the efficient snapshot preset", false)
.option("--interactive", "Role snapshot: interactive elements only", false)
.option("--compact", "Role snapshot: compact output", false)
.option("--depth <n>", "Role snapshot: max depth", (v: string) => Number(v))
.option("--selector <sel>", "Role snapshot: scope to CSS selector")
.option("--frame <sel>", "Role snapshot: scope to an iframe selector")
.option("--labels", "Include viewport label overlay screenshot", false)
.option("--out <path>", "Write snapshot to a file")
.action(async (opts, cmd) => {
const parent = parentOpts(cmd);
const baseUrl = resolveBrowserControlUrl(parent?.url);
const profile = parent?.browserProfile;
const format = opts.format === "aria" ? "aria" : "ai";
const mode =
opts.efficient === true || opts.mode === "efficient" ? "efficient" : undefined;
try {
const result = await browserSnapshot(baseUrl, {
format,
@@ -69,6 +74,8 @@ export function registerBrowserInspectCommands(
depth: Number.isFinite(opts.depth) ? opts.depth : undefined,
selector: opts.selector?.trim() || undefined,
frame: opts.frame?.trim() || undefined,
labels: Boolean(opts.labels) || undefined,
mode,
profile,
});
@@ -81,9 +88,24 @@ export function registerBrowserInspectCommands(
await fs.writeFile(opts.out, payload, "utf8");
}
if (parent?.json) {
defaultRuntime.log(JSON.stringify({ ok: true, out: opts.out }, null, 2));
defaultRuntime.log(
JSON.stringify(
{
ok: true,
out: opts.out,
...(result.format === "ai" && result.imagePath
? { imagePath: result.imagePath }
: {}),
},
null,
2,
),
);
} else {
defaultRuntime.log(opts.out);
if (result.format === "ai" && result.imagePath) {
defaultRuntime.log(`MEDIA:${result.imagePath}`);
}
}
return;
}
@@ -95,6 +117,9 @@ export function registerBrowserInspectCommands(
if (result.format === "ai") {
defaultRuntime.log(result.snapshot);
if (result.imagePath) {
defaultRuntime.log(`MEDIA:${result.imagePath}`);
}
return;
}