feat: add browser snapshot modes

This commit is contained in:
Peter Steinberger
2026-01-15 03:50:48 +00:00
parent 4e48d0a431
commit a6e780b2f6
18 changed files with 430 additions and 141 deletions

View File

@@ -37,6 +37,7 @@ const BROWSER_TOOL_ACTIONS = [
const BROWSER_TARGETS = ["sandbox", "host", "custom"] as const;
const BROWSER_SNAPSHOT_FORMATS = ["aria", "ai"] as const;
const BROWSER_SNAPSHOT_MODES = ["efficient"] as const;
const BROWSER_IMAGE_TYPES = ["png", "jpeg"] as const;
@@ -87,12 +88,14 @@ export const BrowserToolSchema = Type.Object({
targetId: Type.Optional(Type.String()),
limit: Type.Optional(Type.Number()),
maxChars: Type.Optional(Type.Number()),
mode: optionalStringEnum(BROWSER_SNAPSHOT_MODES),
format: optionalStringEnum(BROWSER_SNAPSHOT_FORMATS),
interactive: Type.Optional(Type.Boolean()),
compact: Type.Optional(Type.Boolean()),
depth: Type.Optional(Type.Number()),
selector: Type.Optional(Type.String()),
frame: Type.Optional(Type.String()),
labels: Type.Optional(Type.Boolean()),
fullPage: Type.Optional(Type.Boolean()),
ref: Type.Optional(Type.String()),
element: Type.Optional(Type.String()),

View File

@@ -182,6 +182,8 @@ export function createBrowserTool(opts?: {
params.format === "ai" || params.format === "aria"
? (params.format as "ai" | "aria")
: "ai";
const mode = params.mode === "efficient" ? "efficient" : undefined;
const labels = typeof params.labels === "boolean" ? params.labels : undefined;
const hasMaxChars = Object.hasOwn(params, "maxChars");
const targetId = typeof params.targetId === "string" ? params.targetId.trim() : undefined;
const limit =
@@ -195,7 +197,13 @@ export function createBrowserTool(opts?: {
? Math.floor(params.maxChars)
: undefined;
const resolvedMaxChars =
format === "ai" ? (hasMaxChars ? maxChars : DEFAULT_AI_SNAPSHOT_MAX_CHARS) : undefined;
format === "ai"
? hasMaxChars
? maxChars
: mode === "efficient"
? undefined
: DEFAULT_AI_SNAPSHOT_MAX_CHARS
: undefined;
const interactive =
typeof params.interactive === "boolean" ? params.interactive : undefined;
const compact = typeof params.compact === "boolean" ? params.compact : undefined;
@@ -215,9 +223,19 @@ export function createBrowserTool(opts?: {
depth,
selector,
frame,
labels,
mode,
profile,
});
if (snapshot.format === "ai") {
if (labels && snapshot.imagePath) {
return await imageResultFromFile({
label: "browser:snapshot",
path: snapshot.imagePath,
extraText: snapshot.snapshot,
details: snapshot,
});
}
return {
content: [{ type: "text", text: snapshot.snapshot }],
details: snapshot,

View File

@@ -79,6 +79,11 @@ export type SnapshotResult =
refs: number;
interactive: number;
};
labels?: boolean;
labelsCount?: number;
labelsSkipped?: number;
imagePath?: string;
imageType?: "png" | "jpeg";
};
export function resolveBrowserControlUrl(overrideUrl?: string) {
@@ -264,6 +269,8 @@ export async function browserSnapshot(
depth?: number;
selector?: string;
frame?: string;
labels?: boolean;
mode?: "efficient";
profile?: string;
},
): Promise<SnapshotResult> {
@@ -280,6 +287,8 @@ export async function browserSnapshot(
q.set("depth", String(opts.depth));
if (opts.selector?.trim()) q.set("selector", opts.selector.trim());
if (opts.frame?.trim()) q.set("frame", opts.frame.trim());
if (opts.labels === true) q.set("labels", "1");
if (opts.mode) q.set("mode", opts.mode);
if (opts.profile) q.set("profile", opts.profile);
return await fetchBrowserJson<SnapshotResult>(`${baseUrl}/snapshot?${q.toString()}`, {
timeoutMs: 20000,

View File

@@ -3,3 +3,5 @@ export const DEFAULT_CLAWD_BROWSER_CONTROL_URL = "http://127.0.0.1:18791";
export const DEFAULT_CLAWD_BROWSER_COLOR = "#FF4500";
export const DEFAULT_CLAWD_BROWSER_PROFILE_NAME = "clawd";
export const DEFAULT_AI_SNAPSHOT_MAX_CHARS = 80_000;
export const DEFAULT_AI_SNAPSHOT_EFFICIENT_MAX_CHARS = 10_000;
export const DEFAULT_AI_SNAPSHOT_EFFICIENT_DEPTH = 6;

View File

@@ -42,6 +42,7 @@ export {
setTimezoneViaPlaywright,
snapshotAiViaPlaywright,
snapshotRoleViaPlaywright,
screenshotWithLabelsViaPlaywright,
storageClearViaPlaywright,
storageGetViaPlaywright,
storageSetViaPlaywright,

View File

@@ -347,6 +347,132 @@ export async function takeScreenshotViaPlaywright(opts: {
return { buffer };
}
export async function screenshotWithLabelsViaPlaywright(opts: {
cdpUrl: string;
targetId?: string;
refs: Record<string, { role: string; name?: string; nth?: number }>;
maxLabels?: number;
type?: "png" | "jpeg";
}): Promise<{ buffer: Buffer; labels: number; skipped: number }> {
const page = await getPageForTargetId(opts);
ensurePageState(page);
const type = opts.type ?? "png";
const maxLabels =
typeof opts.maxLabels === "number" && Number.isFinite(opts.maxLabels)
? Math.max(1, Math.floor(opts.maxLabels))
: 150;
const viewport = await page.evaluate(() => ({
scrollX: window.scrollX || 0,
scrollY: window.scrollY || 0,
width: window.innerWidth || 0,
height: window.innerHeight || 0,
}));
const refs = Object.keys(opts.refs ?? {});
const boxes: Array<{ ref: string; x: number; y: number; w: number; h: number }> = [];
let skipped = 0;
for (const ref of refs) {
if (boxes.length >= maxLabels) {
skipped += 1;
continue;
}
try {
const box = await refLocator(page, ref).boundingBox();
if (!box) {
skipped += 1;
continue;
}
const x0 = box.x;
const y0 = box.y;
const x1 = box.x + box.width;
const y1 = box.y + box.height;
const vx0 = viewport.scrollX;
const vy0 = viewport.scrollY;
const vx1 = viewport.scrollX + viewport.width;
const vy1 = viewport.scrollY + viewport.height;
if (x1 < vx0 || x0 > vx1 || y1 < vy0 || y0 > vy1) {
skipped += 1;
continue;
}
boxes.push({
ref,
x: x0 - viewport.scrollX,
y: y0 - viewport.scrollY,
w: Math.max(1, box.width),
h: Math.max(1, box.height),
});
} catch {
skipped += 1;
}
}
try {
if (boxes.length > 0) {
await page.evaluate((labels) => {
const existing = document.querySelectorAll("[data-clawdbot-labels]");
existing.forEach((el) => el.remove());
const root = document.createElement("div");
root.setAttribute("data-clawdbot-labels", "1");
root.style.position = "fixed";
root.style.left = "0";
root.style.top = "0";
root.style.zIndex = "2147483647";
root.style.pointerEvents = "none";
root.style.fontFamily =
'"SF Mono","SFMono-Regular",Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace';
const clamp = (value: number, min: number, max: number) =>
Math.min(max, Math.max(min, value));
for (const label of labels) {
const box = document.createElement("div");
box.setAttribute("data-clawdbot-labels", "1");
box.style.position = "absolute";
box.style.left = `${label.x}px`;
box.style.top = `${label.y}px`;
box.style.width = `${label.w}px`;
box.style.height = `${label.h}px`;
box.style.border = "2px solid #ffb020";
box.style.boxSizing = "border-box";
const tag = document.createElement("div");
tag.setAttribute("data-clawdbot-labels", "1");
tag.textContent = label.ref;
tag.style.position = "absolute";
tag.style.left = `${label.x}px`;
tag.style.top = `${clamp(label.y - 18, 0, 20000)}px`;
tag.style.background = "#ffb020";
tag.style.color = "#1a1a1a";
tag.style.fontSize = "12px";
tag.style.lineHeight = "14px";
tag.style.padding = "1px 4px";
tag.style.borderRadius = "3px";
tag.style.boxShadow = "0 1px 2px rgba(0,0,0,0.35)";
tag.style.whiteSpace = "nowrap";
root.appendChild(box);
root.appendChild(tag);
}
document.documentElement.appendChild(root);
}, boxes);
}
const buffer = await page.screenshot({ type });
return { buffer, labels: boxes.length, skipped };
} finally {
await page
.evaluate(() => {
const existing = document.querySelectorAll("[data-clawdbot-labels]");
existing.forEach((el) => el.remove());
})
.catch(() => {});
}
}
export async function setInputFilesViaPlaywright(opts: {
cdpUrl: string;
targetId?: string;

View File

@@ -4,7 +4,11 @@ import type express from "express";
import { ensureMediaDir, saveMediaBuffer } from "../../media/store.js";
import { captureScreenshot, snapshotAria } from "../cdp.js";
import { DEFAULT_AI_SNAPSHOT_MAX_CHARS } from "../constants.js";
import {
DEFAULT_AI_SNAPSHOT_EFFICIENT_DEPTH,
DEFAULT_AI_SNAPSHOT_EFFICIENT_MAX_CHARS,
DEFAULT_AI_SNAPSHOT_MAX_CHARS,
} from "../constants.js";
import {
DEFAULT_BROWSER_SCREENSHOT_MAX_BYTES,
DEFAULT_BROWSER_SCREENSHOT_MAX_SIDE,
@@ -138,14 +142,12 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
const profileCtx = resolveProfileContext(req, res, ctx);
if (!profileCtx) return;
const targetId = typeof req.query.targetId === "string" ? req.query.targetId.trim() : "";
const mode = req.query.mode === "efficient" ? "efficient" : undefined;
const labels = toBoolean(req.query.labels) ?? undefined;
const explicitFormat =
req.query.format === "aria" ? "aria" : req.query.format === "ai" ? "ai" : undefined;
const format =
req.query.format === "aria"
? "aria"
: req.query.format === "ai"
? "ai"
: (await getPwAiModule())
? "ai"
: "aria";
explicitFormat ?? (mode ? "ai" : (await getPwAiModule()) ? "ai" : "aria");
const limitRaw = typeof req.query.limit === "string" ? Number(req.query.limit) : undefined;
const hasMaxChars = Object.hasOwn(req.query, "maxChars");
const maxCharsRaw =
@@ -156,19 +158,34 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
? Math.floor(maxCharsRaw)
: undefined;
const resolvedMaxChars =
format === "ai" ? (hasMaxChars ? maxChars : DEFAULT_AI_SNAPSHOT_MAX_CHARS) : undefined;
const interactive = toBoolean(req.query.interactive);
const compact = toBoolean(req.query.compact);
const depth = toNumber(req.query.depth);
format === "ai"
? hasMaxChars
? maxChars
: mode === "efficient"
? DEFAULT_AI_SNAPSHOT_EFFICIENT_MAX_CHARS
: DEFAULT_AI_SNAPSHOT_MAX_CHARS
: undefined;
const interactiveRaw = toBoolean(req.query.interactive);
const compactRaw = toBoolean(req.query.compact);
const depthRaw = toNumber(req.query.depth);
const interactive = interactiveRaw ?? (mode === "efficient" ? true : undefined);
const compact = compactRaw ?? (mode === "efficient" ? true : undefined);
const depth =
depthRaw ?? (mode === "efficient" ? DEFAULT_AI_SNAPSHOT_EFFICIENT_DEPTH : undefined);
const selector = toStringOrEmpty(req.query.selector);
const frameSelector = toStringOrEmpty(req.query.frame);
try {
const tab = await profileCtx.ensureTabAvailable(targetId || undefined);
if ((labels || mode === "efficient") && format === "aria") {
return jsonError(res, 400, "labels/mode=efficient require format=ai");
}
if (format === "ai") {
const pw = await requirePwAi(res, "ai snapshot");
if (!pw) return;
const wantsRoleSnapshot =
labels === true ||
mode === "efficient" ||
interactive === true ||
compact === true ||
depth !== undefined ||
@@ -210,6 +227,39 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
}
throw err;
});
if (labels) {
const labeled = await pw.screenshotWithLabelsViaPlaywright({
cdpUrl: profileCtx.profile.cdpUrl,
targetId: tab.targetId,
refs: "refs" in snap ? snap.refs : {},
type: "png",
});
const normalized = await normalizeBrowserScreenshot(labeled.buffer, {
maxSide: DEFAULT_BROWSER_SCREENSHOT_MAX_SIDE,
maxBytes: DEFAULT_BROWSER_SCREENSHOT_MAX_BYTES,
});
await ensureMediaDir();
const saved = await saveMediaBuffer(
normalized.buffer,
normalized.contentType ?? "image/png",
"browser",
DEFAULT_BROWSER_SCREENSHOT_MAX_BYTES,
);
const imageType = normalized.contentType?.includes("jpeg") ? "jpeg" : "png";
return res.json({
ok: true,
format,
targetId: tab.targetId,
url: tab.url,
labels: true,
labelsCount: labeled.labels,
labelsSkipped: labeled.skipped,
imagePath: path.resolve(saved.path),
imageType,
...snap,
});
}
return res.json({
ok: true,
format,

View File

@@ -11,6 +11,8 @@ export const browserCoreExamples = [
"clawdbot browser screenshot --ref 12",
"clawdbot browser snapshot",
"clawdbot browser snapshot --format aria --limit 200",
"clawdbot browser snapshot --efficient",
"clawdbot browser snapshot --labels",
];
export const browserActionExamples = [

View File

@@ -48,17 +48,22 @@ export function registerBrowserInspectCommands(
.option("--format <aria|ai>", "Snapshot format (default: ai)", "ai")
.option("--target-id <id>", "CDP target id (or unique prefix)")
.option("--limit <n>", "Max nodes (default: 500/800)", (v: string) => Number(v))
.option("--mode <efficient>", "Snapshot preset (efficient)")
.option("--efficient", "Use the efficient snapshot preset", false)
.option("--interactive", "Role snapshot: interactive elements only", false)
.option("--compact", "Role snapshot: compact output", false)
.option("--depth <n>", "Role snapshot: max depth", (v: string) => Number(v))
.option("--selector <sel>", "Role snapshot: scope to CSS selector")
.option("--frame <sel>", "Role snapshot: scope to an iframe selector")
.option("--labels", "Include viewport label overlay screenshot", false)
.option("--out <path>", "Write snapshot to a file")
.action(async (opts, cmd) => {
const parent = parentOpts(cmd);
const baseUrl = resolveBrowserControlUrl(parent?.url);
const profile = parent?.browserProfile;
const format = opts.format === "aria" ? "aria" : "ai";
const mode =
opts.efficient === true || opts.mode === "efficient" ? "efficient" : undefined;
try {
const result = await browserSnapshot(baseUrl, {
format,
@@ -69,6 +74,8 @@ export function registerBrowserInspectCommands(
depth: Number.isFinite(opts.depth) ? opts.depth : undefined,
selector: opts.selector?.trim() || undefined,
frame: opts.frame?.trim() || undefined,
labels: Boolean(opts.labels) || undefined,
mode,
profile,
});
@@ -81,9 +88,24 @@ export function registerBrowserInspectCommands(
await fs.writeFile(opts.out, payload, "utf8");
}
if (parent?.json) {
defaultRuntime.log(JSON.stringify({ ok: true, out: opts.out }, null, 2));
defaultRuntime.log(
JSON.stringify(
{
ok: true,
out: opts.out,
...(result.format === "ai" && result.imagePath
? { imagePath: result.imagePath }
: {}),
},
null,
2,
),
);
} else {
defaultRuntime.log(opts.out);
if (result.format === "ai" && result.imagePath) {
defaultRuntime.log(`MEDIA:${result.imagePath}`);
}
}
return;
}
@@ -95,6 +117,9 @@ export function registerBrowserInspectCommands(
if (result.format === "ai") {
defaultRuntime.log(result.snapshot);
if (result.imagePath) {
defaultRuntime.log(`MEDIA:${result.imagePath}`);
}
return;
}