feat(browser): add snapshot refs=aria mode

This commit is contained in:
Peter Steinberger
2026-01-15 10:16:33 +00:00
parent 0facc63019
commit 4f1a4ab072
13 changed files with 210 additions and 1 deletions

View File

@@ -19,6 +19,7 @@
- Browser: ship a built-in `chrome` profile for extension relay and start the relay automatically when running locally.
- Browser: default `browser.defaultProfile` to `chrome` (existing Chrome takeover mode).
- Browser: add `clawdbot browser extension install/path` and copy extension path to clipboard.
- Browser: add `snapshot refs=aria` (Playwright aria-ref ids) for self-resolving refs across `snapshot``act`.
- Control UI: show raw any-map entries in config views; move Docs link into the left nav.
#### Plugins

View File

@@ -39,6 +39,7 @@ const BROWSER_TARGETS = ["sandbox", "host", "custom"] as const;
const BROWSER_SNAPSHOT_FORMATS = ["aria", "ai"] as const;
const BROWSER_SNAPSHOT_MODES = ["efficient"] as const;
const BROWSER_SNAPSHOT_REFS = ["role", "aria"] as const;
const BROWSER_IMAGE_TYPES = ["png", "jpeg"] as const;
@@ -91,6 +92,7 @@ export const BrowserToolSchema = Type.Object({
maxChars: Type.Optional(Type.Number()),
mode: optionalStringEnum(BROWSER_SNAPSHOT_MODES),
format: optionalStringEnum(BROWSER_SNAPSHOT_FORMATS),
refs: optionalStringEnum(BROWSER_SNAPSHOT_REFS),
interactive: Type.Optional(Type.Boolean()),
compact: Type.Optional(Type.Boolean()),
depth: Type.Optional(Type.Number()),

View File

@@ -121,6 +121,19 @@ describe("browser tool snapshot maxChars", () => {
expect(browserClientMocks.browserProfiles).toHaveBeenCalledWith("http://127.0.0.1:18791");
});
it("passes refs mode through to browser snapshot", async () => {
const tool = createBrowserTool();
await tool.execute?.(null, { action: "snapshot", format: "ai", refs: "aria" });
expect(browserClientMocks.browserSnapshot).toHaveBeenCalledWith(
"http://127.0.0.1:18791",
expect.objectContaining({
format: "ai",
refs: "aria",
}),
);
});
});
describe("browser tool snapshot labels", () => {

View File

@@ -128,6 +128,7 @@ export function createBrowserTool(opts?: {
'Profiles: use profile="chrome" for Chrome extension relay takeover (your existing Chrome tabs). Use profile="clawd" for the isolated clawd-managed browser.',
"Chrome extension relay needs an attached tab: user must click the Clawdbot Browser Relay toolbar icon on the tab (badge ON). If no tab is connected, ask them to attach it.",
"When using refs from snapshot (e.g. e12), keep the same tab: prefer passing targetId from the snapshot response into subsequent actions (act/click/type/etc).",
'For stable, self-resolving refs across calls, use snapshot with refs="aria" (Playwright aria-ref ids). Default refs="role" are role+name-based.',
"Use snapshot+act for UI automation. Avoid act:wait by default; use only in exceptional cases when no reliable UI state exists.",
`target selects browser location (sandbox|host|custom). Default: ${targetDefault}.`,
"controlUrl implies target=custom (remote control server).",
@@ -190,6 +191,7 @@ export function createBrowserTool(opts?: {
: "ai";
const mode = params.mode === "efficient" ? "efficient" : undefined;
const labels = typeof params.labels === "boolean" ? params.labels : undefined;
const refs = params.refs === "aria" || params.refs === "role" ? params.refs : undefined;
const hasMaxChars = Object.hasOwn(params, "maxChars");
const targetId = typeof params.targetId === "string" ? params.targetId.trim() : undefined;
const limit =
@@ -224,6 +226,7 @@ export function createBrowserTool(opts?: {
targetId,
limit,
...(typeof resolvedMaxChars === "number" ? { maxChars: resolvedMaxChars } : {}),
refs,
interactive,
compact,
depth,

View File

@@ -118,6 +118,36 @@ describe("browser client", () => {
expect(parsed.searchParams.get("mode")).toBe("efficient");
});
it("adds refs=aria to snapshots when requested", async () => {
const calls: string[] = [];
vi.stubGlobal(
"fetch",
vi.fn(async (url: string) => {
calls.push(url);
return {
ok: true,
json: async () => ({
ok: true,
format: "ai",
targetId: "t1",
url: "https://x",
snapshot: "ok",
}),
} as unknown as Response;
}),
);
await browserSnapshot("http://127.0.0.1:18791", {
format: "ai",
refs: "aria",
});
const snapshotCall = calls.find((url) => url.includes("/snapshot?"));
expect(snapshotCall).toBeTruthy();
const parsed = new URL(snapshotCall as string);
expect(parsed.searchParams.get("refs")).toBe("aria");
});
it("uses the expected endpoints + methods for common calls", async () => {
const calls: Array<{ url: string; init?: RequestInit }> = [];

View File

@@ -270,6 +270,7 @@ export async function browserSnapshot(
targetId?: string;
limit?: number;
maxChars?: number;
refs?: "role" | "aria";
interactive?: boolean;
compact?: boolean;
depth?: number;
@@ -287,6 +288,7 @@ export async function browserSnapshot(
if (typeof opts.maxChars === "number" && Number.isFinite(opts.maxChars)) {
q.set("maxChars", String(opts.maxChars));
}
if (opts.refs === "aria" || opts.refs === "role") q.set("refs", opts.refs);
if (typeof opts.interactive === "boolean") q.set("interactive", String(opts.interactive));
if (typeof opts.compact === "boolean") q.set("compact", String(opts.compact));
if (typeof opts.depth === "number" && Number.isFinite(opts.depth))

View File

@@ -1,6 +1,7 @@
import { describe, expect, it } from "vitest";
import {
buildRoleSnapshotFromAiSnapshot,
buildRoleSnapshotFromAriaSnapshot,
getRoleSnapshotStats,
parseRoleRef,
@@ -67,4 +68,24 @@ describe("pw-role-snapshot", () => {
expect(parseRoleRef("12")).toBeNull();
expect(parseRoleRef("")).toBeNull();
});
it("preserves Playwright aria-ref ids in ai snapshots", () => {
const ai = [
'- navigation [ref=e1]:',
' - link "Home" [ref=e5]',
' - heading "Title" [ref=e6]',
' - button "Save" [ref=e7] [cursor=pointer]:',
" - paragraph: hello",
].join("\n");
const res = buildRoleSnapshotFromAiSnapshot(ai, { interactive: true });
expect(res.snapshot).toContain('[ref=e5]');
expect(res.snapshot).toContain('- link "Home"');
expect(res.snapshot).toContain('- button "Save"');
expect(res.snapshot).not.toContain("navigation");
expect(res.snapshot).not.toContain("heading");
expect(Object.keys(res.refs).sort()).toEqual(["e5", "e7"]);
expect(res.refs.e5).toMatchObject({ role: "link", name: "Home" });
expect(res.refs.e7).toMatchObject({ role: "button", name: "Save" });
});
});

View File

@@ -293,3 +293,75 @@ export function buildRoleSnapshotFromAriaSnapshot(
refs,
};
}
function parseAiSnapshotRef(suffix: string): string | null {
const match = suffix.match(/\[ref=(e\d+)\]/i);
return match ? match[1] : null;
}
/**
* Build a role snapshot from Playwright's AI snapshot output while preserving Playwright's own
* aria-ref ids (e.g. ref=e13). This makes the refs self-resolving across calls.
*/
export function buildRoleSnapshotFromAiSnapshot(
aiSnapshot: string,
options: RoleSnapshotOptions = {},
): { snapshot: string; refs: RoleRefMap } {
const lines = String(aiSnapshot ?? "").split("\n");
const refs: RoleRefMap = {};
if (options.interactive) {
const out: string[] = [];
for (const line of lines) {
const depth = getIndentLevel(line);
if (options.maxDepth !== undefined && depth > options.maxDepth) continue;
const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/);
if (!match) continue;
const [, , roleRaw, name, suffix] = match;
if (roleRaw.startsWith("/")) continue;
const role = roleRaw.toLowerCase();
if (!INTERACTIVE_ROLES.has(role)) continue;
const ref = parseAiSnapshotRef(suffix);
if (!ref) continue;
refs[ref] = { role, ...(name ? { name } : {}) };
out.push(`- ${roleRaw}${name ? ` "${name}"` : ""}${suffix}`);
}
return {
snapshot: out.join("\n") || "(no interactive elements)",
refs,
};
}
const out: string[] = [];
for (const line of lines) {
const depth = getIndentLevel(line);
if (options.maxDepth !== undefined && depth > options.maxDepth) continue;
const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/);
if (!match) {
out.push(line);
continue;
}
const [, , roleRaw, name, suffix] = match;
if (roleRaw.startsWith("/")) {
out.push(line);
continue;
}
const role = roleRaw.toLowerCase();
const isStructural = STRUCTURAL_ROLES.has(role);
if (options.compact && isStructural && !name) continue;
const ref = parseAiSnapshotRef(suffix);
if (ref) refs[ref] = { role, ...(name ? { name } : {}) };
out.push(line);
}
const tree = out.join("\n") || "(empty)";
return {
snapshot: options.compact ? compactTree(tree) : tree,
refs,
};
}

View File

@@ -62,6 +62,16 @@ describe("pw-session refLocator", () => {
expect(mocks.getByRole).toHaveBeenCalled();
});
it("uses aria-ref locators when refs mode is aria", () => {
const { page, mocks } = fakePage();
const state = ensurePageState(page);
state.roleRefsMode = "aria";
refLocator(page, "e1");
expect(mocks.locator).toHaveBeenCalledWith("aria-ref=e1");
});
});
describe("pw-session role refs cache", () => {

View File

@@ -64,9 +64,11 @@ type PageState = {
armIdDownload: number;
/**
* Role-based refs from the last role snapshot (e.g. e1/e2).
* These refs are NOT Playwright's `aria-ref` values.
* Mode "role" refs are generated from ariaSnapshot and resolved via getByRole.
* Mode "aria" refs are Playwright aria-ref ids and resolved via `aria-ref=...`.
*/
roleRefs?: Record<string, { role: string; name?: string; nth?: number }>;
roleRefsMode?: "role" | "aria";
roleRefsFrameSelector?: string;
};
@@ -74,6 +76,7 @@ type RoleRefs = NonNullable<PageState["roleRefs"]>;
type RoleRefsCacheEntry = {
refs: RoleRefs;
frameSelector?: string;
mode?: NonNullable<PageState["roleRefsMode"]>;
};
type ContextState = {
@@ -110,12 +113,14 @@ export function rememberRoleRefsForTarget(opts: {
targetId: string;
refs: RoleRefs;
frameSelector?: string;
mode?: NonNullable<PageState["roleRefsMode"]>;
}): void {
const targetId = opts.targetId.trim();
if (!targetId) return;
roleRefsByTarget.set(roleRefsKey(opts.cdpUrl, targetId), {
refs: opts.refs,
...(opts.frameSelector ? { frameSelector: opts.frameSelector } : {}),
...(opts.mode ? { mode: opts.mode } : {}),
});
while (roleRefsByTarget.size > MAX_ROLE_REFS_CACHE) {
const first = roleRefsByTarget.keys().next();
@@ -137,6 +142,7 @@ export function restoreRoleRefsForTarget(opts: {
if (state.roleRefs) return;
state.roleRefs = cached.refs;
state.roleRefsFrameSelector = cached.frameSelector;
state.roleRefsMode = cached.mode;
}
export function ensurePageState(page: Page): PageState {
@@ -339,6 +345,12 @@ export function refLocator(page: Page, ref: string) {
if (/^e\d+$/.test(normalized)) {
const state = pageStates.get(page);
if (state?.roleRefsMode === "aria") {
const scope = state.roleRefsFrameSelector
? page.frameLocator(state.roleRefsFrameSelector)
: page;
return scope.locator(`aria-ref=${normalized}`);
}
const info = state?.roleRefs?.[normalized];
if (!info) {
throw new Error(

View File

@@ -265,6 +265,7 @@ export async function scrollIntoViewViaPlaywright(opts: {
}): Promise<void> {
const page = await getPageForTargetId(opts);
ensurePageState(page);
restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page });
const timeout = normalizeTimeoutMs(opts.timeoutMs, 20_000);
const ref = requireRef(opts.ref);
@@ -340,6 +341,7 @@ export async function takeScreenshotViaPlaywright(opts: {
}): Promise<{ buffer: Buffer }> {
const page = await getPageForTargetId(opts);
ensurePageState(page);
restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page });
const type = opts.type ?? "png";
if (opts.ref) {
if (opts.fullPage) throw new Error("fullPage is not supported for element screenshots");
@@ -369,6 +371,7 @@ export async function screenshotWithLabelsViaPlaywright(opts: {
}): Promise<{ buffer: Buffer; labels: number; skipped: number }> {
const page = await getPageForTargetId(opts);
ensurePageState(page);
restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page });
const type = opts.type ?? "png";
const maxLabels =
typeof opts.maxLabels === "number" && Number.isFinite(opts.maxLabels)
@@ -495,6 +498,7 @@ export async function setInputFilesViaPlaywright(opts: {
}): Promise<void> {
const page = await getPageForTargetId(opts);
ensurePageState(page);
restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page });
if (!opts.paths.length) throw new Error("paths are required");
const inputRef = typeof opts.inputRef === "string" ? opts.inputRef.trim() : "";
const element = typeof opts.element === "string" ? opts.element.trim() : "";

View File

@@ -2,6 +2,7 @@ import type { Page } from "playwright-core";
import { type AriaSnapshotNode, formatAriaSnapshot, type RawAXNode } from "./cdp.js";
import {
buildRoleSnapshotFromAiSnapshot,
buildRoleSnapshotFromAriaSnapshot,
getRoleSnapshotStats,
type RoleSnapshotOptions,
@@ -76,6 +77,7 @@ export async function snapshotRoleViaPlaywright(opts: {
targetId?: string;
selector?: string;
frameSelector?: string;
refsMode?: "role" | "aria";
options?: RoleSnapshotOptions;
}): Promise<{
snapshot: string;
@@ -88,6 +90,37 @@ export async function snapshotRoleViaPlaywright(opts: {
});
const state = ensurePageState(page);
if (opts.refsMode === "aria") {
if (opts.selector?.trim() || opts.frameSelector?.trim()) {
throw new Error("refs=aria does not support selector/frame snapshots yet.");
}
const maybe = page as unknown as WithSnapshotForAI;
if (!maybe._snapshotForAI) {
throw new Error("refs=aria requires Playwright _snapshotForAI support.");
}
const result = await maybe._snapshotForAI({
timeout: 5000,
track: "response",
});
const built = buildRoleSnapshotFromAiSnapshot(String(result?.full ?? ""), opts.options);
state.roleRefs = built.refs;
state.roleRefsFrameSelector = undefined;
state.roleRefsMode = "aria";
if (opts.targetId) {
rememberRoleRefsForTarget({
cdpUrl: opts.cdpUrl,
targetId: opts.targetId,
refs: built.refs,
mode: "aria",
});
}
return {
snapshot: built.snapshot,
refs: built.refs,
stats: getRoleSnapshotStats(built.snapshot, built.refs),
};
}
const frameSelector = opts.frameSelector?.trim() || "";
const selector = opts.selector?.trim() || "";
const locator = frameSelector
@@ -102,12 +135,14 @@ export async function snapshotRoleViaPlaywright(opts: {
const built = buildRoleSnapshotFromAriaSnapshot(String(ariaSnapshot ?? ""), opts.options);
state.roleRefs = built.refs;
state.roleRefsFrameSelector = frameSelector || undefined;
state.roleRefsMode = "role";
if (opts.targetId) {
rememberRoleRefsForTarget({
cdpUrl: opts.cdpUrl,
targetId: opts.targetId,
refs: built.refs,
frameSelector: frameSelector || undefined,
mode: "role",
});
}
return {

View File

@@ -169,6 +169,8 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
const interactiveRaw = toBoolean(req.query.interactive);
const compactRaw = toBoolean(req.query.compact);
const depthRaw = toNumber(req.query.depth);
const refsModeRaw = toStringOrEmpty(req.query.refs).trim();
const refsMode = refsModeRaw === "aria" ? "aria" : refsModeRaw === "role" ? "role" : undefined;
const interactive = interactiveRaw ?? (mode === "efficient" ? true : undefined);
const compact = compactRaw ?? (mode === "efficient" ? true : undefined);
const depth =
@@ -199,6 +201,7 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
targetId: tab.targetId,
selector: selector.trim() || undefined,
frameSelector: frameSelector.trim() || undefined,
refsMode,
options: {
interactive: interactive ?? undefined,
compact: compact ?? undefined,
@@ -219,6 +222,7 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
targetId: tab.targetId,
selector: selector.trim() || undefined,
frameSelector: frameSelector.trim() || undefined,
refsMode,
options: {
interactive: interactive ?? undefined,
compact: compact ?? undefined,