feat(browser): add snapshot refs=aria mode

This commit is contained in:
Peter Steinberger
2026-01-15 10:16:33 +00:00
parent 0facc63019
commit 4f1a4ab072
13 changed files with 210 additions and 1 deletions

View File

@@ -19,6 +19,7 @@
- Browser: ship a built-in `chrome` profile for extension relay and start the relay automatically when running locally. - Browser: ship a built-in `chrome` profile for extension relay and start the relay automatically when running locally.
- Browser: default `browser.defaultProfile` to `chrome` (existing Chrome takeover mode). - Browser: default `browser.defaultProfile` to `chrome` (existing Chrome takeover mode).
- Browser: add `clawdbot browser extension install/path` and copy extension path to clipboard. - Browser: add `clawdbot browser extension install/path` and copy extension path to clipboard.
- Browser: add `snapshot refs=aria` (Playwright aria-ref ids) for self-resolving refs across `snapshot``act`.
- Control UI: show raw any-map entries in config views; move Docs link into the left nav. - Control UI: show raw any-map entries in config views; move Docs link into the left nav.
#### Plugins #### Plugins

View File

@@ -39,6 +39,7 @@ const BROWSER_TARGETS = ["sandbox", "host", "custom"] as const;
const BROWSER_SNAPSHOT_FORMATS = ["aria", "ai"] as const; const BROWSER_SNAPSHOT_FORMATS = ["aria", "ai"] as const;
const BROWSER_SNAPSHOT_MODES = ["efficient"] as const; const BROWSER_SNAPSHOT_MODES = ["efficient"] as const;
const BROWSER_SNAPSHOT_REFS = ["role", "aria"] as const;
const BROWSER_IMAGE_TYPES = ["png", "jpeg"] as const; const BROWSER_IMAGE_TYPES = ["png", "jpeg"] as const;
@@ -91,6 +92,7 @@ export const BrowserToolSchema = Type.Object({
maxChars: Type.Optional(Type.Number()), maxChars: Type.Optional(Type.Number()),
mode: optionalStringEnum(BROWSER_SNAPSHOT_MODES), mode: optionalStringEnum(BROWSER_SNAPSHOT_MODES),
format: optionalStringEnum(BROWSER_SNAPSHOT_FORMATS), format: optionalStringEnum(BROWSER_SNAPSHOT_FORMATS),
refs: optionalStringEnum(BROWSER_SNAPSHOT_REFS),
interactive: Type.Optional(Type.Boolean()), interactive: Type.Optional(Type.Boolean()),
compact: Type.Optional(Type.Boolean()), compact: Type.Optional(Type.Boolean()),
depth: Type.Optional(Type.Number()), depth: Type.Optional(Type.Number()),

View File

@@ -121,6 +121,19 @@ describe("browser tool snapshot maxChars", () => {
expect(browserClientMocks.browserProfiles).toHaveBeenCalledWith("http://127.0.0.1:18791"); expect(browserClientMocks.browserProfiles).toHaveBeenCalledWith("http://127.0.0.1:18791");
}); });
it("passes refs mode through to browser snapshot", async () => {
const tool = createBrowserTool();
await tool.execute?.(null, { action: "snapshot", format: "ai", refs: "aria" });
expect(browserClientMocks.browserSnapshot).toHaveBeenCalledWith(
"http://127.0.0.1:18791",
expect.objectContaining({
format: "ai",
refs: "aria",
}),
);
});
}); });
describe("browser tool snapshot labels", () => { describe("browser tool snapshot labels", () => {

View File

@@ -128,6 +128,7 @@ export function createBrowserTool(opts?: {
'Profiles: use profile="chrome" for Chrome extension relay takeover (your existing Chrome tabs). Use profile="clawd" for the isolated clawd-managed browser.', 'Profiles: use profile="chrome" for Chrome extension relay takeover (your existing Chrome tabs). Use profile="clawd" for the isolated clawd-managed browser.',
"Chrome extension relay needs an attached tab: user must click the Clawdbot Browser Relay toolbar icon on the tab (badge ON). If no tab is connected, ask them to attach it.", "Chrome extension relay needs an attached tab: user must click the Clawdbot Browser Relay toolbar icon on the tab (badge ON). If no tab is connected, ask them to attach it.",
"When using refs from snapshot (e.g. e12), keep the same tab: prefer passing targetId from the snapshot response into subsequent actions (act/click/type/etc).", "When using refs from snapshot (e.g. e12), keep the same tab: prefer passing targetId from the snapshot response into subsequent actions (act/click/type/etc).",
'For stable, self-resolving refs across calls, use snapshot with refs="aria" (Playwright aria-ref ids). Default refs="role" are role+name-based.',
"Use snapshot+act for UI automation. Avoid act:wait by default; use only in exceptional cases when no reliable UI state exists.", "Use snapshot+act for UI automation. Avoid act:wait by default; use only in exceptional cases when no reliable UI state exists.",
`target selects browser location (sandbox|host|custom). Default: ${targetDefault}.`, `target selects browser location (sandbox|host|custom). Default: ${targetDefault}.`,
"controlUrl implies target=custom (remote control server).", "controlUrl implies target=custom (remote control server).",
@@ -190,6 +191,7 @@ export function createBrowserTool(opts?: {
: "ai"; : "ai";
const mode = params.mode === "efficient" ? "efficient" : undefined; const mode = params.mode === "efficient" ? "efficient" : undefined;
const labels = typeof params.labels === "boolean" ? params.labels : undefined; const labels = typeof params.labels === "boolean" ? params.labels : undefined;
const refs = params.refs === "aria" || params.refs === "role" ? params.refs : undefined;
const hasMaxChars = Object.hasOwn(params, "maxChars"); const hasMaxChars = Object.hasOwn(params, "maxChars");
const targetId = typeof params.targetId === "string" ? params.targetId.trim() : undefined; const targetId = typeof params.targetId === "string" ? params.targetId.trim() : undefined;
const limit = const limit =
@@ -224,6 +226,7 @@ export function createBrowserTool(opts?: {
targetId, targetId,
limit, limit,
...(typeof resolvedMaxChars === "number" ? { maxChars: resolvedMaxChars } : {}), ...(typeof resolvedMaxChars === "number" ? { maxChars: resolvedMaxChars } : {}),
refs,
interactive, interactive,
compact, compact,
depth, depth,

View File

@@ -118,6 +118,36 @@ describe("browser client", () => {
expect(parsed.searchParams.get("mode")).toBe("efficient"); expect(parsed.searchParams.get("mode")).toBe("efficient");
}); });
it("adds refs=aria to snapshots when requested", async () => {
const calls: string[] = [];
vi.stubGlobal(
"fetch",
vi.fn(async (url: string) => {
calls.push(url);
return {
ok: true,
json: async () => ({
ok: true,
format: "ai",
targetId: "t1",
url: "https://x",
snapshot: "ok",
}),
} as unknown as Response;
}),
);
await browserSnapshot("http://127.0.0.1:18791", {
format: "ai",
refs: "aria",
});
const snapshotCall = calls.find((url) => url.includes("/snapshot?"));
expect(snapshotCall).toBeTruthy();
const parsed = new URL(snapshotCall as string);
expect(parsed.searchParams.get("refs")).toBe("aria");
});
it("uses the expected endpoints + methods for common calls", async () => { it("uses the expected endpoints + methods for common calls", async () => {
const calls: Array<{ url: string; init?: RequestInit }> = []; const calls: Array<{ url: string; init?: RequestInit }> = [];

View File

@@ -270,6 +270,7 @@ export async function browserSnapshot(
targetId?: string; targetId?: string;
limit?: number; limit?: number;
maxChars?: number; maxChars?: number;
refs?: "role" | "aria";
interactive?: boolean; interactive?: boolean;
compact?: boolean; compact?: boolean;
depth?: number; depth?: number;
@@ -287,6 +288,7 @@ export async function browserSnapshot(
if (typeof opts.maxChars === "number" && Number.isFinite(opts.maxChars)) { if (typeof opts.maxChars === "number" && Number.isFinite(opts.maxChars)) {
q.set("maxChars", String(opts.maxChars)); q.set("maxChars", String(opts.maxChars));
} }
if (opts.refs === "aria" || opts.refs === "role") q.set("refs", opts.refs);
if (typeof opts.interactive === "boolean") q.set("interactive", String(opts.interactive)); if (typeof opts.interactive === "boolean") q.set("interactive", String(opts.interactive));
if (typeof opts.compact === "boolean") q.set("compact", String(opts.compact)); if (typeof opts.compact === "boolean") q.set("compact", String(opts.compact));
if (typeof opts.depth === "number" && Number.isFinite(opts.depth)) if (typeof opts.depth === "number" && Number.isFinite(opts.depth))

View File

@@ -1,6 +1,7 @@
import { describe, expect, it } from "vitest"; import { describe, expect, it } from "vitest";
import { import {
buildRoleSnapshotFromAiSnapshot,
buildRoleSnapshotFromAriaSnapshot, buildRoleSnapshotFromAriaSnapshot,
getRoleSnapshotStats, getRoleSnapshotStats,
parseRoleRef, parseRoleRef,
@@ -67,4 +68,24 @@ describe("pw-role-snapshot", () => {
expect(parseRoleRef("12")).toBeNull(); expect(parseRoleRef("12")).toBeNull();
expect(parseRoleRef("")).toBeNull(); expect(parseRoleRef("")).toBeNull();
}); });
it("preserves Playwright aria-ref ids in ai snapshots", () => {
const ai = [
'- navigation [ref=e1]:',
' - link "Home" [ref=e5]',
' - heading "Title" [ref=e6]',
' - button "Save" [ref=e7] [cursor=pointer]:',
" - paragraph: hello",
].join("\n");
const res = buildRoleSnapshotFromAiSnapshot(ai, { interactive: true });
expect(res.snapshot).toContain('[ref=e5]');
expect(res.snapshot).toContain('- link "Home"');
expect(res.snapshot).toContain('- button "Save"');
expect(res.snapshot).not.toContain("navigation");
expect(res.snapshot).not.toContain("heading");
expect(Object.keys(res.refs).sort()).toEqual(["e5", "e7"]);
expect(res.refs.e5).toMatchObject({ role: "link", name: "Home" });
expect(res.refs.e7).toMatchObject({ role: "button", name: "Save" });
});
}); });

View File

@@ -293,3 +293,75 @@ export function buildRoleSnapshotFromAriaSnapshot(
refs, refs,
}; };
} }
function parseAiSnapshotRef(suffix: string): string | null {
const match = suffix.match(/\[ref=(e\d+)\]/i);
return match ? match[1] : null;
}
/**
* Build a role snapshot from Playwright's AI snapshot output while preserving Playwright's own
* aria-ref ids (e.g. ref=e13). This makes the refs self-resolving across calls.
*/
export function buildRoleSnapshotFromAiSnapshot(
aiSnapshot: string,
options: RoleSnapshotOptions = {},
): { snapshot: string; refs: RoleRefMap } {
const lines = String(aiSnapshot ?? "").split("\n");
const refs: RoleRefMap = {};
if (options.interactive) {
const out: string[] = [];
for (const line of lines) {
const depth = getIndentLevel(line);
if (options.maxDepth !== undefined && depth > options.maxDepth) continue;
const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/);
if (!match) continue;
const [, , roleRaw, name, suffix] = match;
if (roleRaw.startsWith("/")) continue;
const role = roleRaw.toLowerCase();
if (!INTERACTIVE_ROLES.has(role)) continue;
const ref = parseAiSnapshotRef(suffix);
if (!ref) continue;
refs[ref] = { role, ...(name ? { name } : {}) };
out.push(`- ${roleRaw}${name ? ` "${name}"` : ""}${suffix}`);
}
return {
snapshot: out.join("\n") || "(no interactive elements)",
refs,
};
}
const out: string[] = [];
for (const line of lines) {
const depth = getIndentLevel(line);
if (options.maxDepth !== undefined && depth > options.maxDepth) continue;
const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/);
if (!match) {
out.push(line);
continue;
}
const [, , roleRaw, name, suffix] = match;
if (roleRaw.startsWith("/")) {
out.push(line);
continue;
}
const role = roleRaw.toLowerCase();
const isStructural = STRUCTURAL_ROLES.has(role);
if (options.compact && isStructural && !name) continue;
const ref = parseAiSnapshotRef(suffix);
if (ref) refs[ref] = { role, ...(name ? { name } : {}) };
out.push(line);
}
const tree = out.join("\n") || "(empty)";
return {
snapshot: options.compact ? compactTree(tree) : tree,
refs,
};
}

View File

@@ -62,6 +62,16 @@ describe("pw-session refLocator", () => {
expect(mocks.getByRole).toHaveBeenCalled(); expect(mocks.getByRole).toHaveBeenCalled();
}); });
it("uses aria-ref locators when refs mode is aria", () => {
const { page, mocks } = fakePage();
const state = ensurePageState(page);
state.roleRefsMode = "aria";
refLocator(page, "e1");
expect(mocks.locator).toHaveBeenCalledWith("aria-ref=e1");
});
}); });
describe("pw-session role refs cache", () => { describe("pw-session role refs cache", () => {

View File

@@ -64,9 +64,11 @@ type PageState = {
armIdDownload: number; armIdDownload: number;
/** /**
* Role-based refs from the last role snapshot (e.g. e1/e2). * Role-based refs from the last role snapshot (e.g. e1/e2).
* These refs are NOT Playwright's `aria-ref` values. * Mode "role" refs are generated from ariaSnapshot and resolved via getByRole.
* Mode "aria" refs are Playwright aria-ref ids and resolved via `aria-ref=...`.
*/ */
roleRefs?: Record<string, { role: string; name?: string; nth?: number }>; roleRefs?: Record<string, { role: string; name?: string; nth?: number }>;
roleRefsMode?: "role" | "aria";
roleRefsFrameSelector?: string; roleRefsFrameSelector?: string;
}; };
@@ -74,6 +76,7 @@ type RoleRefs = NonNullable<PageState["roleRefs"]>;
type RoleRefsCacheEntry = { type RoleRefsCacheEntry = {
refs: RoleRefs; refs: RoleRefs;
frameSelector?: string; frameSelector?: string;
mode?: NonNullable<PageState["roleRefsMode"]>;
}; };
type ContextState = { type ContextState = {
@@ -110,12 +113,14 @@ export function rememberRoleRefsForTarget(opts: {
targetId: string; targetId: string;
refs: RoleRefs; refs: RoleRefs;
frameSelector?: string; frameSelector?: string;
mode?: NonNullable<PageState["roleRefsMode"]>;
}): void { }): void {
const targetId = opts.targetId.trim(); const targetId = opts.targetId.trim();
if (!targetId) return; if (!targetId) return;
roleRefsByTarget.set(roleRefsKey(opts.cdpUrl, targetId), { roleRefsByTarget.set(roleRefsKey(opts.cdpUrl, targetId), {
refs: opts.refs, refs: opts.refs,
...(opts.frameSelector ? { frameSelector: opts.frameSelector } : {}), ...(opts.frameSelector ? { frameSelector: opts.frameSelector } : {}),
...(opts.mode ? { mode: opts.mode } : {}),
}); });
while (roleRefsByTarget.size > MAX_ROLE_REFS_CACHE) { while (roleRefsByTarget.size > MAX_ROLE_REFS_CACHE) {
const first = roleRefsByTarget.keys().next(); const first = roleRefsByTarget.keys().next();
@@ -137,6 +142,7 @@ export function restoreRoleRefsForTarget(opts: {
if (state.roleRefs) return; if (state.roleRefs) return;
state.roleRefs = cached.refs; state.roleRefs = cached.refs;
state.roleRefsFrameSelector = cached.frameSelector; state.roleRefsFrameSelector = cached.frameSelector;
state.roleRefsMode = cached.mode;
} }
export function ensurePageState(page: Page): PageState { export function ensurePageState(page: Page): PageState {
@@ -339,6 +345,12 @@ export function refLocator(page: Page, ref: string) {
if (/^e\d+$/.test(normalized)) { if (/^e\d+$/.test(normalized)) {
const state = pageStates.get(page); const state = pageStates.get(page);
if (state?.roleRefsMode === "aria") {
const scope = state.roleRefsFrameSelector
? page.frameLocator(state.roleRefsFrameSelector)
: page;
return scope.locator(`aria-ref=${normalized}`);
}
const info = state?.roleRefs?.[normalized]; const info = state?.roleRefs?.[normalized];
if (!info) { if (!info) {
throw new Error( throw new Error(

View File

@@ -265,6 +265,7 @@ export async function scrollIntoViewViaPlaywright(opts: {
}): Promise<void> { }): Promise<void> {
const page = await getPageForTargetId(opts); const page = await getPageForTargetId(opts);
ensurePageState(page); ensurePageState(page);
restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page });
const timeout = normalizeTimeoutMs(opts.timeoutMs, 20_000); const timeout = normalizeTimeoutMs(opts.timeoutMs, 20_000);
const ref = requireRef(opts.ref); const ref = requireRef(opts.ref);
@@ -340,6 +341,7 @@ export async function takeScreenshotViaPlaywright(opts: {
}): Promise<{ buffer: Buffer }> { }): Promise<{ buffer: Buffer }> {
const page = await getPageForTargetId(opts); const page = await getPageForTargetId(opts);
ensurePageState(page); ensurePageState(page);
restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page });
const type = opts.type ?? "png"; const type = opts.type ?? "png";
if (opts.ref) { if (opts.ref) {
if (opts.fullPage) throw new Error("fullPage is not supported for element screenshots"); if (opts.fullPage) throw new Error("fullPage is not supported for element screenshots");
@@ -369,6 +371,7 @@ export async function screenshotWithLabelsViaPlaywright(opts: {
}): Promise<{ buffer: Buffer; labels: number; skipped: number }> { }): Promise<{ buffer: Buffer; labels: number; skipped: number }> {
const page = await getPageForTargetId(opts); const page = await getPageForTargetId(opts);
ensurePageState(page); ensurePageState(page);
restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page });
const type = opts.type ?? "png"; const type = opts.type ?? "png";
const maxLabels = const maxLabels =
typeof opts.maxLabels === "number" && Number.isFinite(opts.maxLabels) typeof opts.maxLabels === "number" && Number.isFinite(opts.maxLabels)
@@ -495,6 +498,7 @@ export async function setInputFilesViaPlaywright(opts: {
}): Promise<void> { }): Promise<void> {
const page = await getPageForTargetId(opts); const page = await getPageForTargetId(opts);
ensurePageState(page); ensurePageState(page);
restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page });
if (!opts.paths.length) throw new Error("paths are required"); if (!opts.paths.length) throw new Error("paths are required");
const inputRef = typeof opts.inputRef === "string" ? opts.inputRef.trim() : ""; const inputRef = typeof opts.inputRef === "string" ? opts.inputRef.trim() : "";
const element = typeof opts.element === "string" ? opts.element.trim() : ""; const element = typeof opts.element === "string" ? opts.element.trim() : "";

View File

@@ -2,6 +2,7 @@ import type { Page } from "playwright-core";
import { type AriaSnapshotNode, formatAriaSnapshot, type RawAXNode } from "./cdp.js"; import { type AriaSnapshotNode, formatAriaSnapshot, type RawAXNode } from "./cdp.js";
import { import {
buildRoleSnapshotFromAiSnapshot,
buildRoleSnapshotFromAriaSnapshot, buildRoleSnapshotFromAriaSnapshot,
getRoleSnapshotStats, getRoleSnapshotStats,
type RoleSnapshotOptions, type RoleSnapshotOptions,
@@ -76,6 +77,7 @@ export async function snapshotRoleViaPlaywright(opts: {
targetId?: string; targetId?: string;
selector?: string; selector?: string;
frameSelector?: string; frameSelector?: string;
refsMode?: "role" | "aria";
options?: RoleSnapshotOptions; options?: RoleSnapshotOptions;
}): Promise<{ }): Promise<{
snapshot: string; snapshot: string;
@@ -88,6 +90,37 @@ export async function snapshotRoleViaPlaywright(opts: {
}); });
const state = ensurePageState(page); const state = ensurePageState(page);
if (opts.refsMode === "aria") {
if (opts.selector?.trim() || opts.frameSelector?.trim()) {
throw new Error("refs=aria does not support selector/frame snapshots yet.");
}
const maybe = page as unknown as WithSnapshotForAI;
if (!maybe._snapshotForAI) {
throw new Error("refs=aria requires Playwright _snapshotForAI support.");
}
const result = await maybe._snapshotForAI({
timeout: 5000,
track: "response",
});
const built = buildRoleSnapshotFromAiSnapshot(String(result?.full ?? ""), opts.options);
state.roleRefs = built.refs;
state.roleRefsFrameSelector = undefined;
state.roleRefsMode = "aria";
if (opts.targetId) {
rememberRoleRefsForTarget({
cdpUrl: opts.cdpUrl,
targetId: opts.targetId,
refs: built.refs,
mode: "aria",
});
}
return {
snapshot: built.snapshot,
refs: built.refs,
stats: getRoleSnapshotStats(built.snapshot, built.refs),
};
}
const frameSelector = opts.frameSelector?.trim() || ""; const frameSelector = opts.frameSelector?.trim() || "";
const selector = opts.selector?.trim() || ""; const selector = opts.selector?.trim() || "";
const locator = frameSelector const locator = frameSelector
@@ -102,12 +135,14 @@ export async function snapshotRoleViaPlaywright(opts: {
const built = buildRoleSnapshotFromAriaSnapshot(String(ariaSnapshot ?? ""), opts.options); const built = buildRoleSnapshotFromAriaSnapshot(String(ariaSnapshot ?? ""), opts.options);
state.roleRefs = built.refs; state.roleRefs = built.refs;
state.roleRefsFrameSelector = frameSelector || undefined; state.roleRefsFrameSelector = frameSelector || undefined;
state.roleRefsMode = "role";
if (opts.targetId) { if (opts.targetId) {
rememberRoleRefsForTarget({ rememberRoleRefsForTarget({
cdpUrl: opts.cdpUrl, cdpUrl: opts.cdpUrl,
targetId: opts.targetId, targetId: opts.targetId,
refs: built.refs, refs: built.refs,
frameSelector: frameSelector || undefined, frameSelector: frameSelector || undefined,
mode: "role",
}); });
} }
return { return {

View File

@@ -169,6 +169,8 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
const interactiveRaw = toBoolean(req.query.interactive); const interactiveRaw = toBoolean(req.query.interactive);
const compactRaw = toBoolean(req.query.compact); const compactRaw = toBoolean(req.query.compact);
const depthRaw = toNumber(req.query.depth); const depthRaw = toNumber(req.query.depth);
const refsModeRaw = toStringOrEmpty(req.query.refs).trim();
const refsMode = refsModeRaw === "aria" ? "aria" : refsModeRaw === "role" ? "role" : undefined;
const interactive = interactiveRaw ?? (mode === "efficient" ? true : undefined); const interactive = interactiveRaw ?? (mode === "efficient" ? true : undefined);
const compact = compactRaw ?? (mode === "efficient" ? true : undefined); const compact = compactRaw ?? (mode === "efficient" ? true : undefined);
const depth = const depth =
@@ -199,6 +201,7 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
targetId: tab.targetId, targetId: tab.targetId,
selector: selector.trim() || undefined, selector: selector.trim() || undefined,
frameSelector: frameSelector.trim() || undefined, frameSelector: frameSelector.trim() || undefined,
refsMode,
options: { options: {
interactive: interactive ?? undefined, interactive: interactive ?? undefined,
compact: compact ?? undefined, compact: compact ?? undefined,
@@ -219,6 +222,7 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br
targetId: tab.targetId, targetId: tab.targetId,
selector: selector.trim() || undefined, selector: selector.trim() || undefined,
frameSelector: frameSelector.trim() || undefined, frameSelector: frameSelector.trim() || undefined,
refsMode,
options: { options: {
interactive: interactive ?? undefined, interactive: interactive ?? undefined,
compact: compact ?? undefined, compact: compact ?? undefined,