diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d996c4dd..4d7076d17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ - Browser: ship a built-in `chrome` profile for extension relay and start the relay automatically when running locally. - Browser: default `browser.defaultProfile` to `chrome` (existing Chrome takeover mode). - Browser: add `clawdbot browser extension install/path` and copy extension path to clipboard. +- Browser: add `snapshot refs=aria` (Playwright aria-ref ids) for self-resolving refs across `snapshot` → `act`. - Control UI: show raw any-map entries in config views; move Docs link into the left nav. #### Plugins diff --git a/src/agents/tools/browser-tool.schema.ts b/src/agents/tools/browser-tool.schema.ts index e012ce221..5acad1688 100644 --- a/src/agents/tools/browser-tool.schema.ts +++ b/src/agents/tools/browser-tool.schema.ts @@ -39,6 +39,7 @@ const BROWSER_TARGETS = ["sandbox", "host", "custom"] as const; const BROWSER_SNAPSHOT_FORMATS = ["aria", "ai"] as const; const BROWSER_SNAPSHOT_MODES = ["efficient"] as const; +const BROWSER_SNAPSHOT_REFS = ["role", "aria"] as const; const BROWSER_IMAGE_TYPES = ["png", "jpeg"] as const; @@ -91,6 +92,7 @@ export const BrowserToolSchema = Type.Object({ maxChars: Type.Optional(Type.Number()), mode: optionalStringEnum(BROWSER_SNAPSHOT_MODES), format: optionalStringEnum(BROWSER_SNAPSHOT_FORMATS), + refs: optionalStringEnum(BROWSER_SNAPSHOT_REFS), interactive: Type.Optional(Type.Boolean()), compact: Type.Optional(Type.Boolean()), depth: Type.Optional(Type.Number()), diff --git a/src/agents/tools/browser-tool.test.ts b/src/agents/tools/browser-tool.test.ts index a1e695fde..97bb3b026 100644 --- a/src/agents/tools/browser-tool.test.ts +++ b/src/agents/tools/browser-tool.test.ts @@ -121,6 +121,19 @@ describe("browser tool snapshot maxChars", () => { expect(browserClientMocks.browserProfiles).toHaveBeenCalledWith("http://127.0.0.1:18791"); }); + + it("passes refs mode through to browser snapshot", async () => { + const tool = createBrowserTool(); + await tool.execute?.(null, { action: "snapshot", format: "ai", refs: "aria" }); + + expect(browserClientMocks.browserSnapshot).toHaveBeenCalledWith( + "http://127.0.0.1:18791", + expect.objectContaining({ + format: "ai", + refs: "aria", + }), + ); + }); }); describe("browser tool snapshot labels", () => { diff --git a/src/agents/tools/browser-tool.ts b/src/agents/tools/browser-tool.ts index 0c5ad095f..0a1c34125 100644 --- a/src/agents/tools/browser-tool.ts +++ b/src/agents/tools/browser-tool.ts @@ -128,6 +128,7 @@ export function createBrowserTool(opts?: { 'Profiles: use profile="chrome" for Chrome extension relay takeover (your existing Chrome tabs). Use profile="clawd" for the isolated clawd-managed browser.', "Chrome extension relay needs an attached tab: user must click the Clawdbot Browser Relay toolbar icon on the tab (badge ON). If no tab is connected, ask them to attach it.", "When using refs from snapshot (e.g. e12), keep the same tab: prefer passing targetId from the snapshot response into subsequent actions (act/click/type/etc).", + 'For stable, self-resolving refs across calls, use snapshot with refs="aria" (Playwright aria-ref ids). Default refs="role" are role+name-based.', "Use snapshot+act for UI automation. Avoid act:wait by default; use only in exceptional cases when no reliable UI state exists.", `target selects browser location (sandbox|host|custom). Default: ${targetDefault}.`, "controlUrl implies target=custom (remote control server).", @@ -190,6 +191,7 @@ export function createBrowserTool(opts?: { : "ai"; const mode = params.mode === "efficient" ? "efficient" : undefined; const labels = typeof params.labels === "boolean" ? params.labels : undefined; + const refs = params.refs === "aria" || params.refs === "role" ? params.refs : undefined; const hasMaxChars = Object.hasOwn(params, "maxChars"); const targetId = typeof params.targetId === "string" ? params.targetId.trim() : undefined; const limit = @@ -224,6 +226,7 @@ export function createBrowserTool(opts?: { targetId, limit, ...(typeof resolvedMaxChars === "number" ? { maxChars: resolvedMaxChars } : {}), + refs, interactive, compact, depth, diff --git a/src/browser/client.test.ts b/src/browser/client.test.ts index 54876820d..7721828f8 100644 --- a/src/browser/client.test.ts +++ b/src/browser/client.test.ts @@ -118,6 +118,36 @@ describe("browser client", () => { expect(parsed.searchParams.get("mode")).toBe("efficient"); }); + it("adds refs=aria to snapshots when requested", async () => { + const calls: string[] = []; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string) => { + calls.push(url); + return { + ok: true, + json: async () => ({ + ok: true, + format: "ai", + targetId: "t1", + url: "https://x", + snapshot: "ok", + }), + } as unknown as Response; + }), + ); + + await browserSnapshot("http://127.0.0.1:18791", { + format: "ai", + refs: "aria", + }); + + const snapshotCall = calls.find((url) => url.includes("/snapshot?")); + expect(snapshotCall).toBeTruthy(); + const parsed = new URL(snapshotCall as string); + expect(parsed.searchParams.get("refs")).toBe("aria"); + }); + it("uses the expected endpoints + methods for common calls", async () => { const calls: Array<{ url: string; init?: RequestInit }> = []; diff --git a/src/browser/client.ts b/src/browser/client.ts index 958303a25..2505f8dc9 100644 --- a/src/browser/client.ts +++ b/src/browser/client.ts @@ -270,6 +270,7 @@ export async function browserSnapshot( targetId?: string; limit?: number; maxChars?: number; + refs?: "role" | "aria"; interactive?: boolean; compact?: boolean; depth?: number; @@ -287,6 +288,7 @@ export async function browserSnapshot( if (typeof opts.maxChars === "number" && Number.isFinite(opts.maxChars)) { q.set("maxChars", String(opts.maxChars)); } + if (opts.refs === "aria" || opts.refs === "role") q.set("refs", opts.refs); if (typeof opts.interactive === "boolean") q.set("interactive", String(opts.interactive)); if (typeof opts.compact === "boolean") q.set("compact", String(opts.compact)); if (typeof opts.depth === "number" && Number.isFinite(opts.depth)) diff --git a/src/browser/pw-role-snapshot.test.ts b/src/browser/pw-role-snapshot.test.ts index abf49c5b9..3ba9ccfe7 100644 --- a/src/browser/pw-role-snapshot.test.ts +++ b/src/browser/pw-role-snapshot.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from "vitest"; import { + buildRoleSnapshotFromAiSnapshot, buildRoleSnapshotFromAriaSnapshot, getRoleSnapshotStats, parseRoleRef, @@ -67,4 +68,24 @@ describe("pw-role-snapshot", () => { expect(parseRoleRef("12")).toBeNull(); expect(parseRoleRef("")).toBeNull(); }); + + it("preserves Playwright aria-ref ids in ai snapshots", () => { + const ai = [ + '- navigation [ref=e1]:', + ' - link "Home" [ref=e5]', + ' - heading "Title" [ref=e6]', + ' - button "Save" [ref=e7] [cursor=pointer]:', + " - paragraph: hello", + ].join("\n"); + + const res = buildRoleSnapshotFromAiSnapshot(ai, { interactive: true }); + expect(res.snapshot).toContain('[ref=e5]'); + expect(res.snapshot).toContain('- link "Home"'); + expect(res.snapshot).toContain('- button "Save"'); + expect(res.snapshot).not.toContain("navigation"); + expect(res.snapshot).not.toContain("heading"); + expect(Object.keys(res.refs).sort()).toEqual(["e5", "e7"]); + expect(res.refs.e5).toMatchObject({ role: "link", name: "Home" }); + expect(res.refs.e7).toMatchObject({ role: "button", name: "Save" }); + }); }); diff --git a/src/browser/pw-role-snapshot.ts b/src/browser/pw-role-snapshot.ts index 091373ab5..0f9a800bb 100644 --- a/src/browser/pw-role-snapshot.ts +++ b/src/browser/pw-role-snapshot.ts @@ -293,3 +293,75 @@ export function buildRoleSnapshotFromAriaSnapshot( refs, }; } + +function parseAiSnapshotRef(suffix: string): string | null { + const match = suffix.match(/\[ref=(e\d+)\]/i); + return match ? match[1] : null; +} + +/** + * Build a role snapshot from Playwright's AI snapshot output while preserving Playwright's own + * aria-ref ids (e.g. ref=e13). This makes the refs self-resolving across calls. + */ +export function buildRoleSnapshotFromAiSnapshot( + aiSnapshot: string, + options: RoleSnapshotOptions = {}, +): { snapshot: string; refs: RoleRefMap } { + const lines = String(aiSnapshot ?? "").split("\n"); + const refs: RoleRefMap = {}; + + if (options.interactive) { + const out: string[] = []; + for (const line of lines) { + const depth = getIndentLevel(line); + if (options.maxDepth !== undefined && depth > options.maxDepth) continue; + const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/); + if (!match) continue; + const [, , roleRaw, name, suffix] = match; + if (roleRaw.startsWith("/")) continue; + const role = roleRaw.toLowerCase(); + if (!INTERACTIVE_ROLES.has(role)) continue; + const ref = parseAiSnapshotRef(suffix); + if (!ref) continue; + refs[ref] = { role, ...(name ? { name } : {}) }; + out.push(`- ${roleRaw}${name ? ` "${name}"` : ""}${suffix}`); + } + return { + snapshot: out.join("\n") || "(no interactive elements)", + refs, + }; + } + + const out: string[] = []; + for (const line of lines) { + const depth = getIndentLevel(line); + if (options.maxDepth !== undefined && depth > options.maxDepth) continue; + + const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/); + if (!match) { + out.push(line); + continue; + } + const [, , roleRaw, name, suffix] = match; + if (roleRaw.startsWith("/")) { + out.push(line); + continue; + } + + const role = roleRaw.toLowerCase(); + const isStructural = STRUCTURAL_ROLES.has(role); + + if (options.compact && isStructural && !name) continue; + + const ref = parseAiSnapshotRef(suffix); + if (ref) refs[ref] = { role, ...(name ? { name } : {}) }; + + out.push(line); + } + + const tree = out.join("\n") || "(empty)"; + return { + snapshot: options.compact ? compactTree(tree) : tree, + refs, + }; +} diff --git a/src/browser/pw-session.test.ts b/src/browser/pw-session.test.ts index ec5c75135..1832120a5 100644 --- a/src/browser/pw-session.test.ts +++ b/src/browser/pw-session.test.ts @@ -62,6 +62,16 @@ describe("pw-session refLocator", () => { expect(mocks.getByRole).toHaveBeenCalled(); }); + + it("uses aria-ref locators when refs mode is aria", () => { + const { page, mocks } = fakePage(); + const state = ensurePageState(page); + state.roleRefsMode = "aria"; + + refLocator(page, "e1"); + + expect(mocks.locator).toHaveBeenCalledWith("aria-ref=e1"); + }); }); describe("pw-session role refs cache", () => { diff --git a/src/browser/pw-session.ts b/src/browser/pw-session.ts index 7997fe151..ef8142323 100644 --- a/src/browser/pw-session.ts +++ b/src/browser/pw-session.ts @@ -64,9 +64,11 @@ type PageState = { armIdDownload: number; /** * Role-based refs from the last role snapshot (e.g. e1/e2). - * These refs are NOT Playwright's `aria-ref` values. + * Mode "role" refs are generated from ariaSnapshot and resolved via getByRole. + * Mode "aria" refs are Playwright aria-ref ids and resolved via `aria-ref=...`. */ roleRefs?: Record; + roleRefsMode?: "role" | "aria"; roleRefsFrameSelector?: string; }; @@ -74,6 +76,7 @@ type RoleRefs = NonNullable; type RoleRefsCacheEntry = { refs: RoleRefs; frameSelector?: string; + mode?: NonNullable; }; type ContextState = { @@ -110,12 +113,14 @@ export function rememberRoleRefsForTarget(opts: { targetId: string; refs: RoleRefs; frameSelector?: string; + mode?: NonNullable; }): void { const targetId = opts.targetId.trim(); if (!targetId) return; roleRefsByTarget.set(roleRefsKey(opts.cdpUrl, targetId), { refs: opts.refs, ...(opts.frameSelector ? { frameSelector: opts.frameSelector } : {}), + ...(opts.mode ? { mode: opts.mode } : {}), }); while (roleRefsByTarget.size > MAX_ROLE_REFS_CACHE) { const first = roleRefsByTarget.keys().next(); @@ -137,6 +142,7 @@ export function restoreRoleRefsForTarget(opts: { if (state.roleRefs) return; state.roleRefs = cached.refs; state.roleRefsFrameSelector = cached.frameSelector; + state.roleRefsMode = cached.mode; } export function ensurePageState(page: Page): PageState { @@ -339,6 +345,12 @@ export function refLocator(page: Page, ref: string) { if (/^e\d+$/.test(normalized)) { const state = pageStates.get(page); + if (state?.roleRefsMode === "aria") { + const scope = state.roleRefsFrameSelector + ? page.frameLocator(state.roleRefsFrameSelector) + : page; + return scope.locator(`aria-ref=${normalized}`); + } const info = state?.roleRefs?.[normalized]; if (!info) { throw new Error( diff --git a/src/browser/pw-tools-core.interactions.ts b/src/browser/pw-tools-core.interactions.ts index 0f652be34..ea1ac9514 100644 --- a/src/browser/pw-tools-core.interactions.ts +++ b/src/browser/pw-tools-core.interactions.ts @@ -265,6 +265,7 @@ export async function scrollIntoViewViaPlaywright(opts: { }): Promise { const page = await getPageForTargetId(opts); ensurePageState(page); + restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page }); const timeout = normalizeTimeoutMs(opts.timeoutMs, 20_000); const ref = requireRef(opts.ref); @@ -340,6 +341,7 @@ export async function takeScreenshotViaPlaywright(opts: { }): Promise<{ buffer: Buffer }> { const page = await getPageForTargetId(opts); ensurePageState(page); + restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page }); const type = opts.type ?? "png"; if (opts.ref) { if (opts.fullPage) throw new Error("fullPage is not supported for element screenshots"); @@ -369,6 +371,7 @@ export async function screenshotWithLabelsViaPlaywright(opts: { }): Promise<{ buffer: Buffer; labels: number; skipped: number }> { const page = await getPageForTargetId(opts); ensurePageState(page); + restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page }); const type = opts.type ?? "png"; const maxLabels = typeof opts.maxLabels === "number" && Number.isFinite(opts.maxLabels) @@ -495,6 +498,7 @@ export async function setInputFilesViaPlaywright(opts: { }): Promise { const page = await getPageForTargetId(opts); ensurePageState(page); + restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page }); if (!opts.paths.length) throw new Error("paths are required"); const inputRef = typeof opts.inputRef === "string" ? opts.inputRef.trim() : ""; const element = typeof opts.element === "string" ? opts.element.trim() : ""; diff --git a/src/browser/pw-tools-core.snapshot.ts b/src/browser/pw-tools-core.snapshot.ts index 92ab149aa..5cfa77e91 100644 --- a/src/browser/pw-tools-core.snapshot.ts +++ b/src/browser/pw-tools-core.snapshot.ts @@ -2,6 +2,7 @@ import type { Page } from "playwright-core"; import { type AriaSnapshotNode, formatAriaSnapshot, type RawAXNode } from "./cdp.js"; import { + buildRoleSnapshotFromAiSnapshot, buildRoleSnapshotFromAriaSnapshot, getRoleSnapshotStats, type RoleSnapshotOptions, @@ -76,6 +77,7 @@ export async function snapshotRoleViaPlaywright(opts: { targetId?: string; selector?: string; frameSelector?: string; + refsMode?: "role" | "aria"; options?: RoleSnapshotOptions; }): Promise<{ snapshot: string; @@ -88,6 +90,37 @@ export async function snapshotRoleViaPlaywright(opts: { }); const state = ensurePageState(page); + if (opts.refsMode === "aria") { + if (opts.selector?.trim() || opts.frameSelector?.trim()) { + throw new Error("refs=aria does not support selector/frame snapshots yet."); + } + const maybe = page as unknown as WithSnapshotForAI; + if (!maybe._snapshotForAI) { + throw new Error("refs=aria requires Playwright _snapshotForAI support."); + } + const result = await maybe._snapshotForAI({ + timeout: 5000, + track: "response", + }); + const built = buildRoleSnapshotFromAiSnapshot(String(result?.full ?? ""), opts.options); + state.roleRefs = built.refs; + state.roleRefsFrameSelector = undefined; + state.roleRefsMode = "aria"; + if (opts.targetId) { + rememberRoleRefsForTarget({ + cdpUrl: opts.cdpUrl, + targetId: opts.targetId, + refs: built.refs, + mode: "aria", + }); + } + return { + snapshot: built.snapshot, + refs: built.refs, + stats: getRoleSnapshotStats(built.snapshot, built.refs), + }; + } + const frameSelector = opts.frameSelector?.trim() || ""; const selector = opts.selector?.trim() || ""; const locator = frameSelector @@ -102,12 +135,14 @@ export async function snapshotRoleViaPlaywright(opts: { const built = buildRoleSnapshotFromAriaSnapshot(String(ariaSnapshot ?? ""), opts.options); state.roleRefs = built.refs; state.roleRefsFrameSelector = frameSelector || undefined; + state.roleRefsMode = "role"; if (opts.targetId) { rememberRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, refs: built.refs, frameSelector: frameSelector || undefined, + mode: "role", }); } return { diff --git a/src/browser/routes/agent.snapshot.ts b/src/browser/routes/agent.snapshot.ts index 66a39b2a1..fdeb7f69e 100644 --- a/src/browser/routes/agent.snapshot.ts +++ b/src/browser/routes/agent.snapshot.ts @@ -169,6 +169,8 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br const interactiveRaw = toBoolean(req.query.interactive); const compactRaw = toBoolean(req.query.compact); const depthRaw = toNumber(req.query.depth); + const refsModeRaw = toStringOrEmpty(req.query.refs).trim(); + const refsMode = refsModeRaw === "aria" ? "aria" : refsModeRaw === "role" ? "role" : undefined; const interactive = interactiveRaw ?? (mode === "efficient" ? true : undefined); const compact = compactRaw ?? (mode === "efficient" ? true : undefined); const depth = @@ -199,6 +201,7 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br targetId: tab.targetId, selector: selector.trim() || undefined, frameSelector: frameSelector.trim() || undefined, + refsMode, options: { interactive: interactive ?? undefined, compact: compact ?? undefined, @@ -219,6 +222,7 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br targetId: tab.targetId, selector: selector.trim() || undefined, frameSelector: frameSelector.trim() || undefined, + refsMode, options: { interactive: interactive ?? undefined, compact: compact ?? undefined,