diff --git a/src/agents/tools/browser-tool.ts b/src/agents/tools/browser-tool.ts index 5ec8e84ac..ffea30a85 100644 --- a/src/agents/tools/browser-tool.ts +++ b/src/agents/tools/browser-tool.ts @@ -121,6 +121,10 @@ const BrowserToolSchema = Type.Object({ limit: Type.Optional(Type.Number()), maxChars: Type.Optional(Type.Number()), format: Type.Optional(Type.Union([Type.Literal("aria"), Type.Literal("ai")])), + interactive: Type.Optional(Type.Boolean()), + compact: Type.Optional(Type.Boolean()), + depth: Type.Optional(Type.Number()), + selector: Type.Optional(Type.String()), fullPage: Type.Optional(Type.Boolean()), ref: Type.Optional(Type.String()), element: Type.Optional(Type.String()), @@ -336,11 +340,30 @@ export function createBrowserTool(opts?: { format === "ai" ? (maxChars ?? DEFAULT_AI_SNAPSHOT_MAX_CHARS) : undefined; + const interactive = + typeof params.interactive === "boolean" + ? params.interactive + : undefined; + const compact = + typeof params.compact === "boolean" ? params.compact : undefined; + const depth = + typeof params.depth === "number" && Number.isFinite(params.depth) + ? params.depth + : undefined; + const selector = + typeof params.selector === "string" + ? params.selector.trim() + : undefined; const snapshot = await browserSnapshot(baseUrl, { format, targetId, limit, ...(resolvedMaxChars ? { maxChars: resolvedMaxChars } : {}), + ...(resolvedMaxChars ? { maxChars: resolvedMaxChars } : {}), + interactive, + compact, + depth, + selector, profile, }); if (snapshot.format === "ai") { diff --git a/src/browser/client.ts b/src/browser/client.ts index cb491b2e4..c87d4dfc1 100644 --- a/src/browser/client.ts +++ b/src/browser/client.ts @@ -250,6 +250,10 @@ export async function browserSnapshot( targetId?: string; limit?: number; maxChars?: number; + interactive?: boolean; + compact?: boolean; + depth?: number; + selector?: string; profile?: string; }, ): Promise { @@ -260,6 +264,12 @@ export async function browserSnapshot( if (typeof opts.maxChars === "number" && Number.isFinite(opts.maxChars)) { q.set("maxChars", String(opts.maxChars)); } + if (typeof opts.interactive === "boolean") + q.set("interactive", String(opts.interactive)); + if (typeof opts.compact === "boolean") q.set("compact", String(opts.compact)); + if (typeof opts.depth === "number" && Number.isFinite(opts.depth)) + q.set("depth", String(opts.depth)); + if (opts.selector?.trim()) q.set("selector", opts.selector.trim()); if (opts.profile) q.set("profile", opts.profile); return await fetchBrowserJson( `${baseUrl}/snapshot?${q.toString()}`, diff --git a/src/browser/pw-ai.ts b/src/browser/pw-ai.ts index fe14e2ce9..026190fe4 100644 --- a/src/browser/pw-ai.ts +++ b/src/browser/pw-ai.ts @@ -24,6 +24,7 @@ export { selectOptionViaPlaywright, setInputFilesViaPlaywright, snapshotAiViaPlaywright, + snapshotRoleViaPlaywright, takeScreenshotViaPlaywright, typeViaPlaywright, waitForViaPlaywright, diff --git a/src/browser/pw-role-snapshot.test.ts b/src/browser/pw-role-snapshot.test.ts new file mode 100644 index 000000000..7a1c7c493 --- /dev/null +++ b/src/browser/pw-role-snapshot.test.ts @@ -0,0 +1,45 @@ +import { describe, expect, it } from "vitest"; + +import { buildRoleSnapshotFromAriaSnapshot } from "./pw-role-snapshot.js"; + +describe("pw-role-snapshot", () => { + it("adds refs for interactive elements", () => { + const aria = [ + '- heading "Example" [level=1]', + "- paragraph: hello", + '- button "Submit"', + " - generic", + '- link "Learn more"', + ].join("\n"); + + const res = buildRoleSnapshotFromAriaSnapshot(aria, { interactive: true }); + expect(res.snapshot).toContain("[ref=e1]"); + expect(res.snapshot).toContain("[ref=e2]"); + expect(res.snapshot).toContain('- button "Submit" [ref=e1]'); + expect(res.snapshot).toContain('- link "Learn more" [ref=e2]'); + expect(Object.keys(res.refs)).toEqual(["e1", "e2"]); + expect(res.refs.e1).toMatchObject({ role: "button", name: "Submit" }); + expect(res.refs.e2).toMatchObject({ role: "link", name: "Learn more" }); + }); + + it("uses nth only when duplicates exist", () => { + const aria = ['- button "OK"', '- button "OK"', '- button "Cancel"'].join( + "\n", + ); + const res = buildRoleSnapshotFromAriaSnapshot(aria); + expect(res.snapshot).toContain("[ref=e1]"); + expect(res.snapshot).toContain("[ref=e2] [nth=1]"); + expect(res.refs.e1?.nth).toBe(0); + expect(res.refs.e2?.nth).toBe(1); + expect(res.refs.e3?.nth).toBeUndefined(); + }); + it("respects maxDepth", () => { + const aria = ['- region "Main"', " - group", ' - button "Deep"'].join( + "\n", + ); + const res = buildRoleSnapshotFromAriaSnapshot(aria, { maxDepth: 1 }); + expect(res.snapshot).toContain('- region "Main"'); + expect(res.snapshot).toContain(" - group"); + expect(res.snapshot).not.toContain("button"); + }); +}); diff --git a/src/browser/pw-role-snapshot.ts b/src/browser/pw-role-snapshot.ts new file mode 100644 index 000000000..9ddbda251 --- /dev/null +++ b/src/browser/pw-role-snapshot.ts @@ -0,0 +1,281 @@ +export type RoleRef = { + role: string; + name?: string; + /** Index used only when role+name duplicates exist. */ + nth?: number; +}; + +export type RoleRefMap = Record; + +export type RoleSnapshotOptions = { + /** Only include interactive elements (buttons, links, inputs, etc.). */ + interactive?: boolean; + /** Maximum depth to include (0 = root only). */ + maxDepth?: number; + /** Remove unnamed structural elements and empty branches. */ + compact?: boolean; +}; + +const INTERACTIVE_ROLES = new Set([ + "button", + "link", + "textbox", + "checkbox", + "radio", + "combobox", + "listbox", + "menuitem", + "menuitemcheckbox", + "menuitemradio", + "option", + "searchbox", + "slider", + "spinbutton", + "switch", + "tab", + "treeitem", +]); + +const CONTENT_ROLES = new Set([ + "heading", + "cell", + "gridcell", + "columnheader", + "rowheader", + "listitem", + "article", + "region", + "main", + "navigation", +]); + +const STRUCTURAL_ROLES = new Set([ + "generic", + "group", + "list", + "table", + "row", + "rowgroup", + "grid", + "treegrid", + "menu", + "menubar", + "toolbar", + "tablist", + "tree", + "directory", + "document", + "application", + "presentation", + "none", +]); + +function getIndentLevel(line: string): number { + const match = line.match(/^(\s*)/); + return match ? Math.floor(match[1].length / 2) : 0; +} + +type RoleNameTracker = { + counts: Map; + refsByKey: Map; + getKey: (role: string, name?: string) => string; + getNextIndex: (role: string, name?: string) => number; + trackRef: (role: string, name: string | undefined, ref: string) => void; + getDuplicateKeys: () => Set; +}; + +function createRoleNameTracker(): RoleNameTracker { + const counts = new Map(); + const refsByKey = new Map(); + return { + counts, + refsByKey, + getKey(role: string, name?: string) { + return `${role}:${name ?? ""}`; + }, + getNextIndex(role: string, name?: string) { + const key = this.getKey(role, name); + const current = counts.get(key) ?? 0; + counts.set(key, current + 1); + return current; + }, + trackRef(role: string, name: string | undefined, ref: string) { + const key = this.getKey(role, name); + const list = refsByKey.get(key) ?? []; + list.push(ref); + refsByKey.set(key, list); + }, + getDuplicateKeys() { + const out = new Set(); + for (const [key, refs] of refsByKey) { + if (refs.length > 1) out.add(key); + } + return out; + }, + }; +} + +function removeNthFromNonDuplicates( + refs: RoleRefMap, + tracker: RoleNameTracker, +) { + const duplicates = tracker.getDuplicateKeys(); + for (const [ref, data] of Object.entries(refs)) { + const key = tracker.getKey(data.role, data.name); + if (!duplicates.has(key)) delete refs[ref]?.nth; + } +} + +function compactTree(tree: string) { + const lines = tree.split("\n"); + const result: string[] = []; + + for (let i = 0; i < lines.length; i += 1) { + const line = lines[i]; + if (line.includes("[ref=")) { + result.push(line); + continue; + } + if (line.includes(":") && !line.trimEnd().endsWith(":")) { + result.push(line); + continue; + } + + const currentIndent = getIndentLevel(line); + let hasRelevantChildren = false; + for (let j = i + 1; j < lines.length; j += 1) { + const childIndent = getIndentLevel(lines[j]); + if (childIndent <= currentIndent) break; + if (lines[j]?.includes("[ref=")) { + hasRelevantChildren = true; + break; + } + } + if (hasRelevantChildren) result.push(line); + } + + return result.join("\n"); +} + +function processLine( + line: string, + refs: RoleRefMap, + options: RoleSnapshotOptions, + tracker: RoleNameTracker, + nextRef: () => string, +): string | null { + const depth = getIndentLevel(line); + if (options.maxDepth !== undefined && depth > options.maxDepth) return null; + + const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/); + if (!match) return options.interactive ? null : line; + + const [, prefix, roleRaw, name, suffix] = match; + if (roleRaw.startsWith("/")) return options.interactive ? null : line; + + const role = roleRaw.toLowerCase(); + const isInteractive = INTERACTIVE_ROLES.has(role); + const isContent = CONTENT_ROLES.has(role); + const isStructural = STRUCTURAL_ROLES.has(role); + + if (options.interactive && !isInteractive) return null; + if (options.compact && isStructural && !name) return null; + + const shouldHaveRef = isInteractive || (isContent && name); + if (!shouldHaveRef) return line; + + const ref = nextRef(); + const nth = tracker.getNextIndex(role, name); + tracker.trackRef(role, name, ref); + refs[ref] = { + role, + name, + nth, + }; + + let enhanced = `${prefix}${roleRaw}`; + if (name) enhanced += ` "${name}"`; + enhanced += ` [ref=${ref}]`; + if (nth > 0) enhanced += ` [nth=${nth}]`; + if (suffix) enhanced += suffix; + return enhanced; +} + +export function parseRoleRef(raw: string): string | null { + const trimmed = raw.trim(); + if (!trimmed) return null; + const normalized = trimmed.startsWith("@") + ? trimmed.slice(1) + : trimmed.startsWith("ref=") + ? trimmed.slice(4) + : trimmed; + return /^e\d+$/.test(normalized) ? normalized : null; +} + +export function buildRoleSnapshotFromAriaSnapshot( + ariaSnapshot: string, + options: RoleSnapshotOptions = {}, +): { snapshot: string; refs: RoleRefMap } { + const lines = ariaSnapshot.split("\n"); + const refs: RoleRefMap = {}; + const tracker = createRoleNameTracker(); + + let counter = 0; + const nextRef = () => { + counter += 1; + return `e${counter}`; + }; + + if (options.interactive) { + const result: string[] = []; + for (const line of lines) { + const depth = getIndentLevel(line); + if (options.maxDepth !== undefined && depth > options.maxDepth) continue; + + const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/); + if (!match) continue; + const [, , roleRaw, name, suffix] = match; + if (roleRaw.startsWith("/")) continue; + + const role = roleRaw.toLowerCase(); + if (!INTERACTIVE_ROLES.has(role)) continue; + + const ref = nextRef(); + const nth = tracker.getNextIndex(role, name); + tracker.trackRef(role, name, ref); + refs[ref] = { + role, + name, + nth, + }; + + let enhanced = `- ${roleRaw}`; + if (name) enhanced += ` "${name}"`; + enhanced += ` [ref=${ref}]`; + if (nth > 0) enhanced += ` [nth=${nth}]`; + if (suffix.includes("[")) enhanced += suffix; + result.push(enhanced); + } + + removeNthFromNonDuplicates(refs, tracker); + + return { + snapshot: result.join("\n") || "(no interactive elements)", + refs, + }; + } + + const result: string[] = []; + for (const line of lines) { + const processed = processLine(line, refs, options, tracker, nextRef); + if (processed !== null) result.push(processed); + } + + removeNthFromNonDuplicates(refs, tracker); + + const tree = result.join("\n") || "(empty)"; + return { + snapshot: options.compact ? compactTree(tree) : tree, + refs, + }; +} diff --git a/src/browser/pw-session.ts b/src/browser/pw-session.ts index 41c0e43d5..30278e34d 100644 --- a/src/browser/pw-session.ts +++ b/src/browser/pw-session.ts @@ -39,6 +39,11 @@ type PageState = { console: BrowserConsoleMessage[]; armIdUpload: number; armIdDialog: number; + /** + * Role-based refs from the last role snapshot (e.g. e1/e2). + * These refs are NOT Playwright's `aria-ref` values. + */ + roleRefs?: Record; }; const pageStates = new WeakMap(); @@ -189,7 +194,27 @@ export async function getPageForTargetId(opts: { } export function refLocator(page: Page, ref: string) { - return page.locator(`aria-ref=${ref}`); + const normalized = ref.startsWith("@") + ? ref.slice(1) + : ref.startsWith("ref=") + ? ref.slice(4) + : ref; + + if (/^e\d+$/.test(normalized)) { + const state = pageStates.get(page); + const info = state?.roleRefs?.[normalized]; + if (!info) { + throw new Error( + `Unknown ref "${normalized}". Run a new snapshot and use a ref from that snapshot.`, + ); + } + const locator = info.name + ? page.getByRole(info.role as never, { name: info.name, exact: true }) + : page.getByRole(info.role as never); + return info.nth !== undefined ? locator.nth(info.nth) : locator; + } + + return page.locator(`aria-ref=${normalized}`); } export async function closePlaywrightBrowserConnection(): Promise { diff --git a/src/browser/pw-tools-core.ts b/src/browser/pw-tools-core.ts index 2f22d1380..bf23f888a 100644 --- a/src/browser/pw-tools-core.ts +++ b/src/browser/pw-tools-core.ts @@ -1,4 +1,9 @@ import type { BrowserFormField } from "./client-actions-core.js"; +import { + buildRoleSnapshotFromAriaSnapshot, + parseRoleRef, + type RoleSnapshotOptions, +} from "./pw-role-snapshot.js"; import { type BrowserConsoleMessage, ensurePageState, @@ -11,7 +16,9 @@ let nextUploadArmId = 0; let nextDialogArmId = 0; function requireRef(value: unknown): string { - const ref = typeof value === "string" ? value.trim() : ""; + const raw = typeof value === "string" ? value.trim() : ""; + const roleRef = raw ? parseRoleRef(raw) : null; + const ref = roleRef ?? (raw.startsWith("@") ? raw.slice(1) : raw); if (!ref) throw new Error("ref is required"); return ref; } @@ -55,6 +62,31 @@ export async function snapshotAiViaPlaywright(opts: { return { snapshot }; } +export async function snapshotRoleViaPlaywright(opts: { + cdpUrl: string; + targetId?: string; + selector?: string; + options?: RoleSnapshotOptions; +}): Promise<{ snapshot: string }> { + const page = await getPageForTargetId({ + cdpUrl: opts.cdpUrl, + targetId: opts.targetId, + }); + const state = ensurePageState(page); + + const locator = opts.selector?.trim() + ? page.locator(opts.selector.trim()) + : page.locator(":root"); + + const ariaSnapshot = await locator.ariaSnapshot(); + const built = buildRoleSnapshotFromAriaSnapshot( + String(ariaSnapshot ?? ""), + opts.options, + ); + state.roleRefs = built.refs; + return { snapshot: built.snapshot }; +} + export async function clickViaPlaywright(opts: { cdpUrl: string; targetId?: string; @@ -95,8 +127,7 @@ export async function hoverViaPlaywright(opts: { ref: string; timeoutMs?: number; }): Promise { - const ref = String(opts.ref ?? "").trim(); - if (!ref) throw new Error("ref is required"); + const ref = requireRef(opts.ref); const page = await getPageForTargetId(opts); ensurePageState(page); await refLocator(page, ref).hover({ @@ -111,8 +142,8 @@ export async function dragViaPlaywright(opts: { endRef: string; timeoutMs?: number; }): Promise { - const startRef = String(opts.startRef ?? "").trim(); - const endRef = String(opts.endRef ?? "").trim(); + const startRef = requireRef(opts.startRef); + const endRef = requireRef(opts.endRef); if (!startRef || !endRef) throw new Error("startRef and endRef are required"); const page = await getPageForTargetId(opts); ensurePageState(page); @@ -128,8 +159,7 @@ export async function selectOptionViaPlaywright(opts: { values: string[]; timeoutMs?: number; }): Promise { - const ref = String(opts.ref ?? "").trim(); - if (!ref) throw new Error("ref is required"); + const ref = requireRef(opts.ref); if (!opts.values?.length) throw new Error("values are required"); const page = await getPageForTargetId(opts); ensurePageState(page); diff --git a/src/browser/routes/agent.ts b/src/browser/routes/agent.ts index d5b8a675e..b99fad919 100644 --- a/src/browser/routes/agent.ts +++ b/src/browser/routes/agent.ts @@ -573,17 +573,55 @@ export function registerBrowserAgentRoutes( maxCharsRaw > 0 ? Math.floor(maxCharsRaw) : undefined; + const interactive = toBoolean(req.query.interactive); + const compact = toBoolean(req.query.compact); + const depth = toNumber(req.query.depth); + const selector = toStringOrEmpty(req.query.selector); try { const tab = await profileCtx.ensureTabAvailable(targetId || undefined); if (format === "ai") { const pw = await requirePwAi(res, "ai snapshot"); if (!pw) return; - const snap = await pw.snapshotAiViaPlaywright({ - cdpUrl: profileCtx.profile.cdpUrl, - targetId: tab.targetId, - ...(maxChars ? { maxChars } : {}), - }); + const wantsRoleSnapshot = + interactive === true || + compact === true || + depth !== undefined || + Boolean(selector.trim()); + + const snap = wantsRoleSnapshot + ? await pw.snapshotRoleViaPlaywright({ + cdpUrl: profileCtx.profile.cdpUrl, + targetId: tab.targetId, + selector: selector.trim() || undefined, + options: { + interactive: interactive ?? undefined, + compact: compact ?? undefined, + maxDepth: depth ?? undefined, + }, + }) + : await pw + .snapshotAiViaPlaywright({ + cdpUrl: profileCtx.profile.cdpUrl, + targetId: tab.targetId, + ...(maxChars ? { maxChars } : {}), + }) + .catch(async (err) => { + // Public-API fallback when Playwright's private _snapshotForAI is missing. + if (String(err).toLowerCase().includes("_snapshotforai")) { + return await pw.snapshotRoleViaPlaywright({ + cdpUrl: profileCtx.profile.cdpUrl, + targetId: tab.targetId, + selector: selector.trim() || undefined, + options: { + interactive: interactive ?? undefined, + compact: compact ?? undefined, + maxDepth: depth ?? undefined, + }, + }); + } + throw err; + }); return res.json({ ok: true, format, diff --git a/src/cli/browser-cli-actions-input.ts b/src/cli/browser-cli-actions-input.ts index ad19b6648..4ad4b22ff 100644 --- a/src/cli/browser-cli-actions-input.ts +++ b/src/cli/browser-cli-actions-input.ts @@ -122,7 +122,7 @@ export function registerBrowserActionInputCommands( browser .command("click") .description("Click an element by ref from snapshot") - .argument("", "Ref id from ai snapshot") + .argument("", "Ref id from snapshot") .option("--target-id ", "CDP target id (or unique prefix)") .option("--double", "Double click", false) .option("--button ", "Mouse button to use") @@ -171,7 +171,7 @@ export function registerBrowserActionInputCommands( browser .command("type") .description("Type into an element by ref from snapshot") - .argument("", "Ref id from ai snapshot") + .argument("", "Ref id from snapshot") .argument("", "Text to type") .option("--submit", "Press Enter after typing", false) .option("--slowly", "Type slowly (human-like)", false) @@ -243,7 +243,7 @@ export function registerBrowserActionInputCommands( browser .command("hover") .description("Hover an element by ai ref") - .argument("", "Ref id from ai snapshot") + .argument("", "Ref id from snapshot") .option("--target-id ", "CDP target id (or unique prefix)") .action(async (ref: string, opts, cmd) => { const parent = parentOpts(cmd); @@ -305,7 +305,7 @@ export function registerBrowserActionInputCommands( browser .command("select") .description("Select option(s) in a select element") - .argument("", "Ref id from ai snapshot") + .argument("", "Ref id from snapshot") .argument("", "Option values to select") .option("--target-id ", "CDP target id (or unique prefix)") .action(async (ref: string, values: string[], opts, cmd) => { @@ -338,7 +338,7 @@ export function registerBrowserActionInputCommands( .command("upload") .description("Arm file upload for the next file chooser") .argument("", "File paths to upload") - .option("--ref ", "Ref id from ai snapshot to click after arming") + .option("--ref ", "Ref id from snapshot to click after arming") .option("--input-ref ", "Ref id for to set directly") .option("--element ", "CSS selector for ") .option("--target-id ", "CDP target id (or unique prefix)") @@ -490,7 +490,7 @@ export function registerBrowserActionInputCommands( .command("evaluate") .description("Evaluate a function against the page or a ref") .option("--fn ", "Function source, e.g. (el) => el.textContent") - .option("--ref ", "ARIA ref from ai snapshot") + .option("--ref ", "Ref from snapshot") .option("--target-id ", "CDP target id (or unique prefix)") .action(async (opts, cmd) => { const parent = parentOpts(cmd); diff --git a/src/cli/browser-cli-inspect.ts b/src/cli/browser-cli-inspect.ts index 0bc528bd4..dd110b225 100644 --- a/src/cli/browser-cli-inspect.ts +++ b/src/cli/browser-cli-inspect.ts @@ -55,6 +55,10 @@ export function registerBrowserInspectCommands( .option("--limit ", "Max nodes (default: 500/800)", (v: string) => Number(v), ) + .option("--interactive", "Role snapshot: interactive elements only", false) + .option("--compact", "Role snapshot: compact output", false) + .option("--depth ", "Role snapshot: max depth", (v: string) => Number(v)) + .option("--selector ", "Role snapshot: scope to CSS selector") .option("--out ", "Write snapshot to a file") .action(async (opts, cmd) => { const parent = parentOpts(cmd); @@ -66,6 +70,10 @@ export function registerBrowserInspectCommands( format, targetId: opts.targetId?.trim() || undefined, limit: Number.isFinite(opts.limit) ? opts.limit : undefined, + interactive: Boolean(opts.interactive) || undefined, + compact: Boolean(opts.compact) || undefined, + depth: Number.isFinite(opts.depth) ? opts.depth : undefined, + selector: opts.selector?.trim() || undefined, profile, });