feat: role snapshot refs for browser

This commit is contained in:
Peter Steinberger
2026-01-12 08:36:20 +00:00
parent 6a7b812513
commit fadad6e061
10 changed files with 480 additions and 19 deletions

View File

@@ -121,6 +121,10 @@ const BrowserToolSchema = Type.Object({
limit: Type.Optional(Type.Number()),
maxChars: Type.Optional(Type.Number()),
format: Type.Optional(Type.Union([Type.Literal("aria"), Type.Literal("ai")])),
interactive: Type.Optional(Type.Boolean()),
compact: Type.Optional(Type.Boolean()),
depth: Type.Optional(Type.Number()),
selector: Type.Optional(Type.String()),
fullPage: Type.Optional(Type.Boolean()),
ref: Type.Optional(Type.String()),
element: Type.Optional(Type.String()),
@@ -336,11 +340,30 @@ export function createBrowserTool(opts?: {
format === "ai"
? (maxChars ?? DEFAULT_AI_SNAPSHOT_MAX_CHARS)
: undefined;
const interactive =
typeof params.interactive === "boolean"
? params.interactive
: undefined;
const compact =
typeof params.compact === "boolean" ? params.compact : undefined;
const depth =
typeof params.depth === "number" && Number.isFinite(params.depth)
? params.depth
: undefined;
const selector =
typeof params.selector === "string"
? params.selector.trim()
: undefined;
const snapshot = await browserSnapshot(baseUrl, {
format,
targetId,
limit,
...(resolvedMaxChars ? { maxChars: resolvedMaxChars } : {}),
...(resolvedMaxChars ? { maxChars: resolvedMaxChars } : {}),
interactive,
compact,
depth,
selector,
profile,
});
if (snapshot.format === "ai") {

View File

@@ -250,6 +250,10 @@ export async function browserSnapshot(
targetId?: string;
limit?: number;
maxChars?: number;
interactive?: boolean;
compact?: boolean;
depth?: number;
selector?: string;
profile?: string;
},
): Promise<SnapshotResult> {
@@ -260,6 +264,12 @@ export async function browserSnapshot(
if (typeof opts.maxChars === "number" && Number.isFinite(opts.maxChars)) {
q.set("maxChars", String(opts.maxChars));
}
if (typeof opts.interactive === "boolean")
q.set("interactive", String(opts.interactive));
if (typeof opts.compact === "boolean") q.set("compact", String(opts.compact));
if (typeof opts.depth === "number" && Number.isFinite(opts.depth))
q.set("depth", String(opts.depth));
if (opts.selector?.trim()) q.set("selector", opts.selector.trim());
if (opts.profile) q.set("profile", opts.profile);
return await fetchBrowserJson<SnapshotResult>(
`${baseUrl}/snapshot?${q.toString()}`,

View File

@@ -24,6 +24,7 @@ export {
selectOptionViaPlaywright,
setInputFilesViaPlaywright,
snapshotAiViaPlaywright,
snapshotRoleViaPlaywright,
takeScreenshotViaPlaywright,
typeViaPlaywright,
waitForViaPlaywright,

View File

@@ -0,0 +1,45 @@
import { describe, expect, it } from "vitest";
import { buildRoleSnapshotFromAriaSnapshot } from "./pw-role-snapshot.js";
describe("pw-role-snapshot", () => {
it("adds refs for interactive elements", () => {
const aria = [
'- heading "Example" [level=1]',
"- paragraph: hello",
'- button "Submit"',
" - generic",
'- link "Learn more"',
].join("\n");
const res = buildRoleSnapshotFromAriaSnapshot(aria, { interactive: true });
expect(res.snapshot).toContain("[ref=e1]");
expect(res.snapshot).toContain("[ref=e2]");
expect(res.snapshot).toContain('- button "Submit" [ref=e1]');
expect(res.snapshot).toContain('- link "Learn more" [ref=e2]');
expect(Object.keys(res.refs)).toEqual(["e1", "e2"]);
expect(res.refs.e1).toMatchObject({ role: "button", name: "Submit" });
expect(res.refs.e2).toMatchObject({ role: "link", name: "Learn more" });
});
it("uses nth only when duplicates exist", () => {
const aria = ['- button "OK"', '- button "OK"', '- button "Cancel"'].join(
"\n",
);
const res = buildRoleSnapshotFromAriaSnapshot(aria);
expect(res.snapshot).toContain("[ref=e1]");
expect(res.snapshot).toContain("[ref=e2] [nth=1]");
expect(res.refs.e1?.nth).toBe(0);
expect(res.refs.e2?.nth).toBe(1);
expect(res.refs.e3?.nth).toBeUndefined();
});
it("respects maxDepth", () => {
const aria = ['- region "Main"', " - group", ' - button "Deep"'].join(
"\n",
);
const res = buildRoleSnapshotFromAriaSnapshot(aria, { maxDepth: 1 });
expect(res.snapshot).toContain('- region "Main"');
expect(res.snapshot).toContain(" - group");
expect(res.snapshot).not.toContain("button");
});
});

View File

@@ -0,0 +1,281 @@
export type RoleRef = {
role: string;
name?: string;
/** Index used only when role+name duplicates exist. */
nth?: number;
};
export type RoleRefMap = Record<string, RoleRef>;
export type RoleSnapshotOptions = {
/** Only include interactive elements (buttons, links, inputs, etc.). */
interactive?: boolean;
/** Maximum depth to include (0 = root only). */
maxDepth?: number;
/** Remove unnamed structural elements and empty branches. */
compact?: boolean;
};
const INTERACTIVE_ROLES = new Set([
"button",
"link",
"textbox",
"checkbox",
"radio",
"combobox",
"listbox",
"menuitem",
"menuitemcheckbox",
"menuitemradio",
"option",
"searchbox",
"slider",
"spinbutton",
"switch",
"tab",
"treeitem",
]);
const CONTENT_ROLES = new Set([
"heading",
"cell",
"gridcell",
"columnheader",
"rowheader",
"listitem",
"article",
"region",
"main",
"navigation",
]);
const STRUCTURAL_ROLES = new Set([
"generic",
"group",
"list",
"table",
"row",
"rowgroup",
"grid",
"treegrid",
"menu",
"menubar",
"toolbar",
"tablist",
"tree",
"directory",
"document",
"application",
"presentation",
"none",
]);
function getIndentLevel(line: string): number {
const match = line.match(/^(\s*)/);
return match ? Math.floor(match[1].length / 2) : 0;
}
type RoleNameTracker = {
counts: Map<string, number>;
refsByKey: Map<string, string[]>;
getKey: (role: string, name?: string) => string;
getNextIndex: (role: string, name?: string) => number;
trackRef: (role: string, name: string | undefined, ref: string) => void;
getDuplicateKeys: () => Set<string>;
};
function createRoleNameTracker(): RoleNameTracker {
const counts = new Map<string, number>();
const refsByKey = new Map<string, string[]>();
return {
counts,
refsByKey,
getKey(role: string, name?: string) {
return `${role}:${name ?? ""}`;
},
getNextIndex(role: string, name?: string) {
const key = this.getKey(role, name);
const current = counts.get(key) ?? 0;
counts.set(key, current + 1);
return current;
},
trackRef(role: string, name: string | undefined, ref: string) {
const key = this.getKey(role, name);
const list = refsByKey.get(key) ?? [];
list.push(ref);
refsByKey.set(key, list);
},
getDuplicateKeys() {
const out = new Set<string>();
for (const [key, refs] of refsByKey) {
if (refs.length > 1) out.add(key);
}
return out;
},
};
}
function removeNthFromNonDuplicates(
refs: RoleRefMap,
tracker: RoleNameTracker,
) {
const duplicates = tracker.getDuplicateKeys();
for (const [ref, data] of Object.entries(refs)) {
const key = tracker.getKey(data.role, data.name);
if (!duplicates.has(key)) delete refs[ref]?.nth;
}
}
function compactTree(tree: string) {
const lines = tree.split("\n");
const result: string[] = [];
for (let i = 0; i < lines.length; i += 1) {
const line = lines[i];
if (line.includes("[ref=")) {
result.push(line);
continue;
}
if (line.includes(":") && !line.trimEnd().endsWith(":")) {
result.push(line);
continue;
}
const currentIndent = getIndentLevel(line);
let hasRelevantChildren = false;
for (let j = i + 1; j < lines.length; j += 1) {
const childIndent = getIndentLevel(lines[j]);
if (childIndent <= currentIndent) break;
if (lines[j]?.includes("[ref=")) {
hasRelevantChildren = true;
break;
}
}
if (hasRelevantChildren) result.push(line);
}
return result.join("\n");
}
function processLine(
line: string,
refs: RoleRefMap,
options: RoleSnapshotOptions,
tracker: RoleNameTracker,
nextRef: () => string,
): string | null {
const depth = getIndentLevel(line);
if (options.maxDepth !== undefined && depth > options.maxDepth) return null;
const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/);
if (!match) return options.interactive ? null : line;
const [, prefix, roleRaw, name, suffix] = match;
if (roleRaw.startsWith("/")) return options.interactive ? null : line;
const role = roleRaw.toLowerCase();
const isInteractive = INTERACTIVE_ROLES.has(role);
const isContent = CONTENT_ROLES.has(role);
const isStructural = STRUCTURAL_ROLES.has(role);
if (options.interactive && !isInteractive) return null;
if (options.compact && isStructural && !name) return null;
const shouldHaveRef = isInteractive || (isContent && name);
if (!shouldHaveRef) return line;
const ref = nextRef();
const nth = tracker.getNextIndex(role, name);
tracker.trackRef(role, name, ref);
refs[ref] = {
role,
name,
nth,
};
let enhanced = `${prefix}${roleRaw}`;
if (name) enhanced += ` "${name}"`;
enhanced += ` [ref=${ref}]`;
if (nth > 0) enhanced += ` [nth=${nth}]`;
if (suffix) enhanced += suffix;
return enhanced;
}
export function parseRoleRef(raw: string): string | null {
const trimmed = raw.trim();
if (!trimmed) return null;
const normalized = trimmed.startsWith("@")
? trimmed.slice(1)
: trimmed.startsWith("ref=")
? trimmed.slice(4)
: trimmed;
return /^e\d+$/.test(normalized) ? normalized : null;
}
export function buildRoleSnapshotFromAriaSnapshot(
ariaSnapshot: string,
options: RoleSnapshotOptions = {},
): { snapshot: string; refs: RoleRefMap } {
const lines = ariaSnapshot.split("\n");
const refs: RoleRefMap = {};
const tracker = createRoleNameTracker();
let counter = 0;
const nextRef = () => {
counter += 1;
return `e${counter}`;
};
if (options.interactive) {
const result: string[] = [];
for (const line of lines) {
const depth = getIndentLevel(line);
if (options.maxDepth !== undefined && depth > options.maxDepth) continue;
const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/);
if (!match) continue;
const [, , roleRaw, name, suffix] = match;
if (roleRaw.startsWith("/")) continue;
const role = roleRaw.toLowerCase();
if (!INTERACTIVE_ROLES.has(role)) continue;
const ref = nextRef();
const nth = tracker.getNextIndex(role, name);
tracker.trackRef(role, name, ref);
refs[ref] = {
role,
name,
nth,
};
let enhanced = `- ${roleRaw}`;
if (name) enhanced += ` "${name}"`;
enhanced += ` [ref=${ref}]`;
if (nth > 0) enhanced += ` [nth=${nth}]`;
if (suffix.includes("[")) enhanced += suffix;
result.push(enhanced);
}
removeNthFromNonDuplicates(refs, tracker);
return {
snapshot: result.join("\n") || "(no interactive elements)",
refs,
};
}
const result: string[] = [];
for (const line of lines) {
const processed = processLine(line, refs, options, tracker, nextRef);
if (processed !== null) result.push(processed);
}
removeNthFromNonDuplicates(refs, tracker);
const tree = result.join("\n") || "(empty)";
return {
snapshot: options.compact ? compactTree(tree) : tree,
refs,
};
}

View File

@@ -39,6 +39,11 @@ type PageState = {
console: BrowserConsoleMessage[];
armIdUpload: number;
armIdDialog: number;
/**
* Role-based refs from the last role snapshot (e.g. e1/e2).
* These refs are NOT Playwright's `aria-ref` values.
*/
roleRefs?: Record<string, { role: string; name?: string; nth?: number }>;
};
const pageStates = new WeakMap<Page, PageState>();
@@ -189,7 +194,27 @@ export async function getPageForTargetId(opts: {
}
export function refLocator(page: Page, ref: string) {
return page.locator(`aria-ref=${ref}`);
const normalized = ref.startsWith("@")
? ref.slice(1)
: ref.startsWith("ref=")
? ref.slice(4)
: ref;
if (/^e\d+$/.test(normalized)) {
const state = pageStates.get(page);
const info = state?.roleRefs?.[normalized];
if (!info) {
throw new Error(
`Unknown ref "${normalized}". Run a new snapshot and use a ref from that snapshot.`,
);
}
const locator = info.name
? page.getByRole(info.role as never, { name: info.name, exact: true })
: page.getByRole(info.role as never);
return info.nth !== undefined ? locator.nth(info.nth) : locator;
}
return page.locator(`aria-ref=${normalized}`);
}
export async function closePlaywrightBrowserConnection(): Promise<void> {

View File

@@ -1,4 +1,9 @@
import type { BrowserFormField } from "./client-actions-core.js";
import {
buildRoleSnapshotFromAriaSnapshot,
parseRoleRef,
type RoleSnapshotOptions,
} from "./pw-role-snapshot.js";
import {
type BrowserConsoleMessage,
ensurePageState,
@@ -11,7 +16,9 @@ let nextUploadArmId = 0;
let nextDialogArmId = 0;
function requireRef(value: unknown): string {
const ref = typeof value === "string" ? value.trim() : "";
const raw = typeof value === "string" ? value.trim() : "";
const roleRef = raw ? parseRoleRef(raw) : null;
const ref = roleRef ?? (raw.startsWith("@") ? raw.slice(1) : raw);
if (!ref) throw new Error("ref is required");
return ref;
}
@@ -55,6 +62,31 @@ export async function snapshotAiViaPlaywright(opts: {
return { snapshot };
}
export async function snapshotRoleViaPlaywright(opts: {
cdpUrl: string;
targetId?: string;
selector?: string;
options?: RoleSnapshotOptions;
}): Promise<{ snapshot: string }> {
const page = await getPageForTargetId({
cdpUrl: opts.cdpUrl,
targetId: opts.targetId,
});
const state = ensurePageState(page);
const locator = opts.selector?.trim()
? page.locator(opts.selector.trim())
: page.locator(":root");
const ariaSnapshot = await locator.ariaSnapshot();
const built = buildRoleSnapshotFromAriaSnapshot(
String(ariaSnapshot ?? ""),
opts.options,
);
state.roleRefs = built.refs;
return { snapshot: built.snapshot };
}
export async function clickViaPlaywright(opts: {
cdpUrl: string;
targetId?: string;
@@ -95,8 +127,7 @@ export async function hoverViaPlaywright(opts: {
ref: string;
timeoutMs?: number;
}): Promise<void> {
const ref = String(opts.ref ?? "").trim();
if (!ref) throw new Error("ref is required");
const ref = requireRef(opts.ref);
const page = await getPageForTargetId(opts);
ensurePageState(page);
await refLocator(page, ref).hover({
@@ -111,8 +142,8 @@ export async function dragViaPlaywright(opts: {
endRef: string;
timeoutMs?: number;
}): Promise<void> {
const startRef = String(opts.startRef ?? "").trim();
const endRef = String(opts.endRef ?? "").trim();
const startRef = requireRef(opts.startRef);
const endRef = requireRef(opts.endRef);
if (!startRef || !endRef) throw new Error("startRef and endRef are required");
const page = await getPageForTargetId(opts);
ensurePageState(page);
@@ -128,8 +159,7 @@ export async function selectOptionViaPlaywright(opts: {
values: string[];
timeoutMs?: number;
}): Promise<void> {
const ref = String(opts.ref ?? "").trim();
if (!ref) throw new Error("ref is required");
const ref = requireRef(opts.ref);
if (!opts.values?.length) throw new Error("values are required");
const page = await getPageForTargetId(opts);
ensurePageState(page);

View File

@@ -573,17 +573,55 @@ export function registerBrowserAgentRoutes(
maxCharsRaw > 0
? Math.floor(maxCharsRaw)
: undefined;
const interactive = toBoolean(req.query.interactive);
const compact = toBoolean(req.query.compact);
const depth = toNumber(req.query.depth);
const selector = toStringOrEmpty(req.query.selector);
try {
const tab = await profileCtx.ensureTabAvailable(targetId || undefined);
if (format === "ai") {
const pw = await requirePwAi(res, "ai snapshot");
if (!pw) return;
const snap = await pw.snapshotAiViaPlaywright({
cdpUrl: profileCtx.profile.cdpUrl,
targetId: tab.targetId,
...(maxChars ? { maxChars } : {}),
});
const wantsRoleSnapshot =
interactive === true ||
compact === true ||
depth !== undefined ||
Boolean(selector.trim());
const snap = wantsRoleSnapshot
? await pw.snapshotRoleViaPlaywright({
cdpUrl: profileCtx.profile.cdpUrl,
targetId: tab.targetId,
selector: selector.trim() || undefined,
options: {
interactive: interactive ?? undefined,
compact: compact ?? undefined,
maxDepth: depth ?? undefined,
},
})
: await pw
.snapshotAiViaPlaywright({
cdpUrl: profileCtx.profile.cdpUrl,
targetId: tab.targetId,
...(maxChars ? { maxChars } : {}),
})
.catch(async (err) => {
// Public-API fallback when Playwright's private _snapshotForAI is missing.
if (String(err).toLowerCase().includes("_snapshotforai")) {
return await pw.snapshotRoleViaPlaywright({
cdpUrl: profileCtx.profile.cdpUrl,
targetId: tab.targetId,
selector: selector.trim() || undefined,
options: {
interactive: interactive ?? undefined,
compact: compact ?? undefined,
maxDepth: depth ?? undefined,
},
});
}
throw err;
});
return res.json({
ok: true,
format,

View File

@@ -122,7 +122,7 @@ export function registerBrowserActionInputCommands(
browser
.command("click")
.description("Click an element by ref from snapshot")
.argument("<ref>", "Ref id from ai snapshot")
.argument("<ref>", "Ref id from snapshot")
.option("--target-id <id>", "CDP target id (or unique prefix)")
.option("--double", "Double click", false)
.option("--button <left|right|middle>", "Mouse button to use")
@@ -171,7 +171,7 @@ export function registerBrowserActionInputCommands(
browser
.command("type")
.description("Type into an element by ref from snapshot")
.argument("<ref>", "Ref id from ai snapshot")
.argument("<ref>", "Ref id from snapshot")
.argument("<text>", "Text to type")
.option("--submit", "Press Enter after typing", false)
.option("--slowly", "Type slowly (human-like)", false)
@@ -243,7 +243,7 @@ export function registerBrowserActionInputCommands(
browser
.command("hover")
.description("Hover an element by ai ref")
.argument("<ref>", "Ref id from ai snapshot")
.argument("<ref>", "Ref id from snapshot")
.option("--target-id <id>", "CDP target id (or unique prefix)")
.action(async (ref: string, opts, cmd) => {
const parent = parentOpts(cmd);
@@ -305,7 +305,7 @@ export function registerBrowserActionInputCommands(
browser
.command("select")
.description("Select option(s) in a select element")
.argument("<ref>", "Ref id from ai snapshot")
.argument("<ref>", "Ref id from snapshot")
.argument("<values...>", "Option values to select")
.option("--target-id <id>", "CDP target id (or unique prefix)")
.action(async (ref: string, values: string[], opts, cmd) => {
@@ -338,7 +338,7 @@ export function registerBrowserActionInputCommands(
.command("upload")
.description("Arm file upload for the next file chooser")
.argument("<paths...>", "File paths to upload")
.option("--ref <ref>", "Ref id from ai snapshot to click after arming")
.option("--ref <ref>", "Ref id from snapshot to click after arming")
.option("--input-ref <ref>", "Ref id for <input type=file> to set directly")
.option("--element <selector>", "CSS selector for <input type=file>")
.option("--target-id <id>", "CDP target id (or unique prefix)")
@@ -490,7 +490,7 @@ export function registerBrowserActionInputCommands(
.command("evaluate")
.description("Evaluate a function against the page or a ref")
.option("--fn <code>", "Function source, e.g. (el) => el.textContent")
.option("--ref <id>", "ARIA ref from ai snapshot")
.option("--ref <id>", "Ref from snapshot")
.option("--target-id <id>", "CDP target id (or unique prefix)")
.action(async (opts, cmd) => {
const parent = parentOpts(cmd);

View File

@@ -55,6 +55,10 @@ export function registerBrowserInspectCommands(
.option("--limit <n>", "Max nodes (default: 500/800)", (v: string) =>
Number(v),
)
.option("--interactive", "Role snapshot: interactive elements only", false)
.option("--compact", "Role snapshot: compact output", false)
.option("--depth <n>", "Role snapshot: max depth", (v: string) => Number(v))
.option("--selector <sel>", "Role snapshot: scope to CSS selector")
.option("--out <path>", "Write snapshot to a file")
.action(async (opts, cmd) => {
const parent = parentOpts(cmd);
@@ -66,6 +70,10 @@ export function registerBrowserInspectCommands(
format,
targetId: opts.targetId?.trim() || undefined,
limit: Number.isFinite(opts.limit) ? opts.limit : undefined,
interactive: Boolean(opts.interactive) || undefined,
compact: Boolean(opts.compact) || undefined,
depth: Number.isFinite(opts.depth) ? opts.depth : undefined,
selector: opts.selector?.trim() || undefined,
profile,
});