refactor: make browser actions ref-only

This commit is contained in:
Peter Steinberger
2025-12-26 19:02:19 +00:00
parent f50f18f65a
commit 1236c4dafb
15 changed files with 131 additions and 183 deletions

View File

@@ -341,40 +341,74 @@ async function resolveNodeId(
);
}
const BrowserActSchema = Type.Object({
kind: Type.Union([
Type.Literal("click"),
Type.Literal("type"),
Type.Literal("press"),
Type.Literal("hover"),
Type.Literal("drag"),
Type.Literal("select"),
Type.Literal("fill"),
Type.Literal("resize"),
Type.Literal("wait"),
Type.Literal("evaluate"),
Type.Literal("close"),
]),
ref: Type.Optional(Type.String()),
selector: Type.Optional(Type.String()),
targetId: Type.Optional(Type.String()),
doubleClick: Type.Optional(Type.Boolean()),
button: Type.Optional(Type.String()),
modifiers: Type.Optional(Type.Array(Type.String())),
text: Type.Optional(Type.String()),
submit: Type.Optional(Type.Boolean()),
slowly: Type.Optional(Type.Boolean()),
key: Type.Optional(Type.String()),
startRef: Type.Optional(Type.String()),
endRef: Type.Optional(Type.String()),
values: Type.Optional(Type.Array(Type.String())),
fields: Type.Optional(Type.Array(Type.Record(Type.String(), Type.Unknown()))),
width: Type.Optional(Type.Number()),
height: Type.Optional(Type.Number()),
timeMs: Type.Optional(Type.Number()),
textGone: Type.Optional(Type.String()),
fn: Type.Optional(Type.String()),
});
const BrowserActSchema = Type.Union([
Type.Object({
kind: Type.Literal("click"),
ref: Type.String(),
targetId: Type.Optional(Type.String()),
doubleClick: Type.Optional(Type.Boolean()),
button: Type.Optional(Type.String()),
modifiers: Type.Optional(Type.Array(Type.String())),
}),
Type.Object({
kind: Type.Literal("type"),
ref: Type.String(),
text: Type.String(),
targetId: Type.Optional(Type.String()),
submit: Type.Optional(Type.Boolean()),
slowly: Type.Optional(Type.Boolean()),
}),
Type.Object({
kind: Type.Literal("press"),
key: Type.String(),
targetId: Type.Optional(Type.String()),
}),
Type.Object({
kind: Type.Literal("hover"),
ref: Type.String(),
targetId: Type.Optional(Type.String()),
}),
Type.Object({
kind: Type.Literal("drag"),
startRef: Type.String(),
endRef: Type.String(),
targetId: Type.Optional(Type.String()),
}),
Type.Object({
kind: Type.Literal("select"),
ref: Type.String(),
values: Type.Array(Type.String()),
targetId: Type.Optional(Type.String()),
}),
Type.Object({
kind: Type.Literal("fill"),
fields: Type.Array(Type.Record(Type.String(), Type.Unknown())),
targetId: Type.Optional(Type.String()),
}),
Type.Object({
kind: Type.Literal("resize"),
width: Type.Number(),
height: Type.Number(),
targetId: Type.Optional(Type.String()),
}),
Type.Object({
kind: Type.Literal("wait"),
timeMs: Type.Optional(Type.Number()),
text: Type.Optional(Type.String()),
textGone: Type.Optional(Type.String()),
targetId: Type.Optional(Type.String()),
}),
Type.Object({
kind: Type.Literal("evaluate"),
fn: Type.String(),
ref: Type.Optional(Type.String()),
targetId: Type.Optional(Type.String()),
}),
Type.Object({
kind: Type.Literal("close"),
targetId: Type.Optional(Type.String()),
}),
]);
const BrowserToolSchema = Type.Union([
Type.Object({
@@ -514,7 +548,7 @@ function createBrowserTool(): AnyAgentTool {
const format =
params.format === "ai" || params.format === "aria"
? (params.format as "ai" | "aria")
: "aria";
: "ai";
const targetId =
typeof params.targetId === "string"
? params.targetId.trim()

View File

@@ -14,8 +14,7 @@ export type BrowserFormField = {
export type BrowserActRequest =
| {
kind: "click";
ref?: string;
selector?: string;
ref: string;
targetId?: string;
doubleClick?: boolean;
button?: string;
@@ -23,8 +22,7 @@ export type BrowserActRequest =
}
| {
kind: "type";
ref?: string;
selector?: string;
ref: string;
text: string;
targetId?: string;
submit?: boolean;

View File

@@ -111,45 +111,6 @@ describe("pw-ai", () => {
expect(p1.click).toHaveBeenCalledTimes(1);
});
it("clicks a css selector when provided", async () => {
const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1" });
const browser = createBrowser([p1.page]);
(
chromium.connectOverCDP as unknown as ReturnType<typeof vi.fn>
).mockResolvedValue(browser);
const mod = await importModule();
await mod.clickViaPlaywright({
cdpPort: 18792,
targetId: "T1",
selector: "button.save",
});
expect(p1.locator).toHaveBeenCalledWith("button.save");
expect(p1.click).toHaveBeenCalledTimes(1);
});
it("types via css selector when provided", async () => {
const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1" });
const browser = createBrowser([p1.page]);
(
chromium.connectOverCDP as unknown as ReturnType<typeof vi.fn>
).mockResolvedValue(browser);
const mod = await importModule();
await mod.typeViaPlaywright({
cdpPort: 18792,
targetId: "T1",
selector: "input[name=q]",
text: "hello",
});
expect(p1.locator).toHaveBeenCalledWith("input[name=q]");
expect(p1.fill).toHaveBeenCalledTimes(1);
});
it("fails with a clear error when _snapshotForAI is missing", async () => {
const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1", hasSnapshotForAI: false });

View File

@@ -10,18 +10,10 @@ import {
let nextUploadArmId = 0;
let nextDialogArmId = 0;
type LocatorPage = Parameters<typeof refLocator>[0];
function resolveLocator(
page: LocatorPage,
opts: { ref?: string; selector?: string },
) {
const selector =
typeof opts.selector === "string" ? opts.selector.trim() : "";
if (selector) return page.locator(selector);
const ref = typeof opts.ref === "string" ? opts.ref.trim() : "";
if (ref) return refLocator(page, ref);
throw new Error("ref or selector is required");
function requireRef(value: unknown): string {
const ref = typeof value === "string" ? value.trim() : "";
if (!ref) throw new Error("ref is required");
return ref;
}
export async function snapshotAiViaPlaywright(opts: {
@@ -55,8 +47,7 @@ export async function snapshotAiViaPlaywright(opts: {
export async function clickViaPlaywright(opts: {
cdpPort: number;
targetId?: string;
ref?: string;
selector?: string;
ref: string;
doubleClick?: boolean;
button?: "left" | "right" | "middle";
modifiers?: Array<"Alt" | "Control" | "ControlOrMeta" | "Meta" | "Shift">;
@@ -67,10 +58,7 @@ export async function clickViaPlaywright(opts: {
targetId: opts.targetId,
});
ensurePageState(page);
const locator = resolveLocator(page, {
ref: opts.ref,
selector: opts.selector,
});
const locator = refLocator(page, requireRef(opts.ref));
const timeout = Math.max(
500,
Math.min(60_000, Math.floor(opts.timeoutMs ?? 8000)),
@@ -157,8 +145,7 @@ export async function pressKeyViaPlaywright(opts: {
export async function typeViaPlaywright(opts: {
cdpPort: number;
targetId?: string;
ref?: string;
selector?: string;
ref: string;
text: string;
submit?: boolean;
slowly?: boolean;
@@ -167,10 +154,7 @@ export async function typeViaPlaywright(opts: {
const text = String(opts.text ?? "");
const page = await getPageForTargetId(opts);
ensurePageState(page);
const locator = resolveLocator(page, {
ref: opts.ref,
selector: opts.selector,
});
const locator = refLocator(page, requireRef(opts.ref));
const timeout = Math.max(500, Math.min(60_000, opts.timeoutMs ?? 8000));
if (opts.slowly) {
await locator.click({ timeout });

View File

@@ -35,6 +35,16 @@ type ActKind =
type ClickButton = "left" | "right" | "middle";
type ClickModifier = "Alt" | "Control" | "ControlOrMeta" | "Meta" | "Shift";
const SELECTOR_UNSUPPORTED_MESSAGE = [
"Error: 'selector' is not supported. Use 'ref' from snapshot instead.",
"",
"Example workflow:",
"1. snapshot action to get page state with refs",
'2. act with ref: "e123" to interact with element',
"",
"This is more reliable for modern SPAs.",
].join("\n");
function readBody(req: express.Request): Record<string, unknown> {
const body = req.body as Record<string, unknown> | undefined;
if (!body || typeof body !== "object" || Array.isArray(body)) return {};
@@ -113,6 +123,9 @@ export function registerBrowserAgentRoutes(
const body = readBody(req);
const kind = toStringOrEmpty(body.kind) as ActKind;
const targetId = toStringOrEmpty(body.targetId) || undefined;
if (Object.prototype.hasOwnProperty.call(body, "selector")) {
return jsonError(res, 400, SELECTOR_UNSUPPORTED_MESSAGE);
}
if (
kind !== "click" &&
@@ -139,9 +152,7 @@ export function registerBrowserAgentRoutes(
switch (kind) {
case "click": {
const ref = toStringOrEmpty(body.ref);
const selector = toStringOrEmpty(body.selector);
if (!ref && !selector)
return jsonError(res, 400, "ref or selector is required");
if (!ref) return jsonError(res, 400, "ref is required");
const doubleClick = toBoolean(body.doubleClick) ?? false;
const buttonRaw = toStringOrEmpty(body.button) || "";
const button = buttonRaw ? parseClickButton(buttonRaw) : undefined;
@@ -171,10 +182,9 @@ export function registerBrowserAgentRoutes(
const clickRequest: Parameters<typeof pw.clickViaPlaywright>[0] = {
cdpPort,
targetId: tab.targetId,
ref,
doubleClick,
};
if (ref) clickRequest.ref = ref;
if (selector) clickRequest.selector = selector;
if (button) clickRequest.button = button;
if (modifiers) clickRequest.modifiers = modifiers;
await pw.clickViaPlaywright(clickRequest);
@@ -182,9 +192,7 @@ export function registerBrowserAgentRoutes(
}
case "type": {
const ref = toStringOrEmpty(body.ref);
const selector = toStringOrEmpty(body.selector);
if (!ref && !selector)
return jsonError(res, 400, "ref or selector is required");
if (!ref) return jsonError(res, 400, "ref is required");
if (typeof body.text !== "string")
return jsonError(res, 400, "text is required");
const text = body.text;
@@ -193,12 +201,11 @@ export function registerBrowserAgentRoutes(
const typeRequest: Parameters<typeof pw.typeViaPlaywright>[0] = {
cdpPort,
targetId: tab.targetId,
ref,
text,
submit,
slowly,
};
if (ref) typeRequest.ref = ref;
if (selector) typeRequest.selector = selector;
await pw.typeViaPlaywright(typeRequest);
return res.json({ ok: true, targetId: tab.targetId });
}

View File

@@ -327,21 +327,17 @@ describe("browser control server", () => {
modifiers: ["Shift"],
});
const clickSelector = (await realFetch(`${base}/act`, {
const clickSelector = await realFetch(`${base}/act`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
kind: "click",
selector: "button.save",
}),
}).then((r) => r.json())) as { ok: boolean };
expect(clickSelector.ok).toBe(true);
expect(pwMocks.clickViaPlaywright).toHaveBeenNthCalledWith(2, {
cdpPort: testPort + 1,
targetId: "abcd1234",
selector: "button.save",
doubleClick: false,
});
expect(clickSelector.status).toBe(400);
const clickSelectorBody = (await clickSelector.json()) as { error?: string };
expect(clickSelectorBody.error).toMatch(/selector is not supported/i);
const type = (await realFetch(`${base}/act`, {
method: "POST",
@@ -358,26 +354,6 @@ describe("browser control server", () => {
slowly: false,
});
const typeSelector = (await realFetch(`${base}/act`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
kind: "type",
selector: "input[name=q]",
text: "hello",
submit: true,
}),
}).then((r) => r.json())) as { ok: boolean };
expect(typeSelector.ok).toBe(true);
expect(pwMocks.typeViaPlaywright).toHaveBeenNthCalledWith(2, {
cdpPort: testPort + 1,
targetId: "abcd1234",
selector: "input[name=q]",
text: "hello",
submit: true,
slowly: false,
});
const press = (await realFetch(`${base}/act`, {
method: "POST",
headers: { "Content-Type": "application/json" },

View File

@@ -114,9 +114,8 @@ export function registerBrowserActionInputCommands(
browser
.command("click")
.description("Click an element by ai ref or CSS selector")
.argument("[ref]", "Ref id from ai snapshot")
.option("--selector <css>", "CSS selector (instead of ref)")
.description("Click an element by ref from snapshot")
.argument("<ref>", "Ref id from ai snapshot")
.option("--target-id <id>", "CDP target id (or unique prefix)")
.option("--double", "Double click", false)
.option("--button <left|right|middle>", "Mouse button to use")
@@ -124,11 +123,9 @@ export function registerBrowserActionInputCommands(
.action(async (ref: string | undefined, opts, cmd) => {
const parent = parentOpts(cmd);
const baseUrl = resolveBrowserControlUrl(parent?.url);
const selector =
typeof opts.selector === "string" ? opts.selector.trim() : "";
const refValue = typeof ref === "string" ? ref.trim() : "";
if (!selector && !refValue) {
defaultRuntime.error(danger("ref or --selector is required"));
if (!refValue) {
defaultRuntime.error(danger("ref is required"));
defaultRuntime.exit(1);
return;
}
@@ -141,8 +138,7 @@ export function registerBrowserActionInputCommands(
try {
const result = await browserAct(baseUrl, {
kind: "click",
ref: refValue || undefined,
selector: selector || undefined,
ref: refValue,
targetId: opts.targetId?.trim() || undefined,
doubleClick: Boolean(opts.double),
button: opts.button?.trim() || undefined,
@@ -153,11 +149,7 @@ export function registerBrowserActionInputCommands(
return;
}
const suffix = result.url ? ` on ${result.url}` : "";
if (selector) {
defaultRuntime.log(`clicked ${selector}${suffix}`);
} else {
defaultRuntime.log(`clicked ref ${refValue}${suffix}`);
}
defaultRuntime.log(`clicked ref ${refValue}${suffix}`);
} catch (err) {
defaultRuntime.error(danger(String(err)));
defaultRuntime.exit(1);
@@ -166,29 +158,25 @@ export function registerBrowserActionInputCommands(
browser
.command("type")
.description("Type into an element by ai ref or CSS selector")
.argument("[ref]", "Ref id from ai snapshot")
.description("Type into an element by ref from snapshot")
.argument("<ref>", "Ref id from ai snapshot")
.argument("<text>", "Text to type")
.option("--selector <css>", "CSS selector (instead of ref)")
.option("--submit", "Press Enter after typing", false)
.option("--slowly", "Type slowly (human-like)", false)
.option("--target-id <id>", "CDP target id (or unique prefix)")
.action(async (ref: string | undefined, text: string, opts, cmd) => {
const parent = parentOpts(cmd);
const baseUrl = resolveBrowserControlUrl(parent?.url);
const selector =
typeof opts.selector === "string" ? opts.selector.trim() : "";
const refValue = typeof ref === "string" ? ref.trim() : "";
if (!selector && !refValue) {
defaultRuntime.error(danger("ref or --selector is required"));
if (!refValue) {
defaultRuntime.error(danger("ref is required"));
defaultRuntime.exit(1);
return;
}
try {
const result = await browserAct(baseUrl, {
kind: "type",
ref: refValue || undefined,
selector: selector || undefined,
ref: refValue,
text,
submit: Boolean(opts.submit),
slowly: Boolean(opts.slowly),
@@ -198,11 +186,7 @@ export function registerBrowserActionInputCommands(
defaultRuntime.log(JSON.stringify(result, null, 2));
return;
}
if (selector) {
defaultRuntime.log(`typed into ${selector}`);
} else {
defaultRuntime.log(`typed into ref ${refValue}`);
}
defaultRuntime.log(`typed into ref ${refValue}`);
} catch (err) {
defaultRuntime.error(danger(String(err)));
defaultRuntime.exit(1);

View File

@@ -9,17 +9,15 @@ export const browserCoreExamples = [
"clawdis browser screenshot",
"clawdis browser screenshot --full-page",
"clawdis browser screenshot --ref 12",
"clawdis browser snapshot",
"clawdis browser snapshot --format aria --limit 200",
"clawdis browser snapshot --format ai",
];
export const browserActionExamples = [
"clawdis browser navigate https://example.com",
"clawdis browser resize 1280 720",
"clawdis browser click 12 --double",
"clawdis browser click --selector 'button.save'",
'clawdis browser type 23 "hello" --submit',
'clawdis browser type --selector "input[name=q]" "hello"',
"clawdis browser press Enter",
"clawdis browser hover 44",
"clawdis browser drag 10 11",

View File

@@ -45,8 +45,8 @@ export function registerBrowserInspectCommands(
browser
.command("snapshot")
.description("Capture an AI-friendly snapshot (aria or ai)")
.option("--format <aria|ai>", "Snapshot format (default: aria)", "aria")
.description("Capture a snapshot (default: ai; aria is the accessibility tree)")
.option("--format <aria|ai>", "Snapshot format (default: ai)", "ai")
.option("--target-id <id>", "CDP target id (or unique prefix)")
.option("--limit <n>", "Max nodes (default: 500/800)", (v: string) =>
Number(v),
@@ -55,7 +55,7 @@ export function registerBrowserInspectCommands(
.action(async (opts, cmd) => {
const parent = parentOpts(cmd);
const baseUrl = resolveBrowserControlUrl(parent?.url);
const format = opts.format === "ai" ? "ai" : "aria";
const format = opts.format === "aria" ? "aria" : "ai";
try {
const result = await browserSnapshot(baseUrl, {
format,