feat: add selector-based browser actions

This commit is contained in:
Peter Steinberger
2025-12-24 19:52:28 +00:00
parent 523d9ec3c2
commit 27025b71db
11 changed files with 173 additions and 36 deletions

View File

@@ -112,4 +112,4 @@ git commit -m "Add Clawd workspace"
- Canvas UI runs full-screen with native overlays. Avoid placing critical controls in the top-left/top-right/bottom edges; add explicit gutters in the layout and dont rely on safe-area insets. - Canvas UI runs full-screen with native overlays. Avoid placing critical controls in the top-left/top-right/bottom edges; add explicit gutters in the layout and dont rely on safe-area insets.
- For browser-driven verification, use `clawdis browser` (tabs/status/screenshot) with the clawd-managed Chrome profile. - For browser-driven verification, use `clawdis browser` (tabs/status/screenshot) with the clawd-managed Chrome profile.
- For DOM inspection, use `clawdis browser eval|query|dom|snapshot` (and `--json`/`--out` when you need machine output). - For DOM inspection, use `clawdis browser eval|query|dom|snapshot` (and `--json`/`--out` when you need machine output).
- For interactions, use `clawdis browser click|type|hover|drag|select|upload|press|wait|navigate|back|evaluate|run`. - For interactions, use `clawdis browser click|type|hover|drag|select|upload|press|wait|navigate|back|evaluate|run` (click/type accept `--selector`).

View File

@@ -174,7 +174,9 @@ Actions:
- `clawdis browser navigate https://example.com` - `clawdis browser navigate https://example.com`
- `clawdis browser resize 1280 720` - `clawdis browser resize 1280 720`
- `clawdis browser click 12 --double` - `clawdis browser click 12 --double`
- `clawdis browser click --selector 'button.save'`
- `clawdis browser type 23 "hello" --submit` - `clawdis browser type 23 "hello" --submit`
- `clawdis browser type --selector "input[name=q]" "hello"`
- `clawdis browser press Enter` - `clawdis browser press Enter`
- `clawdis browser hover 44` - `clawdis browser hover 44`
- `clawdis browser drag 10 11` - `clawdis browser drag 10 11`
@@ -191,6 +193,7 @@ Notes:
- `upload` and `dialog` are **arming** calls; run them before the click/press that triggers the chooser/dialog. - `upload` and `dialog` are **arming** calls; run them before the click/press that triggers the chooser/dialog.
- The arm default timeout is **2 minutes** (clamped to max 2 minutes); pass `timeoutMs` if you need shorter. - The arm default timeout is **2 minutes** (clamped to max 2 minutes); pass `timeoutMs` if you need shorter.
- `snapshot --format ai` returns AI snapshot markup used for ref-based actions. - `snapshot --format ai` returns AI snapshot markup used for ref-based actions.
- `click`/`type` accept `--selector` to target CSS selectors instead of AI refs.
## Security & privacy notes ## Security & privacy notes

View File

@@ -26,6 +26,7 @@ Core actions:
Notes: Notes:
- Requires `browser.enabled=true` in `~/.clawdis/clawdis.json`. - Requires `browser.enabled=true` in `~/.clawdis/clawdis.json`.
- Uses `browser.controlUrl` unless `controlUrl` is passed explicitly. - Uses `browser.controlUrl` unless `controlUrl` is passed explicitly.
- `act` supports CSS selectors for `click`/`type` via `selector` (use `ref` for AI snapshot targets).
### `clawdis_canvas` ### `clawdis_canvas`
Drive the node Canvas (present, eval, snapshot, A2UI). Drive the node Canvas (present, eval, snapshot, A2UI).

View File

@@ -356,6 +356,7 @@ const BrowserActSchema = Type.Object({
Type.Literal("close"), Type.Literal("close"),
]), ]),
ref: Type.Optional(Type.String()), ref: Type.Optional(Type.String()),
selector: Type.Optional(Type.String()),
targetId: Type.Optional(Type.String()), targetId: Type.Optional(Type.String()),
doubleClick: Type.Optional(Type.Boolean()), doubleClick: Type.Optional(Type.Boolean()),
button: Type.Optional(Type.String()), button: Type.Optional(Type.String()),

View File

@@ -14,7 +14,8 @@ export type BrowserFormField = {
export type BrowserActRequest = export type BrowserActRequest =
| { | {
kind: "click"; kind: "click";
ref: string; ref?: string;
selector?: string;
targetId?: string; targetId?: string;
doubleClick?: boolean; doubleClick?: boolean;
button?: string; button?: string;
@@ -22,7 +23,8 @@ export type BrowserActRequest =
} }
| { | {
kind: "type"; kind: "type";
ref: string; ref?: string;
selector?: string;
text: string; text: string;
targetId?: string; targetId?: string;
submit?: boolean; submit?: boolean;

View File

@@ -29,7 +29,8 @@ function createPage(opts: {
const click = vi.fn().mockResolvedValue(undefined); const click = vi.fn().mockResolvedValue(undefined);
const dblclick = vi.fn().mockResolvedValue(undefined); const dblclick = vi.fn().mockResolvedValue(undefined);
const locator = vi.fn().mockReturnValue({ click, dblclick }); const fill = vi.fn().mockResolvedValue(undefined);
const locator = vi.fn().mockReturnValue({ click, dblclick, fill });
const page = { const page = {
context: () => context, context: () => context,
@@ -44,7 +45,7 @@ function createPage(opts: {
}), }),
}; };
return { page, session, locator, click }; return { page, session, locator, click, fill };
} }
function createBrowser(pages: unknown[]) { function createBrowser(pages: unknown[]) {
@@ -110,6 +111,45 @@ describe("pw-ai", () => {
expect(p1.click).toHaveBeenCalledTimes(1); expect(p1.click).toHaveBeenCalledTimes(1);
}); });
it("clicks a css selector when provided", async () => {
const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1" });
const browser = createBrowser([p1.page]);
(
chromium.connectOverCDP as unknown as ReturnType<typeof vi.fn>
).mockResolvedValue(browser);
const mod = await importModule();
await mod.clickViaPlaywright({
cdpPort: 18792,
targetId: "T1",
selector: "button.save",
});
expect(p1.locator).toHaveBeenCalledWith("button.save");
expect(p1.click).toHaveBeenCalledTimes(1);
});
it("types via css selector when provided", async () => {
const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1" });
const browser = createBrowser([p1.page]);
(
chromium.connectOverCDP as unknown as ReturnType<typeof vi.fn>
).mockResolvedValue(browser);
const mod = await importModule();
await mod.typeViaPlaywright({
cdpPort: 18792,
targetId: "T1",
selector: "input[name=q]",
text: "hello",
});
expect(p1.locator).toHaveBeenCalledWith("input[name=q]");
expect(p1.fill).toHaveBeenCalledTimes(1);
});
it("fails with a clear error when _snapshotForAI is missing", async () => { it("fails with a clear error when _snapshotForAI is missing", async () => {
const { chromium } = await import("playwright-core"); const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1", hasSnapshotForAI: false }); const p1 = createPage({ targetId: "T1", hasSnapshotForAI: false });

View File

@@ -10,6 +10,19 @@ import {
let nextUploadArmId = 0; let nextUploadArmId = 0;
let nextDialogArmId = 0; let nextDialogArmId = 0;
type LocatorPage = Parameters<typeof refLocator>[0];
function resolveLocator(
page: LocatorPage,
opts: { ref?: string; selector?: string },
) {
const selector = typeof opts.selector === "string" ? opts.selector.trim() : "";
if (selector) return page.locator(selector);
const ref = typeof opts.ref === "string" ? opts.ref.trim() : "";
if (ref) return refLocator(page, ref);
throw new Error("ref or selector is required");
}
export async function snapshotAiViaPlaywright(opts: { export async function snapshotAiViaPlaywright(opts: {
cdpPort: number; cdpPort: number;
targetId?: string; targetId?: string;
@@ -41,21 +54,22 @@ export async function snapshotAiViaPlaywright(opts: {
export async function clickViaPlaywright(opts: { export async function clickViaPlaywright(opts: {
cdpPort: number; cdpPort: number;
targetId?: string; targetId?: string;
ref: string; ref?: string;
selector?: string;
doubleClick?: boolean; doubleClick?: boolean;
button?: "left" | "right" | "middle"; button?: "left" | "right" | "middle";
modifiers?: Array<"Alt" | "Control" | "ControlOrMeta" | "Meta" | "Shift">; modifiers?: Array<"Alt" | "Control" | "ControlOrMeta" | "Meta" | "Shift">;
timeoutMs?: number; timeoutMs?: number;
}): Promise<void> { }): Promise<void> {
const ref = String(opts.ref ?? "").trim();
if (!ref) throw new Error("ref is required");
const page = await getPageForTargetId({ const page = await getPageForTargetId({
cdpPort: opts.cdpPort, cdpPort: opts.cdpPort,
targetId: opts.targetId, targetId: opts.targetId,
}); });
ensurePageState(page); ensurePageState(page);
const locator = refLocator(page, ref); const locator = resolveLocator(page, {
ref: opts.ref,
selector: opts.selector,
});
const timeout = Math.max( const timeout = Math.max(
500, 500,
Math.min(60_000, Math.floor(opts.timeoutMs ?? 8000)), Math.min(60_000, Math.floor(opts.timeoutMs ?? 8000)),
@@ -142,18 +156,20 @@ export async function pressKeyViaPlaywright(opts: {
export async function typeViaPlaywright(opts: { export async function typeViaPlaywright(opts: {
cdpPort: number; cdpPort: number;
targetId?: string; targetId?: string;
ref: string; ref?: string;
selector?: string;
text: string; text: string;
submit?: boolean; submit?: boolean;
slowly?: boolean; slowly?: boolean;
timeoutMs?: number; timeoutMs?: number;
}): Promise<void> { }): Promise<void> {
const ref = String(opts.ref ?? "").trim();
if (!ref) throw new Error("ref is required");
const text = String(opts.text ?? ""); const text = String(opts.text ?? "");
const page = await getPageForTargetId(opts); const page = await getPageForTargetId(opts);
ensurePageState(page); ensurePageState(page);
const locator = refLocator(page, ref); const locator = resolveLocator(page, {
ref: opts.ref,
selector: opts.selector,
});
const timeout = Math.max(500, Math.min(60_000, opts.timeoutMs ?? 8000)); const timeout = Math.max(500, Math.min(60_000, opts.timeoutMs ?? 8000));
if (opts.slowly) { if (opts.slowly) {
await locator.click({ timeout }); await locator.click({ timeout });

View File

@@ -139,7 +139,9 @@ export function registerBrowserAgentRoutes(
switch (kind) { switch (kind) {
case "click": { case "click": {
const ref = toStringOrEmpty(body.ref); const ref = toStringOrEmpty(body.ref);
if (!ref) return jsonError(res, 400, "ref is required"); const selector = toStringOrEmpty(body.selector);
if (!ref && !selector)
return jsonError(res, 400, "ref or selector is required");
const doubleClick = toBoolean(body.doubleClick) ?? false; const doubleClick = toBoolean(body.doubleClick) ?? false;
const buttonRaw = toStringOrEmpty(body.button) || ""; const buttonRaw = toStringOrEmpty(body.button) || "";
const button = buttonRaw ? parseClickButton(buttonRaw) : undefined; const button = buttonRaw ? parseClickButton(buttonRaw) : undefined;
@@ -166,32 +168,38 @@ export function registerBrowserAgentRoutes(
const modifiers = modifiersRaw.length const modifiers = modifiersRaw.length
? (modifiersRaw as ClickModifier[]) ? (modifiersRaw as ClickModifier[])
: undefined; : undefined;
await pw.clickViaPlaywright({ const clickRequest: Parameters<typeof pw.clickViaPlaywright>[0] = {
cdpPort, cdpPort,
targetId: tab.targetId, targetId: tab.targetId,
ref,
doubleClick, doubleClick,
button, };
modifiers, if (ref) clickRequest.ref = ref;
}); if (selector) clickRequest.selector = selector;
if (button) clickRequest.button = button;
if (modifiers) clickRequest.modifiers = modifiers;
await pw.clickViaPlaywright(clickRequest);
return res.json({ ok: true, targetId: tab.targetId, url: tab.url }); return res.json({ ok: true, targetId: tab.targetId, url: tab.url });
} }
case "type": { case "type": {
const ref = toStringOrEmpty(body.ref); const ref = toStringOrEmpty(body.ref);
if (!ref) return jsonError(res, 400, "ref is required"); const selector = toStringOrEmpty(body.selector);
if (!ref && !selector)
return jsonError(res, 400, "ref or selector is required");
if (typeof body.text !== "string") if (typeof body.text !== "string")
return jsonError(res, 400, "text is required"); return jsonError(res, 400, "text is required");
const text = body.text; const text = body.text;
const submit = toBoolean(body.submit) ?? false; const submit = toBoolean(body.submit) ?? false;
const slowly = toBoolean(body.slowly) ?? false; const slowly = toBoolean(body.slowly) ?? false;
await pw.typeViaPlaywright({ const typeRequest: Parameters<typeof pw.typeViaPlaywright>[0] = {
cdpPort, cdpPort,
targetId: tab.targetId, targetId: tab.targetId,
ref,
text, text,
submit, submit,
slowly, slowly,
}); };
if (ref) typeRequest.ref = ref;
if (selector) typeRequest.selector = selector;
await pw.typeViaPlaywright(typeRequest);
return res.json({ ok: true, targetId: tab.targetId }); return res.json({ ok: true, targetId: tab.targetId });
} }
case "press": { case "press": {

View File

@@ -318,7 +318,7 @@ describe("browser control server", () => {
}), }),
}).then((r) => r.json())) as { ok: boolean }; }).then((r) => r.json())) as { ok: boolean };
expect(click.ok).toBe(true); expect(click.ok).toBe(true);
expect(pwMocks.clickViaPlaywright).toHaveBeenCalledWith({ expect(pwMocks.clickViaPlaywright).toHaveBeenNthCalledWith(1, {
cdpPort: testPort + 1, cdpPort: testPort + 1,
targetId: "abcd1234", targetId: "abcd1234",
ref: "1", ref: "1",
@@ -327,13 +327,29 @@ describe("browser control server", () => {
modifiers: ["Shift"], modifiers: ["Shift"],
}); });
const clickSelector = (await realFetch(`${base}/act`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
kind: "click",
selector: "button.save",
}),
}).then((r) => r.json())) as { ok: boolean };
expect(clickSelector.ok).toBe(true);
expect(pwMocks.clickViaPlaywright).toHaveBeenNthCalledWith(2, {
cdpPort: testPort + 1,
targetId: "abcd1234",
selector: "button.save",
doubleClick: false,
});
const type = (await realFetch(`${base}/act`, { const type = (await realFetch(`${base}/act`, {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json" }, headers: { "Content-Type": "application/json" },
body: JSON.stringify({ kind: "type", ref: "1", text: "" }), body: JSON.stringify({ kind: "type", ref: "1", text: "" }),
}).then((r) => r.json())) as { ok: boolean }; }).then((r) => r.json())) as { ok: boolean };
expect(type.ok).toBe(true); expect(type.ok).toBe(true);
expect(pwMocks.typeViaPlaywright).toHaveBeenCalledWith({ expect(pwMocks.typeViaPlaywright).toHaveBeenNthCalledWith(1, {
cdpPort: testPort + 1, cdpPort: testPort + 1,
targetId: "abcd1234", targetId: "abcd1234",
ref: "1", ref: "1",
@@ -342,6 +358,26 @@ describe("browser control server", () => {
slowly: false, slowly: false,
}); });
const typeSelector = (await realFetch(`${base}/act`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
kind: "type",
selector: "input[name=q]",
text: "hello",
submit: true,
}),
}).then((r) => r.json())) as { ok: boolean };
expect(typeSelector.ok).toBe(true);
expect(pwMocks.typeViaPlaywright).toHaveBeenNthCalledWith(2, {
cdpPort: testPort + 1,
targetId: "abcd1234",
selector: "input[name=q]",
text: "hello",
submit: true,
slowly: false,
});
const press = (await realFetch(`${base}/act`, { const press = (await realFetch(`${base}/act`, {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json" }, headers: { "Content-Type": "application/json" },

View File

@@ -114,15 +114,24 @@ export function registerBrowserActionInputCommands(
browser browser
.command("click") .command("click")
.description("Click an element by ref from an ai snapshot (e.g. 76)") .description("Click an element by ai ref or CSS selector")
.argument("<ref>", "Ref id from ai snapshot") .argument("[ref]", "Ref id from ai snapshot")
.option("--selector <css>", "CSS selector (instead of ref)")
.option("--target-id <id>", "CDP target id (or unique prefix)") .option("--target-id <id>", "CDP target id (or unique prefix)")
.option("--double", "Double click", false) .option("--double", "Double click", false)
.option("--button <left|right|middle>", "Mouse button to use") .option("--button <left|right|middle>", "Mouse button to use")
.option("--modifiers <list>", "Comma-separated modifiers (Shift,Alt,Meta)") .option("--modifiers <list>", "Comma-separated modifiers (Shift,Alt,Meta)")
.action(async (ref: string, opts, cmd) => { .action(async (ref: string | undefined, opts, cmd) => {
const parent = parentOpts(cmd); const parent = parentOpts(cmd);
const baseUrl = resolveBrowserControlUrl(parent?.url); const baseUrl = resolveBrowserControlUrl(parent?.url);
const selector =
typeof opts.selector === "string" ? opts.selector.trim() : "";
const refValue = typeof ref === "string" ? ref.trim() : "";
if (!selector && !refValue) {
defaultRuntime.error(danger("ref or --selector is required"));
defaultRuntime.exit(1);
return;
}
const modifiers = opts.modifiers const modifiers = opts.modifiers
? String(opts.modifiers) ? String(opts.modifiers)
.split(",") .split(",")
@@ -132,7 +141,8 @@ export function registerBrowserActionInputCommands(
try { try {
const result = await browserAct(baseUrl, { const result = await browserAct(baseUrl, {
kind: "click", kind: "click",
ref, ref: refValue || undefined,
selector: selector || undefined,
targetId: opts.targetId?.trim() || undefined, targetId: opts.targetId?.trim() || undefined,
doubleClick: Boolean(opts.double), doubleClick: Boolean(opts.double),
button: opts.button?.trim() || undefined, button: opts.button?.trim() || undefined,
@@ -143,7 +153,11 @@ export function registerBrowserActionInputCommands(
return; return;
} }
const suffix = result.url ? ` on ${result.url}` : ""; const suffix = result.url ? ` on ${result.url}` : "";
defaultRuntime.log(`clicked ref ${ref}${suffix}`); if (selector) {
defaultRuntime.log(`clicked ${selector}${suffix}`);
} else {
defaultRuntime.log(`clicked ref ${refValue}${suffix}`);
}
} catch (err) { } catch (err) {
defaultRuntime.error(danger(String(err))); defaultRuntime.error(danger(String(err)));
defaultRuntime.exit(1); defaultRuntime.exit(1);
@@ -152,19 +166,29 @@ export function registerBrowserActionInputCommands(
browser browser
.command("type") .command("type")
.description("Type into an element by ai ref") .description("Type into an element by ai ref or CSS selector")
.argument("<ref>", "Ref id from ai snapshot") .argument("[ref]", "Ref id from ai snapshot")
.argument("<text>", "Text to type") .argument("<text>", "Text to type")
.option("--selector <css>", "CSS selector (instead of ref)")
.option("--submit", "Press Enter after typing", false) .option("--submit", "Press Enter after typing", false)
.option("--slowly", "Type slowly (human-like)", false) .option("--slowly", "Type slowly (human-like)", false)
.option("--target-id <id>", "CDP target id (or unique prefix)") .option("--target-id <id>", "CDP target id (or unique prefix)")
.action(async (ref: string, text: string, opts, cmd) => { .action(async (ref: string | undefined, text: string, opts, cmd) => {
const parent = parentOpts(cmd); const parent = parentOpts(cmd);
const baseUrl = resolveBrowserControlUrl(parent?.url); const baseUrl = resolveBrowserControlUrl(parent?.url);
const selector =
typeof opts.selector === "string" ? opts.selector.trim() : "";
const refValue = typeof ref === "string" ? ref.trim() : "";
if (!selector && !refValue) {
defaultRuntime.error(danger("ref or --selector is required"));
defaultRuntime.exit(1);
return;
}
try { try {
const result = await browserAct(baseUrl, { const result = await browserAct(baseUrl, {
kind: "type", kind: "type",
ref, ref: refValue || undefined,
selector: selector || undefined,
text, text,
submit: Boolean(opts.submit), submit: Boolean(opts.submit),
slowly: Boolean(opts.slowly), slowly: Boolean(opts.slowly),
@@ -174,7 +198,11 @@ export function registerBrowserActionInputCommands(
defaultRuntime.log(JSON.stringify(result, null, 2)); defaultRuntime.log(JSON.stringify(result, null, 2));
return; return;
} }
defaultRuntime.log(`typed into ref ${ref}`); if (selector) {
defaultRuntime.log(`typed into ${selector}`);
} else {
defaultRuntime.log(`typed into ref ${refValue}`);
}
} catch (err) { } catch (err) {
defaultRuntime.error(danger(String(err))); defaultRuntime.error(danger(String(err)));
defaultRuntime.exit(1); defaultRuntime.exit(1);

View File

@@ -17,7 +17,9 @@ export const browserActionExamples = [
"clawdis browser navigate https://example.com", "clawdis browser navigate https://example.com",
"clawdis browser resize 1280 720", "clawdis browser resize 1280 720",
"clawdis browser click 12 --double", "clawdis browser click 12 --double",
"clawdis browser click --selector 'button.save'",
'clawdis browser type 23 "hello" --submit', 'clawdis browser type 23 "hello" --submit',
'clawdis browser type --selector "input[name=q]" "hello"',
"clawdis browser press Enter", "clawdis browser press Enter",
"clawdis browser hover 44", "clawdis browser hover 44",
"clawdis browser drag 10 11", "clawdis browser drag 10 11",