From ba22890205c9213f4cc703837b7284d8ea06d7c4 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 13 Dec 2025 18:48:55 +0000 Subject: [PATCH] feat(browser): add ai snapshot refs + click --- docs/agents.md | 2 + src/browser/client.ts | 30 +++++++- src/browser/pw-ai.test.ts | 143 +++++++++++++++++++++++++++++++++++ src/browser/pw-ai.ts | 153 ++++++++++++++++++++++++++++++++++++++ src/browser/server.ts | 51 ++++++++++++- src/cli/program.ts | 62 +++++++++++++-- 6 files changed, 431 insertions(+), 10 deletions(-) create mode 100644 src/browser/pw-ai.test.ts create mode 100644 src/browser/pw-ai.ts diff --git a/docs/agents.md b/docs/agents.md index 1090d2cf6..1a0c4f67f 100644 --- a/docs/agents.md +++ b/docs/agents.md @@ -90,6 +90,8 @@ RPC mode emits structured tool lifecycle events (start/result) and assistant out If you enable the clawd-managed browser (default on), the agent can use: - `clawdis browser status` / `tabs` / `open ` / `screenshot [targetId]` +- `clawdis browser snapshot --format ai` (returns an AI snapshot with `[ref=…]` ids) +- `clawdis browser click ` (click by ref from an AI snapshot) This uses a dedicated Chrome/Chromium profile (lobster-orange by default) so it doesn’t interfere with your daily browser. diff --git a/src/browser/client.ts b/src/browser/client.ts index 369496886..670fb2b5a 100644 --- a/src/browser/client.ts +++ b/src/browser/client.ts @@ -103,6 +103,13 @@ export type SnapshotResult = type?: string; value?: string; }>; + } + | { + ok: true; + format: "ai"; + targetId: string; + url: string; + snapshot: string; }; function unwrapCause(err: unknown): unknown { @@ -310,7 +317,7 @@ export async function browserDom( export async function browserSnapshot( baseUrl: string, opts: { - format: "aria" | "domSnapshot"; + format: "aria" | "domSnapshot" | "ai"; targetId?: string; limit?: number; }, @@ -326,3 +333,24 @@ export async function browserSnapshot( }, ); } + +export async function browserClickRef( + baseUrl: string, + opts: { + ref: string; + targetId?: string; + }, +): Promise<{ ok: true; targetId: string; url: string }> { + return await fetchJson<{ ok: true; targetId: string; url: string }>( + `${baseUrl}/click`, + { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + ref: opts.ref, + targetId: opts.targetId, + }), + timeoutMs: 20000, + }, + ); +} diff --git a/src/browser/pw-ai.test.ts b/src/browser/pw-ai.test.ts new file mode 100644 index 000000000..787a8a66f --- /dev/null +++ b/src/browser/pw-ai.test.ts @@ -0,0 +1,143 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +vi.mock("playwright-core", () => ({ + chromium: { + connectOverCDP: vi.fn(), + }, +})); + +type FakeSession = { + send: ReturnType; + detach: ReturnType; +}; + +function createPage(opts: { + targetId: string; + snapshotFull?: string; + hasSnapshotForAI?: boolean; +}) { + const session: FakeSession = { + send: vi.fn().mockResolvedValue({ + targetInfo: { targetId: opts.targetId }, + }), + detach: vi.fn().mockResolvedValue(undefined), + }; + + const context = { + newCDPSession: vi.fn().mockResolvedValue(session), + }; + + const click = vi.fn().mockResolvedValue(undefined); + const locator = vi.fn().mockReturnValue({ click }); + + const page = { + context: () => context, + locator, + ...(opts.hasSnapshotForAI === false + ? {} + : { + _snapshotForAI: vi + .fn() + .mockResolvedValue({ full: opts.snapshotFull ?? "SNAP" }), + }), + }; + + return { page, session, locator, click }; +} + +function createBrowser(pages: unknown[]) { + const ctx = { + pages: () => pages, + }; + return { + contexts: () => [ctx], + on: vi.fn(), + close: vi.fn().mockResolvedValue(undefined), + }; +} + +async function importModule() { + return await import("./pw-ai.js"); +} + +afterEach(async () => { + const mod = await importModule(); + await mod.closePlaywrightBrowserConnection(); + vi.clearAllMocks(); +}); + +describe("pw-ai", () => { + it("captures an ai snapshot via Playwright for a specific target", async () => { + const { chromium } = await import("playwright-core"); + const p1 = createPage({ targetId: "T1", snapshotFull: "ONE" }); + const p2 = createPage({ targetId: "T2", snapshotFull: "TWO" }); + const browser = createBrowser([p1.page, p2.page]); + + ( + chromium.connectOverCDP as unknown as ReturnType + ).mockResolvedValue(browser); + + const mod = await importModule(); + const res = await mod.snapshotAiViaPlaywright({ + cdpPort: 18792, + targetId: "T2", + }); + + expect(res.snapshot).toBe("TWO"); + expect(p1.session.detach).toHaveBeenCalledTimes(1); + expect(p2.session.detach).toHaveBeenCalledTimes(1); + }); + + it("clicks a ref using aria-ref locator", async () => { + const { chromium } = await import("playwright-core"); + const p1 = createPage({ targetId: "T1" }); + const browser = createBrowser([p1.page]); + ( + chromium.connectOverCDP as unknown as ReturnType + ).mockResolvedValue(browser); + + const mod = await importModule(); + await mod.clickRefViaPlaywright({ + cdpPort: 18792, + targetId: "T1", + ref: "76", + }); + + expect(p1.locator).toHaveBeenCalledWith("aria-ref=76"); + expect(p1.click).toHaveBeenCalledTimes(1); + }); + + it("fails with a clear error when _snapshotForAI is missing", async () => { + const { chromium } = await import("playwright-core"); + const p1 = createPage({ targetId: "T1", hasSnapshotForAI: false }); + const browser = createBrowser([p1.page]); + ( + chromium.connectOverCDP as unknown as ReturnType + ).mockResolvedValue(browser); + + const mod = await importModule(); + await expect( + mod.snapshotAiViaPlaywright({ cdpPort: 18792, targetId: "T1" }), + ).rejects.toThrow(/_snapshotForAI/i); + }); + + it("reuses the CDP connection for repeated calls", async () => { + const { chromium } = await import("playwright-core"); + const p1 = createPage({ targetId: "T1", snapshotFull: "ONE" }); + const browser = createBrowser([p1.page]); + const connect = chromium.connectOverCDP as unknown as ReturnType< + typeof vi.fn + >; + connect.mockResolvedValue(browser); + + const mod = await importModule(); + await mod.snapshotAiViaPlaywright({ cdpPort: 18792, targetId: "T1" }); + await mod.clickRefViaPlaywright({ + cdpPort: 18792, + targetId: "T1", + ref: "1", + }); + + expect(connect).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/browser/pw-ai.ts b/src/browser/pw-ai.ts new file mode 100644 index 000000000..57d8168dd --- /dev/null +++ b/src/browser/pw-ai.ts @@ -0,0 +1,153 @@ +import type { Browser, Page } from "playwright-core"; +import { chromium } from "playwright-core"; + +type SnapshotForAIResult = { full: string; incremental?: string }; +type SnapshotForAIOptions = { timeout?: number; track?: string }; + +type WithSnapshotForAI = { + _snapshotForAI?: ( + options?: SnapshotForAIOptions, + ) => Promise; +}; + +type TargetInfoResponse = { + targetInfo?: { + targetId?: string; + }; +}; + +type ConnectedBrowser = { + browser: Browser; + endpoint: string; +}; + +let cached: ConnectedBrowser | null = null; +let connecting: Promise | null = null; + +function endpointForCdpPort(cdpPort: number) { + return `http://127.0.0.1:${cdpPort}`; +} + +async function connectBrowser(endpoint: string): Promise { + if (cached?.endpoint === endpoint) return cached; + if (connecting) return await connecting; + + connecting = chromium + .connectOverCDP(endpoint, { timeout: 5000 }) + .then((browser) => { + const connected: ConnectedBrowser = { browser, endpoint }; + cached = connected; + browser.on("disconnected", () => { + if (cached?.browser === browser) cached = null; + }); + return connected; + }) + .finally(() => { + connecting = null; + }); + + return await connecting; +} + +async function getAllPages(browser: Browser): Promise { + const contexts = browser.contexts(); + const pages = contexts.flatMap((c) => c.pages()); + return pages; +} + +async function pageTargetId(page: Page): Promise { + const session = await page.context().newCDPSession(page); + try { + const info = (await session.send( + "Target.getTargetInfo", + )) as TargetInfoResponse; + const targetId = String(info?.targetInfo?.targetId ?? "").trim(); + return targetId || null; + } finally { + await session.detach().catch(() => {}); + } +} + +async function findPageByTargetId( + browser: Browser, + targetId: string, +): Promise { + const pages = await getAllPages(browser); + for (const page of pages) { + const tid = await pageTargetId(page).catch(() => null); + if (tid && tid === targetId) return page; + } + return null; +} + +async function getPageForTargetId(opts: { + cdpPort: number; + targetId?: string; +}): Promise { + const endpoint = endpointForCdpPort(opts.cdpPort); + const { browser } = await connectBrowser(endpoint); + const pages = await getAllPages(browser); + if (!pages.length) + throw new Error("No pages available in the connected browser."); + const first = pages[0]; + if (!opts.targetId) return first; + const found = await findPageByTargetId(browser, opts.targetId); + if (!found) throw new Error("tab not found"); + return found; +} + +export async function snapshotAiViaPlaywright(opts: { + cdpPort: number; + targetId?: string; + timeoutMs?: number; +}): Promise<{ snapshot: string }> { + const page = await getPageForTargetId({ + cdpPort: opts.cdpPort, + targetId: opts.targetId, + }); + + const maybe = page as unknown as WithSnapshotForAI; + if (!maybe._snapshotForAI) { + throw new Error( + "Playwright _snapshotForAI is not available. Upgrade playwright-core.", + ); + } + + const result = await maybe._snapshotForAI({ + timeout: Math.max( + 500, + Math.min(60_000, Math.floor(opts.timeoutMs ?? 5000)), + ), + track: "response", + }); + return { snapshot: String(result?.full ?? "") }; +} + +export async function clickRefViaPlaywright(opts: { + cdpPort: number; + targetId?: string; + ref: string; + timeoutMs?: number; +}): Promise { + const ref = String(opts.ref ?? "").trim(); + if (!ref) throw new Error("ref is required"); + + const page = await getPageForTargetId({ + cdpPort: opts.cdpPort, + targetId: opts.targetId, + }); + + await page.locator(`aria-ref=${ref}`).click({ + timeout: Math.max( + 500, + Math.min(60_000, Math.floor(opts.timeoutMs ?? 8000)), + ), + }); +} + +export async function closePlaywrightBrowserConnection(): Promise { + const cur = cached; + cached = null; + if (!cur) return; + await cur.browser.close().catch(() => {}); +} diff --git a/src/browser/server.ts b/src/browser/server.ts index 7b57718d8..2158c4eca 100644 --- a/src/browser/server.ts +++ b/src/browser/server.ts @@ -26,6 +26,11 @@ import { resolveBrowserConfig, shouldStartLocalBrowserServer, } from "./config.js"; +import { + clickRefViaPlaywright, + closePlaywrightBrowserConnection, + snapshotAiViaPlaywright, +} from "./pw-ai.js"; import { DEFAULT_BROWSER_SCREENSHOT_MAX_BYTES, DEFAULT_BROWSER_SCREENSHOT_MAX_SIDE, @@ -522,13 +527,32 @@ export async function startBrowserControlServerFromConfig( if (!state) return jsonError(res, 503, "browser server not started"); const targetId = typeof req.query.targetId === "string" ? req.query.targetId.trim() : ""; - const format = req.query.format === "domSnapshot" ? "domSnapshot" : "aria"; + const format = + req.query.format === "domSnapshot" + ? "domSnapshot" + : req.query.format === "ai" + ? "ai" + : "aria"; const limit = typeof req.query.limit === "string" ? Number(req.query.limit) : undefined; try { const tab = await ensureTabAvailable(runtime, targetId || undefined); + if (format === "ai") { + const snap = await snapshotAiViaPlaywright({ + cdpPort: state.cdpPort, + targetId: tab.targetId, + }); + return res.json({ + ok: true, + format, + targetId: tab.targetId, + url: tab.url, + ...snap, + }); + } + if (format === "aria") { const snap = await snapshotAria({ wsUrl: tab.wsUrl ?? "", @@ -561,6 +585,30 @@ export async function startBrowserControlServerFromConfig( } }); + app.post("/click", async (req, res) => { + if (!state) return jsonError(res, 503, "browser server not started"); + const ref = String((req.body as { ref?: unknown })?.ref ?? "").trim(); + const targetId = String( + (req.body as { targetId?: unknown })?.targetId ?? "", + ).trim(); + + if (!ref) return jsonError(res, 400, "ref is required"); + + try { + const tab = await ensureTabAvailable(runtime, targetId || undefined); + await clickRefViaPlaywright({ + cdpPort: state.cdpPort, + targetId: tab.targetId, + ref, + }); + res.json({ ok: true, targetId: tab.targetId, url: tab.url }); + } catch (err) { + const mapped = mapTabError(err); + if (mapped) return jsonError(res, mapped.status, mapped.message); + jsonError(res, 500, String(err)); + } + }); + const port = resolved.controlPort; const server = await new Promise((resolve, reject) => { const s = app.listen(port, "127.0.0.1", () => resolve(s)); @@ -596,6 +644,7 @@ export async function stopBrowserControlServer( const current = state; state = null; try { + await closePlaywrightBrowserConnection(); if (current.running) { await stopClawdChrome(current.running).catch((err) => logWarn(`clawd browser stop failed: ${String(err)}`, runtime), diff --git a/src/cli/program.ts b/src/cli/program.ts index ddf0993a4..bb942d564 100644 --- a/src/cli/program.ts +++ b/src/cli/program.ts @@ -1,6 +1,7 @@ import chalk from "chalk"; import { Command } from "commander"; import { + browserClickRef, browserCloseTab, browserDom, browserEval, @@ -441,6 +442,8 @@ Examples: clawdis browser query "a" --limit 5 clawdis browser dom --format text --max-chars 5000 clawdis browser snapshot --format aria --limit 200 + clawdis browser snapshot --format ai + clawdis browser click 76 `, ) .action(() => { @@ -803,9 +806,9 @@ Examples: browser .command("snapshot") - .description("Capture an AI-friendly snapshot (aria or domSnapshot)") + .description("Capture an AI-friendly snapshot (aria, domSnapshot, or ai)") .option( - "--format ", + "--format ", "Snapshot format (default: aria)", "aria", ) @@ -813,11 +816,16 @@ Examples: .option("--limit ", "Max nodes (default: 500/800)", (v: string) => Number(v), ) - .option("--out ", "Write JSON snapshot to a file") + .option("--out ", "Write snapshot to a file") .action(async (opts, cmd) => { const parent = parentOpts(cmd); const baseUrl = resolveBrowserControlUrl(parent?.url); - const format = opts.format === "domSnapshot" ? "domSnapshot" : "aria"; + const format = + opts.format === "domSnapshot" + ? "domSnapshot" + : opts.format === "ai" + ? "ai" + : "aria"; try { const result = await browserSnapshot(baseUrl, { format, @@ -825,10 +833,14 @@ Examples: limit: Number.isFinite(opts.limit) ? opts.limit : undefined, }); - const payload = JSON.stringify(result, null, 2); if (opts.out) { const fs = await import("node:fs/promises"); - await fs.writeFile(opts.out, payload, "utf8"); + if (result.format === "ai") { + await fs.writeFile(opts.out, result.snapshot, "utf8"); + } else { + const payload = JSON.stringify(result, null, 2); + await fs.writeFile(opts.out, payload, "utf8"); + } if (parent?.json) { defaultRuntime.log( JSON.stringify({ ok: true, out: opts.out }, null, 2), @@ -839,8 +851,18 @@ Examples: return; } - if (parent?.json || format === "domSnapshot") { - defaultRuntime.log(payload); + if (parent?.json) { + defaultRuntime.log(JSON.stringify(result, null, 2)); + return; + } + + if (result.format === "ai") { + defaultRuntime.log(result.snapshot); + return; + } + + if (result.format === "domSnapshot") { + defaultRuntime.log(JSON.stringify(result, null, 2)); return; } @@ -862,5 +884,29 @@ Examples: } }); + browser + .command("click") + .description("Click an element by ref from an ai snapshot (e.g. 76)") + .argument("", "Ref id from ai snapshot") + .option("--target-id ", "CDP target id (or unique prefix)") + .action(async (ref: string, opts, cmd) => { + const parent = parentOpts(cmd); + const baseUrl = resolveBrowserControlUrl(parent?.url); + try { + const result = await browserClickRef(baseUrl, { + ref, + targetId: opts.targetId?.trim() || undefined, + }); + if (parent?.json) { + defaultRuntime.log(JSON.stringify(result, null, 2)); + return; + } + defaultRuntime.log(`clicked ref ${ref} on ${result.url}`); + } catch (err) { + defaultRuntime.error(danger(String(err))); + defaultRuntime.exit(1); + } + }); + return program; }