feat(browser): add ai snapshot refs + click

This commit is contained in:
Peter Steinberger
2025-12-13 18:48:55 +00:00
parent a59cfa7670
commit ba22890205
6 changed files with 431 additions and 10 deletions

View File

@@ -103,6 +103,13 @@ export type SnapshotResult =
type?: string;
value?: string;
}>;
}
| {
ok: true;
format: "ai";
targetId: string;
url: string;
snapshot: string;
};
function unwrapCause(err: unknown): unknown {
@@ -310,7 +317,7 @@ export async function browserDom(
export async function browserSnapshot(
baseUrl: string,
opts: {
format: "aria" | "domSnapshot";
format: "aria" | "domSnapshot" | "ai";
targetId?: string;
limit?: number;
},
@@ -326,3 +333,24 @@ export async function browserSnapshot(
},
);
}
export async function browserClickRef(
baseUrl: string,
opts: {
ref: string;
targetId?: string;
},
): Promise<{ ok: true; targetId: string; url: string }> {
return await fetchJson<{ ok: true; targetId: string; url: string }>(
`${baseUrl}/click`,
{
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
ref: opts.ref,
targetId: opts.targetId,
}),
timeoutMs: 20000,
},
);
}

143
src/browser/pw-ai.test.ts Normal file
View File

@@ -0,0 +1,143 @@
import { afterEach, describe, expect, it, vi } from "vitest";
vi.mock("playwright-core", () => ({
chromium: {
connectOverCDP: vi.fn(),
},
}));
type FakeSession = {
send: ReturnType<typeof vi.fn>;
detach: ReturnType<typeof vi.fn>;
};
function createPage(opts: {
targetId: string;
snapshotFull?: string;
hasSnapshotForAI?: boolean;
}) {
const session: FakeSession = {
send: vi.fn().mockResolvedValue({
targetInfo: { targetId: opts.targetId },
}),
detach: vi.fn().mockResolvedValue(undefined),
};
const context = {
newCDPSession: vi.fn().mockResolvedValue(session),
};
const click = vi.fn().mockResolvedValue(undefined);
const locator = vi.fn().mockReturnValue({ click });
const page = {
context: () => context,
locator,
...(opts.hasSnapshotForAI === false
? {}
: {
_snapshotForAI: vi
.fn()
.mockResolvedValue({ full: opts.snapshotFull ?? "SNAP" }),
}),
};
return { page, session, locator, click };
}
function createBrowser(pages: unknown[]) {
const ctx = {
pages: () => pages,
};
return {
contexts: () => [ctx],
on: vi.fn(),
close: vi.fn().mockResolvedValue(undefined),
};
}
async function importModule() {
return await import("./pw-ai.js");
}
afterEach(async () => {
const mod = await importModule();
await mod.closePlaywrightBrowserConnection();
vi.clearAllMocks();
});
describe("pw-ai", () => {
it("captures an ai snapshot via Playwright for a specific target", async () => {
const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1", snapshotFull: "ONE" });
const p2 = createPage({ targetId: "T2", snapshotFull: "TWO" });
const browser = createBrowser([p1.page, p2.page]);
(
chromium.connectOverCDP as unknown as ReturnType<typeof vi.fn>
).mockResolvedValue(browser);
const mod = await importModule();
const res = await mod.snapshotAiViaPlaywright({
cdpPort: 18792,
targetId: "T2",
});
expect(res.snapshot).toBe("TWO");
expect(p1.session.detach).toHaveBeenCalledTimes(1);
expect(p2.session.detach).toHaveBeenCalledTimes(1);
});
it("clicks a ref using aria-ref locator", async () => {
const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1" });
const browser = createBrowser([p1.page]);
(
chromium.connectOverCDP as unknown as ReturnType<typeof vi.fn>
).mockResolvedValue(browser);
const mod = await importModule();
await mod.clickRefViaPlaywright({
cdpPort: 18792,
targetId: "T1",
ref: "76",
});
expect(p1.locator).toHaveBeenCalledWith("aria-ref=76");
expect(p1.click).toHaveBeenCalledTimes(1);
});
it("fails with a clear error when _snapshotForAI is missing", async () => {
const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1", hasSnapshotForAI: false });
const browser = createBrowser([p1.page]);
(
chromium.connectOverCDP as unknown as ReturnType<typeof vi.fn>
).mockResolvedValue(browser);
const mod = await importModule();
await expect(
mod.snapshotAiViaPlaywright({ cdpPort: 18792, targetId: "T1" }),
).rejects.toThrow(/_snapshotForAI/i);
});
it("reuses the CDP connection for repeated calls", async () => {
const { chromium } = await import("playwright-core");
const p1 = createPage({ targetId: "T1", snapshotFull: "ONE" });
const browser = createBrowser([p1.page]);
const connect = chromium.connectOverCDP as unknown as ReturnType<
typeof vi.fn
>;
connect.mockResolvedValue(browser);
const mod = await importModule();
await mod.snapshotAiViaPlaywright({ cdpPort: 18792, targetId: "T1" });
await mod.clickRefViaPlaywright({
cdpPort: 18792,
targetId: "T1",
ref: "1",
});
expect(connect).toHaveBeenCalledTimes(1);
});
});

153
src/browser/pw-ai.ts Normal file
View File

@@ -0,0 +1,153 @@
import type { Browser, Page } from "playwright-core";
import { chromium } from "playwright-core";
type SnapshotForAIResult = { full: string; incremental?: string };
type SnapshotForAIOptions = { timeout?: number; track?: string };
type WithSnapshotForAI = {
_snapshotForAI?: (
options?: SnapshotForAIOptions,
) => Promise<SnapshotForAIResult>;
};
type TargetInfoResponse = {
targetInfo?: {
targetId?: string;
};
};
type ConnectedBrowser = {
browser: Browser;
endpoint: string;
};
let cached: ConnectedBrowser | null = null;
let connecting: Promise<ConnectedBrowser> | null = null;
function endpointForCdpPort(cdpPort: number) {
return `http://127.0.0.1:${cdpPort}`;
}
async function connectBrowser(endpoint: string): Promise<ConnectedBrowser> {
if (cached?.endpoint === endpoint) return cached;
if (connecting) return await connecting;
connecting = chromium
.connectOverCDP(endpoint, { timeout: 5000 })
.then((browser) => {
const connected: ConnectedBrowser = { browser, endpoint };
cached = connected;
browser.on("disconnected", () => {
if (cached?.browser === browser) cached = null;
});
return connected;
})
.finally(() => {
connecting = null;
});
return await connecting;
}
async function getAllPages(browser: Browser): Promise<Page[]> {
const contexts = browser.contexts();
const pages = contexts.flatMap((c) => c.pages());
return pages;
}
async function pageTargetId(page: Page): Promise<string | null> {
const session = await page.context().newCDPSession(page);
try {
const info = (await session.send(
"Target.getTargetInfo",
)) as TargetInfoResponse;
const targetId = String(info?.targetInfo?.targetId ?? "").trim();
return targetId || null;
} finally {
await session.detach().catch(() => {});
}
}
async function findPageByTargetId(
browser: Browser,
targetId: string,
): Promise<Page | null> {
const pages = await getAllPages(browser);
for (const page of pages) {
const tid = await pageTargetId(page).catch(() => null);
if (tid && tid === targetId) return page;
}
return null;
}
async function getPageForTargetId(opts: {
cdpPort: number;
targetId?: string;
}): Promise<Page> {
const endpoint = endpointForCdpPort(opts.cdpPort);
const { browser } = await connectBrowser(endpoint);
const pages = await getAllPages(browser);
if (!pages.length)
throw new Error("No pages available in the connected browser.");
const first = pages[0];
if (!opts.targetId) return first;
const found = await findPageByTargetId(browser, opts.targetId);
if (!found) throw new Error("tab not found");
return found;
}
export async function snapshotAiViaPlaywright(opts: {
cdpPort: number;
targetId?: string;
timeoutMs?: number;
}): Promise<{ snapshot: string }> {
const page = await getPageForTargetId({
cdpPort: opts.cdpPort,
targetId: opts.targetId,
});
const maybe = page as unknown as WithSnapshotForAI;
if (!maybe._snapshotForAI) {
throw new Error(
"Playwright _snapshotForAI is not available. Upgrade playwright-core.",
);
}
const result = await maybe._snapshotForAI({
timeout: Math.max(
500,
Math.min(60_000, Math.floor(opts.timeoutMs ?? 5000)),
),
track: "response",
});
return { snapshot: String(result?.full ?? "") };
}
export async function clickRefViaPlaywright(opts: {
cdpPort: number;
targetId?: string;
ref: string;
timeoutMs?: number;
}): Promise<void> {
const ref = String(opts.ref ?? "").trim();
if (!ref) throw new Error("ref is required");
const page = await getPageForTargetId({
cdpPort: opts.cdpPort,
targetId: opts.targetId,
});
await page.locator(`aria-ref=${ref}`).click({
timeout: Math.max(
500,
Math.min(60_000, Math.floor(opts.timeoutMs ?? 8000)),
),
});
}
export async function closePlaywrightBrowserConnection(): Promise<void> {
const cur = cached;
cached = null;
if (!cur) return;
await cur.browser.close().catch(() => {});
}

View File

@@ -26,6 +26,11 @@ import {
resolveBrowserConfig,
shouldStartLocalBrowserServer,
} from "./config.js";
import {
clickRefViaPlaywright,
closePlaywrightBrowserConnection,
snapshotAiViaPlaywright,
} from "./pw-ai.js";
import {
DEFAULT_BROWSER_SCREENSHOT_MAX_BYTES,
DEFAULT_BROWSER_SCREENSHOT_MAX_SIDE,
@@ -522,13 +527,32 @@ export async function startBrowserControlServerFromConfig(
if (!state) return jsonError(res, 503, "browser server not started");
const targetId =
typeof req.query.targetId === "string" ? req.query.targetId.trim() : "";
const format = req.query.format === "domSnapshot" ? "domSnapshot" : "aria";
const format =
req.query.format === "domSnapshot"
? "domSnapshot"
: req.query.format === "ai"
? "ai"
: "aria";
const limit =
typeof req.query.limit === "string" ? Number(req.query.limit) : undefined;
try {
const tab = await ensureTabAvailable(runtime, targetId || undefined);
if (format === "ai") {
const snap = await snapshotAiViaPlaywright({
cdpPort: state.cdpPort,
targetId: tab.targetId,
});
return res.json({
ok: true,
format,
targetId: tab.targetId,
url: tab.url,
...snap,
});
}
if (format === "aria") {
const snap = await snapshotAria({
wsUrl: tab.wsUrl ?? "",
@@ -561,6 +585,30 @@ export async function startBrowserControlServerFromConfig(
}
});
app.post("/click", async (req, res) => {
if (!state) return jsonError(res, 503, "browser server not started");
const ref = String((req.body as { ref?: unknown })?.ref ?? "").trim();
const targetId = String(
(req.body as { targetId?: unknown })?.targetId ?? "",
).trim();
if (!ref) return jsonError(res, 400, "ref is required");
try {
const tab = await ensureTabAvailable(runtime, targetId || undefined);
await clickRefViaPlaywright({
cdpPort: state.cdpPort,
targetId: tab.targetId,
ref,
});
res.json({ ok: true, targetId: tab.targetId, url: tab.url });
} catch (err) {
const mapped = mapTabError(err);
if (mapped) return jsonError(res, mapped.status, mapped.message);
jsonError(res, 500, String(err));
}
});
const port = resolved.controlPort;
const server = await new Promise<Server>((resolve, reject) => {
const s = app.listen(port, "127.0.0.1", () => resolve(s));
@@ -596,6 +644,7 @@ export async function stopBrowserControlServer(
const current = state;
state = null;
try {
await closePlaywrightBrowserConnection();
if (current.running) {
await stopClawdChrome(current.running).catch((err) =>
logWarn(`clawd browser stop failed: ${String(err)}`, runtime),