diff --git a/CHANGELOG.md b/CHANGELOG.md index 753a7a42d..09a8b8093 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ ### Fixes - Docs/agent tools: clarify that browser `wait` should be avoided by default and used only in exceptional cases. -- Browser tools: `upload` can auto-click a ref after arming and now emits input/change events after `setFiles` so sites like X pick up attachments. +- Browser tools: `upload` supports auto-click refs, direct `inputRef`/`element` file inputs, and emits input/change after `setFiles` so JS-heavy sites pick up attachments. - macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background. - macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries. - macOS Debug: hide “Restart Gateway” when the app won’t start a local gateway (remote mode / attach-only). diff --git a/docs/browser.md b/docs/browser.md index 662aa2c01..df49984c3 100644 --- a/docs/browser.md +++ b/docs/browser.md @@ -191,6 +191,7 @@ Actions: Notes: - `upload` and `dialog` are **arming** calls; run them before the click/press that triggers the chooser/dialog. - `upload` can take a `ref` to auto-click after arming (useful for single-step file uploads). +- `upload` can also take `inputRef` (aria ref) or `element` (CSS selector) to set `` directly without waiting for a file chooser. - The arm default timeout is **2 minutes** (clamped to max 2 minutes); pass `timeoutMs` if you need shorter. - `snapshot` defaults to `ai`; `aria` returns an accessibility tree for debugging. - `click`/`type` require `ref` from `snapshot --format ai`; use `evaluate` for rare CSS selector one-offs. diff --git a/docs/tools.md b/docs/tools.md index 32d41bc01..bd5a6a489 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -53,6 +53,7 @@ Notes: - `act` requires `ref` from `snapshot --format ai`; use `evaluate` for rare CSS selector needs. - Avoid `act` → `wait` by default; use it only in exceptional cases (no reliable UI state to wait on). - `upload` can optionally pass a `ref` to auto-click after arming. +- `upload` also supports `inputRef` (aria ref) or `element` (CSS selector) to set `` directly. ### `clawdis_canvas` Drive the node Canvas (present, eval, snapshot, A2UI). diff --git a/src/agents/clawdis-tools.ts b/src/agents/clawdis-tools.ts index 6007fadb0..0a273008e 100644 --- a/src/agents/clawdis-tools.ts +++ b/src/agents/clawdis-tools.ts @@ -484,6 +484,8 @@ const BrowserToolSchema = Type.Union([ controlUrl: Type.Optional(Type.String()), paths: Type.Array(Type.String()), ref: Type.Optional(Type.String()), + inputRef: Type.Optional(Type.String()), + element: Type.Optional(Type.String()), targetId: Type.Optional(Type.String()), timeoutMs: Type.Optional(Type.Number()), }), @@ -627,6 +629,8 @@ function createBrowserTool(): AnyAgentTool { : []; if (paths.length === 0) throw new Error("paths required"); const ref = readStringParam(params, "ref"); + const inputRef = readStringParam(params, "inputRef"); + const element = readStringParam(params, "element"); const targetId = typeof params.targetId === "string" ? params.targetId.trim() @@ -640,6 +644,8 @@ function createBrowserTool(): AnyAgentTool { await browserArmFileChooser(baseUrl, { paths, ref, + inputRef, + element, targetId, timeoutMs, }), diff --git a/src/browser/client-actions-core.ts b/src/browser/client-actions-core.ts index 92a3c1940..5c3fd5a66 100644 --- a/src/browser/client-actions-core.ts +++ b/src/browser/client-actions-core.ts @@ -94,6 +94,8 @@ export async function browserArmFileChooser( opts: { paths: string[]; ref?: string; + inputRef?: string; + element?: string; targetId?: string; timeoutMs?: number; }, @@ -106,6 +108,8 @@ export async function browserArmFileChooser( body: JSON.stringify({ paths: opts.paths, ref: opts.ref, + inputRef: opts.inputRef, + element: opts.element, targetId: opts.targetId, timeoutMs: opts.timeoutMs, }), diff --git a/src/browser/pw-ai.ts b/src/browser/pw-ai.ts index b984b9e5e..fe14e2ce9 100644 --- a/src/browser/pw-ai.ts +++ b/src/browser/pw-ai.ts @@ -22,6 +22,7 @@ export { pressKeyViaPlaywright, resizeViewportViaPlaywright, selectOptionViaPlaywright, + setInputFilesViaPlaywright, snapshotAiViaPlaywright, takeScreenshotViaPlaywright, typeViaPlaywright, diff --git a/src/browser/pw-tools-core.ts b/src/browser/pw-tools-core.ts index 4f1d0bc27..99f1507eb 100644 --- a/src/browser/pw-tools-core.ts +++ b/src/browser/pw-tools-core.ts @@ -303,6 +303,44 @@ export async function armFileUploadViaPlaywright(opts: { }); } +export async function setInputFilesViaPlaywright(opts: { + cdpPort: number; + targetId?: string; + inputRef?: string; + element?: string; + paths: string[]; +}): Promise { + const page = await getPageForTargetId(opts); + ensurePageState(page); + if (!opts.paths.length) throw new Error("paths are required"); + const inputRef = + typeof opts.inputRef === "string" ? opts.inputRef.trim() : ""; + const element = typeof opts.element === "string" ? opts.element.trim() : ""; + if (inputRef && element) { + throw new Error("inputRef and element are mutually exclusive"); + } + if (!inputRef && !element) { + throw new Error("inputRef or element is required"); + } + + const locator = inputRef + ? refLocator(page, inputRef) + : page.locator(element).first(); + + await locator.setInputFiles(opts.paths); + try { + const handle = await locator.elementHandle(); + if (handle) { + await handle.evaluate((el) => { + el.dispatchEvent(new Event("input", { bubbles: true })); + el.dispatchEvent(new Event("change", { bubbles: true })); + }); + } + } catch { + // Best-effort for sites that don't react to setInputFiles alone. + } +} + export async function armDialogViaPlaywright(opts: { cdpPort: number; targetId?: string; diff --git a/src/browser/routes/agent.ts b/src/browser/routes/agent.ts index b3c4c8c93..41d32ddc1 100644 --- a/src/browser/routes/agent.ts +++ b/src/browser/routes/agent.ts @@ -339,6 +339,8 @@ export function registerBrowserAgentRoutes( const body = readBody(req); const targetId = toStringOrEmpty(body.targetId) || undefined; const ref = toStringOrEmpty(body.ref) || undefined; + const inputRef = toStringOrEmpty(body.inputRef) || undefined; + const element = toStringOrEmpty(body.element) || undefined; const paths = toStringArray(body.paths) ?? []; const timeoutMs = toNumber(body.timeoutMs); if (!paths.length) return jsonError(res, 400, "paths are required"); @@ -346,18 +348,35 @@ export function registerBrowserAgentRoutes( const tab = await ctx.ensureTabAvailable(targetId); const pw = await requirePwAi(res, "file chooser hook"); if (!pw) return; - await pw.armFileUploadViaPlaywright({ - cdpPort: ctx.state().cdpPort, - targetId: tab.targetId, - paths, - timeoutMs: timeoutMs ?? undefined, - }); - if (ref) { - await pw.clickViaPlaywright({ + if (inputRef || element) { + if (ref) { + return jsonError( + res, + 400, + "ref cannot be combined with inputRef/element", + ); + } + await pw.setInputFilesViaPlaywright({ cdpPort: ctx.state().cdpPort, targetId: tab.targetId, - ref, + inputRef, + element, + paths, }); + } else { + await pw.armFileUploadViaPlaywright({ + cdpPort: ctx.state().cdpPort, + targetId: tab.targetId, + paths, + timeoutMs: timeoutMs ?? undefined, + }); + if (ref) { + await pw.clickViaPlaywright({ + cdpPort: ctx.state().cdpPort, + targetId: tab.targetId, + ref, + }); + } } res.json({ ok: true }); } catch (err) { diff --git a/src/browser/server.test.ts b/src/browser/server.test.ts index 51f83fe9b..0e4a98392 100644 --- a/src/browser/server.test.ts +++ b/src/browser/server.test.ts @@ -33,6 +33,7 @@ const pwMocks = vi.hoisted(() => ({ pressKeyViaPlaywright: vi.fn(async () => {}), resizeViewportViaPlaywright: vi.fn(async () => {}), selectOptionViaPlaywright: vi.fn(async () => {}), + setInputFilesViaPlaywright: vi.fn(async () => {}), snapshotAiViaPlaywright: vi.fn(async () => ({ snapshot: "ok" })), takeScreenshotViaPlaywright: vi.fn(async () => ({ buffer: Buffer.from("png"), @@ -493,6 +494,37 @@ describe("browser control server", () => { ref: "e12", }); + const uploadWithInputRef = await realFetch(`${base}/hooks/file-chooser`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ paths: ["/tmp/c.txt"], inputRef: "e99" }), + }).then((r) => r.json()); + expect(uploadWithInputRef).toMatchObject({ ok: true }); + expect(pwMocks.setInputFilesViaPlaywright).toHaveBeenCalledWith({ + cdpPort: testPort + 1, + targetId: "abcd1234", + inputRef: "e99", + element: undefined, + paths: ["/tmp/c.txt"], + }); + + const uploadWithElement = await realFetch(`${base}/hooks/file-chooser`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + paths: ["/tmp/d.txt"], + element: "input[type=file]", + }), + }).then((r) => r.json()); + expect(uploadWithElement).toMatchObject({ ok: true }); + expect(pwMocks.setInputFilesViaPlaywright).toHaveBeenCalledWith({ + cdpPort: testPort + 1, + targetId: "abcd1234", + inputRef: undefined, + element: "input[type=file]", + paths: ["/tmp/d.txt"], + }); + const dialog = await realFetch(`${base}/hooks/dialog`, { method: "POST", headers: { "Content-Type": "application/json" }, diff --git a/src/cli/browser-cli-actions-input.ts b/src/cli/browser-cli-actions-input.ts index 5f803ae57..54d8b9590 100644 --- a/src/cli/browser-cli-actions-input.ts +++ b/src/cli/browser-cli-actions-input.ts @@ -302,6 +302,8 @@ export function registerBrowserActionInputCommands( .description("Arm file upload for the next file chooser") .argument("", "File paths to upload") .option("--ref ", "Ref id from ai snapshot to click after arming") + .option("--input-ref ", "Ref id for to set directly") + .option("--element ", "CSS selector for ") .option("--target-id ", "CDP target id (or unique prefix)") .option( "--timeout-ms ", @@ -315,6 +317,8 @@ export function registerBrowserActionInputCommands( const result = await browserArmFileChooser(baseUrl, { paths, ref: opts.ref?.trim() || undefined, + inputRef: opts.inputRef?.trim() || undefined, + element: opts.element?.trim() || undefined, targetId: opts.targetId?.trim() || undefined, timeoutMs: Number.isFinite(opts.timeoutMs) ? opts.timeoutMs