diff --git a/CHANGELOG.md b/CHANGELOG.md index 8de6b8fbc..07a5b175f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ - Gateway/Control UI: make `chat.send` non-blocking, wire Stop to `chat.abort`, and treat `/stop` as an out-of-band abort. (#653) - Gateway/Control UI: allow `chat.abort` without `runId` (abort active runs), suppress post-abort chat streaming, and prune stuck chat runs. (#653) - Gateway/Control UI: sniff image attachments for chat.send, drop non-images, and log mismatches. (#670) — thanks @cristip73. +- Gateway/Agent: accept image attachments on `agent` (multimodal message) and add live gateway image probe (`CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1`). - CLI: `clawdbot sessions` now includes `elev:*` + `usage:*` flags in the table output. - CLI/Pairing: accept positional provider for `pairing list|approve` (npm-run compatible); update docs/bot hints. - Branding: normalize user-facing “ClawdBot”/“CLAWDBOT” → “Clawdbot” (CLI, status, docs). diff --git a/docs/testing.md b/docs/testing.md index 6a2c7f528..51df16c54 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -129,6 +129,8 @@ Live tests are split into two layers so we can isolate failures: - Optional tool-calling stress: - `CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1` enables an extra “bash writes file → read reads it back → echo nonce” check. - This is specifically meant to catch tool-calling compatibility issues across providers (formatting, history replay, tool_result pairing, etc.). +- Optional image send smoke: + - `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` sends a real image attachment through the gateway agent pipeline (multimodal message) and asserts the model can read back a per-run code from the image. ### Recommended live recipes @@ -143,6 +145,37 @@ Narrow, explicit allowlists are fastest and least flaky: - Tool calling across several providers (bash + read probe): - `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-flash-latest,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` +## Live: model matrix (what we cover) + +There is no fixed “CI model list” (live is opt-in), but these are the **recommended** models to cover regularly on a dev machine with keys. + +### Baseline: tool calling (Read + optional Bash) + +Pick at least one per provider family: +- OpenAI: `openai/gpt-5.2` (or `openai/gpt-5-mini`) +- Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`) +- Google: `google/gemini-flash-latest` (or `google/gemini-2.5-pro`) +- Z.AI (GLM): `zai/glm-4.7` +- MiniMax: `minimax/minimax-m2.1` + +Optional additional coverage (nice to have): +- xAI: `xai/grok-4` (or latest available) +- Mistral: `mistral/`… (pick one “tools” capable model you have enabled) +- Cerebras: `cerebras/`… (if you have access) +- LM Studio: `lmstudio/`… (local; tool calling depends on API mode) + +### Vision: image send (attachment → multimodal message) + +Run with `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` and include at least one image-capable model in `CLAWDBOT_LIVE_GATEWAY_MODELS` (Claude/Gemini/OpenAI vision-capable variants, etc.). + +### Aggregators / alternate gateways + +If you have keys enabled, we also support testing via: +- OpenRouter: `openrouter/...` (hundreds of models; use `clawdbot models scan` to find tool+image capable candidates) +- OpenCode Zen: `opencode-zen/...` (requires `OPENCODE_ZEN_API_KEY`) + +Tip: don’t try to hardcode “all models” in docs. The authoritative list is whatever `discoverModels(...)` returns on your machine + whatever keys are available. + ## Credentials (never commit) Live tests discover credentials the same way the CLI does. Practical implications: diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index 1e7229b5e..4c5fb6960 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -1,4 +1,4 @@ -import { randomUUID } from "node:crypto"; +import { randomBytes, randomUUID } from "node:crypto"; import fs from "node:fs/promises"; import { createServer } from "node:net"; import os from "node:os"; @@ -16,6 +16,7 @@ import { ensureClawdbotModelsJson } from "../agents/models-config.js"; import { loadConfig } from "../config/config.js"; import { resolveUserPath } from "../utils.js"; import { GatewayClient } from "./client.js"; +import { renderCatNoncePngBase64 } from "./live-image-probe.js"; import { startGatewayServer } from "./server.js"; const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1"; @@ -24,6 +25,8 @@ const ALL_MODELS = process.env.CLAWDBOT_LIVE_GATEWAY_ALL_MODELS === "1" || process.env.CLAWDBOT_LIVE_GATEWAY_MODELS === "all"; const EXTRA_TOOL_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE === "1"; +const EXTRA_IMAGE_PROBES = + process.env.CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE === "1"; const describeLive = LIVE && GATEWAY_LIVE ? describe : describe.skip; @@ -60,6 +63,43 @@ function isMeaningful(text: string): boolean { return true; } +function randomImageProbeCode(len = 10): string { + const alphabet = "2345689ABCEF"; + const bytes = randomBytes(len); + let out = ""; + for (let i = 0; i < len; i += 1) { + out += alphabet[bytes[i] % alphabet.length]; + } + return out; +} + +function editDistance(a: string, b: string): number { + if (a === b) return 0; + const aLen = a.length; + const bLen = b.length; + if (aLen === 0) return bLen; + if (bLen === 0) return aLen; + + let prev = Array.from({ length: bLen + 1 }, (_v, idx) => idx); + let curr = Array.from({ length: bLen + 1 }, () => 0); + + for (let i = 1; i <= aLen; i += 1) { + curr[0] = i; + const aCh = a.charCodeAt(i - 1); + for (let j = 1; j <= bLen; j += 1) { + const cost = aCh === b.charCodeAt(j - 1) ? 0 : 1; + curr[j] = Math.min( + prev[j] + 1, // delete + curr[j - 1] + 1, // insert + prev[j - 1] + cost, // substitute + ); + } + [prev, curr] = [curr, prev]; + } + + return prev[bLen] ?? Number.POSITIVE_INFINITY; +} + async function getFreePort(): Promise { return await new Promise((resolve, reject) => { const srv = createServer(); @@ -204,6 +244,14 @@ describeLive("gateway live (dev agent, profile keys)", () => { } expect(candidates.length).toBeGreaterThan(0); + const imageCandidates = EXTRA_IMAGE_PROBES + ? candidates.filter((m) => m.input?.includes("image")) + : []; + if (EXTRA_IMAGE_PROBES && imageCandidates.length === 0) { + throw new Error( + "image probe enabled but no selected models advertise image support; set CLAWDBOT_LIVE_GATEWAY_MODELS to include an image-capable model", + ); + } // Build a temp config that allows all selected models, so session overrides stick. const lmstudioProvider = cfg.models?.providers?.lmstudio; @@ -365,6 +413,53 @@ describeLive("gateway live (dev agent, profile keys)", () => { await fs.rm(toolWritePath, { force: true }); } + if (EXTRA_IMAGE_PROBES && model.input?.includes("image")) { + const imageCode = randomImageProbeCode(10); + const imageBase64 = renderCatNoncePngBase64(imageCode); + const runIdImage = randomUUID(); + + const imageProbe = await client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runIdImage}-image`, + message: + "Look at the attached image. Reply with exactly two tokens separated by a single space: " + + "(1) the animal shown or written in the image, lowercase; " + + "(2) the code printed in the image, uppercase. No extra text.", + attachments: [ + { + mimeType: "image/png", + fileName: `probe-${runIdImage}.png`, + content: imageBase64, + }, + ], + deliver: false, + }, + { expectFinal: true }, + ); + if (imageProbe?.status !== "ok") { + throw new Error( + `image probe failed: status=${String(imageProbe?.status)}`, + ); + } + const imageText = extractPayloadText(imageProbe?.result); + if (!/\bcat\b/i.test(imageText)) { + throw new Error(`image probe missing 'cat': ${imageText}`); + } + const candidates = + imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? []; + const bestDistance = candidates.reduce((best, cand) => { + if (Math.abs(cand.length - imageCode.length) > 2) return best; + return Math.min(best, editDistance(cand, imageCode)); + }, Number.POSITIVE_INFINITY); + if (!(bestDistance <= 1)) { + throw new Error( + `image probe missing code (${imageCode}): ${imageText}`, + ); + } + } + // Regression: tool-call-only turn followed by a user message (OpenAI responses bug class). if ( (model.provider === "openai" && diff --git a/src/gateway/live-image-probe.ts b/src/gateway/live-image-probe.ts new file mode 100644 index 000000000..490bd4daf --- /dev/null +++ b/src/gateway/live-image-probe.ts @@ -0,0 +1,206 @@ +import { deflateSync } from "node:zlib"; + +const CRC_TABLE = (() => { + const table = new Uint32Array(256); + for (let i = 0; i < 256; i += 1) { + let c = i; + for (let k = 0; k < 8; k += 1) { + c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1; + } + table[i] = c >>> 0; + } + return table; +})(); + +function crc32(buf: Buffer) { + let crc = 0xffffffff; + for (let i = 0; i < buf.length; i += 1) { + crc = CRC_TABLE[(crc ^ buf[i]) & 0xff] ^ (crc >>> 8); + } + return (crc ^ 0xffffffff) >>> 0; +} + +function pngChunk(type: string, data: Buffer) { + const typeBuf = Buffer.from(type, "ascii"); + const len = Buffer.alloc(4); + len.writeUInt32BE(data.length, 0); + const crc = crc32(Buffer.concat([typeBuf, data])); + const crcBuf = Buffer.alloc(4); + crcBuf.writeUInt32BE(crc, 0); + return Buffer.concat([len, typeBuf, data, crcBuf]); +} + +function encodePngRgba(buffer: Buffer, width: number, height: number) { + const stride = width * 4; + const raw = Buffer.alloc((stride + 1) * height); + for (let row = 0; row < height; row += 1) { + const rawOffset = row * (stride + 1); + raw[rawOffset] = 0; // filter: none + buffer.copy(raw, rawOffset + 1, row * stride, row * stride + stride); + } + const compressed = deflateSync(raw); + + const signature = Buffer.from([ + 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, + ]); + const ihdr = Buffer.alloc(13); + ihdr.writeUInt32BE(width, 0); + ihdr.writeUInt32BE(height, 4); + ihdr[8] = 8; // bit depth + ihdr[9] = 6; // color type RGBA + ihdr[10] = 0; // compression + ihdr[11] = 0; // filter + ihdr[12] = 0; // interlace + + return Buffer.concat([ + signature, + pngChunk("IHDR", ihdr), + pngChunk("IDAT", compressed), + pngChunk("IEND", Buffer.alloc(0)), + ]); +} + +function fillPixel( + buf: Buffer, + x: number, + y: number, + width: number, + r: number, + g: number, + b: number, + a = 255, +) { + if (x < 0 || y < 0) return; + if (x >= width) return; + const idx = (y * width + x) * 4; + if (idx < 0 || idx + 3 >= buf.length) return; + buf[idx] = r; + buf[idx + 1] = g; + buf[idx + 2] = b; + buf[idx + 3] = a; +} + +const GLYPH_ROWS_5X7: Record = { + "0": [0b01110, 0b10001, 0b10011, 0b10101, 0b11001, 0b10001, 0b01110], + "1": [0b00100, 0b01100, 0b00100, 0b00100, 0b00100, 0b00100, 0b01110], + "2": [0b01110, 0b10001, 0b00001, 0b00010, 0b00100, 0b01000, 0b11111], + "3": [0b11110, 0b00001, 0b00001, 0b01110, 0b00001, 0b00001, 0b11110], + "4": [0b00010, 0b00110, 0b01010, 0b10010, 0b11111, 0b00010, 0b00010], + "5": [0b11111, 0b10000, 0b11110, 0b00001, 0b00001, 0b10001, 0b01110], + "6": [0b00110, 0b01000, 0b10000, 0b11110, 0b10001, 0b10001, 0b01110], + "7": [0b11111, 0b00001, 0b00010, 0b00100, 0b01000, 0b01000, 0b01000], + "8": [0b01110, 0b10001, 0b10001, 0b01110, 0b10001, 0b10001, 0b01110], + "9": [0b01110, 0b10001, 0b10001, 0b01111, 0b00001, 0b00010, 0b01100], + + A: [0b01110, 0b10001, 0b10001, 0b11111, 0b10001, 0b10001, 0b10001], + B: [0b11110, 0b10001, 0b10001, 0b11110, 0b10001, 0b10001, 0b11110], + C: [0b01110, 0b10001, 0b10000, 0b10000, 0b10000, 0b10001, 0b01110], + D: [0b11110, 0b10001, 0b10001, 0b10001, 0b10001, 0b10001, 0b11110], + E: [0b11111, 0b10000, 0b10000, 0b11110, 0b10000, 0b10000, 0b11111], + F: [0b11111, 0b10000, 0b10000, 0b11110, 0b10000, 0b10000, 0b10000], + T: [0b11111, 0b00100, 0b00100, 0b00100, 0b00100, 0b00100, 0b00100], +}; + +function drawGlyph5x7(params: { + buf: Buffer; + width: number; + x: number; + y: number; + char: string; + scale: number; + color: { r: number; g: number; b: number; a?: number }; +}) { + const rows = GLYPH_ROWS_5X7[params.char]; + if (!rows) return; + for (let row = 0; row < 7; row += 1) { + const bits = rows[row] ?? 0; + for (let col = 0; col < 5; col += 1) { + const on = (bits & (1 << (4 - col))) !== 0; + if (!on) continue; + for (let dy = 0; dy < params.scale; dy += 1) { + for (let dx = 0; dx < params.scale; dx += 1) { + fillPixel( + params.buf, + params.x + col * params.scale + dx, + params.y + row * params.scale + dy, + params.width, + params.color.r, + params.color.g, + params.color.b, + params.color.a ?? 255, + ); + } + } + } + } +} + +function drawText(params: { + buf: Buffer; + width: number; + x: number; + y: number; + text: string; + scale: number; + color: { r: number; g: number; b: number; a?: number }; +}) { + const text = params.text.toUpperCase(); + let cursorX = params.x; + for (const raw of text) { + const ch = raw in GLYPH_ROWS_5X7 ? raw : raw.toUpperCase(); + drawGlyph5x7({ + buf: params.buf, + width: params.width, + x: cursorX, + y: params.y, + char: ch, + scale: params.scale, + color: params.color, + }); + cursorX += 6 * params.scale; + } +} + +function measureTextWidthPx(text: string, scale: number) { + return text.length * 6 * scale - scale; // 5px glyph + 1px space +} + +export function renderCatNoncePngBase64(nonce: string): string { + const top = "CAT"; + const bottom = nonce.toUpperCase(); + + const scale = 12; + const pad = 18; + const gap = 18; + + const topWidth = measureTextWidthPx(top, scale); + const bottomWidth = measureTextWidthPx(bottom, scale); + const width = Math.max(topWidth, bottomWidth) + pad * 2; + const height = pad * 2 + 7 * scale + gap + 7 * scale; + + const buf = Buffer.alloc(width * height * 4, 255); + const black = { r: 0, g: 0, b: 0 }; + + drawText({ + buf, + width, + x: Math.floor((width - topWidth) / 2), + y: pad, + text: top, + scale, + color: black, + }); + + drawText({ + buf, + width, + x: Math.floor((width - bottomWidth) / 2), + y: pad + 7 * scale + gap, + text: bottom, + scale, + color: black, + }); + + const png = encodePngRgba(buf, width, height); + return png.toString("base64"); +} diff --git a/src/gateway/protocol/schema.ts b/src/gateway/protocol/schema.ts index acceefe46..784370791 100644 --- a/src/gateway/protocol/schema.ts +++ b/src/gateway/protocol/schema.ts @@ -225,6 +225,7 @@ export const AgentParamsSchema = Type.Object( sessionKey: Type.Optional(Type.String()), thinking: Type.Optional(Type.String()), deliver: Type.Optional(Type.Boolean()), + attachments: Type.Optional(Type.Array(Type.Unknown())), provider: Type.Optional(Type.String()), timeout: Type.Optional(Type.Integer({ minimum: 0 })), lane: Type.Optional(Type.String()), diff --git a/src/gateway/server-methods/agent.ts b/src/gateway/server-methods/agent.ts index 9ee67c9fe..3184539be 100644 --- a/src/gateway/server-methods/agent.ts +++ b/src/gateway/server-methods/agent.ts @@ -23,6 +23,7 @@ import { isWhatsAppGroupJid, normalizeWhatsAppTarget, } from "../../whatsapp/normalize.js"; +import { parseMessageWithAttachments } from "../chat-attachments.js"; import { type AgentWaitParams, ErrorCodes, @@ -57,6 +58,12 @@ export const agentHandlers: GatewayRequestHandlers = { sessionKey?: string; thinking?: string; deliver?: boolean; + attachments?: Array<{ + type?: string; + mimeType?: string; + fileName?: string; + content?: unknown; + }>; provider?: string; lane?: string; extraSystemPrompt?: string; @@ -73,7 +80,45 @@ export const agentHandlers: GatewayRequestHandlers = { }); return; } - const message = request.message.trim(); + const normalizedAttachments = + request.attachments + ?.map((a) => ({ + type: typeof a?.type === "string" ? a.type : undefined, + mimeType: typeof a?.mimeType === "string" ? a.mimeType : undefined, + fileName: typeof a?.fileName === "string" ? a.fileName : undefined, + content: + typeof a?.content === "string" + ? a.content + : ArrayBuffer.isView(a?.content) + ? Buffer.from( + a.content.buffer, + a.content.byteOffset, + a.content.byteLength, + ).toString("base64") + : undefined, + })) + .filter((a) => a.content) ?? []; + + let message = request.message.trim(); + let images: Array<{ type: "image"; data: string; mimeType: string }> = []; + if (normalizedAttachments.length > 0) { + try { + const parsed = await parseMessageWithAttachments( + message, + normalizedAttachments, + { maxBytes: 5_000_000, log: context.logGateway }, + ); + message = parsed.message.trim(); + images = parsed.images; + } catch (err) { + respond( + false, + undefined, + errorShape(ErrorCodes.INVALID_REQUEST, String(err)), + ); + return; + } + } const rawProvider = typeof request.provider === "string" ? request.provider.trim() : ""; if (rawProvider) { @@ -275,6 +320,7 @@ export const agentHandlers: GatewayRequestHandlers = { void agentCommand( { message, + images, to: sanitizedTo, sessionId: resolvedSessionId, sessionKey: requestedSessionKey, diff --git a/src/gateway/server.agent.test.ts b/src/gateway/server.agent.test.ts index b1807cc50..682f76969 100644 --- a/src/gateway/server.agent.test.ts +++ b/src/gateway/server.agent.test.ts @@ -21,6 +21,9 @@ import { installGatewayTestHooks(); +const BASE_IMAGE_PNG = + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+X3mIAAAAASUVORK5CYII="; + function expectProviders(call: Record, provider: string) { expect(call.provider).toBe(provider); expect(call.messageProvider).toBe(provider); @@ -111,6 +114,58 @@ describe("gateway server agent", () => { await server.close(); }); + test("agent forwards image attachments as images[]", async () => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gw-")); + testState.sessionStorePath = path.join(dir, "sessions.json"); + await fs.writeFile( + testState.sessionStorePath, + JSON.stringify( + { + main: { + sessionId: "sess-main-images", + updatedAt: Date.now(), + }, + }, + null, + 2, + ), + "utf-8", + ); + + const { server, ws } = await startServerWithClient(); + await connectOk(ws); + + const res = await rpcReq(ws, "agent", { + message: "what is in the image?", + sessionKey: "main", + attachments: [ + { + mimeType: "image/png", + fileName: "tiny.png", + content: BASE_IMAGE_PNG, + }, + ], + idempotencyKey: "idem-agent-attachments", + }); + expect(res.ok).toBe(true); + + const spy = vi.mocked(agentCommand); + const call = spy.mock.calls.at(-1)?.[0] as Record; + expect(call.sessionKey).toBe("main"); + expectProviders(call, "webchat"); + expect(call.message).toBe("what is in the image?"); + + const images = call.images as Array>; + expect(Array.isArray(images)).toBe(true); + expect(images.length).toBe(1); + expect(images[0]?.type).toBe("image"); + expect(images[0]?.mimeType).toBe("image/png"); + expect(images[0]?.data).toBe(BASE_IMAGE_PNG); + + ws.close(); + await server.close(); + }); + test("agent falls back to whatsapp when delivery requested and no last provider exists", async () => { testState.allowFrom = ["+1555"]; const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gw-"));