feat(gateway): add agent image attachments + live probe

This commit is contained in:
Peter Steinberger
2026-01-10 20:34:34 +00:00
parent b9b1bc2726
commit 9790b39d80
7 changed files with 439 additions and 2 deletions

View File

@@ -27,6 +27,7 @@
- Gateway/Control UI: make `chat.send` non-blocking, wire Stop to `chat.abort`, and treat `/stop` as an out-of-band abort. (#653)
- Gateway/Control UI: allow `chat.abort` without `runId` (abort active runs), suppress post-abort chat streaming, and prune stuck chat runs. (#653)
- Gateway/Control UI: sniff image attachments for chat.send, drop non-images, and log mismatches. (#670) — thanks @cristip73.
- Gateway/Agent: accept image attachments on `agent` (multimodal message) and add live gateway image probe (`CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1`).
- CLI: `clawdbot sessions` now includes `elev:*` + `usage:*` flags in the table output.
- CLI/Pairing: accept positional provider for `pairing list|approve` (npm-run compatible); update docs/bot hints.
- Branding: normalize user-facing “ClawdBot”/“CLAWDBOT” → “Clawdbot” (CLI, status, docs).

View File

@@ -129,6 +129,8 @@ Live tests are split into two layers so we can isolate failures:
- Optional tool-calling stress:
- `CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1` enables an extra “bash writes file → read reads it back → echo nonce” check.
- This is specifically meant to catch tool-calling compatibility issues across providers (formatting, history replay, tool_result pairing, etc.).
- Optional image send smoke:
- `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` sends a real image attachment through the gateway agent pipeline (multimodal message) and asserts the model can read back a per-run code from the image.
### Recommended live recipes
@@ -143,6 +145,37 @@ Narrow, explicit allowlists are fastest and least flaky:
- Tool calling across several providers (bash + read probe):
- `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-flash-latest,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
## Live: model matrix (what we cover)
There is no fixed “CI model list” (live is opt-in), but these are the **recommended** models to cover regularly on a dev machine with keys.
### Baseline: tool calling (Read + optional Bash)
Pick at least one per provider family:
- OpenAI: `openai/gpt-5.2` (or `openai/gpt-5-mini`)
- Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`)
- Google: `google/gemini-flash-latest` (or `google/gemini-2.5-pro`)
- Z.AI (GLM): `zai/glm-4.7`
- MiniMax: `minimax/minimax-m2.1`
Optional additional coverage (nice to have):
- xAI: `xai/grok-4` (or latest available)
- Mistral: `mistral/`… (pick one “tools” capable model you have enabled)
- Cerebras: `cerebras/`… (if you have access)
- LM Studio: `lmstudio/`… (local; tool calling depends on API mode)
### Vision: image send (attachment → multimodal message)
Run with `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` and include at least one image-capable model in `CLAWDBOT_LIVE_GATEWAY_MODELS` (Claude/Gemini/OpenAI vision-capable variants, etc.).
### Aggregators / alternate gateways
If you have keys enabled, we also support testing via:
- OpenRouter: `openrouter/...` (hundreds of models; use `clawdbot models scan` to find tool+image capable candidates)
- OpenCode Zen: `opencode-zen/...` (requires `OPENCODE_ZEN_API_KEY`)
Tip: dont try to hardcode “all models” in docs. The authoritative list is whatever `discoverModels(...)` returns on your machine + whatever keys are available.
## Credentials (never commit)
Live tests discover credentials the same way the CLI does. Practical implications:

View File

@@ -1,4 +1,4 @@
import { randomUUID } from "node:crypto";
import { randomBytes, randomUUID } from "node:crypto";
import fs from "node:fs/promises";
import { createServer } from "node:net";
import os from "node:os";
@@ -16,6 +16,7 @@ import { ensureClawdbotModelsJson } from "../agents/models-config.js";
import { loadConfig } from "../config/config.js";
import { resolveUserPath } from "../utils.js";
import { GatewayClient } from "./client.js";
import { renderCatNoncePngBase64 } from "./live-image-probe.js";
import { startGatewayServer } from "./server.js";
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
@@ -24,6 +25,8 @@ const ALL_MODELS =
process.env.CLAWDBOT_LIVE_GATEWAY_ALL_MODELS === "1" ||
process.env.CLAWDBOT_LIVE_GATEWAY_MODELS === "all";
const EXTRA_TOOL_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE === "1";
const EXTRA_IMAGE_PROBES =
process.env.CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE === "1";
const describeLive = LIVE && GATEWAY_LIVE ? describe : describe.skip;
@@ -60,6 +63,43 @@ function isMeaningful(text: string): boolean {
return true;
}
function randomImageProbeCode(len = 10): string {
const alphabet = "2345689ABCEF";
const bytes = randomBytes(len);
let out = "";
for (let i = 0; i < len; i += 1) {
out += alphabet[bytes[i] % alphabet.length];
}
return out;
}
function editDistance(a: string, b: string): number {
if (a === b) return 0;
const aLen = a.length;
const bLen = b.length;
if (aLen === 0) return bLen;
if (bLen === 0) return aLen;
let prev = Array.from({ length: bLen + 1 }, (_v, idx) => idx);
let curr = Array.from({ length: bLen + 1 }, () => 0);
for (let i = 1; i <= aLen; i += 1) {
curr[0] = i;
const aCh = a.charCodeAt(i - 1);
for (let j = 1; j <= bLen; j += 1) {
const cost = aCh === b.charCodeAt(j - 1) ? 0 : 1;
curr[j] = Math.min(
prev[j] + 1, // delete
curr[j - 1] + 1, // insert
prev[j - 1] + cost, // substitute
);
}
[prev, curr] = [curr, prev];
}
return prev[bLen] ?? Number.POSITIVE_INFINITY;
}
async function getFreePort(): Promise<number> {
return await new Promise((resolve, reject) => {
const srv = createServer();
@@ -204,6 +244,14 @@ describeLive("gateway live (dev agent, profile keys)", () => {
}
expect(candidates.length).toBeGreaterThan(0);
const imageCandidates = EXTRA_IMAGE_PROBES
? candidates.filter((m) => m.input?.includes("image"))
: [];
if (EXTRA_IMAGE_PROBES && imageCandidates.length === 0) {
throw new Error(
"image probe enabled but no selected models advertise image support; set CLAWDBOT_LIVE_GATEWAY_MODELS to include an image-capable model",
);
}
// Build a temp config that allows all selected models, so session overrides stick.
const lmstudioProvider = cfg.models?.providers?.lmstudio;
@@ -365,6 +413,53 @@ describeLive("gateway live (dev agent, profile keys)", () => {
await fs.rm(toolWritePath, { force: true });
}
if (EXTRA_IMAGE_PROBES && model.input?.includes("image")) {
const imageCode = randomImageProbeCode(10);
const imageBase64 = renderCatNoncePngBase64(imageCode);
const runIdImage = randomUUID();
const imageProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdImage}-image`,
message:
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
"(1) the animal shown or written in the image, lowercase; " +
"(2) the code printed in the image, uppercase. No extra text.",
attachments: [
{
mimeType: "image/png",
fileName: `probe-${runIdImage}.png`,
content: imageBase64,
},
],
deliver: false,
},
{ expectFinal: true },
);
if (imageProbe?.status !== "ok") {
throw new Error(
`image probe failed: status=${String(imageProbe?.status)}`,
);
}
const imageText = extractPayloadText(imageProbe?.result);
if (!/\bcat\b/i.test(imageText)) {
throw new Error(`image probe missing 'cat': ${imageText}`);
}
const candidates =
imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
const bestDistance = candidates.reduce((best, cand) => {
if (Math.abs(cand.length - imageCode.length) > 2) return best;
return Math.min(best, editDistance(cand, imageCode));
}, Number.POSITIVE_INFINITY);
if (!(bestDistance <= 1)) {
throw new Error(
`image probe missing code (${imageCode}): ${imageText}`,
);
}
}
// Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
if (
(model.provider === "openai" &&

View File

@@ -0,0 +1,206 @@
import { deflateSync } from "node:zlib";
const CRC_TABLE = (() => {
const table = new Uint32Array(256);
for (let i = 0; i < 256; i += 1) {
let c = i;
for (let k = 0; k < 8; k += 1) {
c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1;
}
table[i] = c >>> 0;
}
return table;
})();
function crc32(buf: Buffer) {
let crc = 0xffffffff;
for (let i = 0; i < buf.length; i += 1) {
crc = CRC_TABLE[(crc ^ buf[i]) & 0xff] ^ (crc >>> 8);
}
return (crc ^ 0xffffffff) >>> 0;
}
function pngChunk(type: string, data: Buffer) {
const typeBuf = Buffer.from(type, "ascii");
const len = Buffer.alloc(4);
len.writeUInt32BE(data.length, 0);
const crc = crc32(Buffer.concat([typeBuf, data]));
const crcBuf = Buffer.alloc(4);
crcBuf.writeUInt32BE(crc, 0);
return Buffer.concat([len, typeBuf, data, crcBuf]);
}
function encodePngRgba(buffer: Buffer, width: number, height: number) {
const stride = width * 4;
const raw = Buffer.alloc((stride + 1) * height);
for (let row = 0; row < height; row += 1) {
const rawOffset = row * (stride + 1);
raw[rawOffset] = 0; // filter: none
buffer.copy(raw, rawOffset + 1, row * stride, row * stride + stride);
}
const compressed = deflateSync(raw);
const signature = Buffer.from([
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a,
]);
const ihdr = Buffer.alloc(13);
ihdr.writeUInt32BE(width, 0);
ihdr.writeUInt32BE(height, 4);
ihdr[8] = 8; // bit depth
ihdr[9] = 6; // color type RGBA
ihdr[10] = 0; // compression
ihdr[11] = 0; // filter
ihdr[12] = 0; // interlace
return Buffer.concat([
signature,
pngChunk("IHDR", ihdr),
pngChunk("IDAT", compressed),
pngChunk("IEND", Buffer.alloc(0)),
]);
}
function fillPixel(
buf: Buffer,
x: number,
y: number,
width: number,
r: number,
g: number,
b: number,
a = 255,
) {
if (x < 0 || y < 0) return;
if (x >= width) return;
const idx = (y * width + x) * 4;
if (idx < 0 || idx + 3 >= buf.length) return;
buf[idx] = r;
buf[idx + 1] = g;
buf[idx + 2] = b;
buf[idx + 3] = a;
}
const GLYPH_ROWS_5X7: Record<string, number[]> = {
"0": [0b01110, 0b10001, 0b10011, 0b10101, 0b11001, 0b10001, 0b01110],
"1": [0b00100, 0b01100, 0b00100, 0b00100, 0b00100, 0b00100, 0b01110],
"2": [0b01110, 0b10001, 0b00001, 0b00010, 0b00100, 0b01000, 0b11111],
"3": [0b11110, 0b00001, 0b00001, 0b01110, 0b00001, 0b00001, 0b11110],
"4": [0b00010, 0b00110, 0b01010, 0b10010, 0b11111, 0b00010, 0b00010],
"5": [0b11111, 0b10000, 0b11110, 0b00001, 0b00001, 0b10001, 0b01110],
"6": [0b00110, 0b01000, 0b10000, 0b11110, 0b10001, 0b10001, 0b01110],
"7": [0b11111, 0b00001, 0b00010, 0b00100, 0b01000, 0b01000, 0b01000],
"8": [0b01110, 0b10001, 0b10001, 0b01110, 0b10001, 0b10001, 0b01110],
"9": [0b01110, 0b10001, 0b10001, 0b01111, 0b00001, 0b00010, 0b01100],
A: [0b01110, 0b10001, 0b10001, 0b11111, 0b10001, 0b10001, 0b10001],
B: [0b11110, 0b10001, 0b10001, 0b11110, 0b10001, 0b10001, 0b11110],
C: [0b01110, 0b10001, 0b10000, 0b10000, 0b10000, 0b10001, 0b01110],
D: [0b11110, 0b10001, 0b10001, 0b10001, 0b10001, 0b10001, 0b11110],
E: [0b11111, 0b10000, 0b10000, 0b11110, 0b10000, 0b10000, 0b11111],
F: [0b11111, 0b10000, 0b10000, 0b11110, 0b10000, 0b10000, 0b10000],
T: [0b11111, 0b00100, 0b00100, 0b00100, 0b00100, 0b00100, 0b00100],
};
function drawGlyph5x7(params: {
buf: Buffer;
width: number;
x: number;
y: number;
char: string;
scale: number;
color: { r: number; g: number; b: number; a?: number };
}) {
const rows = GLYPH_ROWS_5X7[params.char];
if (!rows) return;
for (let row = 0; row < 7; row += 1) {
const bits = rows[row] ?? 0;
for (let col = 0; col < 5; col += 1) {
const on = (bits & (1 << (4 - col))) !== 0;
if (!on) continue;
for (let dy = 0; dy < params.scale; dy += 1) {
for (let dx = 0; dx < params.scale; dx += 1) {
fillPixel(
params.buf,
params.x + col * params.scale + dx,
params.y + row * params.scale + dy,
params.width,
params.color.r,
params.color.g,
params.color.b,
params.color.a ?? 255,
);
}
}
}
}
}
function drawText(params: {
buf: Buffer;
width: number;
x: number;
y: number;
text: string;
scale: number;
color: { r: number; g: number; b: number; a?: number };
}) {
const text = params.text.toUpperCase();
let cursorX = params.x;
for (const raw of text) {
const ch = raw in GLYPH_ROWS_5X7 ? raw : raw.toUpperCase();
drawGlyph5x7({
buf: params.buf,
width: params.width,
x: cursorX,
y: params.y,
char: ch,
scale: params.scale,
color: params.color,
});
cursorX += 6 * params.scale;
}
}
function measureTextWidthPx(text: string, scale: number) {
return text.length * 6 * scale - scale; // 5px glyph + 1px space
}
export function renderCatNoncePngBase64(nonce: string): string {
const top = "CAT";
const bottom = nonce.toUpperCase();
const scale = 12;
const pad = 18;
const gap = 18;
const topWidth = measureTextWidthPx(top, scale);
const bottomWidth = measureTextWidthPx(bottom, scale);
const width = Math.max(topWidth, bottomWidth) + pad * 2;
const height = pad * 2 + 7 * scale + gap + 7 * scale;
const buf = Buffer.alloc(width * height * 4, 255);
const black = { r: 0, g: 0, b: 0 };
drawText({
buf,
width,
x: Math.floor((width - topWidth) / 2),
y: pad,
text: top,
scale,
color: black,
});
drawText({
buf,
width,
x: Math.floor((width - bottomWidth) / 2),
y: pad + 7 * scale + gap,
text: bottom,
scale,
color: black,
});
const png = encodePngRgba(buf, width, height);
return png.toString("base64");
}

View File

@@ -225,6 +225,7 @@ export const AgentParamsSchema = Type.Object(
sessionKey: Type.Optional(Type.String()),
thinking: Type.Optional(Type.String()),
deliver: Type.Optional(Type.Boolean()),
attachments: Type.Optional(Type.Array(Type.Unknown())),
provider: Type.Optional(Type.String()),
timeout: Type.Optional(Type.Integer({ minimum: 0 })),
lane: Type.Optional(Type.String()),

View File

@@ -23,6 +23,7 @@ import {
isWhatsAppGroupJid,
normalizeWhatsAppTarget,
} from "../../whatsapp/normalize.js";
import { parseMessageWithAttachments } from "../chat-attachments.js";
import {
type AgentWaitParams,
ErrorCodes,
@@ -57,6 +58,12 @@ export const agentHandlers: GatewayRequestHandlers = {
sessionKey?: string;
thinking?: string;
deliver?: boolean;
attachments?: Array<{
type?: string;
mimeType?: string;
fileName?: string;
content?: unknown;
}>;
provider?: string;
lane?: string;
extraSystemPrompt?: string;
@@ -73,7 +80,45 @@ export const agentHandlers: GatewayRequestHandlers = {
});
return;
}
const message = request.message.trim();
const normalizedAttachments =
request.attachments
?.map((a) => ({
type: typeof a?.type === "string" ? a.type : undefined,
mimeType: typeof a?.mimeType === "string" ? a.mimeType : undefined,
fileName: typeof a?.fileName === "string" ? a.fileName : undefined,
content:
typeof a?.content === "string"
? a.content
: ArrayBuffer.isView(a?.content)
? Buffer.from(
a.content.buffer,
a.content.byteOffset,
a.content.byteLength,
).toString("base64")
: undefined,
}))
.filter((a) => a.content) ?? [];
let message = request.message.trim();
let images: Array<{ type: "image"; data: string; mimeType: string }> = [];
if (normalizedAttachments.length > 0) {
try {
const parsed = await parseMessageWithAttachments(
message,
normalizedAttachments,
{ maxBytes: 5_000_000, log: context.logGateway },
);
message = parsed.message.trim();
images = parsed.images;
} catch (err) {
respond(
false,
undefined,
errorShape(ErrorCodes.INVALID_REQUEST, String(err)),
);
return;
}
}
const rawProvider =
typeof request.provider === "string" ? request.provider.trim() : "";
if (rawProvider) {
@@ -275,6 +320,7 @@ export const agentHandlers: GatewayRequestHandlers = {
void agentCommand(
{
message,
images,
to: sanitizedTo,
sessionId: resolvedSessionId,
sessionKey: requestedSessionKey,

View File

@@ -21,6 +21,9 @@ import {
installGatewayTestHooks();
const BASE_IMAGE_PNG =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+X3mIAAAAASUVORK5CYII=";
function expectProviders(call: Record<string, unknown>, provider: string) {
expect(call.provider).toBe(provider);
expect(call.messageProvider).toBe(provider);
@@ -111,6 +114,58 @@ describe("gateway server agent", () => {
await server.close();
});
test("agent forwards image attachments as images[]", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gw-"));
testState.sessionStorePath = path.join(dir, "sessions.json");
await fs.writeFile(
testState.sessionStorePath,
JSON.stringify(
{
main: {
sessionId: "sess-main-images",
updatedAt: Date.now(),
},
},
null,
2,
),
"utf-8",
);
const { server, ws } = await startServerWithClient();
await connectOk(ws);
const res = await rpcReq(ws, "agent", {
message: "what is in the image?",
sessionKey: "main",
attachments: [
{
mimeType: "image/png",
fileName: "tiny.png",
content: BASE_IMAGE_PNG,
},
],
idempotencyKey: "idem-agent-attachments",
});
expect(res.ok).toBe(true);
const spy = vi.mocked(agentCommand);
const call = spy.mock.calls.at(-1)?.[0] as Record<string, unknown>;
expect(call.sessionKey).toBe("main");
expectProviders(call, "webchat");
expect(call.message).toBe("what is in the image?");
const images = call.images as Array<Record<string, unknown>>;
expect(Array.isArray(images)).toBe(true);
expect(images.length).toBe(1);
expect(images[0]?.type).toBe("image");
expect(images[0]?.mimeType).toBe("image/png");
expect(images[0]?.data).toBe(BASE_IMAGE_PNG);
ws.close();
await server.close();
});
test("agent falls back to whatsapp when delivery requested and no last provider exists", async () => {
testState.allowFrom = ["+1555"];
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gw-"));