diff --git a/CHANGELOG.md b/CHANGELOG.md index 25a1392ac..5054c3542 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ - Onboarding/Gateway: persist non-interactive gateway token auth in config; add WS wizard + gateway tool-calling regression coverage. - Gateway/Control UI: make `chat.send` non-blocking, wire Stop to `chat.abort`, and treat `/stop` as an out-of-band abort. (#653) - Gateway/Control UI: allow `chat.abort` without `runId` (abort active runs), suppress post-abort chat streaming, and prune stuck chat runs. (#653) +- Gateway/Control UI: sniff image attachments for chat.send, drop non-images, and log mismatches. (#670) — thanks @cristip73. - CLI: `clawdbot sessions` now includes `elev:*` + `usage:*` flags in the table output. - CLI/Pairing: accept positional provider for `pairing list|approve` (npm-run compatible); update docs/bot hints. - Branding: normalize user-facing “ClawdBot”/“CLAWDBOT” → “Clawdbot” (CLI, status, docs). diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index 48cd2cf06..55c9219b2 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -8,7 +8,12 @@ import type { AgentTool, ThinkingLevel, } from "@mariozechner/pi-agent-core"; -import type { Api, AssistantMessage, Model } from "@mariozechner/pi-ai"; +import type { + Api, + AssistantMessage, + ImageContent, + Model, +} from "@mariozechner/pi-ai"; import { createAgentSession, discoverAuthStorage, @@ -1009,6 +1014,8 @@ export async function runEmbeddedPiAgent(params: { config?: ClawdbotConfig; skillsSnapshot?: SkillSnapshot; prompt: string; + /** Optional image attachments for multimodal messages. */ + images?: ImageContent[]; provider?: string; model?: string; authProfileId?: string; @@ -1434,7 +1441,9 @@ export async function runEmbeddedPiAgent(params: { `embedded run prompt start: runId=${params.runId} sessionId=${params.sessionId}`, ); try { - await session.prompt(params.prompt); + await session.prompt(params.prompt, { + images: params.images, + }); } catch (err) { promptError = err; } finally { diff --git a/src/commands/agent.ts b/src/commands/agent.ts index 485324828..cf7eb6ab7 100644 --- a/src/commands/agent.ts +++ b/src/commands/agent.ts @@ -66,8 +66,17 @@ import { } from "../utils/message-provider.js"; import { normalizeE164 } from "../utils.js"; +/** Image content block for Claude API multimodal messages. */ +type ImageContent = { + type: "image"; + data: string; + mimeType: string; +}; + type AgentCommandOpts = { message: string; + /** Optional image attachments for multimodal messages. */ + images?: ImageContent[]; to?: string; sessionId?: string; sessionKey?: string; @@ -450,6 +459,7 @@ export async function agentCommand( config: cfg, skillsSnapshot, prompt: body, + images: opts.images, provider: providerOverride, model: modelOverride, authProfileId: sessionEntry?.authProfileOverride, diff --git a/src/gateway/chat-attachments.test.ts b/src/gateway/chat-attachments.test.ts index e07116636..2cc47fb48 100644 --- a/src/gateway/chat-attachments.test.ts +++ b/src/gateway/chat-attachments.test.ts @@ -3,6 +3,7 @@ import { describe, expect, it } from "vitest"; import { buildMessageWithAttachments, type ChatAttachment, + parseMessageWithAttachments, } from "./chat-attachments.js"; const PNG_1x1 = @@ -56,3 +57,65 @@ describe("buildMessageWithAttachments", () => { ).toThrow(/exceeds size limit/i); }); }); + +describe("parseMessageWithAttachments", () => { + it("sniffs mime when missing", async () => { + const logs: string[] = []; + const parsed = await parseMessageWithAttachments( + "see this", + [ + { + type: "image", + fileName: "dot.png", + content: PNG_1x1, + }, + ], + { log: { warn: (message) => logs.push(message) } }, + ); + expect(parsed.message).toBe("see this"); + expect(parsed.images).toHaveLength(1); + expect(parsed.images[0]?.mimeType).toBe("image/png"); + expect(parsed.images[0]?.data).toBe(PNG_1x1); + expect(logs).toHaveLength(0); + }); + + it("drops non-image payloads and logs", async () => { + const logs: string[] = []; + const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); + const parsed = await parseMessageWithAttachments( + "x", + [ + { + type: "file", + mimeType: "image/png", + fileName: "not-image.pdf", + content: pdf, + }, + ], + { log: { warn: (message) => logs.push(message) } }, + ); + expect(parsed.images).toHaveLength(0); + expect(logs).toHaveLength(1); + expect(logs[0]).toMatch(/non-image/i); + }); + + it("prefers sniffed mime type and logs mismatch", async () => { + const logs: string[] = []; + const parsed = await parseMessageWithAttachments( + "x", + [ + { + type: "image", + mimeType: "image/jpeg", + fileName: "dot.png", + content: PNG_1x1, + }, + ], + { log: { warn: (message) => logs.push(message) } }, + ); + expect(parsed.images).toHaveLength(1); + expect(parsed.images[0]?.mimeType).toBe("image/png"); + expect(logs).toHaveLength(1); + expect(logs[0]).toMatch(/mime mismatch/i); + }); +}); diff --git a/src/gateway/chat-attachments.ts b/src/gateway/chat-attachments.ts index 523da182d..e24fc4e2c 100644 --- a/src/gateway/chat-attachments.ts +++ b/src/gateway/chat-attachments.ts @@ -1,3 +1,5 @@ +import { detectMime } from "../media/mime.js"; + export type ChatAttachment = { type?: string; mimeType?: string; @@ -5,6 +7,133 @@ export type ChatAttachment = { content?: unknown; }; +export type ChatImageContent = { + type: "image"; + data: string; + mimeType: string; +}; + +export type ParsedMessageWithImages = { + message: string; + images: ChatImageContent[]; +}; + +type AttachmentLog = { + warn: (message: string) => void; +}; + +function normalizeMime(mime?: string): string | undefined { + if (!mime) return undefined; + const cleaned = mime.split(";")[0]?.trim().toLowerCase(); + return cleaned || undefined; +} + +async function sniffMimeFromBase64( + base64: string, +): Promise { + const trimmed = base64.trim(); + if (!trimmed) return undefined; + + const take = Math.min(256, trimmed.length); + const sliceLen = take - (take % 4); + if (sliceLen < 8) return undefined; + + try { + const head = Buffer.from(trimmed.slice(0, sliceLen), "base64"); + return await detectMime({ buffer: head }); + } catch { + return undefined; + } +} + +function isImageMime(mime?: string): boolean { + return typeof mime === "string" && mime.startsWith("image/"); +} + +/** + * Parse attachments and extract images as structured content blocks. + * Returns the message text and an array of image content blocks + * compatible with Claude API's image format. + */ +export async function parseMessageWithAttachments( + message: string, + attachments: ChatAttachment[] | undefined, + opts?: { maxBytes?: number; log?: AttachmentLog }, +): Promise { + const maxBytes = opts?.maxBytes ?? 5_000_000; // 5 MB + const log = opts?.log; + if (!attachments || attachments.length === 0) { + return { message, images: [] }; + } + + const images: ChatImageContent[] = []; + + for (const [idx, att] of attachments.entries()) { + if (!att) continue; + const mime = att.mimeType ?? ""; + const content = att.content; + const label = att.fileName || att.type || `attachment-${idx + 1}`; + + if (typeof content !== "string") { + throw new Error(`attachment ${label}: content must be base64 string`); + } + + let sizeBytes = 0; + let b64 = content.trim(); + // Strip data URL prefix if present (e.g., "data:image/jpeg;base64,...") + const dataUrlMatch = /^data:[^;]+;base64,(.*)$/.exec(b64); + if (dataUrlMatch) { + b64 = dataUrlMatch[1]; + } + // Basic base64 sanity: length multiple of 4 and charset check. + if (b64.length % 4 !== 0 || /[^A-Za-z0-9+/=]/.test(b64)) { + throw new Error(`attachment ${label}: invalid base64 content`); + } + try { + sizeBytes = Buffer.from(b64, "base64").byteLength; + } catch { + throw new Error(`attachment ${label}: invalid base64 content`); + } + if (sizeBytes <= 0 || sizeBytes > maxBytes) { + throw new Error( + `attachment ${label}: exceeds size limit (${sizeBytes} > ${maxBytes} bytes)`, + ); + } + + const providedMime = normalizeMime(mime); + const sniffedMime = normalizeMime(await sniffMimeFromBase64(b64)); + if (sniffedMime && !isImageMime(sniffedMime)) { + log?.warn( + `attachment ${label}: detected non-image (${sniffedMime}), dropping`, + ); + continue; + } + if (!sniffedMime && !isImageMime(providedMime)) { + log?.warn( + `attachment ${label}: unable to detect image mime type, dropping`, + ); + continue; + } + if (sniffedMime && providedMime && sniffedMime !== providedMime) { + log?.warn( + `attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using sniffed`, + ); + } + + images.push({ + type: "image", + data: b64, + mimeType: sniffedMime ?? providedMime ?? mime, + }); + } + + return { message, images }; +} + +/** + * @deprecated Use parseMessageWithAttachments instead. + * This function converts images to markdown data URLs which Claude API cannot process as images. + */ export function buildMessageWithAttachments( message: string, attachments: ChatAttachment[] | undefined, diff --git a/src/gateway/server-bridge.ts b/src/gateway/server-bridge.ts index 1de965f9a..87ac5a955 100644 --- a/src/gateway/server-bridge.ts +++ b/src/gateway/server-bridge.ts @@ -43,7 +43,10 @@ import { isChatStopCommandText, resolveChatRunExpiresAtMs, } from "./chat-abort.js"; -import { buildMessageWithAttachments } from "./chat-attachments.js"; +import { + type ChatImageContent, + parseMessageWithAttachments, +} from "./chat-attachments.js"; import { ErrorCodes, errorShape, @@ -793,32 +796,37 @@ export function createBridgeHandlers(ctx: BridgeHandlersContext) { }; const stopCommand = isChatStopCommandText(p.message); const normalizedAttachments = - p.attachments?.map((a) => ({ - type: typeof a?.type === "string" ? a.type : undefined, - mimeType: - typeof a?.mimeType === "string" ? a.mimeType : undefined, - fileName: - typeof a?.fileName === "string" ? a.fileName : undefined, - content: - typeof a?.content === "string" - ? a.content - : ArrayBuffer.isView(a?.content) - ? Buffer.from( - a.content.buffer, - a.content.byteOffset, - a.content.byteLength, - ).toString("base64") - : undefined, - })) ?? []; + p.attachments + ?.map((a) => ({ + type: typeof a?.type === "string" ? a.type : undefined, + mimeType: + typeof a?.mimeType === "string" ? a.mimeType : undefined, + fileName: + typeof a?.fileName === "string" ? a.fileName : undefined, + content: + typeof a?.content === "string" + ? a.content + : ArrayBuffer.isView(a?.content) + ? Buffer.from( + a.content.buffer, + a.content.byteOffset, + a.content.byteLength, + ).toString("base64") + : undefined, + })) + .filter((a) => a.content) ?? []; - let messageWithAttachments = p.message; + let parsedMessage = p.message; + let parsedImages: ChatImageContent[] = []; if (normalizedAttachments.length > 0) { try { - messageWithAttachments = buildMessageWithAttachments( + const parsed = await parseMessageWithAttachments( p.message, normalizedAttachments, - { maxBytes: 5_000_000 }, + { maxBytes: 5_000_000, log: ctx.logBridge }, ); + parsedMessage = parsed.message; + parsedImages = parsed.images; } catch (err) { return { ok: false, @@ -922,7 +930,8 @@ export function createBridgeHandlers(ctx: BridgeHandlersContext) { }; void agentCommand( { - message: messageWithAttachments, + message: parsedMessage, + images: parsedImages.length > 0 ? parsedImages : undefined, sessionId, sessionKey: p.sessionKey, runId: clientRunId, diff --git a/src/gateway/server-methods/chat.ts b/src/gateway/server-methods/chat.ts index e2c5db37b..c3afb65d4 100644 --- a/src/gateway/server-methods/chat.ts +++ b/src/gateway/server-methods/chat.ts @@ -13,7 +13,10 @@ import { isChatStopCommandText, resolveChatRunExpiresAtMs, } from "../chat-abort.js"; -import { buildMessageWithAttachments } from "../chat-attachments.js"; +import { + type ChatImageContent, + parseMessageWithAttachments, +} from "../chat-attachments.js"; import { ErrorCodes, errorShape, @@ -181,29 +184,34 @@ export const chatHandlers: GatewayRequestHandlers = { }; const stopCommand = isChatStopCommandText(p.message); const normalizedAttachments = - p.attachments?.map((a) => ({ - type: typeof a?.type === "string" ? a.type : undefined, - mimeType: typeof a?.mimeType === "string" ? a.mimeType : undefined, - fileName: typeof a?.fileName === "string" ? a.fileName : undefined, - content: - typeof a?.content === "string" - ? a.content - : ArrayBuffer.isView(a?.content) - ? Buffer.from( - a.content.buffer, - a.content.byteOffset, - a.content.byteLength, - ).toString("base64") - : undefined, - })) ?? []; - let messageWithAttachments = p.message; + p.attachments + ?.map((a) => ({ + type: typeof a?.type === "string" ? a.type : undefined, + mimeType: typeof a?.mimeType === "string" ? a.mimeType : undefined, + fileName: typeof a?.fileName === "string" ? a.fileName : undefined, + content: + typeof a?.content === "string" + ? a.content + : ArrayBuffer.isView(a?.content) + ? Buffer.from( + a.content.buffer, + a.content.byteOffset, + a.content.byteLength, + ).toString("base64") + : undefined, + })) + .filter((a) => a.content) ?? []; + let parsedMessage = p.message; + let parsedImages: ChatImageContent[] = []; if (normalizedAttachments.length > 0) { try { - messageWithAttachments = buildMessageWithAttachments( + const parsed = await parseMessageWithAttachments( p.message, normalizedAttachments, - { maxBytes: 5_000_000 }, + { maxBytes: 5_000_000, log: context.logGateway }, ); + parsedMessage = parsed.message; + parsedImages = parsed.images; } catch (err) { respond( false, @@ -312,7 +320,8 @@ export const chatHandlers: GatewayRequestHandlers = { void agentCommand( { - message: messageWithAttachments, + message: parsedMessage, + images: parsedImages.length > 0 ? parsedImages : undefined, sessionId, sessionKey: p.sessionKey, runId: clientRunId, diff --git a/src/gateway/server-methods/types.ts b/src/gateway/server-methods/types.ts index 79545fe80..613faa32a 100644 --- a/src/gateway/server-methods/types.ts +++ b/src/gateway/server-methods/types.ts @@ -32,6 +32,7 @@ export type GatewayRequestContext = { getHealthCache: () => HealthSummary | null; refreshHealthSnapshot: (opts?: { probe?: boolean }) => Promise; logHealth: { error: (message: string) => void }; + logGateway: { warn: (message: string) => void }; incrementPresenceVersion: () => number; getHealthVersion: () => number; broadcast: ( diff --git a/src/gateway/server.ts b/src/gateway/server.ts index a56f07605..2f77446bc 100644 --- a/src/gateway/server.ts +++ b/src/gateway/server.ts @@ -1674,6 +1674,7 @@ export async function startGatewayServer( getHealthCache: () => healthCache, refreshHealthSnapshot, logHealth, + logGateway: log, incrementPresenceVersion: () => { presenceVersion += 1; return presenceVersion;