From c4e76eb6355fced2b1715d39c55fdf01b5715a4e Mon Sep 17 00:00:00 2001 From: cristip73 Date: Sat, 10 Jan 2026 19:17:32 +0200 Subject: [PATCH 1/2] fix: enable image attachments in chat messages for Claude API Images were previously converted to markdown data URLs which Claude API treats as plain text, not as actual images. Changes: - Add parseMessageWithAttachments() that returns {message, images[]} - Pass images through the stack to session.prompt() as content blocks - Filter null/empty attachments before parsing - Strip data URL prefix if client sends it This enables iOS and other clients to send images that Claude can actually see. Co-Authored-By: Claude Opus 4.5 --- src/agents/pi-embedded-runner.ts | 13 ++++- src/commands/agent.ts | 10 ++++ src/gateway/chat-attachments.ts | 77 ++++++++++++++++++++++++++++++ src/gateway/server-bridge.ts | 51 ++++++++++++-------- src/gateway/server-methods/chat.ts | 47 ++++++++++-------- 5 files changed, 156 insertions(+), 42 deletions(-) diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index 48cd2cf06..55c9219b2 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -8,7 +8,12 @@ import type { AgentTool, ThinkingLevel, } from "@mariozechner/pi-agent-core"; -import type { Api, AssistantMessage, Model } from "@mariozechner/pi-ai"; +import type { + Api, + AssistantMessage, + ImageContent, + Model, +} from "@mariozechner/pi-ai"; import { createAgentSession, discoverAuthStorage, @@ -1009,6 +1014,8 @@ export async function runEmbeddedPiAgent(params: { config?: ClawdbotConfig; skillsSnapshot?: SkillSnapshot; prompt: string; + /** Optional image attachments for multimodal messages. */ + images?: ImageContent[]; provider?: string; model?: string; authProfileId?: string; @@ -1434,7 +1441,9 @@ export async function runEmbeddedPiAgent(params: { `embedded run prompt start: runId=${params.runId} sessionId=${params.sessionId}`, ); try { - await session.prompt(params.prompt); + await session.prompt(params.prompt, { + images: params.images, + }); } catch (err) { promptError = err; } finally { diff --git a/src/commands/agent.ts b/src/commands/agent.ts index 485324828..cf7eb6ab7 100644 --- a/src/commands/agent.ts +++ b/src/commands/agent.ts @@ -66,8 +66,17 @@ import { } from "../utils/message-provider.js"; import { normalizeE164 } from "../utils.js"; +/** Image content block for Claude API multimodal messages. */ +type ImageContent = { + type: "image"; + data: string; + mimeType: string; +}; + type AgentCommandOpts = { message: string; + /** Optional image attachments for multimodal messages. */ + images?: ImageContent[]; to?: string; sessionId?: string; sessionKey?: string; @@ -450,6 +459,7 @@ export async function agentCommand( config: cfg, skillsSnapshot, prompt: body, + images: opts.images, provider: providerOverride, model: modelOverride, authProfileId: sessionEntry?.authProfileOverride, diff --git a/src/gateway/chat-attachments.ts b/src/gateway/chat-attachments.ts index 523da182d..50082dc93 100644 --- a/src/gateway/chat-attachments.ts +++ b/src/gateway/chat-attachments.ts @@ -5,6 +5,83 @@ export type ChatAttachment = { content?: unknown; }; +export type ChatImageContent = { + type: "image"; + data: string; + mimeType: string; +}; + +export type ParsedMessageWithImages = { + message: string; + images: ChatImageContent[]; +}; + +/** + * Parse attachments and extract images as structured content blocks. + * Returns the message text and an array of image content blocks + * compatible with Claude API's image format. + */ +export function parseMessageWithAttachments( + message: string, + attachments: ChatAttachment[] | undefined, + opts?: { maxBytes?: number }, +): ParsedMessageWithImages { + const maxBytes = opts?.maxBytes ?? 5_000_000; // 5 MB + if (!attachments || attachments.length === 0) { + return { message, images: [] }; + } + + const images: ChatImageContent[] = []; + + for (const [idx, att] of attachments.entries()) { + if (!att) continue; + const mime = att.mimeType ?? ""; + const content = att.content; + const label = att.fileName || att.type || `attachment-${idx + 1}`; + + if (typeof content !== "string") { + throw new Error(`attachment ${label}: content must be base64 string`); + } + if (!mime.startsWith("image/")) { + throw new Error(`attachment ${label}: only image/* supported`); + } + + let sizeBytes = 0; + let b64 = content.trim(); + // Strip data URL prefix if present (e.g., "data:image/jpeg;base64,...") + const dataUrlMatch = /^data:[^;]+;base64,(.*)$/.exec(b64); + if (dataUrlMatch) { + b64 = dataUrlMatch[1]; + } + // Basic base64 sanity: length multiple of 4 and charset check. + if (b64.length % 4 !== 0 || /[^A-Za-z0-9+/=]/.test(b64)) { + throw new Error(`attachment ${label}: invalid base64 content`); + } + try { + sizeBytes = Buffer.from(b64, "base64").byteLength; + } catch { + throw new Error(`attachment ${label}: invalid base64 content`); + } + if (sizeBytes <= 0 || sizeBytes > maxBytes) { + throw new Error( + `attachment ${label}: exceeds size limit (${sizeBytes} > ${maxBytes} bytes)`, + ); + } + + images.push({ + type: "image", + data: b64, + mimeType: mime, + }); + } + + return { message, images }; +} + +/** + * @deprecated Use parseMessageWithAttachments instead. + * This function converts images to markdown data URLs which Claude API cannot process as images. + */ export function buildMessageWithAttachments( message: string, attachments: ChatAttachment[] | undefined, diff --git a/src/gateway/server-bridge.ts b/src/gateway/server-bridge.ts index 1de965f9a..bd24912b8 100644 --- a/src/gateway/server-bridge.ts +++ b/src/gateway/server-bridge.ts @@ -43,7 +43,10 @@ import { isChatStopCommandText, resolveChatRunExpiresAtMs, } from "./chat-abort.js"; -import { buildMessageWithAttachments } from "./chat-attachments.js"; +import { + type ChatImageContent, + parseMessageWithAttachments, +} from "./chat-attachments.js"; import { ErrorCodes, errorShape, @@ -793,32 +796,37 @@ export function createBridgeHandlers(ctx: BridgeHandlersContext) { }; const stopCommand = isChatStopCommandText(p.message); const normalizedAttachments = - p.attachments?.map((a) => ({ - type: typeof a?.type === "string" ? a.type : undefined, - mimeType: - typeof a?.mimeType === "string" ? a.mimeType : undefined, - fileName: - typeof a?.fileName === "string" ? a.fileName : undefined, - content: - typeof a?.content === "string" - ? a.content - : ArrayBuffer.isView(a?.content) - ? Buffer.from( - a.content.buffer, - a.content.byteOffset, - a.content.byteLength, - ).toString("base64") - : undefined, - })) ?? []; + p.attachments + ?.map((a) => ({ + type: typeof a?.type === "string" ? a.type : undefined, + mimeType: + typeof a?.mimeType === "string" ? a.mimeType : undefined, + fileName: + typeof a?.fileName === "string" ? a.fileName : undefined, + content: + typeof a?.content === "string" + ? a.content + : ArrayBuffer.isView(a?.content) + ? Buffer.from( + a.content.buffer, + a.content.byteOffset, + a.content.byteLength, + ).toString("base64") + : undefined, + })) + .filter((a) => a.content && a.mimeType) ?? []; - let messageWithAttachments = p.message; + let parsedMessage = p.message; + let parsedImages: ChatImageContent[] = []; if (normalizedAttachments.length > 0) { try { - messageWithAttachments = buildMessageWithAttachments( + const parsed = parseMessageWithAttachments( p.message, normalizedAttachments, { maxBytes: 5_000_000 }, ); + parsedMessage = parsed.message; + parsedImages = parsed.images; } catch (err) { return { ok: false, @@ -922,7 +930,8 @@ export function createBridgeHandlers(ctx: BridgeHandlersContext) { }; void agentCommand( { - message: messageWithAttachments, + message: parsedMessage, + images: parsedImages.length > 0 ? parsedImages : undefined, sessionId, sessionKey: p.sessionKey, runId: clientRunId, diff --git a/src/gateway/server-methods/chat.ts b/src/gateway/server-methods/chat.ts index e2c5db37b..eacabe0c6 100644 --- a/src/gateway/server-methods/chat.ts +++ b/src/gateway/server-methods/chat.ts @@ -13,7 +13,10 @@ import { isChatStopCommandText, resolveChatRunExpiresAtMs, } from "../chat-abort.js"; -import { buildMessageWithAttachments } from "../chat-attachments.js"; +import { + type ChatImageContent, + parseMessageWithAttachments, +} from "../chat-attachments.js"; import { ErrorCodes, errorShape, @@ -181,29 +184,34 @@ export const chatHandlers: GatewayRequestHandlers = { }; const stopCommand = isChatStopCommandText(p.message); const normalizedAttachments = - p.attachments?.map((a) => ({ - type: typeof a?.type === "string" ? a.type : undefined, - mimeType: typeof a?.mimeType === "string" ? a.mimeType : undefined, - fileName: typeof a?.fileName === "string" ? a.fileName : undefined, - content: - typeof a?.content === "string" - ? a.content - : ArrayBuffer.isView(a?.content) - ? Buffer.from( - a.content.buffer, - a.content.byteOffset, - a.content.byteLength, - ).toString("base64") - : undefined, - })) ?? []; - let messageWithAttachments = p.message; + p.attachments + ?.map((a) => ({ + type: typeof a?.type === "string" ? a.type : undefined, + mimeType: typeof a?.mimeType === "string" ? a.mimeType : undefined, + fileName: typeof a?.fileName === "string" ? a.fileName : undefined, + content: + typeof a?.content === "string" + ? a.content + : ArrayBuffer.isView(a?.content) + ? Buffer.from( + a.content.buffer, + a.content.byteOffset, + a.content.byteLength, + ).toString("base64") + : undefined, + })) + .filter((a) => a.content && a.mimeType) ?? []; + let parsedMessage = p.message; + let parsedImages: ChatImageContent[] = []; if (normalizedAttachments.length > 0) { try { - messageWithAttachments = buildMessageWithAttachments( + const parsed = parseMessageWithAttachments( p.message, normalizedAttachments, { maxBytes: 5_000_000 }, ); + parsedMessage = parsed.message; + parsedImages = parsed.images; } catch (err) { respond( false, @@ -312,7 +320,8 @@ export const chatHandlers: GatewayRequestHandlers = { void agentCommand( { - message: messageWithAttachments, + message: parsedMessage, + images: parsedImages.length > 0 ? parsedImages : undefined, sessionId, sessionKey: p.sessionKey, runId: clientRunId, From 193ebba657f9bd059a67942849863c11e67075df Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 10 Jan 2026 20:06:33 +0100 Subject: [PATCH 2/2] fix: sniff chat attachment mime (#670) (thanks @cristip73) --- CHANGELOG.md | 1 + src/gateway/chat-attachments.test.ts | 63 ++++++++++++++++++++++++++ src/gateway/chat-attachments.ts | 66 +++++++++++++++++++++++++--- src/gateway/server-bridge.ts | 6 +-- src/gateway/server-methods/chat.ts | 6 +-- src/gateway/server-methods/types.ts | 1 + src/gateway/server.ts | 1 + 7 files changed, 131 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25a1392ac..5054c3542 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ - Onboarding/Gateway: persist non-interactive gateway token auth in config; add WS wizard + gateway tool-calling regression coverage. - Gateway/Control UI: make `chat.send` non-blocking, wire Stop to `chat.abort`, and treat `/stop` as an out-of-band abort. (#653) - Gateway/Control UI: allow `chat.abort` without `runId` (abort active runs), suppress post-abort chat streaming, and prune stuck chat runs. (#653) +- Gateway/Control UI: sniff image attachments for chat.send, drop non-images, and log mismatches. (#670) — thanks @cristip73. - CLI: `clawdbot sessions` now includes `elev:*` + `usage:*` flags in the table output. - CLI/Pairing: accept positional provider for `pairing list|approve` (npm-run compatible); update docs/bot hints. - Branding: normalize user-facing “ClawdBot”/“CLAWDBOT” → “Clawdbot” (CLI, status, docs). diff --git a/src/gateway/chat-attachments.test.ts b/src/gateway/chat-attachments.test.ts index e07116636..2cc47fb48 100644 --- a/src/gateway/chat-attachments.test.ts +++ b/src/gateway/chat-attachments.test.ts @@ -3,6 +3,7 @@ import { describe, expect, it } from "vitest"; import { buildMessageWithAttachments, type ChatAttachment, + parseMessageWithAttachments, } from "./chat-attachments.js"; const PNG_1x1 = @@ -56,3 +57,65 @@ describe("buildMessageWithAttachments", () => { ).toThrow(/exceeds size limit/i); }); }); + +describe("parseMessageWithAttachments", () => { + it("sniffs mime when missing", async () => { + const logs: string[] = []; + const parsed = await parseMessageWithAttachments( + "see this", + [ + { + type: "image", + fileName: "dot.png", + content: PNG_1x1, + }, + ], + { log: { warn: (message) => logs.push(message) } }, + ); + expect(parsed.message).toBe("see this"); + expect(parsed.images).toHaveLength(1); + expect(parsed.images[0]?.mimeType).toBe("image/png"); + expect(parsed.images[0]?.data).toBe(PNG_1x1); + expect(logs).toHaveLength(0); + }); + + it("drops non-image payloads and logs", async () => { + const logs: string[] = []; + const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); + const parsed = await parseMessageWithAttachments( + "x", + [ + { + type: "file", + mimeType: "image/png", + fileName: "not-image.pdf", + content: pdf, + }, + ], + { log: { warn: (message) => logs.push(message) } }, + ); + expect(parsed.images).toHaveLength(0); + expect(logs).toHaveLength(1); + expect(logs[0]).toMatch(/non-image/i); + }); + + it("prefers sniffed mime type and logs mismatch", async () => { + const logs: string[] = []; + const parsed = await parseMessageWithAttachments( + "x", + [ + { + type: "image", + mimeType: "image/jpeg", + fileName: "dot.png", + content: PNG_1x1, + }, + ], + { log: { warn: (message) => logs.push(message) } }, + ); + expect(parsed.images).toHaveLength(1); + expect(parsed.images[0]?.mimeType).toBe("image/png"); + expect(logs).toHaveLength(1); + expect(logs[0]).toMatch(/mime mismatch/i); + }); +}); diff --git a/src/gateway/chat-attachments.ts b/src/gateway/chat-attachments.ts index 50082dc93..e24fc4e2c 100644 --- a/src/gateway/chat-attachments.ts +++ b/src/gateway/chat-attachments.ts @@ -1,3 +1,5 @@ +import { detectMime } from "../media/mime.js"; + export type ChatAttachment = { type?: string; mimeType?: string; @@ -16,17 +18,50 @@ export type ParsedMessageWithImages = { images: ChatImageContent[]; }; +type AttachmentLog = { + warn: (message: string) => void; +}; + +function normalizeMime(mime?: string): string | undefined { + if (!mime) return undefined; + const cleaned = mime.split(";")[0]?.trim().toLowerCase(); + return cleaned || undefined; +} + +async function sniffMimeFromBase64( + base64: string, +): Promise { + const trimmed = base64.trim(); + if (!trimmed) return undefined; + + const take = Math.min(256, trimmed.length); + const sliceLen = take - (take % 4); + if (sliceLen < 8) return undefined; + + try { + const head = Buffer.from(trimmed.slice(0, sliceLen), "base64"); + return await detectMime({ buffer: head }); + } catch { + return undefined; + } +} + +function isImageMime(mime?: string): boolean { + return typeof mime === "string" && mime.startsWith("image/"); +} + /** * Parse attachments and extract images as structured content blocks. * Returns the message text and an array of image content blocks * compatible with Claude API's image format. */ -export function parseMessageWithAttachments( +export async function parseMessageWithAttachments( message: string, attachments: ChatAttachment[] | undefined, - opts?: { maxBytes?: number }, -): ParsedMessageWithImages { + opts?: { maxBytes?: number; log?: AttachmentLog }, +): Promise { const maxBytes = opts?.maxBytes ?? 5_000_000; // 5 MB + const log = opts?.log; if (!attachments || attachments.length === 0) { return { message, images: [] }; } @@ -42,9 +77,6 @@ export function parseMessageWithAttachments( if (typeof content !== "string") { throw new Error(`attachment ${label}: content must be base64 string`); } - if (!mime.startsWith("image/")) { - throw new Error(`attachment ${label}: only image/* supported`); - } let sizeBytes = 0; let b64 = content.trim(); @@ -68,10 +100,30 @@ export function parseMessageWithAttachments( ); } + const providedMime = normalizeMime(mime); + const sniffedMime = normalizeMime(await sniffMimeFromBase64(b64)); + if (sniffedMime && !isImageMime(sniffedMime)) { + log?.warn( + `attachment ${label}: detected non-image (${sniffedMime}), dropping`, + ); + continue; + } + if (!sniffedMime && !isImageMime(providedMime)) { + log?.warn( + `attachment ${label}: unable to detect image mime type, dropping`, + ); + continue; + } + if (sniffedMime && providedMime && sniffedMime !== providedMime) { + log?.warn( + `attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using sniffed`, + ); + } + images.push({ type: "image", data: b64, - mimeType: mime, + mimeType: sniffedMime ?? providedMime ?? mime, }); } diff --git a/src/gateway/server-bridge.ts b/src/gateway/server-bridge.ts index bd24912b8..87ac5a955 100644 --- a/src/gateway/server-bridge.ts +++ b/src/gateway/server-bridge.ts @@ -814,16 +814,16 @@ export function createBridgeHandlers(ctx: BridgeHandlersContext) { ).toString("base64") : undefined, })) - .filter((a) => a.content && a.mimeType) ?? []; + .filter((a) => a.content) ?? []; let parsedMessage = p.message; let parsedImages: ChatImageContent[] = []; if (normalizedAttachments.length > 0) { try { - const parsed = parseMessageWithAttachments( + const parsed = await parseMessageWithAttachments( p.message, normalizedAttachments, - { maxBytes: 5_000_000 }, + { maxBytes: 5_000_000, log: ctx.logBridge }, ); parsedMessage = parsed.message; parsedImages = parsed.images; diff --git a/src/gateway/server-methods/chat.ts b/src/gateway/server-methods/chat.ts index eacabe0c6..c3afb65d4 100644 --- a/src/gateway/server-methods/chat.ts +++ b/src/gateway/server-methods/chat.ts @@ -200,15 +200,15 @@ export const chatHandlers: GatewayRequestHandlers = { ).toString("base64") : undefined, })) - .filter((a) => a.content && a.mimeType) ?? []; + .filter((a) => a.content) ?? []; let parsedMessage = p.message; let parsedImages: ChatImageContent[] = []; if (normalizedAttachments.length > 0) { try { - const parsed = parseMessageWithAttachments( + const parsed = await parseMessageWithAttachments( p.message, normalizedAttachments, - { maxBytes: 5_000_000 }, + { maxBytes: 5_000_000, log: context.logGateway }, ); parsedMessage = parsed.message; parsedImages = parsed.images; diff --git a/src/gateway/server-methods/types.ts b/src/gateway/server-methods/types.ts index 79545fe80..613faa32a 100644 --- a/src/gateway/server-methods/types.ts +++ b/src/gateway/server-methods/types.ts @@ -32,6 +32,7 @@ export type GatewayRequestContext = { getHealthCache: () => HealthSummary | null; refreshHealthSnapshot: (opts?: { probe?: boolean }) => Promise; logHealth: { error: (message: string) => void }; + logGateway: { warn: (message: string) => void }; incrementPresenceVersion: () => number; getHealthVersion: () => number; broadcast: ( diff --git a/src/gateway/server.ts b/src/gateway/server.ts index a56f07605..2f77446bc 100644 --- a/src/gateway/server.ts +++ b/src/gateway/server.ts @@ -1674,6 +1674,7 @@ export async function startGatewayServer( getHealthCache: () => healthCache, refreshHealthSnapshot, logHealth, + logGateway: log, incrementPresenceVersion: () => { presenceVersion += 1; return presenceVersion;