From c4e76eb6355fced2b1715d39c55fdf01b5715a4e Mon Sep 17 00:00:00 2001 From: cristip73 Date: Sat, 10 Jan 2026 19:17:32 +0200 Subject: [PATCH] fix: enable image attachments in chat messages for Claude API Images were previously converted to markdown data URLs which Claude API treats as plain text, not as actual images. Changes: - Add parseMessageWithAttachments() that returns {message, images[]} - Pass images through the stack to session.prompt() as content blocks - Filter null/empty attachments before parsing - Strip data URL prefix if client sends it This enables iOS and other clients to send images that Claude can actually see. Co-Authored-By: Claude Opus 4.5 --- src/agents/pi-embedded-runner.ts | 13 ++++- src/commands/agent.ts | 10 ++++ src/gateway/chat-attachments.ts | 77 ++++++++++++++++++++++++++++++ src/gateway/server-bridge.ts | 51 ++++++++++++-------- src/gateway/server-methods/chat.ts | 47 ++++++++++-------- 5 files changed, 156 insertions(+), 42 deletions(-) diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index 48cd2cf06..55c9219b2 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -8,7 +8,12 @@ import type { AgentTool, ThinkingLevel, } from "@mariozechner/pi-agent-core"; -import type { Api, AssistantMessage, Model } from "@mariozechner/pi-ai"; +import type { + Api, + AssistantMessage, + ImageContent, + Model, +} from "@mariozechner/pi-ai"; import { createAgentSession, discoverAuthStorage, @@ -1009,6 +1014,8 @@ export async function runEmbeddedPiAgent(params: { config?: ClawdbotConfig; skillsSnapshot?: SkillSnapshot; prompt: string; + /** Optional image attachments for multimodal messages. */ + images?: ImageContent[]; provider?: string; model?: string; authProfileId?: string; @@ -1434,7 +1441,9 @@ export async function runEmbeddedPiAgent(params: { `embedded run prompt start: runId=${params.runId} sessionId=${params.sessionId}`, ); try { - await session.prompt(params.prompt); + await session.prompt(params.prompt, { + images: params.images, + }); } catch (err) { promptError = err; } finally { diff --git a/src/commands/agent.ts b/src/commands/agent.ts index 485324828..cf7eb6ab7 100644 --- a/src/commands/agent.ts +++ b/src/commands/agent.ts @@ -66,8 +66,17 @@ import { } from "../utils/message-provider.js"; import { normalizeE164 } from "../utils.js"; +/** Image content block for Claude API multimodal messages. */ +type ImageContent = { + type: "image"; + data: string; + mimeType: string; +}; + type AgentCommandOpts = { message: string; + /** Optional image attachments for multimodal messages. */ + images?: ImageContent[]; to?: string; sessionId?: string; sessionKey?: string; @@ -450,6 +459,7 @@ export async function agentCommand( config: cfg, skillsSnapshot, prompt: body, + images: opts.images, provider: providerOverride, model: modelOverride, authProfileId: sessionEntry?.authProfileOverride, diff --git a/src/gateway/chat-attachments.ts b/src/gateway/chat-attachments.ts index 523da182d..50082dc93 100644 --- a/src/gateway/chat-attachments.ts +++ b/src/gateway/chat-attachments.ts @@ -5,6 +5,83 @@ export type ChatAttachment = { content?: unknown; }; +export type ChatImageContent = { + type: "image"; + data: string; + mimeType: string; +}; + +export type ParsedMessageWithImages = { + message: string; + images: ChatImageContent[]; +}; + +/** + * Parse attachments and extract images as structured content blocks. + * Returns the message text and an array of image content blocks + * compatible with Claude API's image format. + */ +export function parseMessageWithAttachments( + message: string, + attachments: ChatAttachment[] | undefined, + opts?: { maxBytes?: number }, +): ParsedMessageWithImages { + const maxBytes = opts?.maxBytes ?? 5_000_000; // 5 MB + if (!attachments || attachments.length === 0) { + return { message, images: [] }; + } + + const images: ChatImageContent[] = []; + + for (const [idx, att] of attachments.entries()) { + if (!att) continue; + const mime = att.mimeType ?? ""; + const content = att.content; + const label = att.fileName || att.type || `attachment-${idx + 1}`; + + if (typeof content !== "string") { + throw new Error(`attachment ${label}: content must be base64 string`); + } + if (!mime.startsWith("image/")) { + throw new Error(`attachment ${label}: only image/* supported`); + } + + let sizeBytes = 0; + let b64 = content.trim(); + // Strip data URL prefix if present (e.g., "data:image/jpeg;base64,...") + const dataUrlMatch = /^data:[^;]+;base64,(.*)$/.exec(b64); + if (dataUrlMatch) { + b64 = dataUrlMatch[1]; + } + // Basic base64 sanity: length multiple of 4 and charset check. + if (b64.length % 4 !== 0 || /[^A-Za-z0-9+/=]/.test(b64)) { + throw new Error(`attachment ${label}: invalid base64 content`); + } + try { + sizeBytes = Buffer.from(b64, "base64").byteLength; + } catch { + throw new Error(`attachment ${label}: invalid base64 content`); + } + if (sizeBytes <= 0 || sizeBytes > maxBytes) { + throw new Error( + `attachment ${label}: exceeds size limit (${sizeBytes} > ${maxBytes} bytes)`, + ); + } + + images.push({ + type: "image", + data: b64, + mimeType: mime, + }); + } + + return { message, images }; +} + +/** + * @deprecated Use parseMessageWithAttachments instead. + * This function converts images to markdown data URLs which Claude API cannot process as images. + */ export function buildMessageWithAttachments( message: string, attachments: ChatAttachment[] | undefined, diff --git a/src/gateway/server-bridge.ts b/src/gateway/server-bridge.ts index 1de965f9a..bd24912b8 100644 --- a/src/gateway/server-bridge.ts +++ b/src/gateway/server-bridge.ts @@ -43,7 +43,10 @@ import { isChatStopCommandText, resolveChatRunExpiresAtMs, } from "./chat-abort.js"; -import { buildMessageWithAttachments } from "./chat-attachments.js"; +import { + type ChatImageContent, + parseMessageWithAttachments, +} from "./chat-attachments.js"; import { ErrorCodes, errorShape, @@ -793,32 +796,37 @@ export function createBridgeHandlers(ctx: BridgeHandlersContext) { }; const stopCommand = isChatStopCommandText(p.message); const normalizedAttachments = - p.attachments?.map((a) => ({ - type: typeof a?.type === "string" ? a.type : undefined, - mimeType: - typeof a?.mimeType === "string" ? a.mimeType : undefined, - fileName: - typeof a?.fileName === "string" ? a.fileName : undefined, - content: - typeof a?.content === "string" - ? a.content - : ArrayBuffer.isView(a?.content) - ? Buffer.from( - a.content.buffer, - a.content.byteOffset, - a.content.byteLength, - ).toString("base64") - : undefined, - })) ?? []; + p.attachments + ?.map((a) => ({ + type: typeof a?.type === "string" ? a.type : undefined, + mimeType: + typeof a?.mimeType === "string" ? a.mimeType : undefined, + fileName: + typeof a?.fileName === "string" ? a.fileName : undefined, + content: + typeof a?.content === "string" + ? a.content + : ArrayBuffer.isView(a?.content) + ? Buffer.from( + a.content.buffer, + a.content.byteOffset, + a.content.byteLength, + ).toString("base64") + : undefined, + })) + .filter((a) => a.content && a.mimeType) ?? []; - let messageWithAttachments = p.message; + let parsedMessage = p.message; + let parsedImages: ChatImageContent[] = []; if (normalizedAttachments.length > 0) { try { - messageWithAttachments = buildMessageWithAttachments( + const parsed = parseMessageWithAttachments( p.message, normalizedAttachments, { maxBytes: 5_000_000 }, ); + parsedMessage = parsed.message; + parsedImages = parsed.images; } catch (err) { return { ok: false, @@ -922,7 +930,8 @@ export function createBridgeHandlers(ctx: BridgeHandlersContext) { }; void agentCommand( { - message: messageWithAttachments, + message: parsedMessage, + images: parsedImages.length > 0 ? parsedImages : undefined, sessionId, sessionKey: p.sessionKey, runId: clientRunId, diff --git a/src/gateway/server-methods/chat.ts b/src/gateway/server-methods/chat.ts index e2c5db37b..eacabe0c6 100644 --- a/src/gateway/server-methods/chat.ts +++ b/src/gateway/server-methods/chat.ts @@ -13,7 +13,10 @@ import { isChatStopCommandText, resolveChatRunExpiresAtMs, } from "../chat-abort.js"; -import { buildMessageWithAttachments } from "../chat-attachments.js"; +import { + type ChatImageContent, + parseMessageWithAttachments, +} from "../chat-attachments.js"; import { ErrorCodes, errorShape, @@ -181,29 +184,34 @@ export const chatHandlers: GatewayRequestHandlers = { }; const stopCommand = isChatStopCommandText(p.message); const normalizedAttachments = - p.attachments?.map((a) => ({ - type: typeof a?.type === "string" ? a.type : undefined, - mimeType: typeof a?.mimeType === "string" ? a.mimeType : undefined, - fileName: typeof a?.fileName === "string" ? a.fileName : undefined, - content: - typeof a?.content === "string" - ? a.content - : ArrayBuffer.isView(a?.content) - ? Buffer.from( - a.content.buffer, - a.content.byteOffset, - a.content.byteLength, - ).toString("base64") - : undefined, - })) ?? []; - let messageWithAttachments = p.message; + p.attachments + ?.map((a) => ({ + type: typeof a?.type === "string" ? a.type : undefined, + mimeType: typeof a?.mimeType === "string" ? a.mimeType : undefined, + fileName: typeof a?.fileName === "string" ? a.fileName : undefined, + content: + typeof a?.content === "string" + ? a.content + : ArrayBuffer.isView(a?.content) + ? Buffer.from( + a.content.buffer, + a.content.byteOffset, + a.content.byteLength, + ).toString("base64") + : undefined, + })) + .filter((a) => a.content && a.mimeType) ?? []; + let parsedMessage = p.message; + let parsedImages: ChatImageContent[] = []; if (normalizedAttachments.length > 0) { try { - messageWithAttachments = buildMessageWithAttachments( + const parsed = parseMessageWithAttachments( p.message, normalizedAttachments, { maxBytes: 5_000_000 }, ); + parsedMessage = parsed.message; + parsedImages = parsed.images; } catch (err) { respond( false, @@ -312,7 +320,8 @@ export const chatHandlers: GatewayRequestHandlers = { void agentCommand( { - message: messageWithAttachments, + message: parsedMessage, + images: parsedImages.length > 0 ? parsedImages : undefined, sessionId, sessionKey: p.sessionKey, runId: clientRunId,