diff --git a/src/agents/clawdbot-tools.ts b/src/agents/clawdbot-tools.ts index 5ae4891dc..98717cd3b 100644 --- a/src/agents/clawdbot-tools.ts +++ b/src/agents/clawdbot-tools.ts @@ -41,12 +41,15 @@ export function createClawdbotTools(options?: { replyToMode?: "off" | "first" | "all"; /** Mutable ref to track if a reply was sent (for "first" mode). */ hasRepliedRef?: { value: boolean }; + /** If true, the model has native vision capability */ + modelHasVision?: boolean; }): AnyAgentTool[] { const imageTool = options?.agentDir?.trim() ? createImageTool({ config: options?.config, agentDir: options.agentDir, sandboxRoot: options?.sandboxRoot, + modelHasVision: options?.modelHasVision, }) : null; const webSearchTool = createWebSearchTool({ diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index 407a25246..94a80dcb5 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -2,7 +2,7 @@ import fs from "node:fs/promises"; import os from "node:os"; import type { AgentMessage } from "@mariozechner/pi-agent-core"; -import type { AssistantMessage } from "@mariozechner/pi-ai"; +import type { AssistantMessage, ImageContent } from "@mariozechner/pi-ai"; import { streamSimple } from "@mariozechner/pi-ai"; import { createAgentSession, SessionManager, SettingsManager } from "@mariozechner/pi-coding-agent"; @@ -69,7 +69,9 @@ import { resolveSandboxRuntimeStatus } from "../../sandbox/runtime-status.js"; import { isTimeoutError } from "../../failover-error.js"; import { getGlobalHookRunner } from "../../../plugins/hook-runner-global.js"; +import { MAX_IMAGE_BYTES } from "../../../media/constants.js"; import type { EmbeddedRunAttemptParams, EmbeddedRunAttemptResult } from "./types.js"; +import { detectAndLoadPromptImages } from "./images.js"; export async function runEmbeddedAttempt( params: EmbeddedRunAttemptParams, @@ -133,6 +135,9 @@ export async function runEmbeddedAttempt( const agentDir = params.agentDir ?? resolveClawdbotAgentDir(); + // Check if the model supports native image input + const modelHasVision = params.model.input?.includes("image") ?? false; + const toolsRaw = createClawdbotCodingTools({ exec: { ...params.execOverrides, @@ -153,6 +158,7 @@ export async function runEmbeddedAttempt( currentThreadTs: params.currentThreadTs, replyToMode: params.replyToMode, hasRepliedRef: params.hasRepliedRef, + modelHasVision, }); const tools = sanitizeToolsForGoogle({ tools: toolsRaw, provider: params.provider }); logToolSchemasForGoogle({ tools, provider: params.provider }); @@ -530,7 +536,60 @@ export async function runEmbeddedAttempt( } try { - await abortable(activeSession.prompt(effectivePrompt, { images: params.images })); + // Detect and load images referenced in the prompt for vision-capable models. + // This eliminates the need for an explicit "view" tool call by injecting + // images directly into the prompt when the model supports it. + // Also scans conversation history to enable follow-up questions about earlier images. + const imageResult = await detectAndLoadPromptImages({ + prompt: effectivePrompt, + workspaceDir: effectiveWorkspace, + model: params.model, + existingImages: params.images, + historyMessages: activeSession.messages, + maxBytes: MAX_IMAGE_BYTES, + // Enforce sandbox path restrictions when sandbox is enabled + sandboxRoot: sandbox?.enabled ? sandbox.workspaceDir : undefined, + }); + + // Inject history images into their original message positions. + // This ensures the model sees images in context (e.g., "compare to the first image"). + if (imageResult.historyImagesByIndex.size > 0) { + for (const [msgIndex, images] of imageResult.historyImagesByIndex) { + // Bounds check: ensure index is valid before accessing + if (msgIndex < 0 || msgIndex >= activeSession.messages.length) continue; + const msg = activeSession.messages[msgIndex]; + if (msg && msg.role === "user") { + // Convert string content to array format if needed + if (typeof msg.content === "string") { + msg.content = [{ type: "text", text: msg.content }]; + } + if (Array.isArray(msg.content)) { + // Check for existing image content to avoid duplicates across turns + const existingImageData = new Set( + msg.content + .filter((c): c is ImageContent => + c != null && typeof c === "object" && c.type === "image" && typeof c.data === "string", + ) + .map((c) => c.data), + ); + for (const img of images) { + // Only add if this image isn't already in the message + if (!existingImageData.has(img.data)) { + msg.content.push(img); + } + } + } + } + } + } + + // Only pass images option if there are actually images to pass + // This avoids potential issues with models that don't expect the images parameter + if (imageResult.images.length > 0) { + await abortable(activeSession.prompt(effectivePrompt, { images: imageResult.images })); + } else { + await abortable(activeSession.prompt(effectivePrompt)); + } } catch (err) { promptError = err; } finally { diff --git a/src/agents/pi-embedded-runner/run/images.test.ts b/src/agents/pi-embedded-runner/run/images.test.ts new file mode 100644 index 000000000..918f2027c --- /dev/null +++ b/src/agents/pi-embedded-runner/run/images.test.ts @@ -0,0 +1,218 @@ +import { describe, expect, it } from "vitest"; + +import { detectImageReferences, modelSupportsImages } from "./images.js"; + +describe("detectImageReferences", () => { + it("detects absolute file paths with common extensions", () => { + const prompt = "Check this image /path/to/screenshot.png and tell me what you see"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]).toEqual({ + raw: "/path/to/screenshot.png", + type: "path", + resolved: "/path/to/screenshot.png", + }); + }); + + it("detects relative paths starting with ./", () => { + const prompt = "Look at ./images/photo.jpg"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.raw).toBe("./images/photo.jpg"); + expect(refs[0]?.type).toBe("path"); + }); + + it("detects relative paths starting with ../", () => { + const prompt = "The file is at ../screenshots/test.jpeg"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.raw).toBe("../screenshots/test.jpeg"); + expect(refs[0]?.type).toBe("path"); + }); + + it("detects home directory paths starting with ~/", () => { + const prompt = "My photo is at ~/Pictures/vacation.png"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.raw).toBe("~/Pictures/vacation.png"); + expect(refs[0]?.type).toBe("path"); + // Resolved path should expand ~ + expect(refs[0]?.resolved).not.toContain("~"); + }); + + it("detects HTTP URLs with image extensions", () => { + const prompt = "Check this URL: https://mysite.com/images/logo.png"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]).toEqual({ + raw: "https://mysite.com/images/logo.png", + type: "url", + resolved: "https://mysite.com/images/logo.png", + }); + }); + + it("detects HTTPS URLs with query parameters", () => { + const prompt = "Image from https://cdn.mysite.com/img.jpg?size=large&v=2"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.type).toBe("url"); + expect(refs[0]?.raw).toContain("https://cdn.mysite.com/img.jpg"); + }); + + it("detects multiple image references in a prompt", () => { + const prompt = ` + Compare these two images: + 1. /home/user/photo1.png + 2. https://mysite.com/photo2.jpg + `; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(2); + expect(refs.some((r) => r.type === "path")).toBe(true); + expect(refs.some((r) => r.type === "url")).toBe(true); + }); + + it("handles various image extensions", () => { + const extensions = ["png", "jpg", "jpeg", "gif", "webp", "bmp", "tiff", "heic"]; + for (const ext of extensions) { + const prompt = `Image: /test/image.${ext}`; + const refs = detectImageReferences(prompt); + expect(refs.length).toBeGreaterThanOrEqual(1); + expect(refs[0]?.raw).toContain(`.${ext}`); + } + }); + + it("deduplicates repeated image references", () => { + const prompt = "Look at /path/image.png and also /path/image.png again"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + }); + + it("returns empty array when no images found", () => { + const prompt = "Just some text without any image references"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(0); + }); + + it("ignores non-image file extensions", () => { + const prompt = "Check /path/to/document.pdf and /code/file.ts"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(0); + }); + + it("handles paths inside quotes (without spaces)", () => { + const prompt = 'The file is at "/path/to/image.png"'; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.raw).toBe("/path/to/image.png"); + }); + + it("handles paths in parentheses", () => { + const prompt = "See the image (./screenshot.png) for details"; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.raw).toBe("./screenshot.png"); + }); + + it("detects [Image: source: ...] format from messaging systems", () => { + const prompt = `What does this image show? +[Image: source: /Users/tyleryust/Library/Messages/Attachments/IMG_0043.jpeg]`; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.raw).toBe("/Users/tyleryust/Library/Messages/Attachments/IMG_0043.jpeg"); + expect(refs[0]?.type).toBe("path"); + }); + + it("handles complex message attachment paths", () => { + const prompt = `[Image: source: /Users/tyleryust/Library/Messages/Attachments/23/03/AA4726EA-DB27-4269-BA56-1436936CC134/5E3E286A-F585-4E5E-9043-5BC2AFAFD81BIMG_0043.jpeg]`; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.resolved).toContain("IMG_0043.jpeg"); + }); + + it("detects multiple images in [media attached: ...] format", () => { + // Multi-file format uses separate brackets on separate lines + const prompt = `[media attached: 2 files] +[media attached 1/2: /Users/tyleryust/.clawdbot/media/IMG_6430.jpeg (image/jpeg)] +[media attached 2/2: /Users/tyleryust/.clawdbot/media/IMG_6431.jpeg (image/jpeg)] +what about these images?`; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(2); + expect(refs[0]?.resolved).toContain("IMG_6430.jpeg"); + expect(refs[1]?.resolved).toContain("IMG_6431.jpeg"); + }); + + it("does not double-count path and url in same bracket", () => { + // Single file with URL (| separates path from url, not multiple files) + const prompt = `[media attached: /cache/IMG_6430.jpeg (image/jpeg) | /cache/IMG_6430.jpeg]`; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.resolved).toContain("IMG_6430.jpeg"); + }); + + it("skips example.com URLs as they are documentation examples", () => { + const prompt = `To send an image: MEDIA:https://example.com/image.jpg +Here is my actual image: /path/to/real.png`; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.raw).toBe("/path/to/real.png"); + }); + + it("handles single file format with URL (no index)", () => { + const prompt = `[media attached: /cache/photo.jpeg (image/jpeg) | https://example.com/url] +what is this?`; + const refs = detectImageReferences(prompt); + + expect(refs).toHaveLength(1); + expect(refs[0]?.resolved).toContain("photo.jpeg"); + }); + + it("handles paths with spaces in filename", () => { + // URL after | is https, not a local path, so only the local path should be detected + const prompt = `[media attached: /Users/test/.clawdbot/media/ChatGPT Image Apr 21, 2025.png (image/png) | https://example.com/same.png] +what is this?`; + const refs = detectImageReferences(prompt); + + // Only 1 ref - the local path (example.com URLs are skipped) + expect(refs).toHaveLength(1); + expect(refs[0]?.resolved).toContain("ChatGPT Image Apr 21, 2025.png"); + }); +}); + +describe("modelSupportsImages", () => { + it("returns true when model input includes image", () => { + const model = { input: ["text", "image"] }; + expect(modelSupportsImages(model)).toBe(true); + }); + + it("returns false when model input does not include image", () => { + const model = { input: ["text"] }; + expect(modelSupportsImages(model)).toBe(false); + }); + + it("returns false when model input is undefined", () => { + const model = {}; + expect(modelSupportsImages(model)).toBe(false); + }); + + it("returns false when model input is empty", () => { + const model = { input: [] }; + expect(modelSupportsImages(model)).toBe(false); + }); +}); diff --git a/src/agents/pi-embedded-runner/run/images.ts b/src/agents/pi-embedded-runner/run/images.ts new file mode 100644 index 000000000..f7de8b734 --- /dev/null +++ b/src/agents/pi-embedded-runner/run/images.ts @@ -0,0 +1,379 @@ +import fs from "node:fs/promises"; +import path from "node:path"; + +import type { ImageContent } from "@mariozechner/pi-ai"; + +import { assertSandboxPath } from "../../sandbox-paths.js"; +import { extractTextFromMessage } from "../../../tui/tui-formatters.js"; +import { loadWebMedia } from "../../../web/media.js"; +import { resolveUserPath } from "../../../utils.js"; +import { log } from "../logger.js"; + +/** + * Common image file extensions for detection. + */ +const IMAGE_EXTENSIONS = new Set([ + ".png", + ".jpg", + ".jpeg", + ".gif", + ".webp", + ".bmp", + ".tiff", + ".tif", + ".heic", + ".heif", +]); + +/** + * Result of detecting an image reference in text. + */ +export interface DetectedImageRef { + /** The raw matched string from the prompt */ + raw: string; + /** The type of reference (path or url) */ + type: "path" | "url"; + /** The resolved/normalized path or URL */ + resolved: string; + /** Index of the message this ref was found in (for history images) */ + messageIndex?: number; +} + +/** + * Checks if a file extension indicates an image file. + */ +function isImageExtension(filePath: string): boolean { + const ext = path.extname(filePath).toLowerCase(); + return IMAGE_EXTENSIONS.has(ext); +} + +/** + * Detects image references in a user prompt. + * + * Patterns detected: + * - Absolute paths: /path/to/image.png + * - Relative paths: ./image.png, ../images/photo.jpg + * - Home paths: ~/Pictures/screenshot.png + * - HTTP(S) URLs: https://example.com/image.png + * - Message attachments: [Image: source: /path/to/image.jpg] + * + * @param prompt The user prompt text to scan + * @returns Array of detected image references + */ +export function detectImageReferences(prompt: string): DetectedImageRef[] { + const refs: DetectedImageRef[] = []; + const seen = new Set(); + + // Helper to add a path ref + const addPathRef = (raw: string) => { + const trimmed = raw.trim(); + if (!trimmed || seen.has(trimmed.toLowerCase())) return; + if (!isImageExtension(trimmed)) return; + seen.add(trimmed.toLowerCase()); + const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed; + refs.push({ raw: trimmed, type: "path", resolved }); + }; + + // Pattern for [media attached: path (type) | url] or [media attached N/M: path (type) | url] format + // Each bracket = ONE file. The | separates path from URL, not multiple files. + // Multi-file format uses separate brackets on separate lines. + const mediaAttachedPattern = /\[media attached(?:\s+\d+\/\d+)?:\s*([^\]]+)\]/gi; + let match: RegExpExecArray | null; + while ((match = mediaAttachedPattern.exec(prompt)) !== null) { + const content = match[1]; + + // Skip "[media attached: N files]" header lines + if (/^\d+\s+files?$/i.test(content.trim())) { + continue; + } + + // Extract path before the (mime/type) or | delimiter + // Format is: path (type) | url OR just: path (type) + // Path may contain spaces (e.g., "ChatGPT Image Apr 21.png") + // Use non-greedy .+? to stop at first image extension + const pathMatch = content.match( + /^\s*(.+?\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\s*(?:\(|$|\|)/i, + ); + if (pathMatch?.[1]) { + addPathRef(pathMatch[1].trim()); + } + } + + // Pattern for [Image: source: /path/...] format from messaging systems + const messageImagePattern = /\[Image:\s*source:\s*([^\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\]/gi; + while ((match = messageImagePattern.exec(prompt)) !== null) { + const raw = match[1]?.trim(); + if (raw) addPathRef(raw); + } + + // Pattern for HTTP(S) URLs ending in image extensions + // Skip example.com URLs as they're often just documentation examples + const urlPattern = + /https?:\/\/[^\s<>"'`\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif)(?:\?[^\s<>"'`\]]*)?/gi; + while ((match = urlPattern.exec(prompt)) !== null) { + const raw = match[0]; + // Skip example.com URLs - they're documentation, not real images + if (raw.includes("example.com")) continue; + if (seen.has(raw.toLowerCase())) continue; + seen.add(raw.toLowerCase()); + refs.push({ raw, type: "url", resolved: raw }); + } + + // Pattern for file paths (absolute, relative, or home) + // Matches: + // - /absolute/path/to/file.ext (including paths with special chars like Messages/Attachments) + // - ./relative/path.ext + // - ../parent/path.ext + // - ~/home/path.ext + const pathPattern = /(?:^|\s|["'`(])((\.\.?\/|[~/])[^\s"'`()[\]]*\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))/gi; + while ((match = pathPattern.exec(prompt)) !== null) { + const raw = match[1] || match[0]; + addPathRef(raw); + } + + return refs; +} + +/** + * Loads an image from a file path or URL and returns it as ImageContent. + * + * @param ref The detected image reference + * @param workspaceDir The current workspace directory for resolving relative paths + * @param options Optional settings for sandbox and size limits + * @returns The loaded image content, or null if loading failed + */ +export async function loadImageFromRef( + ref: DetectedImageRef, + workspaceDir: string, + options?: { + maxBytes?: number; + /** If set, enforce that file paths are within this sandbox root */ + sandboxRoot?: string; + }, +): Promise { + try { + let targetPath = ref.resolved; + + // When sandbox is enabled, block remote URL loading to maintain network boundary + if (ref.type === "url" && options?.sandboxRoot) { + log.debug(`Native image: rejecting remote URL in sandboxed session: ${ref.resolved}`); + return null; + } + + // For file paths, resolve relative to the appropriate root: + // - When sandbox is enabled, resolve relative to sandboxRoot for security + // - Otherwise, resolve relative to workspaceDir + if (ref.type === "path" && !path.isAbsolute(targetPath)) { + const resolveRoot = options?.sandboxRoot ?? workspaceDir; + targetPath = path.resolve(resolveRoot, targetPath); + } + + // Enforce sandbox restrictions if sandboxRoot is set + if (ref.type === "path" && options?.sandboxRoot) { + try { + const validated = await assertSandboxPath({ + filePath: targetPath, + cwd: options.sandboxRoot, + root: options.sandboxRoot, + }); + targetPath = validated.resolved; + } catch (err) { + // Log the actual error for debugging (sandbox violation or other path error) + log.debug(`Native image: sandbox validation failed for ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`); + return null; + } + } + + // Check file exists for local paths + if (ref.type === "path") { + try { + await fs.stat(targetPath); + } catch { + log.debug(`Native image: file not found: ${targetPath}`); + return null; + } + } + + // loadWebMedia handles both file paths and HTTP(S) URLs + const media = await loadWebMedia(targetPath, options?.maxBytes); + + if (media.kind !== "image") { + log.debug(`Native image: not an image file: ${targetPath} (got ${media.kind})`); + return null; + } + + // EXIF orientation is already normalized by loadWebMedia -> resizeToJpeg + const mimeType = media.contentType ?? "image/png"; + const data = media.buffer.toString("base64"); + + return { type: "image", data, mimeType }; + } catch (err) { + // Log the actual error for debugging (size limits, network failures, etc.) + log.debug(`Native image: failed to load ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`); + return null; + } +} + +/** + * Checks if a model supports image input based on its input capabilities. + * + * @param model The model object with input capability array + * @returns True if the model supports image input + */ +export function modelSupportsImages(model: { input?: string[] }): boolean { + return model.input?.includes("image") ?? false; +} + +/** + * Extracts image references from conversation history messages. + * Scans user messages for image paths/URLs that can be loaded. + * Each ref includes the messageIndex so images can be injected at their original location. + * + * Note: Global deduplication is intentional - if the same image appears in multiple + * messages, we only inject it at the FIRST occurrence. This is sufficient because: + * 1. The model sees all message content including the image + * 2. Later references to "the image" or "that picture" will work since it's in context + * 3. Injecting duplicates would waste tokens and potentially hit size limits + */ +function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] { + const allRefs: DetectedImageRef[] = []; + const seen = new Set(); + + for (let i = 0; i < messages.length; i++) { + const msg = messages[i]; + if (!msg || typeof msg !== "object") continue; + const message = msg as { role?: string }; + // Only scan user messages for image references + if (message.role !== "user") continue; + + const text = extractTextFromMessage(msg); + if (!text) continue; + + const refs = detectImageReferences(text); + for (const ref of refs) { + const key = ref.resolved.toLowerCase(); + if (seen.has(key)) continue; + seen.add(key); + allRefs.push({ ...ref, messageIndex: i }); + } + } + + return allRefs; +} + +/** + * Detects and loads images referenced in a prompt for models with vision capability. + * + * This function scans the prompt for image references (file paths and URLs), + * loads them, and returns them as ImageContent array ready to be passed to + * the model's prompt method. + * + * Also scans conversation history for images from previous turns and returns + * them mapped by message index so they can be injected at their original location. + * + * @param params Configuration for image detection and loading + * @returns Object with loaded images for current prompt and history images by message index + */ +export async function detectAndLoadPromptImages(params: { + prompt: string; + workspaceDir: string; + model: { input?: string[] }; + existingImages?: ImageContent[]; + historyMessages?: unknown[]; + maxBytes?: number; + /** If set, enforce that file paths are within this sandbox root */ + sandboxRoot?: string; +}): Promise<{ + /** Images for the current prompt (existingImages + detected in current prompt) */ + images: ImageContent[]; + /** Images from history messages, keyed by message index */ + historyImagesByIndex: Map; + detectedRefs: DetectedImageRef[]; + loadedCount: number; + skippedCount: number; +}> { + // If model doesn't support images, return empty results + if (!modelSupportsImages(params.model)) { + return { + images: params.existingImages ?? [], + historyImagesByIndex: new Map(), + detectedRefs: [], + loadedCount: 0, + skippedCount: 0, + }; + } + + // Detect images from current prompt + const promptRefs = detectImageReferences(params.prompt); + + // Detect images from conversation history (with message indices) + const historyRefs = params.historyMessages + ? detectImagesFromHistory(params.historyMessages) + : []; + + // Deduplicate: if an image is in the current prompt, don't also load it from history. + // Current prompt images are passed via the `images` parameter to prompt(), while history + // images are injected into their original message positions. We don't want the same + // image loaded and sent twice (wasting tokens and potentially causing confusion). + const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase())); + const uniqueHistoryRefs = historyRefs.filter( + (r) => !seenPaths.has(r.resolved.toLowerCase()), + ); + + const allRefs = [...promptRefs, ...uniqueHistoryRefs]; + + if (allRefs.length === 0) { + return { + images: params.existingImages ?? [], + historyImagesByIndex: new Map(), + detectedRefs: [], + loadedCount: 0, + skippedCount: 0, + }; + } + + log.debug( + `Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`, + ); + + // Load images for current prompt + const promptImages: ImageContent[] = [...(params.existingImages ?? [])]; + // Load images for history, grouped by message index + const historyImagesByIndex = new Map(); + + let loadedCount = 0; + let skippedCount = 0; + + for (const ref of allRefs) { + const image = await loadImageFromRef(ref, params.workspaceDir, { + maxBytes: params.maxBytes, + sandboxRoot: params.sandboxRoot, + }); + if (image) { + if (ref.messageIndex !== undefined) { + // History image - add to the appropriate message index + const existing = historyImagesByIndex.get(ref.messageIndex); + if (existing) { + existing.push(image); + } else { + historyImagesByIndex.set(ref.messageIndex, [image]); + } + } else { + // Current prompt image + promptImages.push(image); + } + loadedCount++; + log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`); + } else { + skippedCount++; + } + } + + return { + images: promptImages, + historyImagesByIndex, + detectedRefs: allRefs, + loadedCount, + skippedCount, + }; +} diff --git a/src/agents/pi-tools.ts b/src/agents/pi-tools.ts index 33c4c27ad..d7d0a0c11 100644 --- a/src/agents/pi-tools.ts +++ b/src/agents/pi-tools.ts @@ -127,6 +127,8 @@ export function createClawdbotCodingTools(options?: { replyToMode?: "off" | "first" | "all"; /** Mutable ref to track if a reply was sent (for "first" mode). */ hasRepliedRef?: { value: boolean }; + /** If true, the model has native vision capability */ + modelHasVision?: boolean; }): AnyAgentTool[] { const execToolName = "exec"; const sandbox = options?.sandbox?.enabled ? options.sandbox : undefined; @@ -280,6 +282,7 @@ export function createClawdbotCodingTools(options?: { currentThreadTs: options?.currentThreadTs, replyToMode: options?.replyToMode, hasRepliedRef: options?.hasRepliedRef, + modelHasVision: options?.modelHasVision, }), ]; const pluginGroups = buildPluginToolGroups({ diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index 255e20183..9dcd94004 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -102,7 +102,11 @@ describe("image tool implicit imageModel config", () => { }); }); - it("disables image tool when primary model already supports images", async () => { + it("keeps image tool available when primary model supports images (for explicit requests)", async () => { + // When the primary model supports images, we still keep the tool available + // because images are auto-injected into prompts. The tool description is + // adjusted via modelHasVision to discourage redundant usage. + vi.stubEnv("OPENAI_API_KEY", "test-key"); const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-image-")); const cfg: ClawdbotConfig = { agents: { @@ -119,8 +123,13 @@ describe("image tool implicit imageModel config", () => { }, }, }; - expect(resolveImageModelConfigForTool({ cfg, agentDir })).toBeNull(); - expect(createImageTool({ config: cfg, agentDir })).toBeNull(); + // Tool should still be available for explicit image analysis requests + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "openai/gpt-5-mini", + }); + const tool = createImageTool({ config: cfg, agentDir, modelHasVision: true }); + expect(tool).not.toBeNull(); + expect(tool?.description).toContain("Only use this tool when the image was NOT already provided"); }); it("sandboxes image paths like the read tool", async () => { diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 52627e547..119423353 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -145,11 +145,10 @@ export function resolveImageModelConfigForTool(params: { cfg?: ClawdbotConfig; agentDir: string; }): ImageModelConfig | null { - const primarySupportsImages = resolvePrimaryModelSupportsImages({ - cfg: params.cfg, - agentDir: params.agentDir, - }); - if (primarySupportsImages === true) return null; + // Note: We intentionally do NOT gate based on primarySupportsImages here. + // Even when the primary model supports images, we keep the tool available + // because images are auto-injected into prompts (see attempt.ts detectAndLoadPromptImages). + // The tool description is adjusted via modelHasVision to discourage redundant usage. const explicit = coerceImageModelConfig(params.cfg); if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) { return explicit; @@ -368,6 +367,8 @@ export function createImageTool(options?: { config?: ClawdbotConfig; agentDir?: string; sandboxRoot?: string; + /** If true, the model has native vision capability and images in the prompt are auto-injected */ + modelHasVision?: boolean; }): AnyAgentTool | null { const agentDir = options?.agentDir?.trim(); if (!agentDir) { @@ -382,11 +383,17 @@ export function createImageTool(options?: { agentDir, }); if (!imageModelConfig) return null; + + // If model has native vision, images in the prompt are auto-injected + // so this tool is only needed when image wasn't provided in the prompt + const description = options?.modelHasVision + ? "Analyze an image with a vision model. Only use this tool when the image was NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you." + : "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL."; + return { label: "Image", name: "image", - description: - "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL.", + description, parameters: Type.Object({ prompt: Type.Optional(Type.String()), image: Type.String(), diff --git a/src/imessage/monitor/monitor-provider.ts b/src/imessage/monitor/monitor-provider.ts index 4a388d8cb..e94c53a3b 100644 --- a/src/imessage/monitor/monitor-provider.ts +++ b/src/imessage/monitor/monitor-provider.ts @@ -307,9 +307,16 @@ export async function monitorIMessageProvider(opts: MonitorIMessageOpts = {}): P const mentionRegexes = buildMentionRegexes(cfg, route.agentId); const messageText = (message.text ?? "").trim(); const attachments = includeAttachments ? (message.attachments ?? []) : []; - const firstAttachment = attachments?.find((entry) => entry?.original_path && !entry?.missing); + // Filter to valid attachments with paths + const validAttachments = attachments.filter( + (entry) => entry?.original_path && !entry?.missing, + ); + const firstAttachment = validAttachments[0]; const mediaPath = firstAttachment?.original_path ?? undefined; const mediaType = firstAttachment?.mime_type ?? undefined; + // Build arrays for all attachments (for multi-image support) + const mediaPaths = validAttachments.map((a) => a.original_path).filter(Boolean) as string[]; + const mediaTypes = validAttachments.map((a) => a.mime_type ?? undefined); const kind = mediaKindFromMime(mediaType ?? undefined); const placeholder = kind ? `` : attachments?.length ? "" : ""; const bodyText = messageText || placeholder; @@ -445,6 +452,9 @@ export async function monitorIMessageProvider(opts: MonitorIMessageOpts = {}): P MediaPath: mediaPath, MediaType: mediaType, MediaUrl: mediaPath, + MediaPaths: mediaPaths.length > 0 ? mediaPaths : undefined, + MediaTypes: mediaTypes.length > 0 ? mediaTypes : undefined, + MediaUrls: mediaPaths.length > 0 ? mediaPaths : undefined, MediaRemoteHost: remoteHost, WasMentioned: effectiveWasMentioned, CommandAuthorized: commandAuthorized, diff --git a/src/media/image-ops.ts b/src/media/image-ops.ts index 904b87a6d..e97e14b1c 100644 --- a/src/media/image-ops.ts +++ b/src/media/image-ops.ts @@ -28,6 +28,98 @@ async function loadSharp(): Promise<(buffer: Buffer) => ReturnType> { return (buffer) => sharp(buffer, { failOnError: false }); } +/** + * Reads EXIF orientation from JPEG buffer. + * Returns orientation value 1-8, or null if not found/not JPEG. + * + * EXIF orientation values: + * 1 = Normal, 2 = Flip H, 3 = Rotate 180, 4 = Flip V, + * 5 = Rotate 270 CW + Flip H, 6 = Rotate 90 CW, 7 = Rotate 90 CW + Flip H, 8 = Rotate 270 CW + */ +function readJpegExifOrientation(buffer: Buffer): number | null { + // Check JPEG magic bytes + if (buffer.length < 2 || buffer[0] !== 0xff || buffer[1] !== 0xd8) { + return null; + } + + let offset = 2; + while (offset < buffer.length - 4) { + // Look for marker + if (buffer[offset] !== 0xff) { + offset++; + continue; + } + + const marker = buffer[offset + 1]; + // Skip padding FF bytes + if (marker === 0xff) { + offset++; + continue; + } + + // APP1 marker (EXIF) + if (marker === 0xe1) { + const segmentLength = buffer.readUInt16BE(offset + 2); + const exifStart = offset + 4; + + // Check for "Exif\0\0" header + if ( + buffer.length > exifStart + 6 && + buffer.toString("ascii", exifStart, exifStart + 4) === "Exif" && + buffer[exifStart + 4] === 0 && + buffer[exifStart + 5] === 0 + ) { + const tiffStart = exifStart + 6; + if (buffer.length < tiffStart + 8) return null; + + // Check byte order (II = little-endian, MM = big-endian) + const byteOrder = buffer.toString("ascii", tiffStart, tiffStart + 2); + const isLittleEndian = byteOrder === "II"; + + const readU16 = (pos: number) => + isLittleEndian ? buffer.readUInt16LE(pos) : buffer.readUInt16BE(pos); + const readU32 = (pos: number) => + isLittleEndian ? buffer.readUInt32LE(pos) : buffer.readUInt32BE(pos); + + // Read IFD0 offset + const ifd0Offset = readU32(tiffStart + 4); + const ifd0Start = tiffStart + ifd0Offset; + if (buffer.length < ifd0Start + 2) return null; + + const numEntries = readU16(ifd0Start); + for (let i = 0; i < numEntries; i++) { + const entryOffset = ifd0Start + 2 + i * 12; + if (buffer.length < entryOffset + 12) break; + + const tag = readU16(entryOffset); + // Orientation tag = 0x0112 + if (tag === 0x0112) { + const value = readU16(entryOffset + 8); + return value >= 1 && value <= 8 ? value : null; + } + } + } + return null; + } + + // Skip other segments + if (marker >= 0xe0 && marker <= 0xef) { + const segmentLength = buffer.readUInt16BE(offset + 2); + offset += 2 + segmentLength; + continue; + } + + // SOF, SOS, or other marker - stop searching + if (marker === 0xc0 || marker === 0xda) { + break; + } + + offset++; + } + + return null; +} + async function withTempDir(fn: (dir: string) => Promise): Promise { const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-img-")); try { @@ -108,6 +200,81 @@ export async function getImageMetadata(buffer: Buffer): Promise { + // Map EXIF orientation to sips operations + // sips -r rotates clockwise, -f flips (horizontal/vertical) + const ops: string[] = []; + switch (orientation) { + case 2: // Flip horizontal + ops.push("-f", "horizontal"); + break; + case 3: // Rotate 180 + ops.push("-r", "180"); + break; + case 4: // Flip vertical + ops.push("-f", "vertical"); + break; + case 5: // Rotate 270 CW + flip horizontal + ops.push("-r", "270", "-f", "horizontal"); + break; + case 6: // Rotate 90 CW + ops.push("-r", "90"); + break; + case 7: // Rotate 90 CW + flip horizontal + ops.push("-r", "90", "-f", "horizontal"); + break; + case 8: // Rotate 270 CW + ops.push("-r", "270"); + break; + default: + // Orientation 1 or unknown - no change needed + return buffer; + } + + return await withTempDir(async (dir) => { + const input = path.join(dir, "in.jpg"); + const output = path.join(dir, "out.jpg"); + await fs.writeFile(input, buffer); + await runExec( + "/usr/bin/sips", + [...ops, input, "--out", output], + { timeoutMs: 20_000, maxBuffer: 1024 * 1024 }, + ); + return await fs.readFile(output); + }); +} + +/** + * Normalizes EXIF orientation in an image buffer. + * Returns the buffer with correct pixel orientation (rotated if needed). + * Falls back to original buffer if normalization fails. + */ +export async function normalizeExifOrientation(buffer: Buffer): Promise { + if (prefersSips()) { + try { + const orientation = readJpegExifOrientation(buffer); + if (!orientation || orientation === 1) { + return buffer; // No rotation needed + } + return await sipsApplyOrientation(buffer, orientation); + } catch { + return buffer; + } + } + + try { + const sharp = await loadSharp(); + // .rotate() with no args auto-rotates based on EXIF orientation + return await sharp(buffer).rotate().toBuffer(); + } catch { + // Sharp not available or failed - return original buffer + return buffer; + } +} + export async function resizeToJpeg(params: { buffer: Buffer; maxSide: number; @@ -115,14 +282,17 @@ export async function resizeToJpeg(params: { withoutEnlargement?: boolean; }): Promise { if (prefersSips()) { + // Normalize EXIF orientation BEFORE resizing (sips resize doesn't auto-rotate) + const normalized = await normalizeExifOrientationSips(params.buffer); + // Avoid enlarging by checking dimensions first (sips has no withoutEnlargement flag). if (params.withoutEnlargement !== false) { - const meta = await getImageMetadata(params.buffer); + const meta = await getImageMetadata(normalized); if (meta) { const maxDim = Math.max(meta.width, meta.height); if (maxDim > 0 && maxDim <= params.maxSide) { return await sipsResizeToJpeg({ - buffer: params.buffer, + buffer: normalized, maxSide: maxDim, quality: params.quality, }); @@ -130,14 +300,16 @@ export async function resizeToJpeg(params: { } } return await sipsResizeToJpeg({ - buffer: params.buffer, + buffer: normalized, maxSide: params.maxSide, quality: params.quality, }); } const sharp = await loadSharp(); + // Use .rotate() BEFORE .resize() to auto-rotate based on EXIF orientation return await sharp(params.buffer) + .rotate() // Auto-rotate based on EXIF before resizing .resize({ width: params.maxSide, height: params.maxSide, @@ -147,3 +319,19 @@ export async function resizeToJpeg(params: { .jpeg({ quality: params.quality, mozjpeg: true }) .toBuffer(); } + +/** + * Internal sips-only EXIF normalization (no sharp fallback). + * Used by resizeToJpeg to normalize before sips resize. + */ +async function normalizeExifOrientationSips(buffer: Buffer): Promise { + try { + const orientation = readJpegExifOrientation(buffer); + if (!orientation || orientation === 1) { + return buffer; + } + return await sipsApplyOrientation(buffer, orientation); + } catch { + return buffer; + } +}