import fs from "node:fs/promises"; import path from "node:path"; import { fileURLToPath } from "node:url"; import type { ImageContent } from "@mariozechner/pi-ai"; import { assertSandboxPath } from "../../sandbox-paths.js"; import { sanitizeImageBlocks } from "../../tool-images.js"; import { extractTextFromMessage } from "../../../tui/tui-formatters.js"; import { loadWebMedia } from "../../../web/media.js"; import { resolveUserPath } from "../../../utils.js"; import { log } from "../logger.js"; /** * Common image file extensions for detection. */ const IMAGE_EXTENSIONS = new Set([ ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".heic", ".heif", ]); /** * Result of detecting an image reference in text. */ export interface DetectedImageRef { /** The raw matched string from the prompt */ raw: string; /** The type of reference (path or url) */ type: "path" | "url"; /** The resolved/normalized path or URL */ resolved: string; /** Index of the message this ref was found in (for history images) */ messageIndex?: number; } /** * Checks if a file extension indicates an image file. */ function isImageExtension(filePath: string): boolean { const ext = path.extname(filePath).toLowerCase(); return IMAGE_EXTENSIONS.has(ext); } async function sanitizeImagesWithLog( images: ImageContent[], label: string, ): Promise { const { images: sanitized, dropped } = await sanitizeImageBlocks(images, label); if (dropped > 0) { log.warn(`Native image: dropped ${dropped} image(s) after sanitization (${label}).`); } return sanitized; } /** * Detects image references in a user prompt. * * Patterns detected: * - Absolute paths: /path/to/image.png * - Relative paths: ./image.png, ../images/photo.jpg * - Home paths: ~/Pictures/screenshot.png * - file:// URLs: file:///path/to/image.png * - Message attachments: [Image: source: /path/to/image.jpg] * * @param prompt The user prompt text to scan * @returns Array of detected image references */ export function detectImageReferences(prompt: string): DetectedImageRef[] { const refs: DetectedImageRef[] = []; const seen = new Set(); // Helper to add a path ref const addPathRef = (raw: string) => { const trimmed = raw.trim(); if (!trimmed || seen.has(trimmed.toLowerCase())) return; if (trimmed.startsWith("http://") || trimmed.startsWith("https://")) return; if (!isImageExtension(trimmed)) return; seen.add(trimmed.toLowerCase()); const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed; refs.push({ raw: trimmed, type: "path", resolved }); }; // Pattern for [media attached: path (type) | url] or [media attached N/M: path (type) | url] format // Each bracket = ONE file. The | separates path from URL, not multiple files. // Multi-file format uses separate brackets on separate lines. const mediaAttachedPattern = /\[media attached(?:\s+\d+\/\d+)?:\s*([^\]]+)\]/gi; let match: RegExpExecArray | null; while ((match = mediaAttachedPattern.exec(prompt)) !== null) { const content = match[1]; // Skip "[media attached: N files]" header lines if (/^\d+\s+files?$/i.test(content.trim())) { continue; } // Extract path before the (mime/type) or | delimiter // Format is: path (type) | url OR just: path (type) // Path may contain spaces (e.g., "ChatGPT Image Apr 21.png") // Use non-greedy .+? to stop at first image extension const pathMatch = content.match( /^\s*(.+?\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\s*(?:\(|$|\|)/i, ); if (pathMatch?.[1]) { addPathRef(pathMatch[1].trim()); } } // Pattern for [Image: source: /path/...] format from messaging systems const messageImagePattern = /\[Image:\s*source:\s*([^\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\]/gi; while ((match = messageImagePattern.exec(prompt)) !== null) { const raw = match[1]?.trim(); if (raw) addPathRef(raw); } // Remote HTTP(S) URLs are intentionally ignored. Native image injection is local-only. // Pattern for file:// URLs - treat as paths since loadWebMedia handles them const fileUrlPattern = /file:\/\/[^\s<>"'`\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif)/gi; while ((match = fileUrlPattern.exec(prompt)) !== null) { const raw = match[0]; if (seen.has(raw.toLowerCase())) continue; seen.add(raw.toLowerCase()); // Use fileURLToPath for proper handling (e.g., file://localhost/path) try { const resolved = fileURLToPath(raw); refs.push({ raw, type: "path", resolved }); } catch { // Skip malformed file:// URLs } } // Pattern for file paths (absolute, relative, or home) // Matches: // - /absolute/path/to/file.ext (including paths with special chars like Messages/Attachments) // - ./relative/path.ext // - ../parent/path.ext // - ~/home/path.ext const pathPattern = /(?:^|\s|["'`(])((\.\.?\/|[~/])[^\s"'`()[\]]*\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))/gi; while ((match = pathPattern.exec(prompt)) !== null) { // Use capture group 1 (the path without delimiter prefix); skip if undefined if (match[1]) addPathRef(match[1]); } return refs; } /** * Loads an image from a file path or URL and returns it as ImageContent. * * @param ref The detected image reference * @param workspaceDir The current workspace directory for resolving relative paths * @param options Optional settings for sandbox and size limits * @returns The loaded image content, or null if loading failed */ export async function loadImageFromRef( ref: DetectedImageRef, workspaceDir: string, options?: { maxBytes?: number; /** If set, enforce that file paths are within this sandbox root */ sandboxRoot?: string; }, ): Promise { try { let targetPath = ref.resolved; // Remote URL loading is disabled (local-only). if (ref.type === "url") { log.debug(`Native image: rejecting remote URL (local-only): ${ref.resolved}`); return null; } // For file paths, resolve relative to the appropriate root: // - When sandbox is enabled, resolve relative to sandboxRoot for security // - Otherwise, resolve relative to workspaceDir // Note: ref.resolved may already be absolute (e.g., after ~ expansion in detectImageReferences), // in which case we skip relative resolution. if (ref.type === "path" && !path.isAbsolute(targetPath)) { const resolveRoot = options?.sandboxRoot ?? workspaceDir; targetPath = path.resolve(resolveRoot, targetPath); } // Enforce sandbox restrictions if sandboxRoot is set if (ref.type === "path" && options?.sandboxRoot) { try { const validated = await assertSandboxPath({ filePath: targetPath, cwd: options.sandboxRoot, root: options.sandboxRoot, }); targetPath = validated.resolved; } catch (err) { // Log the actual error for debugging (sandbox violation or other path error) log.debug( `Native image: sandbox validation failed for ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`, ); return null; } } // Check file exists for local paths if (ref.type === "path") { try { await fs.stat(targetPath); } catch { log.debug(`Native image: file not found: ${targetPath}`); return null; } } // loadWebMedia handles local file paths (including file:// URLs) const media = await loadWebMedia(targetPath, options?.maxBytes); if (media.kind !== "image") { log.debug(`Native image: not an image file: ${targetPath} (got ${media.kind})`); return null; } // EXIF orientation is already normalized by loadWebMedia -> resizeToJpeg // Default to JPEG since optimization converts images to JPEG format const mimeType = media.contentType ?? "image/jpeg"; const data = media.buffer.toString("base64"); return { type: "image", data, mimeType }; } catch (err) { // Log the actual error for debugging (size limits, network failures, etc.) log.debug( `Native image: failed to load ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`, ); return null; } } /** * Checks if a model supports image input based on its input capabilities. * * @param model The model object with input capability array * @returns True if the model supports image input */ export function modelSupportsImages(model: { input?: string[] }): boolean { return model.input?.includes("image") ?? false; } /** * Extracts image references from conversation history messages. * Scans user messages for image paths/URLs that can be loaded. * Each ref includes the messageIndex so images can be injected at their original location. * * Note: Global deduplication is intentional - if the same image appears in multiple * messages, we only inject it at the FIRST occurrence. This is sufficient because: * 1. The model sees all message content including the image * 2. Later references to "the image" or "that picture" will work since it's in context * 3. Injecting duplicates would waste tokens and potentially hit size limits */ function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] { const allRefs: DetectedImageRef[] = []; const seen = new Set(); const messageHasImageContent = (msg: unknown): boolean => { if (!msg || typeof msg !== "object") return false; const content = (msg as { content?: unknown }).content; if (!Array.isArray(content)) return false; return content.some( (part) => part != null && typeof part === "object" && (part as { type?: string }).type === "image", ); }; for (let i = 0; i < messages.length; i++) { const msg = messages[i]; if (!msg || typeof msg !== "object") continue; const message = msg as { role?: string }; // Only scan user messages for image references if (message.role !== "user") continue; // Skip if message already has image content (prevents reloading each turn) if (messageHasImageContent(msg)) continue; const text = extractTextFromMessage(msg); if (!text) continue; const refs = detectImageReferences(text); for (const ref of refs) { const key = ref.resolved.toLowerCase(); if (seen.has(key)) continue; seen.add(key); allRefs.push({ ...ref, messageIndex: i }); } } return allRefs; } /** * Detects and loads images referenced in a prompt for models with vision capability. * * This function scans the prompt for image references (file paths and URLs), * loads them, and returns them as ImageContent array ready to be passed to * the model's prompt method. * * Also scans conversation history for images from previous turns and returns * them mapped by message index so they can be injected at their original location. * * @param params Configuration for image detection and loading * @returns Object with loaded images for current prompt and history images by message index */ export async function detectAndLoadPromptImages(params: { prompt: string; workspaceDir: string; model: { input?: string[] }; existingImages?: ImageContent[]; historyMessages?: unknown[]; maxBytes?: number; /** If set, enforce that file paths are within this sandbox root */ sandboxRoot?: string; }): Promise<{ /** Images for the current prompt (existingImages + detected in current prompt) */ images: ImageContent[]; /** Images from history messages, keyed by message index */ historyImagesByIndex: Map; detectedRefs: DetectedImageRef[]; loadedCount: number; skippedCount: number; }> { // If model doesn't support images, return empty results if (!modelSupportsImages(params.model)) { return { images: [], historyImagesByIndex: new Map(), detectedRefs: [], loadedCount: 0, skippedCount: 0, }; } // Detect images from current prompt const promptRefs = detectImageReferences(params.prompt); // Detect images from conversation history (with message indices) const historyRefs = params.historyMessages ? detectImagesFromHistory(params.historyMessages) : []; // Deduplicate: if an image is in the current prompt, don't also load it from history. // Current prompt images are passed via the `images` parameter to prompt(), while history // images are injected into their original message positions. We don't want the same // image loaded and sent twice (wasting tokens and potentially causing confusion). const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase())); const uniqueHistoryRefs = historyRefs.filter((r) => !seenPaths.has(r.resolved.toLowerCase())); const allRefs = [...promptRefs, ...uniqueHistoryRefs]; if (allRefs.length === 0) { return { images: params.existingImages ?? [], historyImagesByIndex: new Map(), detectedRefs: [], loadedCount: 0, skippedCount: 0, }; } log.debug( `Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`, ); // Load images for current prompt const promptImages: ImageContent[] = [...(params.existingImages ?? [])]; // Load images for history, grouped by message index const historyImagesByIndex = new Map(); let loadedCount = 0; let skippedCount = 0; for (const ref of allRefs) { const image = await loadImageFromRef(ref, params.workspaceDir, { maxBytes: params.maxBytes, sandboxRoot: params.sandboxRoot, }); if (image) { if (ref.messageIndex !== undefined) { // History image - add to the appropriate message index const existing = historyImagesByIndex.get(ref.messageIndex); if (existing) { existing.push(image); } else { historyImagesByIndex.set(ref.messageIndex, [image]); } } else { // Current prompt image promptImages.push(image); } loadedCount++; log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`); } else { skippedCount++; } } const sanitizedPromptImages = await sanitizeImagesWithLog(promptImages, "prompt:images"); const sanitizedHistoryImagesByIndex = new Map(); for (const [index, images] of historyImagesByIndex) { const sanitized = await sanitizeImagesWithLog(images, `history:images:${index}`); if (sanitized.length > 0) { sanitizedHistoryImagesByIndex.set(index, sanitized); } } return { images: sanitizedPromptImages, historyImagesByIndex: sanitizedHistoryImagesByIndex, detectedRefs: allRefs, loadedCount, skippedCount, }; }