feat: native image injection for vision-capable models

- Auto-detect and load images referenced in user prompts - Inject history images at their original message positions - Fix EXIF orientation - rotate before resizing in resizeToJpeg - Sandbox security: validate paths, block remote URLs when sandbox enabled - Prevent duplicate history image injection across turns - Handle string-based user message content (convert to array) - Add bounds check for message index in history processing - Fix regex to properly match relative paths (./ ../) - Add multi-image support for iMessage attachments - Pass MAX_IMAGE_BYTES limit to image loading Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-17 03:10:10 -08:00
parent f7123ec30a
commit 8d74578ceb
9 changed files with 892 additions and 16 deletions
--- a/src/agents/pi-embedded-runner/run/images.ts
+++ b/src/agents/pi-embedded-runner/run/images.ts
@@ -0,0 +1,379 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+
+import type { ImageContent } from "@mariozechner/pi-ai";
+
+import { assertSandboxPath } from "../../sandbox-paths.js";
+import { extractTextFromMessage } from "../../../tui/tui-formatters.js";
+import { loadWebMedia } from "../../../web/media.js";
+import { resolveUserPath } from "../../../utils.js";
+import { log } from "../logger.js";
+
+/**
+ * Common image file extensions for detection.
+ */
+const IMAGE_EXTENSIONS = new Set([
+  ".png",
+  ".jpg",
+  ".jpeg",
+  ".gif",
+  ".webp",
+  ".bmp",
+  ".tiff",
+  ".tif",
+  ".heic",
+  ".heif",
+]);
+
+/**
+ * Result of detecting an image reference in text.
+ */
+export interface DetectedImageRef {
+  /** The raw matched string from the prompt */
+  raw: string;
+  /** The type of reference (path or url) */
+  type: "path" | "url";
+  /** The resolved/normalized path or URL */
+  resolved: string;
+  /** Index of the message this ref was found in (for history images) */
+  messageIndex?: number;
+}
+
+/**
+ * Checks if a file extension indicates an image file.
+ */
+function isImageExtension(filePath: string): boolean {
+  const ext = path.extname(filePath).toLowerCase();
+  return IMAGE_EXTENSIONS.has(ext);
+}
+
+/**
+ * Detects image references in a user prompt.
+ *
+ * Patterns detected:
+ * - Absolute paths: /path/to/image.png
+ * - Relative paths: ./image.png, ../images/photo.jpg
+ * - Home paths: ~/Pictures/screenshot.png
+ * - HTTP(S) URLs: https://example.com/image.png
+ * - Message attachments: [Image: source: /path/to/image.jpg]
+ *
+ * @param prompt The user prompt text to scan
+ * @returns Array of detected image references
+ */
+export function detectImageReferences(prompt: string): DetectedImageRef[] {
+  const refs: DetectedImageRef[] = [];
+  const seen = new Set<string>();
+
+  // Helper to add a path ref
+  const addPathRef = (raw: string) => {
+    const trimmed = raw.trim();
+    if (!trimmed || seen.has(trimmed.toLowerCase())) return;
+    if (!isImageExtension(trimmed)) return;
+    seen.add(trimmed.toLowerCase());
+    const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed;
+    refs.push({ raw: trimmed, type: "path", resolved });
+  };
+
+  // Pattern for [media attached: path (type) | url] or [media attached N/M: path (type) | url] format
+  // Each bracket = ONE file. The | separates path from URL, not multiple files.
+  // Multi-file format uses separate brackets on separate lines.
+  const mediaAttachedPattern = /\[media attached(?:\s+\d+\/\d+)?:\s*([^\]]+)\]/gi;
+  let match: RegExpExecArray | null;
+  while ((match = mediaAttachedPattern.exec(prompt)) !== null) {
+    const content = match[1];
+
+    // Skip "[media attached: N files]" header lines
+    if (/^\d+\s+files?$/i.test(content.trim())) {
+      continue;
+    }
+
+    // Extract path before the (mime/type) or | delimiter
+    // Format is: path (type) | url  OR  just: path (type)
+    // Path may contain spaces (e.g., "ChatGPT Image Apr 21.png")
+    // Use non-greedy .+? to stop at first image extension
+    const pathMatch = content.match(
+      /^\s*(.+?\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\s*(?:\(|$|\|)/i,
+    );
+    if (pathMatch?.[1]) {
+      addPathRef(pathMatch[1].trim());
+    }
+  }
+
+  // Pattern for [Image: source: /path/...] format from messaging systems
+  const messageImagePattern = /\[Image:\s*source:\s*([^\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\]/gi;
+  while ((match = messageImagePattern.exec(prompt)) !== null) {
+    const raw = match[1]?.trim();
+    if (raw) addPathRef(raw);
+  }
+
+  // Pattern for HTTP(S) URLs ending in image extensions
+  // Skip example.com URLs as they're often just documentation examples
+  const urlPattern =
+    /https?:\/\/[^\s<>"'`\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif)(?:\?[^\s<>"'`\]]*)?/gi;
+  while ((match = urlPattern.exec(prompt)) !== null) {
+    const raw = match[0];
+    // Skip example.com URLs - they're documentation, not real images
+    if (raw.includes("example.com")) continue;
+    if (seen.has(raw.toLowerCase())) continue;
+    seen.add(raw.toLowerCase());
+    refs.push({ raw, type: "url", resolved: raw });
+  }
+
+  // Pattern for file paths (absolute, relative, or home)
+  // Matches:
+  // - /absolute/path/to/file.ext (including paths with special chars like Messages/Attachments)
+  // - ./relative/path.ext
+  // - ../parent/path.ext
+  // - ~/home/path.ext
+  const pathPattern = /(?:^|\s|["'`(])((\.\.?\/|[~/])[^\s"'`()[\]]*\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))/gi;
+  while ((match = pathPattern.exec(prompt)) !== null) {
+    const raw = match[1] || match[0];
+    addPathRef(raw);
+  }
+
+  return refs;
+}
+
+/**
+ * Loads an image from a file path or URL and returns it as ImageContent.
+ *
+ * @param ref The detected image reference
+ * @param workspaceDir The current workspace directory for resolving relative paths
+ * @param options Optional settings for sandbox and size limits
+ * @returns The loaded image content, or null if loading failed
+ */
+export async function loadImageFromRef(
+  ref: DetectedImageRef,
+  workspaceDir: string,
+  options?: {
+    maxBytes?: number;
+    /** If set, enforce that file paths are within this sandbox root */
+    sandboxRoot?: string;
+  },
+): Promise<ImageContent | null> {
+  try {
+    let targetPath = ref.resolved;
+
+    // When sandbox is enabled, block remote URL loading to maintain network boundary
+    if (ref.type === "url" && options?.sandboxRoot) {
+      log.debug(`Native image: rejecting remote URL in sandboxed session: ${ref.resolved}`);
+      return null;
+    }
+
+    // For file paths, resolve relative to the appropriate root:
+    // - When sandbox is enabled, resolve relative to sandboxRoot for security
+    // - Otherwise, resolve relative to workspaceDir
+    if (ref.type === "path" && !path.isAbsolute(targetPath)) {
+      const resolveRoot = options?.sandboxRoot ?? workspaceDir;
+      targetPath = path.resolve(resolveRoot, targetPath);
+    }
+
+    // Enforce sandbox restrictions if sandboxRoot is set
+    if (ref.type === "path" && options?.sandboxRoot) {
+      try {
+        const validated = await assertSandboxPath({
+          filePath: targetPath,
+          cwd: options.sandboxRoot,
+          root: options.sandboxRoot,
+        });
+        targetPath = validated.resolved;
+      } catch (err) {
+        // Log the actual error for debugging (sandbox violation or other path error)
+        log.debug(`Native image: sandbox validation failed for ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`);
+        return null;
+      }
+    }
+
+    // Check file exists for local paths
+    if (ref.type === "path") {
+      try {
+        await fs.stat(targetPath);
+      } catch {
+        log.debug(`Native image: file not found: ${targetPath}`);
+        return null;
+      }
+    }
+
+    // loadWebMedia handles both file paths and HTTP(S) URLs
+    const media = await loadWebMedia(targetPath, options?.maxBytes);
+
+    if (media.kind !== "image") {
+      log.debug(`Native image: not an image file: ${targetPath} (got ${media.kind})`);
+      return null;
+    }
+
+    // EXIF orientation is already normalized by loadWebMedia -> resizeToJpeg
+    const mimeType = media.contentType ?? "image/png";
+    const data = media.buffer.toString("base64");
+
+    return { type: "image", data, mimeType };
+  } catch (err) {
+    // Log the actual error for debugging (size limits, network failures, etc.)
+    log.debug(`Native image: failed to load ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`);
+    return null;
+  }
+}
+
+/**
+ * Checks if a model supports image input based on its input capabilities.
+ *
+ * @param model The model object with input capability array
+ * @returns True if the model supports image input
+ */
+export function modelSupportsImages(model: { input?: string[] }): boolean {
+  return model.input?.includes("image") ?? false;
+}
+
+/**
+ * Extracts image references from conversation history messages.
+ * Scans user messages for image paths/URLs that can be loaded.
+ * Each ref includes the messageIndex so images can be injected at their original location.
+ *
+ * Note: Global deduplication is intentional - if the same image appears in multiple
+ * messages, we only inject it at the FIRST occurrence. This is sufficient because:
+ * 1. The model sees all message content including the image
+ * 2. Later references to "the image" or "that picture" will work since it's in context
+ * 3. Injecting duplicates would waste tokens and potentially hit size limits
+ */
+function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
+  const allRefs: DetectedImageRef[] = [];
+  const seen = new Set<string>();
+
+  for (let i = 0; i < messages.length; i++) {
+    const msg = messages[i];
+    if (!msg || typeof msg !== "object") continue;
+    const message = msg as { role?: string };
+    // Only scan user messages for image references
+    if (message.role !== "user") continue;
+
+    const text = extractTextFromMessage(msg);
+    if (!text) continue;
+
+    const refs = detectImageReferences(text);
+    for (const ref of refs) {
+      const key = ref.resolved.toLowerCase();
+      if (seen.has(key)) continue;
+      seen.add(key);
+      allRefs.push({ ...ref, messageIndex: i });
+    }
+  }
+
+  return allRefs;
+}
+
+/**
+ * Detects and loads images referenced in a prompt for models with vision capability.
+ *
+ * This function scans the prompt for image references (file paths and URLs),
+ * loads them, and returns them as ImageContent array ready to be passed to
+ * the model's prompt method.
+ *
+ * Also scans conversation history for images from previous turns and returns
+ * them mapped by message index so they can be injected at their original location.
+ *
+ * @param params Configuration for image detection and loading
+ * @returns Object with loaded images for current prompt and history images by message index
+ */
+export async function detectAndLoadPromptImages(params: {
+  prompt: string;
+  workspaceDir: string;
+  model: { input?: string[] };
+  existingImages?: ImageContent[];
+  historyMessages?: unknown[];
+  maxBytes?: number;
+  /** If set, enforce that file paths are within this sandbox root */
+  sandboxRoot?: string;
+}): Promise<{
+  /** Images for the current prompt (existingImages + detected in current prompt) */
+  images: ImageContent[];
+  /** Images from history messages, keyed by message index */
+  historyImagesByIndex: Map<number, ImageContent[]>;
+  detectedRefs: DetectedImageRef[];
+  loadedCount: number;
+  skippedCount: number;
+}> {
+  // If model doesn't support images, return empty results
+  if (!modelSupportsImages(params.model)) {
+    return {
+      images: params.existingImages ?? [],
+      historyImagesByIndex: new Map(),
+      detectedRefs: [],
+      loadedCount: 0,
+      skippedCount: 0,
+    };
+  }
+
+  // Detect images from current prompt
+  const promptRefs = detectImageReferences(params.prompt);
+
+  // Detect images from conversation history (with message indices)
+  const historyRefs = params.historyMessages
+    ? detectImagesFromHistory(params.historyMessages)
+    : [];
+
+  // Deduplicate: if an image is in the current prompt, don't also load it from history.
+  // Current prompt images are passed via the `images` parameter to prompt(), while history
+  // images are injected into their original message positions. We don't want the same
+  // image loaded and sent twice (wasting tokens and potentially causing confusion).
+  const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase()));
+  const uniqueHistoryRefs = historyRefs.filter(
+    (r) => !seenPaths.has(r.resolved.toLowerCase()),
+  );
+
+  const allRefs = [...promptRefs, ...uniqueHistoryRefs];
+
+  if (allRefs.length === 0) {
+    return {
+      images: params.existingImages ?? [],
+      historyImagesByIndex: new Map(),
+      detectedRefs: [],
+      loadedCount: 0,
+      skippedCount: 0,
+    };
+  }
+
+  log.debug(
+    `Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`,
+  );
+
+  // Load images for current prompt
+  const promptImages: ImageContent[] = [...(params.existingImages ?? [])];
+  // Load images for history, grouped by message index
+  const historyImagesByIndex = new Map<number, ImageContent[]>();
+
+  let loadedCount = 0;
+  let skippedCount = 0;
+
+  for (const ref of allRefs) {
+    const image = await loadImageFromRef(ref, params.workspaceDir, {
+      maxBytes: params.maxBytes,
+      sandboxRoot: params.sandboxRoot,
+    });
+    if (image) {
+      if (ref.messageIndex !== undefined) {
+        // History image - add to the appropriate message index
+        const existing = historyImagesByIndex.get(ref.messageIndex);
+        if (existing) {
+          existing.push(image);
+        } else {
+          historyImagesByIndex.set(ref.messageIndex, [image]);
+        }
+      } else {
+        // Current prompt image
+        promptImages.push(image);
+      }
+      loadedCount++;
+      log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`);
+    } else {
+      skippedCount++;
+    }
+  }
+
+  return {
+    images: promptImages,
+    historyImagesByIndex,
+    detectedRefs: allRefs,
+    loadedCount,
+    skippedCount,
+  };
+}