feat: native image injection for vision-capable models
- Auto-detect and load images referenced in user prompts - Inject history images at their original message positions - Fix EXIF orientation - rotate before resizing in resizeToJpeg - Sandbox security: validate paths, block remote URLs when sandbox enabled - Prevent duplicate history image injection across turns - Handle string-based user message content (convert to array) - Add bounds check for message index in history processing - Fix regex to properly match relative paths (./ ../) - Add multi-image support for iMessage attachments - Pass MAX_IMAGE_BYTES limit to image loading Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Peter Steinberger
parent
f7123ec30a
commit
8d74578ceb
379
src/agents/pi-embedded-runner/run/images.ts
Normal file
379
src/agents/pi-embedded-runner/run/images.ts
Normal file
@@ -0,0 +1,379 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
|
||||
import type { ImageContent } from "@mariozechner/pi-ai";
|
||||
|
||||
import { assertSandboxPath } from "../../sandbox-paths.js";
|
||||
import { extractTextFromMessage } from "../../../tui/tui-formatters.js";
|
||||
import { loadWebMedia } from "../../../web/media.js";
|
||||
import { resolveUserPath } from "../../../utils.js";
|
||||
import { log } from "../logger.js";
|
||||
|
||||
/**
|
||||
* Common image file extensions for detection.
|
||||
*/
|
||||
const IMAGE_EXTENSIONS = new Set([
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".webp",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".tif",
|
||||
".heic",
|
||||
".heif",
|
||||
]);
|
||||
|
||||
/**
|
||||
* Result of detecting an image reference in text.
|
||||
*/
|
||||
export interface DetectedImageRef {
|
||||
/** The raw matched string from the prompt */
|
||||
raw: string;
|
||||
/** The type of reference (path or url) */
|
||||
type: "path" | "url";
|
||||
/** The resolved/normalized path or URL */
|
||||
resolved: string;
|
||||
/** Index of the message this ref was found in (for history images) */
|
||||
messageIndex?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a file extension indicates an image file.
|
||||
*/
|
||||
function isImageExtension(filePath: string): boolean {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
return IMAGE_EXTENSIONS.has(ext);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects image references in a user prompt.
|
||||
*
|
||||
* Patterns detected:
|
||||
* - Absolute paths: /path/to/image.png
|
||||
* - Relative paths: ./image.png, ../images/photo.jpg
|
||||
* - Home paths: ~/Pictures/screenshot.png
|
||||
* - HTTP(S) URLs: https://example.com/image.png
|
||||
* - Message attachments: [Image: source: /path/to/image.jpg]
|
||||
*
|
||||
* @param prompt The user prompt text to scan
|
||||
* @returns Array of detected image references
|
||||
*/
|
||||
export function detectImageReferences(prompt: string): DetectedImageRef[] {
|
||||
const refs: DetectedImageRef[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Helper to add a path ref
|
||||
const addPathRef = (raw: string) => {
|
||||
const trimmed = raw.trim();
|
||||
if (!trimmed || seen.has(trimmed.toLowerCase())) return;
|
||||
if (!isImageExtension(trimmed)) return;
|
||||
seen.add(trimmed.toLowerCase());
|
||||
const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed;
|
||||
refs.push({ raw: trimmed, type: "path", resolved });
|
||||
};
|
||||
|
||||
// Pattern for [media attached: path (type) | url] or [media attached N/M: path (type) | url] format
|
||||
// Each bracket = ONE file. The | separates path from URL, not multiple files.
|
||||
// Multi-file format uses separate brackets on separate lines.
|
||||
const mediaAttachedPattern = /\[media attached(?:\s+\d+\/\d+)?:\s*([^\]]+)\]/gi;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = mediaAttachedPattern.exec(prompt)) !== null) {
|
||||
const content = match[1];
|
||||
|
||||
// Skip "[media attached: N files]" header lines
|
||||
if (/^\d+\s+files?$/i.test(content.trim())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract path before the (mime/type) or | delimiter
|
||||
// Format is: path (type) | url OR just: path (type)
|
||||
// Path may contain spaces (e.g., "ChatGPT Image Apr 21.png")
|
||||
// Use non-greedy .+? to stop at first image extension
|
||||
const pathMatch = content.match(
|
||||
/^\s*(.+?\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\s*(?:\(|$|\|)/i,
|
||||
);
|
||||
if (pathMatch?.[1]) {
|
||||
addPathRef(pathMatch[1].trim());
|
||||
}
|
||||
}
|
||||
|
||||
// Pattern for [Image: source: /path/...] format from messaging systems
|
||||
const messageImagePattern = /\[Image:\s*source:\s*([^\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\]/gi;
|
||||
while ((match = messageImagePattern.exec(prompt)) !== null) {
|
||||
const raw = match[1]?.trim();
|
||||
if (raw) addPathRef(raw);
|
||||
}
|
||||
|
||||
// Pattern for HTTP(S) URLs ending in image extensions
|
||||
// Skip example.com URLs as they're often just documentation examples
|
||||
const urlPattern =
|
||||
/https?:\/\/[^\s<>"'`\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif)(?:\?[^\s<>"'`\]]*)?/gi;
|
||||
while ((match = urlPattern.exec(prompt)) !== null) {
|
||||
const raw = match[0];
|
||||
// Skip example.com URLs - they're documentation, not real images
|
||||
if (raw.includes("example.com")) continue;
|
||||
if (seen.has(raw.toLowerCase())) continue;
|
||||
seen.add(raw.toLowerCase());
|
||||
refs.push({ raw, type: "url", resolved: raw });
|
||||
}
|
||||
|
||||
// Pattern for file paths (absolute, relative, or home)
|
||||
// Matches:
|
||||
// - /absolute/path/to/file.ext (including paths with special chars like Messages/Attachments)
|
||||
// - ./relative/path.ext
|
||||
// - ../parent/path.ext
|
||||
// - ~/home/path.ext
|
||||
const pathPattern = /(?:^|\s|["'`(])((\.\.?\/|[~/])[^\s"'`()[\]]*\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))/gi;
|
||||
while ((match = pathPattern.exec(prompt)) !== null) {
|
||||
const raw = match[1] || match[0];
|
||||
addPathRef(raw);
|
||||
}
|
||||
|
||||
return refs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads an image from a file path or URL and returns it as ImageContent.
|
||||
*
|
||||
* @param ref The detected image reference
|
||||
* @param workspaceDir The current workspace directory for resolving relative paths
|
||||
* @param options Optional settings for sandbox and size limits
|
||||
* @returns The loaded image content, or null if loading failed
|
||||
*/
|
||||
export async function loadImageFromRef(
|
||||
ref: DetectedImageRef,
|
||||
workspaceDir: string,
|
||||
options?: {
|
||||
maxBytes?: number;
|
||||
/** If set, enforce that file paths are within this sandbox root */
|
||||
sandboxRoot?: string;
|
||||
},
|
||||
): Promise<ImageContent | null> {
|
||||
try {
|
||||
let targetPath = ref.resolved;
|
||||
|
||||
// When sandbox is enabled, block remote URL loading to maintain network boundary
|
||||
if (ref.type === "url" && options?.sandboxRoot) {
|
||||
log.debug(`Native image: rejecting remote URL in sandboxed session: ${ref.resolved}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// For file paths, resolve relative to the appropriate root:
|
||||
// - When sandbox is enabled, resolve relative to sandboxRoot for security
|
||||
// - Otherwise, resolve relative to workspaceDir
|
||||
if (ref.type === "path" && !path.isAbsolute(targetPath)) {
|
||||
const resolveRoot = options?.sandboxRoot ?? workspaceDir;
|
||||
targetPath = path.resolve(resolveRoot, targetPath);
|
||||
}
|
||||
|
||||
// Enforce sandbox restrictions if sandboxRoot is set
|
||||
if (ref.type === "path" && options?.sandboxRoot) {
|
||||
try {
|
||||
const validated = await assertSandboxPath({
|
||||
filePath: targetPath,
|
||||
cwd: options.sandboxRoot,
|
||||
root: options.sandboxRoot,
|
||||
});
|
||||
targetPath = validated.resolved;
|
||||
} catch (err) {
|
||||
// Log the actual error for debugging (sandbox violation or other path error)
|
||||
log.debug(`Native image: sandbox validation failed for ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Check file exists for local paths
|
||||
if (ref.type === "path") {
|
||||
try {
|
||||
await fs.stat(targetPath);
|
||||
} catch {
|
||||
log.debug(`Native image: file not found: ${targetPath}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// loadWebMedia handles both file paths and HTTP(S) URLs
|
||||
const media = await loadWebMedia(targetPath, options?.maxBytes);
|
||||
|
||||
if (media.kind !== "image") {
|
||||
log.debug(`Native image: not an image file: ${targetPath} (got ${media.kind})`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// EXIF orientation is already normalized by loadWebMedia -> resizeToJpeg
|
||||
const mimeType = media.contentType ?? "image/png";
|
||||
const data = media.buffer.toString("base64");
|
||||
|
||||
return { type: "image", data, mimeType };
|
||||
} catch (err) {
|
||||
// Log the actual error for debugging (size limits, network failures, etc.)
|
||||
log.debug(`Native image: failed to load ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a model supports image input based on its input capabilities.
|
||||
*
|
||||
* @param model The model object with input capability array
|
||||
* @returns True if the model supports image input
|
||||
*/
|
||||
export function modelSupportsImages(model: { input?: string[] }): boolean {
|
||||
return model.input?.includes("image") ?? false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts image references from conversation history messages.
|
||||
* Scans user messages for image paths/URLs that can be loaded.
|
||||
* Each ref includes the messageIndex so images can be injected at their original location.
|
||||
*
|
||||
* Note: Global deduplication is intentional - if the same image appears in multiple
|
||||
* messages, we only inject it at the FIRST occurrence. This is sufficient because:
|
||||
* 1. The model sees all message content including the image
|
||||
* 2. Later references to "the image" or "that picture" will work since it's in context
|
||||
* 3. Injecting duplicates would waste tokens and potentially hit size limits
|
||||
*/
|
||||
function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
|
||||
const allRefs: DetectedImageRef[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (let i = 0; i < messages.length; i++) {
|
||||
const msg = messages[i];
|
||||
if (!msg || typeof msg !== "object") continue;
|
||||
const message = msg as { role?: string };
|
||||
// Only scan user messages for image references
|
||||
if (message.role !== "user") continue;
|
||||
|
||||
const text = extractTextFromMessage(msg);
|
||||
if (!text) continue;
|
||||
|
||||
const refs = detectImageReferences(text);
|
||||
for (const ref of refs) {
|
||||
const key = ref.resolved.toLowerCase();
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
allRefs.push({ ...ref, messageIndex: i });
|
||||
}
|
||||
}
|
||||
|
||||
return allRefs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects and loads images referenced in a prompt for models with vision capability.
|
||||
*
|
||||
* This function scans the prompt for image references (file paths and URLs),
|
||||
* loads them, and returns them as ImageContent array ready to be passed to
|
||||
* the model's prompt method.
|
||||
*
|
||||
* Also scans conversation history for images from previous turns and returns
|
||||
* them mapped by message index so they can be injected at their original location.
|
||||
*
|
||||
* @param params Configuration for image detection and loading
|
||||
* @returns Object with loaded images for current prompt and history images by message index
|
||||
*/
|
||||
export async function detectAndLoadPromptImages(params: {
|
||||
prompt: string;
|
||||
workspaceDir: string;
|
||||
model: { input?: string[] };
|
||||
existingImages?: ImageContent[];
|
||||
historyMessages?: unknown[];
|
||||
maxBytes?: number;
|
||||
/** If set, enforce that file paths are within this sandbox root */
|
||||
sandboxRoot?: string;
|
||||
}): Promise<{
|
||||
/** Images for the current prompt (existingImages + detected in current prompt) */
|
||||
images: ImageContent[];
|
||||
/** Images from history messages, keyed by message index */
|
||||
historyImagesByIndex: Map<number, ImageContent[]>;
|
||||
detectedRefs: DetectedImageRef[];
|
||||
loadedCount: number;
|
||||
skippedCount: number;
|
||||
}> {
|
||||
// If model doesn't support images, return empty results
|
||||
if (!modelSupportsImages(params.model)) {
|
||||
return {
|
||||
images: params.existingImages ?? [],
|
||||
historyImagesByIndex: new Map(),
|
||||
detectedRefs: [],
|
||||
loadedCount: 0,
|
||||
skippedCount: 0,
|
||||
};
|
||||
}
|
||||
|
||||
// Detect images from current prompt
|
||||
const promptRefs = detectImageReferences(params.prompt);
|
||||
|
||||
// Detect images from conversation history (with message indices)
|
||||
const historyRefs = params.historyMessages
|
||||
? detectImagesFromHistory(params.historyMessages)
|
||||
: [];
|
||||
|
||||
// Deduplicate: if an image is in the current prompt, don't also load it from history.
|
||||
// Current prompt images are passed via the `images` parameter to prompt(), while history
|
||||
// images are injected into their original message positions. We don't want the same
|
||||
// image loaded and sent twice (wasting tokens and potentially causing confusion).
|
||||
const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase()));
|
||||
const uniqueHistoryRefs = historyRefs.filter(
|
||||
(r) => !seenPaths.has(r.resolved.toLowerCase()),
|
||||
);
|
||||
|
||||
const allRefs = [...promptRefs, ...uniqueHistoryRefs];
|
||||
|
||||
if (allRefs.length === 0) {
|
||||
return {
|
||||
images: params.existingImages ?? [],
|
||||
historyImagesByIndex: new Map(),
|
||||
detectedRefs: [],
|
||||
loadedCount: 0,
|
||||
skippedCount: 0,
|
||||
};
|
||||
}
|
||||
|
||||
log.debug(
|
||||
`Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`,
|
||||
);
|
||||
|
||||
// Load images for current prompt
|
||||
const promptImages: ImageContent[] = [...(params.existingImages ?? [])];
|
||||
// Load images for history, grouped by message index
|
||||
const historyImagesByIndex = new Map<number, ImageContent[]>();
|
||||
|
||||
let loadedCount = 0;
|
||||
let skippedCount = 0;
|
||||
|
||||
for (const ref of allRefs) {
|
||||
const image = await loadImageFromRef(ref, params.workspaceDir, {
|
||||
maxBytes: params.maxBytes,
|
||||
sandboxRoot: params.sandboxRoot,
|
||||
});
|
||||
if (image) {
|
||||
if (ref.messageIndex !== undefined) {
|
||||
// History image - add to the appropriate message index
|
||||
const existing = historyImagesByIndex.get(ref.messageIndex);
|
||||
if (existing) {
|
||||
existing.push(image);
|
||||
} else {
|
||||
historyImagesByIndex.set(ref.messageIndex, [image]);
|
||||
}
|
||||
} else {
|
||||
// Current prompt image
|
||||
promptImages.push(image);
|
||||
}
|
||||
loadedCount++;
|
||||
log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`);
|
||||
} else {
|
||||
skippedCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
images: promptImages,
|
||||
historyImagesByIndex,
|
||||
detectedRefs: allRefs,
|
||||
loadedCount,
|
||||
skippedCount,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user