fix: tighten native image injection (#1098)

Thanks @tyler6204.

Co-authored-by: Tyler Yust <tyler6204@users.noreply.github.com>
This commit is contained in:
Peter Steinberger
2026-01-18 08:06:50 +00:00
parent ddcc05f5f4
commit 8b57f519c3
7 changed files with 81 additions and 119 deletions

View File

@@ -68,6 +68,7 @@ import { describeUnknownError, mapThinkingLevel } from "../utils.js";
import { resolveSandboxRuntimeStatus } from "../../sandbox/runtime-status.js";
import { isTimeoutError } from "../../failover-error.js";
import { getGlobalHookRunner } from "../../../plugins/hook-runner-global.js";
import { MAX_IMAGE_BYTES } from "../../../media/constants.js";
import { MAX_IMAGE_BYTES } from "../../../media/constants.js";
import type { EmbeddedRunAttemptParams, EmbeddedRunAttemptResult } from "./types.js";
@@ -137,7 +138,6 @@ export async function runEmbeddedAttempt(
// Check if the model supports native image input
const modelHasVision = params.model.input?.includes("image") ?? false;
const toolsRaw = createClawdbotCodingTools({
exec: {
...params.execOverrides,

View File

@@ -1,6 +1,6 @@
import { describe, expect, it } from "vitest";
import { detectImageReferences, modelSupportsImages } from "./images.js";
import { detectAndLoadPromptImages, detectImageReferences, modelSupportsImages } from "./images.js";
describe("detectImageReferences", () => {
it("detects absolute file paths with common extensions", () => {
@@ -44,27 +44,6 @@ describe("detectImageReferences", () => {
expect(refs[0]?.resolved).not.toContain("~");
});
it("detects HTTP URLs with image extensions", () => {
const prompt = "Check this URL: https://mysite.com/images/logo.png";
const refs = detectImageReferences(prompt);
expect(refs).toHaveLength(1);
expect(refs[0]).toEqual({
raw: "https://mysite.com/images/logo.png",
type: "url",
resolved: "https://mysite.com/images/logo.png",
});
});
it("detects HTTPS URLs with query parameters", () => {
const prompt = "Image from https://cdn.mysite.com/img.jpg?size=large&v=2";
const refs = detectImageReferences(prompt);
expect(refs).toHaveLength(1);
expect(refs[0]?.type).toBe("url");
expect(refs[0]?.raw).toContain("https://cdn.mysite.com/img.jpg");
});
it("detects multiple image references in a prompt", () => {
const prompt = `
Compare these two images:
@@ -73,9 +52,9 @@ describe("detectImageReferences", () => {
`;
const refs = detectImageReferences(prompt);
expect(refs).toHaveLength(2);
expect(refs).toHaveLength(1);
expect(refs.some((r) => r.type === "path")).toBe(true);
expect(refs.some((r) => r.type === "url")).toBe(true);
expect(refs.some((r) => r.type === "url")).toBe(false);
});
it("handles various image extensions", () => {
@@ -165,9 +144,10 @@ what about these images?`;
expect(refs[0]?.resolved).toContain("IMG_6430.jpeg");
});
it("skips example.com URLs as they are documentation examples", () => {
it("ignores remote URLs entirely (local-only)", () => {
const prompt = `To send an image: MEDIA:https://example.com/image.jpg
Here is my actual image: /path/to/real.png`;
Here is my actual image: /path/to/real.png
Also https://cdn.mysite.com/img.jpg`;
const refs = detectImageReferences(prompt);
expect(refs).toHaveLength(1);
@@ -216,3 +196,38 @@ describe("modelSupportsImages", () => {
expect(modelSupportsImages(model)).toBe(false);
});
});
describe("detectAndLoadPromptImages", () => {
it("returns no images for non-vision models even when existing images are provided", async () => {
const result = await detectAndLoadPromptImages({
prompt: "ignore",
workspaceDir: "/tmp",
model: { input: ["text"] },
existingImages: [{ type: "image", data: "abc", mimeType: "image/png" }],
});
expect(result.images).toHaveLength(0);
expect(result.detectedRefs).toHaveLength(0);
});
it("skips history messages that already include image content", async () => {
const result = await detectAndLoadPromptImages({
prompt: "no images here",
workspaceDir: "/tmp",
model: { input: ["text", "image"] },
historyMessages: [
{
role: "user",
content: [
{ type: "text", text: "See /tmp/should-not-load.png" },
{ type: "image", data: "abc", mimeType: "image/png" },
],
},
],
});
expect(result.detectedRefs).toHaveLength(0);
expect(result.images).toHaveLength(0);
expect(result.historyImagesByIndex.size).toBe(0);
});
});

View File

@@ -55,7 +55,7 @@ function isImageExtension(filePath: string): boolean {
* - Absolute paths: /path/to/image.png
* - Relative paths: ./image.png, ../images/photo.jpg
* - Home paths: ~/Pictures/screenshot.png
* - HTTP(S) URLs: https://example.com/image.png
* - file:// URLs: file:///path/to/image.png
* - Message attachments: [Image: source: /path/to/image.jpg]
*
* @param prompt The user prompt text to scan
@@ -69,6 +69,7 @@ export function detectImageReferences(prompt: string): DetectedImageRef[] {
const addPathRef = (raw: string) => {
const trimmed = raw.trim();
if (!trimmed || seen.has(trimmed.toLowerCase())) return;
if (trimmed.startsWith("http://") || trimmed.startsWith("https://")) return;
if (!isImageExtension(trimmed)) return;
seen.add(trimmed.toLowerCase());
const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed;
@@ -107,18 +108,7 @@ export function detectImageReferences(prompt: string): DetectedImageRef[] {
if (raw) addPathRef(raw);
}
// Pattern for HTTP(S) URLs ending in image extensions
// Skip example.com URLs as they're often just documentation examples
const urlPattern =
/https?:\/\/[^\s<>"'`\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif)(?:\?[^\s<>"'`\]]*)?/gi;
while ((match = urlPattern.exec(prompt)) !== null) {
const raw = match[0];
// Skip example.com URLs - they're documentation, not real images
if (raw.includes("example.com")) continue;
if (seen.has(raw.toLowerCase())) continue;
seen.add(raw.toLowerCase());
refs.push({ raw, type: "url", resolved: raw });
}
// Remote HTTP(S) URLs are intentionally ignored. Native image injection is local-only.
// Pattern for file:// URLs - treat as paths since loadWebMedia handles them
const fileUrlPattern =
@@ -171,9 +161,9 @@ export async function loadImageFromRef(
try {
let targetPath = ref.resolved;
// When sandbox is enabled, block remote URL loading to maintain network boundary
if (ref.type === "url" && options?.sandboxRoot) {
log.debug(`Native image: rejecting remote URL in sandboxed session: ${ref.resolved}`);
// Remote URL loading is disabled (local-only).
if (ref.type === "url") {
log.debug(`Native image: rejecting remote URL (local-only): ${ref.resolved}`);
return null;
}
@@ -213,7 +203,7 @@ export async function loadImageFromRef(
}
}
// loadWebMedia handles both file paths and HTTP(S) URLs
// loadWebMedia handles local file paths (including file:// URLs)
const media = await loadWebMedia(targetPath, options?.maxBytes);
if (media.kind !== "image") {
@@ -259,12 +249,26 @@ function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
const allRefs: DetectedImageRef[] = [];
const seen = new Set<string>();
const messageHasImageContent = (msg: unknown): boolean => {
if (!msg || typeof msg !== "object") return false;
const content = (msg as { content?: unknown }).content;
if (!Array.isArray(content)) return false;
return content.some(
(part) =>
part != null &&
typeof part === "object" &&
(part as { type?: string }).type === "image",
);
};
for (let i = 0; i < messages.length; i++) {
const msg = messages[i];
if (!msg || typeof msg !== "object") continue;
const message = msg as { role?: string };
// Only scan user messages for image references
if (message.role !== "user") continue;
// Skip if message already has image content (prevents reloading each turn)
if (messageHasImageContent(msg)) continue;
const text = extractTextFromMessage(msg);
if (!text) continue;
@@ -315,7 +319,7 @@ export async function detectAndLoadPromptImages(params: {
// If model doesn't support images, return empty results
if (!modelSupportsImages(params.model)) {
return {
images: params.existingImages ?? [],
images: [],
historyImagesByIndex: new Map(),
detectedRefs: [],
loadedCount: 0,