From 8b57f519c38b8ea46ce31218d10eed71db38c837 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 18 Jan 2026 08:06:50 +0000 Subject: [PATCH] fix: tighten native image injection (#1098) Thanks @tyler6204. Co-authored-by: Tyler Yust --- CHANGELOG.md | 1 + src/agents/pi-embedded-runner/run/attempt.ts | 2 +- .../pi-embedded-runner/run/images.test.ts | 67 ++++++++++------- src/agents/pi-embedded-runner/run/images.ts | 40 +++++----- src/agents/tools/image-tool.ts | 74 +------------------ src/config/types.tools.ts | 15 ++++ src/media/image-ops.ts | 1 - 7 files changed, 81 insertions(+), 119 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 949a6333b..9cefb0af4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Docs: https://docs.clawd.bot - Bridge: add `skills.bins` RPC to support node host auto-allow skill bins. - Slash commands: replace `/cost` with `/usage off|tokens|full` to control per-response usage footer; `/usage` no longer aliases `/status`. (Supersedes #1140) — thanks @Nachx639. - Sessions: add daily reset policy with per-type overrides and idle windows (default 4am local), preserving legacy idle-only configs. (#1146) — thanks @austinm911. +- Agents: auto-inject local image references for vision models and avoid reloading history images. (#1098) — thanks @tyler6204. - Docs: refresh exec/elevated/exec-approvals docs for the new flow. https://docs.clawd.bot/tools/exec-approvals - Docs: add node host CLI + update exec approvals/bridge protocol docs. https://docs.clawd.bot/cli/node - ACP: add experimental ACP support for IDE integrations (`clawdbot acp`). Thanks @visionik. diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index 94a80dcb5..da0d495e2 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -68,6 +68,7 @@ import { describeUnknownError, mapThinkingLevel } from "../utils.js"; import { resolveSandboxRuntimeStatus } from "../../sandbox/runtime-status.js"; import { isTimeoutError } from "../../failover-error.js"; import { getGlobalHookRunner } from "../../../plugins/hook-runner-global.js"; +import { MAX_IMAGE_BYTES } from "../../../media/constants.js"; import { MAX_IMAGE_BYTES } from "../../../media/constants.js"; import type { EmbeddedRunAttemptParams, EmbeddedRunAttemptResult } from "./types.js"; @@ -137,7 +138,6 @@ export async function runEmbeddedAttempt( // Check if the model supports native image input const modelHasVision = params.model.input?.includes("image") ?? false; - const toolsRaw = createClawdbotCodingTools({ exec: { ...params.execOverrides, diff --git a/src/agents/pi-embedded-runner/run/images.test.ts b/src/agents/pi-embedded-runner/run/images.test.ts index 918f2027c..3c25a8e6d 100644 --- a/src/agents/pi-embedded-runner/run/images.test.ts +++ b/src/agents/pi-embedded-runner/run/images.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from "vitest"; -import { detectImageReferences, modelSupportsImages } from "./images.js"; +import { detectAndLoadPromptImages, detectImageReferences, modelSupportsImages } from "./images.js"; describe("detectImageReferences", () => { it("detects absolute file paths with common extensions", () => { @@ -44,27 +44,6 @@ describe("detectImageReferences", () => { expect(refs[0]?.resolved).not.toContain("~"); }); - it("detects HTTP URLs with image extensions", () => { - const prompt = "Check this URL: https://mysite.com/images/logo.png"; - const refs = detectImageReferences(prompt); - - expect(refs).toHaveLength(1); - expect(refs[0]).toEqual({ - raw: "https://mysite.com/images/logo.png", - type: "url", - resolved: "https://mysite.com/images/logo.png", - }); - }); - - it("detects HTTPS URLs with query parameters", () => { - const prompt = "Image from https://cdn.mysite.com/img.jpg?size=large&v=2"; - const refs = detectImageReferences(prompt); - - expect(refs).toHaveLength(1); - expect(refs[0]?.type).toBe("url"); - expect(refs[0]?.raw).toContain("https://cdn.mysite.com/img.jpg"); - }); - it("detects multiple image references in a prompt", () => { const prompt = ` Compare these two images: @@ -73,9 +52,9 @@ describe("detectImageReferences", () => { `; const refs = detectImageReferences(prompt); - expect(refs).toHaveLength(2); + expect(refs).toHaveLength(1); expect(refs.some((r) => r.type === "path")).toBe(true); - expect(refs.some((r) => r.type === "url")).toBe(true); + expect(refs.some((r) => r.type === "url")).toBe(false); }); it("handles various image extensions", () => { @@ -165,9 +144,10 @@ what about these images?`; expect(refs[0]?.resolved).toContain("IMG_6430.jpeg"); }); - it("skips example.com URLs as they are documentation examples", () => { + it("ignores remote URLs entirely (local-only)", () => { const prompt = `To send an image: MEDIA:https://example.com/image.jpg -Here is my actual image: /path/to/real.png`; +Here is my actual image: /path/to/real.png +Also https://cdn.mysite.com/img.jpg`; const refs = detectImageReferences(prompt); expect(refs).toHaveLength(1); @@ -216,3 +196,38 @@ describe("modelSupportsImages", () => { expect(modelSupportsImages(model)).toBe(false); }); }); + +describe("detectAndLoadPromptImages", () => { + it("returns no images for non-vision models even when existing images are provided", async () => { + const result = await detectAndLoadPromptImages({ + prompt: "ignore", + workspaceDir: "/tmp", + model: { input: ["text"] }, + existingImages: [{ type: "image", data: "abc", mimeType: "image/png" }], + }); + + expect(result.images).toHaveLength(0); + expect(result.detectedRefs).toHaveLength(0); + }); + + it("skips history messages that already include image content", async () => { + const result = await detectAndLoadPromptImages({ + prompt: "no images here", + workspaceDir: "/tmp", + model: { input: ["text", "image"] }, + historyMessages: [ + { + role: "user", + content: [ + { type: "text", text: "See /tmp/should-not-load.png" }, + { type: "image", data: "abc", mimeType: "image/png" }, + ], + }, + ], + }); + + expect(result.detectedRefs).toHaveLength(0); + expect(result.images).toHaveLength(0); + expect(result.historyImagesByIndex.size).toBe(0); + }); +}); diff --git a/src/agents/pi-embedded-runner/run/images.ts b/src/agents/pi-embedded-runner/run/images.ts index accaa8d73..6a5c4d828 100644 --- a/src/agents/pi-embedded-runner/run/images.ts +++ b/src/agents/pi-embedded-runner/run/images.ts @@ -55,7 +55,7 @@ function isImageExtension(filePath: string): boolean { * - Absolute paths: /path/to/image.png * - Relative paths: ./image.png, ../images/photo.jpg * - Home paths: ~/Pictures/screenshot.png - * - HTTP(S) URLs: https://example.com/image.png + * - file:// URLs: file:///path/to/image.png * - Message attachments: [Image: source: /path/to/image.jpg] * * @param prompt The user prompt text to scan @@ -69,6 +69,7 @@ export function detectImageReferences(prompt: string): DetectedImageRef[] { const addPathRef = (raw: string) => { const trimmed = raw.trim(); if (!trimmed || seen.has(trimmed.toLowerCase())) return; + if (trimmed.startsWith("http://") || trimmed.startsWith("https://")) return; if (!isImageExtension(trimmed)) return; seen.add(trimmed.toLowerCase()); const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed; @@ -107,18 +108,7 @@ export function detectImageReferences(prompt: string): DetectedImageRef[] { if (raw) addPathRef(raw); } - // Pattern for HTTP(S) URLs ending in image extensions - // Skip example.com URLs as they're often just documentation examples - const urlPattern = - /https?:\/\/[^\s<>"'`\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif)(?:\?[^\s<>"'`\]]*)?/gi; - while ((match = urlPattern.exec(prompt)) !== null) { - const raw = match[0]; - // Skip example.com URLs - they're documentation, not real images - if (raw.includes("example.com")) continue; - if (seen.has(raw.toLowerCase())) continue; - seen.add(raw.toLowerCase()); - refs.push({ raw, type: "url", resolved: raw }); - } + // Remote HTTP(S) URLs are intentionally ignored. Native image injection is local-only. // Pattern for file:// URLs - treat as paths since loadWebMedia handles them const fileUrlPattern = @@ -171,9 +161,9 @@ export async function loadImageFromRef( try { let targetPath = ref.resolved; - // When sandbox is enabled, block remote URL loading to maintain network boundary - if (ref.type === "url" && options?.sandboxRoot) { - log.debug(`Native image: rejecting remote URL in sandboxed session: ${ref.resolved}`); + // Remote URL loading is disabled (local-only). + if (ref.type === "url") { + log.debug(`Native image: rejecting remote URL (local-only): ${ref.resolved}`); return null; } @@ -213,7 +203,7 @@ export async function loadImageFromRef( } } - // loadWebMedia handles both file paths and HTTP(S) URLs + // loadWebMedia handles local file paths (including file:// URLs) const media = await loadWebMedia(targetPath, options?.maxBytes); if (media.kind !== "image") { @@ -259,12 +249,26 @@ function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] { const allRefs: DetectedImageRef[] = []; const seen = new Set(); + const messageHasImageContent = (msg: unknown): boolean => { + if (!msg || typeof msg !== "object") return false; + const content = (msg as { content?: unknown }).content; + if (!Array.isArray(content)) return false; + return content.some( + (part) => + part != null && + typeof part === "object" && + (part as { type?: string }).type === "image", + ); + }; + for (let i = 0; i < messages.length; i++) { const msg = messages[i]; if (!msg || typeof msg !== "object") continue; const message = msg as { role?: string }; // Only scan user messages for image references if (message.role !== "user") continue; + // Skip if message already has image content (prevents reloading each turn) + if (messageHasImageContent(msg)) continue; const text = extractTextFromMessage(msg); if (!text) continue; @@ -315,7 +319,7 @@ export async function detectAndLoadPromptImages(params: { // If model doesn't support images, return empty results if (!modelSupportsImages(params.model)) { return { - images: params.existingImages ?? [], + images: [], historyImagesByIndex: new Map(), detectedRefs: [], loadedCount: 0, diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 119423353..59e5df47d 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -1,4 +1,3 @@ -import fsSync from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; @@ -20,7 +19,7 @@ import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js"; import { minimaxUnderstandImage } from "../minimax-vlm.js"; import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js"; import { runWithImageModelFallback } from "../model-fallback.js"; -import { normalizeProviderId, resolveConfiguredModelRef } from "../model-selection.js"; +import { resolveConfiguredModelRef } from "../model-selection.js"; import { ensureClawdbotModelsJson } from "../models-config.js"; import { assertSandboxPath } from "../sandbox-paths.js"; import type { AnyAgentTool } from "./common.js"; @@ -62,77 +61,6 @@ function hasAuthForProvider(params: { provider: string; agentDir: string }): boo return listProfilesForProvider(store, params.provider).length > 0; } -type ProviderModelEntry = { - id?: string; - input?: string[]; -}; - -type ProviderConfigLike = { - models?: ProviderModelEntry[]; -}; - -function resolveProviderConfig( - providers: Record | undefined, - provider: string, -): ProviderConfigLike | null { - if (!providers) return null; - const normalized = normalizeProviderId(provider); - for (const [key, value] of Object.entries(providers)) { - if (normalizeProviderId(key) === normalized) return value; - } - return null; -} - -function resolveModelSupportsImages(params: { - providerConfig: ProviderConfigLike | null; - modelId: string; -}): boolean | null { - const models = params.providerConfig?.models; - if (!Array.isArray(models) || models.length === 0) return null; - const trimmedId = params.modelId.trim(); - if (!trimmedId) return null; - const match = - models.find((model) => String(model?.id ?? "").trim() === trimmedId) ?? - models.find( - (model) => - String(model?.id ?? "") - .trim() - .toLowerCase() === trimmedId.toLowerCase(), - ); - if (!match) return null; - const input = Array.isArray(match.input) ? match.input : []; - return input.includes("image"); -} - -function resolvePrimaryModelSupportsImages(params: { - cfg?: ClawdbotConfig; - agentDir: string; -}): boolean | null { - if (!params.cfg) return null; - const primary = resolveDefaultModelRef(params.cfg); - const providerConfig = resolveProviderConfig( - params.cfg.models?.providers as Record | undefined, - primary.provider, - ); - const fromConfig = resolveModelSupportsImages({ - providerConfig, - modelId: primary.model, - }); - if (fromConfig !== null) return fromConfig; - try { - const modelsPath = path.join(params.agentDir, "models.json"); - const raw = fsSync.readFileSync(modelsPath, "utf8"); - const parsed = JSON.parse(raw) as { providers?: Record }; - const provider = resolveProviderConfig(parsed.providers, primary.provider); - return resolveModelSupportsImages({ - providerConfig: provider, - modelId: primary.model, - }); - } catch { - return null; - } -} - /** * Resolve the effective image model config for the `image` tool. * diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 6f27b0b69..456060d06 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -120,6 +120,21 @@ export type ToolPolicyConfig = { profile?: ToolProfileId; }; +export type ExecToolConfig = { + host?: "sandbox" | "gateway" | "node"; + security?: "deny" | "allowlist" | "full"; + ask?: "off" | "on-miss" | "always"; + node?: string; + backgroundMs?: number; + timeoutSec?: number; + cleanupMs?: number; + notifyOnExit?: boolean; + applyPatch?: { + enabled?: boolean; + allowModels?: string[]; + }; +}; + export type AgentToolsConfig = { /** Base tool profile applied before allow/deny lists. */ profile?: ToolProfileId; diff --git a/src/media/image-ops.ts b/src/media/image-ops.ts index e97e14b1c..c79e537a9 100644 --- a/src/media/image-ops.ts +++ b/src/media/image-ops.ts @@ -59,7 +59,6 @@ function readJpegExifOrientation(buffer: Buffer): number | null { // APP1 marker (EXIF) if (marker === 0xe1) { - const segmentLength = buffer.readUInt16BE(offset + 2); const exifStart = offset + 4; // Check for "Exif\0\0" header