From 8b57f519c38b8ea46ce31218d10eed71db38c837 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sun, 18 Jan 2026 08:06:50 +0000
Subject: [PATCH] fix: tighten native image injection (#1098)

Thanks @tyler6204.

Co-authored-by: Tyler Yust <tyler6204@users.noreply.github.com>
---
 CHANGELOG.md                                  |  1 +
 src/agents/pi-embedded-runner/run/attempt.ts  |  2 +-
 .../pi-embedded-runner/run/images.test.ts     | 67 ++++++++++-------
 src/agents/pi-embedded-runner/run/images.ts   | 40 +++++-----
 src/agents/tools/image-tool.ts                | 74 +------------------
 src/config/types.tools.ts                     | 15 ++++
 src/media/image-ops.ts                        |  1 -
 7 files changed, 81 insertions(+), 119 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 949a6333b..9cefb0af4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ Docs: https://docs.clawd.bot
 - Bridge: add `skills.bins` RPC to support node host auto-allow skill bins.
 - Slash commands: replace `/cost` with `/usage off|tokens|full` to control per-response usage footer; `/usage` no longer aliases `/status`. (Supersedes #1140) — thanks @Nachx639.
 - Sessions: add daily reset policy with per-type overrides and idle windows (default 4am local), preserving legacy idle-only configs. (#1146) — thanks @austinm911.
+- Agents: auto-inject local image references for vision models and avoid reloading history images. (#1098) — thanks @tyler6204.
 - Docs: refresh exec/elevated/exec-approvals docs for the new flow. https://docs.clawd.bot/tools/exec-approvals
 - Docs: add node host CLI + update exec approvals/bridge protocol docs. https://docs.clawd.bot/cli/node
 - ACP: add experimental ACP support for IDE integrations (`clawdbot acp`). Thanks @visionik.
diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts
index 94a80dcb5..da0d495e2 100644
--- a/src/agents/pi-embedded-runner/run/attempt.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.ts
@@ -68,6 +68,7 @@ import { describeUnknownError, mapThinkingLevel } from "../utils.js";
 import { resolveSandboxRuntimeStatus } from "../../sandbox/runtime-status.js";
 import { isTimeoutError } from "../../failover-error.js";
 import { getGlobalHookRunner } from "../../../plugins/hook-runner-global.js";
+import { MAX_IMAGE_BYTES } from "../../../media/constants.js";
 
 import { MAX_IMAGE_BYTES } from "../../../media/constants.js";
 import type { EmbeddedRunAttemptParams, EmbeddedRunAttemptResult } from "./types.js";
@@ -137,7 +138,6 @@ export async function runEmbeddedAttempt(
 
     // Check if the model supports native image input
     const modelHasVision = params.model.input?.includes("image") ?? false;
-
     const toolsRaw = createClawdbotCodingTools({
       exec: {
         ...params.execOverrides,
diff --git a/src/agents/pi-embedded-runner/run/images.test.ts b/src/agents/pi-embedded-runner/run/images.test.ts
index 918f2027c..3c25a8e6d 100644
--- a/src/agents/pi-embedded-runner/run/images.test.ts
+++ b/src/agents/pi-embedded-runner/run/images.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from "vitest";
 
-import { detectImageReferences, modelSupportsImages } from "./images.js";
+import { detectAndLoadPromptImages, detectImageReferences, modelSupportsImages } from "./images.js";
 
 describe("detectImageReferences", () => {
   it("detects absolute file paths with common extensions", () => {
@@ -44,27 +44,6 @@ describe("detectImageReferences", () => {
     expect(refs[0]?.resolved).not.toContain("~");
   });
 
-  it("detects HTTP URLs with image extensions", () => {
-    const prompt = "Check this URL: https://mysite.com/images/logo.png";
-    const refs = detectImageReferences(prompt);
-
-    expect(refs).toHaveLength(1);
-    expect(refs[0]).toEqual({
-      raw: "https://mysite.com/images/logo.png",
-      type: "url",
-      resolved: "https://mysite.com/images/logo.png",
-    });
-  });
-
-  it("detects HTTPS URLs with query parameters", () => {
-    const prompt = "Image from https://cdn.mysite.com/img.jpg?size=large&v=2";
-    const refs = detectImageReferences(prompt);
-
-    expect(refs).toHaveLength(1);
-    expect(refs[0]?.type).toBe("url");
-    expect(refs[0]?.raw).toContain("https://cdn.mysite.com/img.jpg");
-  });
-
   it("detects multiple image references in a prompt", () => {
     const prompt = `
       Compare these two images:
@@ -73,9 +52,9 @@ describe("detectImageReferences", () => {
     `;
     const refs = detectImageReferences(prompt);
 
-    expect(refs).toHaveLength(2);
+    expect(refs).toHaveLength(1);
     expect(refs.some((r) => r.type === "path")).toBe(true);
-    expect(refs.some((r) => r.type === "url")).toBe(true);
+    expect(refs.some((r) => r.type === "url")).toBe(false);
   });
 
   it("handles various image extensions", () => {
@@ -165,9 +144,10 @@ what about these images?`;
     expect(refs[0]?.resolved).toContain("IMG_6430.jpeg");
   });
 
-  it("skips example.com URLs as they are documentation examples", () => {
+  it("ignores remote URLs entirely (local-only)", () => {
     const prompt = `To send an image: MEDIA:https://example.com/image.jpg
-Here is my actual image: /path/to/real.png`;
+Here is my actual image: /path/to/real.png
+Also https://cdn.mysite.com/img.jpg`;
     const refs = detectImageReferences(prompt);
 
     expect(refs).toHaveLength(1);
@@ -216,3 +196,38 @@ describe("modelSupportsImages", () => {
     expect(modelSupportsImages(model)).toBe(false);
   });
 });
+
+describe("detectAndLoadPromptImages", () => {
+  it("returns no images for non-vision models even when existing images are provided", async () => {
+    const result = await detectAndLoadPromptImages({
+      prompt: "ignore",
+      workspaceDir: "/tmp",
+      model: { input: ["text"] },
+      existingImages: [{ type: "image", data: "abc", mimeType: "image/png" }],
+    });
+
+    expect(result.images).toHaveLength(0);
+    expect(result.detectedRefs).toHaveLength(0);
+  });
+
+  it("skips history messages that already include image content", async () => {
+    const result = await detectAndLoadPromptImages({
+      prompt: "no images here",
+      workspaceDir: "/tmp",
+      model: { input: ["text", "image"] },
+      historyMessages: [
+        {
+          role: "user",
+          content: [
+            { type: "text", text: "See /tmp/should-not-load.png" },
+            { type: "image", data: "abc", mimeType: "image/png" },
+          ],
+        },
+      ],
+    });
+
+    expect(result.detectedRefs).toHaveLength(0);
+    expect(result.images).toHaveLength(0);
+    expect(result.historyImagesByIndex.size).toBe(0);
+  });
+});
diff --git a/src/agents/pi-embedded-runner/run/images.ts b/src/agents/pi-embedded-runner/run/images.ts
index accaa8d73..6a5c4d828 100644
--- a/src/agents/pi-embedded-runner/run/images.ts
+++ b/src/agents/pi-embedded-runner/run/images.ts
@@ -55,7 +55,7 @@ function isImageExtension(filePath: string): boolean {
  * - Absolute paths: /path/to/image.png
  * - Relative paths: ./image.png, ../images/photo.jpg
  * - Home paths: ~/Pictures/screenshot.png
- * - HTTP(S) URLs: https://example.com/image.png
+ * - file:// URLs: file:///path/to/image.png
  * - Message attachments: [Image: source: /path/to/image.jpg]
  *
  * @param prompt The user prompt text to scan
@@ -69,6 +69,7 @@ export function detectImageReferences(prompt: string): DetectedImageRef[] {
   const addPathRef = (raw: string) => {
     const trimmed = raw.trim();
     if (!trimmed || seen.has(trimmed.toLowerCase())) return;
+    if (trimmed.startsWith("http://") || trimmed.startsWith("https://")) return;
     if (!isImageExtension(trimmed)) return;
     seen.add(trimmed.toLowerCase());
     const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed;
@@ -107,18 +108,7 @@ export function detectImageReferences(prompt: string): DetectedImageRef[] {
     if (raw) addPathRef(raw);
   }
 
-  // Pattern for HTTP(S) URLs ending in image extensions
-  // Skip example.com URLs as they're often just documentation examples
-  const urlPattern =
-    /https?:\/\/[^\s<>"'`\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif)(?:\?[^\s<>"'`\]]*)?/gi;
-  while ((match = urlPattern.exec(prompt)) !== null) {
-    const raw = match[0];
-    // Skip example.com URLs - they're documentation, not real images
-    if (raw.includes("example.com")) continue;
-    if (seen.has(raw.toLowerCase())) continue;
-    seen.add(raw.toLowerCase());
-    refs.push({ raw, type: "url", resolved: raw });
-  }
+  // Remote HTTP(S) URLs are intentionally ignored. Native image injection is local-only.
 
   // Pattern for file:// URLs - treat as paths since loadWebMedia handles them
   const fileUrlPattern =
@@ -171,9 +161,9 @@ export async function loadImageFromRef(
   try {
     let targetPath = ref.resolved;
 
-    // When sandbox is enabled, block remote URL loading to maintain network boundary
-    if (ref.type === "url" && options?.sandboxRoot) {
-      log.debug(`Native image: rejecting remote URL in sandboxed session: ${ref.resolved}`);
+    // Remote URL loading is disabled (local-only).
+    if (ref.type === "url") {
+      log.debug(`Native image: rejecting remote URL (local-only): ${ref.resolved}`);
       return null;
     }
 
@@ -213,7 +203,7 @@ export async function loadImageFromRef(
       }
     }
 
-    // loadWebMedia handles both file paths and HTTP(S) URLs
+    // loadWebMedia handles local file paths (including file:// URLs)
     const media = await loadWebMedia(targetPath, options?.maxBytes);
 
     if (media.kind !== "image") {
@@ -259,12 +249,26 @@ function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
   const allRefs: DetectedImageRef[] = [];
   const seen = new Set<string>();
 
+  const messageHasImageContent = (msg: unknown): boolean => {
+    if (!msg || typeof msg !== "object") return false;
+    const content = (msg as { content?: unknown }).content;
+    if (!Array.isArray(content)) return false;
+    return content.some(
+      (part) =>
+        part != null &&
+        typeof part === "object" &&
+        (part as { type?: string }).type === "image",
+    );
+  };
+
   for (let i = 0; i < messages.length; i++) {
     const msg = messages[i];
     if (!msg || typeof msg !== "object") continue;
     const message = msg as { role?: string };
     // Only scan user messages for image references
     if (message.role !== "user") continue;
+    // Skip if message already has image content (prevents reloading each turn)
+    if (messageHasImageContent(msg)) continue;
 
     const text = extractTextFromMessage(msg);
     if (!text) continue;
@@ -315,7 +319,7 @@ export async function detectAndLoadPromptImages(params: {
   // If model doesn't support images, return empty results
   if (!modelSupportsImages(params.model)) {
     return {
-      images: params.existingImages ?? [],
+      images: [],
       historyImagesByIndex: new Map(),
       detectedRefs: [],
       loadedCount: 0,
diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts
index 119423353..59e5df47d 100644
--- a/src/agents/tools/image-tool.ts
+++ b/src/agents/tools/image-tool.ts
@@ -1,4 +1,3 @@
-import fsSync from "node:fs";
 import fs from "node:fs/promises";
 import path from "node:path";
 
@@ -20,7 +19,7 @@ import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
 import { minimaxUnderstandImage } from "../minimax-vlm.js";
 import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js";
 import { runWithImageModelFallback } from "../model-fallback.js";
-import { normalizeProviderId, resolveConfiguredModelRef } from "../model-selection.js";
+import { resolveConfiguredModelRef } from "../model-selection.js";
 import { ensureClawdbotModelsJson } from "../models-config.js";
 import { assertSandboxPath } from "../sandbox-paths.js";
 import type { AnyAgentTool } from "./common.js";
@@ -62,77 +61,6 @@ function hasAuthForProvider(params: { provider: string; agentDir: string }): boo
   return listProfilesForProvider(store, params.provider).length > 0;
 }
 
-type ProviderModelEntry = {
-  id?: string;
-  input?: string[];
-};
-
-type ProviderConfigLike = {
-  models?: ProviderModelEntry[];
-};
-
-function resolveProviderConfig(
-  providers: Record<string, ProviderConfigLike> | undefined,
-  provider: string,
-): ProviderConfigLike | null {
-  if (!providers) return null;
-  const normalized = normalizeProviderId(provider);
-  for (const [key, value] of Object.entries(providers)) {
-    if (normalizeProviderId(key) === normalized) return value;
-  }
-  return null;
-}
-
-function resolveModelSupportsImages(params: {
-  providerConfig: ProviderConfigLike | null;
-  modelId: string;
-}): boolean | null {
-  const models = params.providerConfig?.models;
-  if (!Array.isArray(models) || models.length === 0) return null;
-  const trimmedId = params.modelId.trim();
-  if (!trimmedId) return null;
-  const match =
-    models.find((model) => String(model?.id ?? "").trim() === trimmedId) ??
-    models.find(
-      (model) =>
-        String(model?.id ?? "")
-          .trim()
-          .toLowerCase() === trimmedId.toLowerCase(),
-    );
-  if (!match) return null;
-  const input = Array.isArray(match.input) ? match.input : [];
-  return input.includes("image");
-}
-
-function resolvePrimaryModelSupportsImages(params: {
-  cfg?: ClawdbotConfig;
-  agentDir: string;
-}): boolean | null {
-  if (!params.cfg) return null;
-  const primary = resolveDefaultModelRef(params.cfg);
-  const providerConfig = resolveProviderConfig(
-    params.cfg.models?.providers as Record<string, ProviderConfigLike> | undefined,
-    primary.provider,
-  );
-  const fromConfig = resolveModelSupportsImages({
-    providerConfig,
-    modelId: primary.model,
-  });
-  if (fromConfig !== null) return fromConfig;
-  try {
-    const modelsPath = path.join(params.agentDir, "models.json");
-    const raw = fsSync.readFileSync(modelsPath, "utf8");
-    const parsed = JSON.parse(raw) as { providers?: Record<string, ProviderConfigLike> };
-    const provider = resolveProviderConfig(parsed.providers, primary.provider);
-    return resolveModelSupportsImages({
-      providerConfig: provider,
-      modelId: primary.model,
-    });
-  } catch {
-    return null;
-  }
-}
-
 /**
  * Resolve the effective image model config for the `image` tool.
  *
diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts
index 6f27b0b69..456060d06 100644
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -120,6 +120,21 @@ export type ToolPolicyConfig = {
   profile?: ToolProfileId;
 };
 
+export type ExecToolConfig = {
+  host?: "sandbox" | "gateway" | "node";
+  security?: "deny" | "allowlist" | "full";
+  ask?: "off" | "on-miss" | "always";
+  node?: string;
+  backgroundMs?: number;
+  timeoutSec?: number;
+  cleanupMs?: number;
+  notifyOnExit?: boolean;
+  applyPatch?: {
+    enabled?: boolean;
+    allowModels?: string[];
+  };
+};
+
 export type AgentToolsConfig = {
   /** Base tool profile applied before allow/deny lists. */
   profile?: ToolProfileId;
diff --git a/src/media/image-ops.ts b/src/media/image-ops.ts
index e97e14b1c..c79e537a9 100644
--- a/src/media/image-ops.ts
+++ b/src/media/image-ops.ts
@@ -59,7 +59,6 @@ function readJpegExifOrientation(buffer: Buffer): number | null {
 
     // APP1 marker (EXIF)
     if (marker === 0xe1) {
-      const segmentLength = buffer.readUInt16BE(offset + 2);
       const exifStart = offset + 4;
 
       // Check for "Exif\0\0" header