fix: tighten native image injection (#1098)
Thanks @tyler6204. Co-authored-by: Tyler Yust <tyler6204@users.noreply.github.com>
This commit is contained in:
@@ -21,6 +21,7 @@ Docs: https://docs.clawd.bot
|
||||
- Bridge: add `skills.bins` RPC to support node host auto-allow skill bins.
|
||||
- Slash commands: replace `/cost` with `/usage off|tokens|full` to control per-response usage footer; `/usage` no longer aliases `/status`. (Supersedes #1140) — thanks @Nachx639.
|
||||
- Sessions: add daily reset policy with per-type overrides and idle windows (default 4am local), preserving legacy idle-only configs. (#1146) — thanks @austinm911.
|
||||
- Agents: auto-inject local image references for vision models and avoid reloading history images. (#1098) — thanks @tyler6204.
|
||||
- Docs: refresh exec/elevated/exec-approvals docs for the new flow. https://docs.clawd.bot/tools/exec-approvals
|
||||
- Docs: add node host CLI + update exec approvals/bridge protocol docs. https://docs.clawd.bot/cli/node
|
||||
- ACP: add experimental ACP support for IDE integrations (`clawdbot acp`). Thanks @visionik.
|
||||
|
||||
@@ -68,6 +68,7 @@ import { describeUnknownError, mapThinkingLevel } from "../utils.js";
|
||||
import { resolveSandboxRuntimeStatus } from "../../sandbox/runtime-status.js";
|
||||
import { isTimeoutError } from "../../failover-error.js";
|
||||
import { getGlobalHookRunner } from "../../../plugins/hook-runner-global.js";
|
||||
import { MAX_IMAGE_BYTES } from "../../../media/constants.js";
|
||||
|
||||
import { MAX_IMAGE_BYTES } from "../../../media/constants.js";
|
||||
import type { EmbeddedRunAttemptParams, EmbeddedRunAttemptResult } from "./types.js";
|
||||
@@ -137,7 +138,6 @@ export async function runEmbeddedAttempt(
|
||||
|
||||
// Check if the model supports native image input
|
||||
const modelHasVision = params.model.input?.includes("image") ?? false;
|
||||
|
||||
const toolsRaw = createClawdbotCodingTools({
|
||||
exec: {
|
||||
...params.execOverrides,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { detectImageReferences, modelSupportsImages } from "./images.js";
|
||||
import { detectAndLoadPromptImages, detectImageReferences, modelSupportsImages } from "./images.js";
|
||||
|
||||
describe("detectImageReferences", () => {
|
||||
it("detects absolute file paths with common extensions", () => {
|
||||
@@ -44,27 +44,6 @@ describe("detectImageReferences", () => {
|
||||
expect(refs[0]?.resolved).not.toContain("~");
|
||||
});
|
||||
|
||||
it("detects HTTP URLs with image extensions", () => {
|
||||
const prompt = "Check this URL: https://mysite.com/images/logo.png";
|
||||
const refs = detectImageReferences(prompt);
|
||||
|
||||
expect(refs).toHaveLength(1);
|
||||
expect(refs[0]).toEqual({
|
||||
raw: "https://mysite.com/images/logo.png",
|
||||
type: "url",
|
||||
resolved: "https://mysite.com/images/logo.png",
|
||||
});
|
||||
});
|
||||
|
||||
it("detects HTTPS URLs with query parameters", () => {
|
||||
const prompt = "Image from https://cdn.mysite.com/img.jpg?size=large&v=2";
|
||||
const refs = detectImageReferences(prompt);
|
||||
|
||||
expect(refs).toHaveLength(1);
|
||||
expect(refs[0]?.type).toBe("url");
|
||||
expect(refs[0]?.raw).toContain("https://cdn.mysite.com/img.jpg");
|
||||
});
|
||||
|
||||
it("detects multiple image references in a prompt", () => {
|
||||
const prompt = `
|
||||
Compare these two images:
|
||||
@@ -73,9 +52,9 @@ describe("detectImageReferences", () => {
|
||||
`;
|
||||
const refs = detectImageReferences(prompt);
|
||||
|
||||
expect(refs).toHaveLength(2);
|
||||
expect(refs).toHaveLength(1);
|
||||
expect(refs.some((r) => r.type === "path")).toBe(true);
|
||||
expect(refs.some((r) => r.type === "url")).toBe(true);
|
||||
expect(refs.some((r) => r.type === "url")).toBe(false);
|
||||
});
|
||||
|
||||
it("handles various image extensions", () => {
|
||||
@@ -165,9 +144,10 @@ what about these images?`;
|
||||
expect(refs[0]?.resolved).toContain("IMG_6430.jpeg");
|
||||
});
|
||||
|
||||
it("skips example.com URLs as they are documentation examples", () => {
|
||||
it("ignores remote URLs entirely (local-only)", () => {
|
||||
const prompt = `To send an image: MEDIA:https://example.com/image.jpg
|
||||
Here is my actual image: /path/to/real.png`;
|
||||
Here is my actual image: /path/to/real.png
|
||||
Also https://cdn.mysite.com/img.jpg`;
|
||||
const refs = detectImageReferences(prompt);
|
||||
|
||||
expect(refs).toHaveLength(1);
|
||||
@@ -216,3 +196,38 @@ describe("modelSupportsImages", () => {
|
||||
expect(modelSupportsImages(model)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("detectAndLoadPromptImages", () => {
|
||||
it("returns no images for non-vision models even when existing images are provided", async () => {
|
||||
const result = await detectAndLoadPromptImages({
|
||||
prompt: "ignore",
|
||||
workspaceDir: "/tmp",
|
||||
model: { input: ["text"] },
|
||||
existingImages: [{ type: "image", data: "abc", mimeType: "image/png" }],
|
||||
});
|
||||
|
||||
expect(result.images).toHaveLength(0);
|
||||
expect(result.detectedRefs).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("skips history messages that already include image content", async () => {
|
||||
const result = await detectAndLoadPromptImages({
|
||||
prompt: "no images here",
|
||||
workspaceDir: "/tmp",
|
||||
model: { input: ["text", "image"] },
|
||||
historyMessages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: "See /tmp/should-not-load.png" },
|
||||
{ type: "image", data: "abc", mimeType: "image/png" },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.detectedRefs).toHaveLength(0);
|
||||
expect(result.images).toHaveLength(0);
|
||||
expect(result.historyImagesByIndex.size).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -55,7 +55,7 @@ function isImageExtension(filePath: string): boolean {
|
||||
* - Absolute paths: /path/to/image.png
|
||||
* - Relative paths: ./image.png, ../images/photo.jpg
|
||||
* - Home paths: ~/Pictures/screenshot.png
|
||||
* - HTTP(S) URLs: https://example.com/image.png
|
||||
* - file:// URLs: file:///path/to/image.png
|
||||
* - Message attachments: [Image: source: /path/to/image.jpg]
|
||||
*
|
||||
* @param prompt The user prompt text to scan
|
||||
@@ -69,6 +69,7 @@ export function detectImageReferences(prompt: string): DetectedImageRef[] {
|
||||
const addPathRef = (raw: string) => {
|
||||
const trimmed = raw.trim();
|
||||
if (!trimmed || seen.has(trimmed.toLowerCase())) return;
|
||||
if (trimmed.startsWith("http://") || trimmed.startsWith("https://")) return;
|
||||
if (!isImageExtension(trimmed)) return;
|
||||
seen.add(trimmed.toLowerCase());
|
||||
const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed;
|
||||
@@ -107,18 +108,7 @@ export function detectImageReferences(prompt: string): DetectedImageRef[] {
|
||||
if (raw) addPathRef(raw);
|
||||
}
|
||||
|
||||
// Pattern for HTTP(S) URLs ending in image extensions
|
||||
// Skip example.com URLs as they're often just documentation examples
|
||||
const urlPattern =
|
||||
/https?:\/\/[^\s<>"'`\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif)(?:\?[^\s<>"'`\]]*)?/gi;
|
||||
while ((match = urlPattern.exec(prompt)) !== null) {
|
||||
const raw = match[0];
|
||||
// Skip example.com URLs - they're documentation, not real images
|
||||
if (raw.includes("example.com")) continue;
|
||||
if (seen.has(raw.toLowerCase())) continue;
|
||||
seen.add(raw.toLowerCase());
|
||||
refs.push({ raw, type: "url", resolved: raw });
|
||||
}
|
||||
// Remote HTTP(S) URLs are intentionally ignored. Native image injection is local-only.
|
||||
|
||||
// Pattern for file:// URLs - treat as paths since loadWebMedia handles them
|
||||
const fileUrlPattern =
|
||||
@@ -171,9 +161,9 @@ export async function loadImageFromRef(
|
||||
try {
|
||||
let targetPath = ref.resolved;
|
||||
|
||||
// When sandbox is enabled, block remote URL loading to maintain network boundary
|
||||
if (ref.type === "url" && options?.sandboxRoot) {
|
||||
log.debug(`Native image: rejecting remote URL in sandboxed session: ${ref.resolved}`);
|
||||
// Remote URL loading is disabled (local-only).
|
||||
if (ref.type === "url") {
|
||||
log.debug(`Native image: rejecting remote URL (local-only): ${ref.resolved}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -213,7 +203,7 @@ export async function loadImageFromRef(
|
||||
}
|
||||
}
|
||||
|
||||
// loadWebMedia handles both file paths and HTTP(S) URLs
|
||||
// loadWebMedia handles local file paths (including file:// URLs)
|
||||
const media = await loadWebMedia(targetPath, options?.maxBytes);
|
||||
|
||||
if (media.kind !== "image") {
|
||||
@@ -259,12 +249,26 @@ function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
|
||||
const allRefs: DetectedImageRef[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
const messageHasImageContent = (msg: unknown): boolean => {
|
||||
if (!msg || typeof msg !== "object") return false;
|
||||
const content = (msg as { content?: unknown }).content;
|
||||
if (!Array.isArray(content)) return false;
|
||||
return content.some(
|
||||
(part) =>
|
||||
part != null &&
|
||||
typeof part === "object" &&
|
||||
(part as { type?: string }).type === "image",
|
||||
);
|
||||
};
|
||||
|
||||
for (let i = 0; i < messages.length; i++) {
|
||||
const msg = messages[i];
|
||||
if (!msg || typeof msg !== "object") continue;
|
||||
const message = msg as { role?: string };
|
||||
// Only scan user messages for image references
|
||||
if (message.role !== "user") continue;
|
||||
// Skip if message already has image content (prevents reloading each turn)
|
||||
if (messageHasImageContent(msg)) continue;
|
||||
|
||||
const text = extractTextFromMessage(msg);
|
||||
if (!text) continue;
|
||||
@@ -315,7 +319,7 @@ export async function detectAndLoadPromptImages(params: {
|
||||
// If model doesn't support images, return empty results
|
||||
if (!modelSupportsImages(params.model)) {
|
||||
return {
|
||||
images: params.existingImages ?? [],
|
||||
images: [],
|
||||
historyImagesByIndex: new Map(),
|
||||
detectedRefs: [],
|
||||
loadedCount: 0,
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import fsSync from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
|
||||
@@ -20,7 +19,7 @@ import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
|
||||
import { minimaxUnderstandImage } from "../minimax-vlm.js";
|
||||
import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js";
|
||||
import { runWithImageModelFallback } from "../model-fallback.js";
|
||||
import { normalizeProviderId, resolveConfiguredModelRef } from "../model-selection.js";
|
||||
import { resolveConfiguredModelRef } from "../model-selection.js";
|
||||
import { ensureClawdbotModelsJson } from "../models-config.js";
|
||||
import { assertSandboxPath } from "../sandbox-paths.js";
|
||||
import type { AnyAgentTool } from "./common.js";
|
||||
@@ -62,77 +61,6 @@ function hasAuthForProvider(params: { provider: string; agentDir: string }): boo
|
||||
return listProfilesForProvider(store, params.provider).length > 0;
|
||||
}
|
||||
|
||||
type ProviderModelEntry = {
|
||||
id?: string;
|
||||
input?: string[];
|
||||
};
|
||||
|
||||
type ProviderConfigLike = {
|
||||
models?: ProviderModelEntry[];
|
||||
};
|
||||
|
||||
function resolveProviderConfig(
|
||||
providers: Record<string, ProviderConfigLike> | undefined,
|
||||
provider: string,
|
||||
): ProviderConfigLike | null {
|
||||
if (!providers) return null;
|
||||
const normalized = normalizeProviderId(provider);
|
||||
for (const [key, value] of Object.entries(providers)) {
|
||||
if (normalizeProviderId(key) === normalized) return value;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function resolveModelSupportsImages(params: {
|
||||
providerConfig: ProviderConfigLike | null;
|
||||
modelId: string;
|
||||
}): boolean | null {
|
||||
const models = params.providerConfig?.models;
|
||||
if (!Array.isArray(models) || models.length === 0) return null;
|
||||
const trimmedId = params.modelId.trim();
|
||||
if (!trimmedId) return null;
|
||||
const match =
|
||||
models.find((model) => String(model?.id ?? "").trim() === trimmedId) ??
|
||||
models.find(
|
||||
(model) =>
|
||||
String(model?.id ?? "")
|
||||
.trim()
|
||||
.toLowerCase() === trimmedId.toLowerCase(),
|
||||
);
|
||||
if (!match) return null;
|
||||
const input = Array.isArray(match.input) ? match.input : [];
|
||||
return input.includes("image");
|
||||
}
|
||||
|
||||
function resolvePrimaryModelSupportsImages(params: {
|
||||
cfg?: ClawdbotConfig;
|
||||
agentDir: string;
|
||||
}): boolean | null {
|
||||
if (!params.cfg) return null;
|
||||
const primary = resolveDefaultModelRef(params.cfg);
|
||||
const providerConfig = resolveProviderConfig(
|
||||
params.cfg.models?.providers as Record<string, ProviderConfigLike> | undefined,
|
||||
primary.provider,
|
||||
);
|
||||
const fromConfig = resolveModelSupportsImages({
|
||||
providerConfig,
|
||||
modelId: primary.model,
|
||||
});
|
||||
if (fromConfig !== null) return fromConfig;
|
||||
try {
|
||||
const modelsPath = path.join(params.agentDir, "models.json");
|
||||
const raw = fsSync.readFileSync(modelsPath, "utf8");
|
||||
const parsed = JSON.parse(raw) as { providers?: Record<string, ProviderConfigLike> };
|
||||
const provider = resolveProviderConfig(parsed.providers, primary.provider);
|
||||
return resolveModelSupportsImages({
|
||||
providerConfig: provider,
|
||||
modelId: primary.model,
|
||||
});
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the effective image model config for the `image` tool.
|
||||
*
|
||||
|
||||
@@ -120,6 +120,21 @@ export type ToolPolicyConfig = {
|
||||
profile?: ToolProfileId;
|
||||
};
|
||||
|
||||
export type ExecToolConfig = {
|
||||
host?: "sandbox" | "gateway" | "node";
|
||||
security?: "deny" | "allowlist" | "full";
|
||||
ask?: "off" | "on-miss" | "always";
|
||||
node?: string;
|
||||
backgroundMs?: number;
|
||||
timeoutSec?: number;
|
||||
cleanupMs?: number;
|
||||
notifyOnExit?: boolean;
|
||||
applyPatch?: {
|
||||
enabled?: boolean;
|
||||
allowModels?: string[];
|
||||
};
|
||||
};
|
||||
|
||||
export type AgentToolsConfig = {
|
||||
/** Base tool profile applied before allow/deny lists. */
|
||||
profile?: ToolProfileId;
|
||||
|
||||
@@ -59,7 +59,6 @@ function readJpegExifOrientation(buffer: Buffer): number | null {
|
||||
|
||||
// APP1 marker (EXIF)
|
||||
if (marker === 0xe1) {
|
||||
const segmentLength = buffer.readUInt16BE(offset + 2);
|
||||
const exifStart = offset + 4;
|
||||
|
||||
// Check for "Exif\0\0" header
|
||||
|
||||
Reference in New Issue
Block a user