feat: native image injection for vision-capable models
- Auto-detect and load images referenced in user prompts - Inject history images at their original message positions - Fix EXIF orientation - rotate before resizing in resizeToJpeg - Sandbox security: validate paths, block remote URLs when sandbox enabled - Prevent duplicate history image injection across turns - Handle string-based user message content (convert to array) - Add bounds check for message index in history processing - Fix regex to properly match relative paths (./ ../) - Add multi-image support for iMessage attachments - Pass MAX_IMAGE_BYTES limit to image loading Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Peter Steinberger
parent
f7123ec30a
commit
8d74578ceb
@@ -102,7 +102,11 @@ describe("image tool implicit imageModel config", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("disables image tool when primary model already supports images", async () => {
|
||||
it("keeps image tool available when primary model supports images (for explicit requests)", async () => {
|
||||
// When the primary model supports images, we still keep the tool available
|
||||
// because images are auto-injected into prompts. The tool description is
|
||||
// adjusted via modelHasVision to discourage redundant usage.
|
||||
vi.stubEnv("OPENAI_API_KEY", "test-key");
|
||||
const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-image-"));
|
||||
const cfg: ClawdbotConfig = {
|
||||
agents: {
|
||||
@@ -119,8 +123,13 @@ describe("image tool implicit imageModel config", () => {
|
||||
},
|
||||
},
|
||||
};
|
||||
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toBeNull();
|
||||
expect(createImageTool({ config: cfg, agentDir })).toBeNull();
|
||||
// Tool should still be available for explicit image analysis requests
|
||||
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
|
||||
primary: "openai/gpt-5-mini",
|
||||
});
|
||||
const tool = createImageTool({ config: cfg, agentDir, modelHasVision: true });
|
||||
expect(tool).not.toBeNull();
|
||||
expect(tool?.description).toContain("Only use this tool when the image was NOT already provided");
|
||||
});
|
||||
|
||||
it("sandboxes image paths like the read tool", async () => {
|
||||
|
||||
@@ -145,11 +145,10 @@ export function resolveImageModelConfigForTool(params: {
|
||||
cfg?: ClawdbotConfig;
|
||||
agentDir: string;
|
||||
}): ImageModelConfig | null {
|
||||
const primarySupportsImages = resolvePrimaryModelSupportsImages({
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
if (primarySupportsImages === true) return null;
|
||||
// Note: We intentionally do NOT gate based on primarySupportsImages here.
|
||||
// Even when the primary model supports images, we keep the tool available
|
||||
// because images are auto-injected into prompts (see attempt.ts detectAndLoadPromptImages).
|
||||
// The tool description is adjusted via modelHasVision to discourage redundant usage.
|
||||
const explicit = coerceImageModelConfig(params.cfg);
|
||||
if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) {
|
||||
return explicit;
|
||||
@@ -368,6 +367,8 @@ export function createImageTool(options?: {
|
||||
config?: ClawdbotConfig;
|
||||
agentDir?: string;
|
||||
sandboxRoot?: string;
|
||||
/** If true, the model has native vision capability and images in the prompt are auto-injected */
|
||||
modelHasVision?: boolean;
|
||||
}): AnyAgentTool | null {
|
||||
const agentDir = options?.agentDir?.trim();
|
||||
if (!agentDir) {
|
||||
@@ -382,11 +383,17 @@ export function createImageTool(options?: {
|
||||
agentDir,
|
||||
});
|
||||
if (!imageModelConfig) return null;
|
||||
|
||||
// If model has native vision, images in the prompt are auto-injected
|
||||
// so this tool is only needed when image wasn't provided in the prompt
|
||||
const description = options?.modelHasVision
|
||||
? "Analyze an image with a vision model. Only use this tool when the image was NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you."
|
||||
: "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL.";
|
||||
|
||||
return {
|
||||
label: "Image",
|
||||
name: "image",
|
||||
description:
|
||||
"Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL.",
|
||||
description,
|
||||
parameters: Type.Object({
|
||||
prompt: Type.Optional(Type.String()),
|
||||
image: Type.String(),
|
||||
|
||||
Reference in New Issue
Block a user