From 8ff09f83375c3446cb10b905bbf4d96c005e27b5 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 12 Jan 2026 17:50:44 +0000 Subject: [PATCH] feat(image): auto-pair image model --- docs/tools/index.md | 2 +- src/agents/models-config.providers.ts | 10 ++ src/agents/tools/image-tool.test.ts | 107 +++++++++++++++ src/agents/tools/image-tool.ts | 184 +++++++++++++++++++++++++- 4 files changed, 295 insertions(+), 8 deletions(-) create mode 100644 src/agents/tools/image-tool.test.ts diff --git a/docs/tools/index.md b/docs/tools/index.md index d9f448722..3cfd44da6 100644 --- a/docs/tools/index.md +++ b/docs/tools/index.md @@ -153,7 +153,7 @@ Core parameters: - `maxBytesMb` (optional size cap) Notes: -- Only available when `agents.defaults.imageModel` is configured (primary or fallbacks). +- Only available when `agents.defaults.imageModel` is configured (primary or fallbacks), or when an implicit image model can be inferred from your default model + configured auth (best-effort pairing). - Uses the image model directly (independent of the main chat model). ### `message` diff --git a/src/agents/models-config.providers.ts b/src/agents/models-config.providers.ts index f62502ca4..457713237 100644 --- a/src/agents/models-config.providers.ts +++ b/src/agents/models-config.providers.ts @@ -10,6 +10,7 @@ export type ProviderConfig = NonNullable[string]; const MINIMAX_API_BASE_URL = "https://api.minimax.io/anthropic"; const MINIMAX_DEFAULT_MODEL_ID = "MiniMax-M2.1"; +const MINIMAX_DEFAULT_VISION_MODEL_ID = "MiniMax-VL-01"; const MINIMAX_DEFAULT_CONTEXT_WINDOW = 200000; const MINIMAX_DEFAULT_MAX_TOKENS = 8192; // Pricing: MiniMax doesn't publish public rates. Override in models.json for accurate costs. @@ -148,6 +149,15 @@ function buildMinimaxProvider(): ProviderConfig { contextWindow: MINIMAX_DEFAULT_CONTEXT_WINDOW, maxTokens: MINIMAX_DEFAULT_MAX_TOKENS, }, + { + id: MINIMAX_DEFAULT_VISION_MODEL_ID, + name: "MiniMax VL 01", + reasoning: false, + input: ["text", "image"], + cost: MINIMAX_API_COST, + contextWindow: MINIMAX_DEFAULT_CONTEXT_WINDOW, + maxTokens: MINIMAX_DEFAULT_MAX_TOKENS, + }, ], }; } diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts new file mode 100644 index 000000000..987ab5348 --- /dev/null +++ b/src/agents/tools/image-tool.test.ts @@ -0,0 +1,107 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +import type { ClawdbotConfig } from "../../config/config.js"; +import { + createImageTool, + resolveImageModelConfigForTool, +} from "./image-tool.js"; + +async function writeAuthProfiles(agentDir: string, profiles: unknown) { + await fs.mkdir(agentDir, { recursive: true }); + await fs.writeFile( + path.join(agentDir, "auth-profiles.json"), + `${JSON.stringify(profiles, null, 2)}\n`, + "utf8", + ); +} + +describe("image tool implicit imageModel config", () => { + beforeEach(() => { + vi.stubEnv("OPENAI_API_KEY", ""); + vi.stubEnv("ANTHROPIC_API_KEY", ""); + vi.stubEnv("ANTHROPIC_OAUTH_TOKEN", ""); + vi.stubEnv("MINIMAX_API_KEY", ""); + }); + + afterEach(() => { + vi.unstubAllEnvs(); + }); + + it("stays disabled without auth when no pairing is possible", async () => { + const agentDir = await fs.mkdtemp( + path.join(os.tmpdir(), "clawdbot-image-"), + ); + const cfg: ClawdbotConfig = { + agents: { defaults: { model: { primary: "openai/gpt-5.2" } } }, + }; + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toBeNull(); + expect(createImageTool({ config: cfg, agentDir })).toBeNull(); + }); + + it("pairs minimax primary with MiniMax-VL-01 (and fallbacks) when auth exists", async () => { + const agentDir = await fs.mkdtemp( + path.join(os.tmpdir(), "clawdbot-image-"), + ); + vi.stubEnv("MINIMAX_API_KEY", "minimax-test"); + vi.stubEnv("OPENAI_API_KEY", "openai-test"); + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg: ClawdbotConfig = { + agents: { defaults: { model: { primary: "minimax/MiniMax-M2.1" } } }, + }; + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "minimax/MiniMax-VL-01", + fallbacks: ["openai/gpt-5-mini", "anthropic/claude-opus-4-5"], + }); + expect(createImageTool({ config: cfg, agentDir })).not.toBeNull(); + }); + + it("pairs a custom provider when it declares an image-capable model", async () => { + const agentDir = await fs.mkdtemp( + path.join(os.tmpdir(), "clawdbot-image-"), + ); + await writeAuthProfiles(agentDir, { + version: 1, + profiles: { + "acme:default": { type: "api_key", provider: "acme", key: "sk-test" }, + }, + }); + const cfg: ClawdbotConfig = { + agents: { defaults: { model: { primary: "acme/text-1" } } }, + models: { + providers: { + acme: { + models: [ + { id: "text-1", input: ["text"] }, + { id: "vision-1", input: ["text", "image"] }, + ], + }, + }, + }, + }; + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "acme/vision-1", + }); + expect(createImageTool({ config: cfg, agentDir })).not.toBeNull(); + }); + + it("prefers explicit agents.defaults.imageModel", async () => { + const agentDir = await fs.mkdtemp( + path.join(os.tmpdir(), "clawdbot-image-"), + ); + const cfg: ClawdbotConfig = { + agents: { + defaults: { + model: { primary: "minimax/MiniMax-M2.1" }, + imageModel: { primary: "openai/gpt-5-mini" }, + }, + }, + }; + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "openai/gpt-5-mini", + }); + }); +}); diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 5b8c56f56..9999e4bdf 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -14,15 +14,23 @@ import { Type } from "@sinclair/typebox"; import type { ClawdbotConfig } from "../../config/config.js"; import { resolveUserPath } from "../../utils.js"; import { loadWebMedia } from "../../web/media.js"; -import { getApiKeyForModel } from "../model-auth.js"; +import { + ensureAuthProfileStore, + listProfilesForProvider, +} from "../auth-profiles.js"; +import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js"; +import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js"; import { runWithImageModelFallback } from "../model-fallback.js"; +import { parseModelRef } from "../model-selection.js"; import { ensureClawdbotModelsJson } from "../models-config.js"; import { extractAssistantText } from "../pi-embedded-utils.js"; import type { AnyAgentTool } from "./common.js"; const DEFAULT_PROMPT = "Describe the image."; -function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean { +type ImageModelConfig = { primary?: string; fallbacks?: string[] }; + +function coerceImageModelConfig(cfg?: ClawdbotConfig): ImageModelConfig { const imageModel = cfg?.agents?.defaults?.imageModel as | { primary?: string; fallbacks?: string[] } | string @@ -31,7 +39,150 @@ function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean { typeof imageModel === "string" ? imageModel.trim() : imageModel?.primary; const fallbacks = typeof imageModel === "object" ? (imageModel?.fallbacks ?? []) : []; - return Boolean(primary?.trim() || fallbacks.length > 0); + return { + ...(primary?.trim() ? { primary: primary.trim() } : {}), + ...(fallbacks.length > 0 ? { fallbacks } : {}), + }; +} + +function resolveProviderVisionModelFromConfig(params: { + cfg?: ClawdbotConfig; + provider: string; +}): string | null { + const providerCfg = params.cfg?.models?.providers?.[ + params.provider + ] as unknown as + | { models?: Array<{ id?: string; input?: string[] }> } + | undefined; + const models = providerCfg?.models ?? []; + const preferMinimaxVl = + params.provider === "minimax" + ? models.find( + (m) => + (m?.id ?? "").trim() === "MiniMax-VL-01" && + Array.isArray(m?.input) && + m.input.includes("image"), + ) + : null; + const picked = + preferMinimaxVl ?? + models.find( + (m) => Boolean((m?.id ?? "").trim()) && m.input?.includes("image"), + ); + const id = (picked?.id ?? "").trim(); + return id ? `${params.provider}/${id}` : null; +} + +function resolveDefaultModelRef(cfg?: ClawdbotConfig): { + provider: string; + model: string; +} { + const modelConfig = cfg?.agents?.defaults?.model as + | { primary?: string } + | string + | undefined; + const raw = + typeof modelConfig === "string" + ? modelConfig.trim() + : modelConfig?.primary?.trim(); + const parsed = + parseModelRef(raw ?? "", DEFAULT_PROVIDER) ?? + ({ provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL } as const); + return { provider: parsed.provider, model: parsed.model }; +} + +function hasAuthForProvider(params: { + provider: string; + agentDir: string; +}): boolean { + if (resolveEnvApiKey(params.provider)?.apiKey) return true; + const store = ensureAuthProfileStore(params.agentDir, { + allowKeychainPrompt: false, + }); + return listProfilesForProvider(store, params.provider).length > 0; +} + +/** + * Resolve the effective image model config for the `image` tool. + * + * - Prefer explicit config (`agents.defaults.imageModel`). + * - Otherwise, try to "pair" the primary model with an image-capable model: + * - same provider (best effort) + * - fall back to OpenAI/Anthropic when available + */ +export function resolveImageModelConfigForTool(params: { + cfg?: ClawdbotConfig; + agentDir: string; +}): ImageModelConfig | null { + const explicit = coerceImageModelConfig(params.cfg); + if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) { + return explicit; + } + + const primary = resolveDefaultModelRef(params.cfg); + const openaiOk = hasAuthForProvider({ + provider: "openai", + agentDir: params.agentDir, + }); + const anthropicOk = hasAuthForProvider({ + provider: "anthropic", + agentDir: params.agentDir, + }); + + const fallbacks: string[] = []; + const addFallback = (modelRef: string | null) => { + const ref = (modelRef ?? "").trim(); + if (!ref) return; + if (fallbacks.includes(ref)) return; + fallbacks.push(ref); + }; + + const providerVisionFromConfig = resolveProviderVisionModelFromConfig({ + cfg: params.cfg, + provider: primary.provider, + }); + const providerOk = hasAuthForProvider({ + provider: primary.provider, + agentDir: params.agentDir, + }); + + let preferred: string | null = null; + + // MiniMax users: always try the canonical vision model first when auth exists. + if (primary.provider === "minimax" && providerOk) { + preferred = "minimax/MiniMax-VL-01"; + } else if (providerOk && providerVisionFromConfig) { + preferred = providerVisionFromConfig; + } else if (primary.provider === "openai" && openaiOk) { + preferred = "openai/gpt-5-mini"; + } else if (primary.provider === "anthropic" && anthropicOk) { + preferred = "anthropic/claude-opus-4-5"; + } + + if (preferred?.trim()) { + if (openaiOk) addFallback("openai/gpt-5-mini"); + if (anthropicOk) addFallback("anthropic/claude-opus-4-5"); + // Don't duplicate primary in fallbacks. + const pruned = fallbacks.filter((ref) => ref !== preferred); + return { + primary: preferred, + ...(pruned.length > 0 ? { fallbacks: pruned } : {}), + }; + } + + // Cross-provider fallback when we can't pair with the primary provider. + if (openaiOk) { + if (anthropicOk) addFallback("anthropic/claude-opus-4-5"); + return { + primary: "openai/gpt-5-mini", + ...(fallbacks.length ? { fallbacks } : {}), + }; + } + if (anthropicOk) { + return { primary: "anthropic/claude-opus-4-5" }; + } + + return null; } function pickMaxBytes( @@ -78,17 +229,31 @@ function buildImageContext( async function runImagePrompt(params: { cfg?: ClawdbotConfig; agentDir: string; + imageModelConfig: ImageModelConfig; modelOverride?: string; prompt: string; base64: string; mimeType: string; }): Promise<{ text: string; provider: string; model: string }> { - await ensureClawdbotModelsJson(params.cfg, params.agentDir); + const effectiveCfg: ClawdbotConfig | undefined = params.cfg + ? { + ...params.cfg, + agents: { + ...params.cfg.agents, + defaults: { + ...params.cfg.agents?.defaults, + imageModel: params.imageModelConfig, + }, + }, + } + : undefined; + + await ensureClawdbotModelsJson(effectiveCfg, params.agentDir); const authStorage = discoverAuthStorage(params.agentDir); const modelRegistry = discoverModels(authStorage, params.agentDir); const result = await runWithImageModelFallback({ - cfg: params.cfg, + cfg: effectiveCfg, modelOverride: params.modelOverride, run: async (provider, modelId) => { const model = modelRegistry.find(provider, modelId) as Model | null; @@ -102,7 +267,7 @@ async function runImagePrompt(params: { } const apiKeyInfo = await getApiKeyForModel({ model, - cfg: params.cfg, + cfg: effectiveCfg, agentDir: params.agentDir, }); authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey); @@ -132,11 +297,15 @@ export function createImageTool(options?: { config?: ClawdbotConfig; agentDir?: string; }): AnyAgentTool | null { - if (!ensureImageToolConfigured(options?.config)) return null; const agentDir = options?.agentDir; if (!agentDir?.trim()) { throw new Error("createImageTool requires agentDir when enabled"); } + const imageModelConfig = resolveImageModelConfigForTool({ + cfg: options?.config, + agentDir, + }); + if (!imageModelConfig) return null; return { label: "Image", name: "image", @@ -181,6 +350,7 @@ export function createImageTool(options?: { const result = await runImagePrompt({ cfg: options?.config, agentDir, + imageModelConfig, modelOverride, prompt: promptRaw, base64,