feat(image): auto-pair image model

This commit is contained in:
Peter Steinberger
2026-01-12 17:50:44 +00:00
parent e91aa0657e
commit 8ff09f8337
4 changed files with 295 additions and 8 deletions

View File

@@ -153,7 +153,7 @@ Core parameters:
- `maxBytesMb` (optional size cap)
Notes:
- Only available when `agents.defaults.imageModel` is configured (primary or fallbacks).
- Only available when `agents.defaults.imageModel` is configured (primary or fallbacks), or when an implicit image model can be inferred from your default model + configured auth (best-effort pairing).
- Uses the image model directly (independent of the main chat model).
### `message`

View File

@@ -10,6 +10,7 @@ export type ProviderConfig = NonNullable<ModelsConfig["providers"]>[string];
const MINIMAX_API_BASE_URL = "https://api.minimax.io/anthropic";
const MINIMAX_DEFAULT_MODEL_ID = "MiniMax-M2.1";
const MINIMAX_DEFAULT_VISION_MODEL_ID = "MiniMax-VL-01";
const MINIMAX_DEFAULT_CONTEXT_WINDOW = 200000;
const MINIMAX_DEFAULT_MAX_TOKENS = 8192;
// Pricing: MiniMax doesn't publish public rates. Override in models.json for accurate costs.
@@ -148,6 +149,15 @@ function buildMinimaxProvider(): ProviderConfig {
contextWindow: MINIMAX_DEFAULT_CONTEXT_WINDOW,
maxTokens: MINIMAX_DEFAULT_MAX_TOKENS,
},
{
id: MINIMAX_DEFAULT_VISION_MODEL_ID,
name: "MiniMax VL 01",
reasoning: false,
input: ["text", "image"],
cost: MINIMAX_API_COST,
contextWindow: MINIMAX_DEFAULT_CONTEXT_WINDOW,
maxTokens: MINIMAX_DEFAULT_MAX_TOKENS,
},
],
};
}

View File

@@ -0,0 +1,107 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { ClawdbotConfig } from "../../config/config.js";
import {
createImageTool,
resolveImageModelConfigForTool,
} from "./image-tool.js";
async function writeAuthProfiles(agentDir: string, profiles: unknown) {
await fs.mkdir(agentDir, { recursive: true });
await fs.writeFile(
path.join(agentDir, "auth-profiles.json"),
`${JSON.stringify(profiles, null, 2)}\n`,
"utf8",
);
}
describe("image tool implicit imageModel config", () => {
beforeEach(() => {
vi.stubEnv("OPENAI_API_KEY", "");
vi.stubEnv("ANTHROPIC_API_KEY", "");
vi.stubEnv("ANTHROPIC_OAUTH_TOKEN", "");
vi.stubEnv("MINIMAX_API_KEY", "");
});
afterEach(() => {
vi.unstubAllEnvs();
});
it("stays disabled without auth when no pairing is possible", async () => {
const agentDir = await fs.mkdtemp(
path.join(os.tmpdir(), "clawdbot-image-"),
);
const cfg: ClawdbotConfig = {
agents: { defaults: { model: { primary: "openai/gpt-5.2" } } },
};
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toBeNull();
expect(createImageTool({ config: cfg, agentDir })).toBeNull();
});
it("pairs minimax primary with MiniMax-VL-01 (and fallbacks) when auth exists", async () => {
const agentDir = await fs.mkdtemp(
path.join(os.tmpdir(), "clawdbot-image-"),
);
vi.stubEnv("MINIMAX_API_KEY", "minimax-test");
vi.stubEnv("OPENAI_API_KEY", "openai-test");
vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test");
const cfg: ClawdbotConfig = {
agents: { defaults: { model: { primary: "minimax/MiniMax-M2.1" } } },
};
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "minimax/MiniMax-VL-01",
fallbacks: ["openai/gpt-5-mini", "anthropic/claude-opus-4-5"],
});
expect(createImageTool({ config: cfg, agentDir })).not.toBeNull();
});
it("pairs a custom provider when it declares an image-capable model", async () => {
const agentDir = await fs.mkdtemp(
path.join(os.tmpdir(), "clawdbot-image-"),
);
await writeAuthProfiles(agentDir, {
version: 1,
profiles: {
"acme:default": { type: "api_key", provider: "acme", key: "sk-test" },
},
});
const cfg: ClawdbotConfig = {
agents: { defaults: { model: { primary: "acme/text-1" } } },
models: {
providers: {
acme: {
models: [
{ id: "text-1", input: ["text"] },
{ id: "vision-1", input: ["text", "image"] },
],
},
},
},
};
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "acme/vision-1",
});
expect(createImageTool({ config: cfg, agentDir })).not.toBeNull();
});
it("prefers explicit agents.defaults.imageModel", async () => {
const agentDir = await fs.mkdtemp(
path.join(os.tmpdir(), "clawdbot-image-"),
);
const cfg: ClawdbotConfig = {
agents: {
defaults: {
model: { primary: "minimax/MiniMax-M2.1" },
imageModel: { primary: "openai/gpt-5-mini" },
},
},
};
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "openai/gpt-5-mini",
});
});
});

View File

@@ -14,15 +14,23 @@ import { Type } from "@sinclair/typebox";
import type { ClawdbotConfig } from "../../config/config.js";
import { resolveUserPath } from "../../utils.js";
import { loadWebMedia } from "../../web/media.js";
import { getApiKeyForModel } from "../model-auth.js";
import {
ensureAuthProfileStore,
listProfilesForProvider,
} from "../auth-profiles.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js";
import { runWithImageModelFallback } from "../model-fallback.js";
import { parseModelRef } from "../model-selection.js";
import { ensureClawdbotModelsJson } from "../models-config.js";
import { extractAssistantText } from "../pi-embedded-utils.js";
import type { AnyAgentTool } from "./common.js";
const DEFAULT_PROMPT = "Describe the image.";
function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean {
type ImageModelConfig = { primary?: string; fallbacks?: string[] };
function coerceImageModelConfig(cfg?: ClawdbotConfig): ImageModelConfig {
const imageModel = cfg?.agents?.defaults?.imageModel as
| { primary?: string; fallbacks?: string[] }
| string
@@ -31,7 +39,150 @@ function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean {
typeof imageModel === "string" ? imageModel.trim() : imageModel?.primary;
const fallbacks =
typeof imageModel === "object" ? (imageModel?.fallbacks ?? []) : [];
return Boolean(primary?.trim() || fallbacks.length > 0);
return {
...(primary?.trim() ? { primary: primary.trim() } : {}),
...(fallbacks.length > 0 ? { fallbacks } : {}),
};
}
function resolveProviderVisionModelFromConfig(params: {
cfg?: ClawdbotConfig;
provider: string;
}): string | null {
const providerCfg = params.cfg?.models?.providers?.[
params.provider
] as unknown as
| { models?: Array<{ id?: string; input?: string[] }> }
| undefined;
const models = providerCfg?.models ?? [];
const preferMinimaxVl =
params.provider === "minimax"
? models.find(
(m) =>
(m?.id ?? "").trim() === "MiniMax-VL-01" &&
Array.isArray(m?.input) &&
m.input.includes("image"),
)
: null;
const picked =
preferMinimaxVl ??
models.find(
(m) => Boolean((m?.id ?? "").trim()) && m.input?.includes("image"),
);
const id = (picked?.id ?? "").trim();
return id ? `${params.provider}/${id}` : null;
}
function resolveDefaultModelRef(cfg?: ClawdbotConfig): {
provider: string;
model: string;
} {
const modelConfig = cfg?.agents?.defaults?.model as
| { primary?: string }
| string
| undefined;
const raw =
typeof modelConfig === "string"
? modelConfig.trim()
: modelConfig?.primary?.trim();
const parsed =
parseModelRef(raw ?? "", DEFAULT_PROVIDER) ??
({ provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL } as const);
return { provider: parsed.provider, model: parsed.model };
}
function hasAuthForProvider(params: {
provider: string;
agentDir: string;
}): boolean {
if (resolveEnvApiKey(params.provider)?.apiKey) return true;
const store = ensureAuthProfileStore(params.agentDir, {
allowKeychainPrompt: false,
});
return listProfilesForProvider(store, params.provider).length > 0;
}
/**
* Resolve the effective image model config for the `image` tool.
*
* - Prefer explicit config (`agents.defaults.imageModel`).
* - Otherwise, try to "pair" the primary model with an image-capable model:
* - same provider (best effort)
* - fall back to OpenAI/Anthropic when available
*/
export function resolveImageModelConfigForTool(params: {
cfg?: ClawdbotConfig;
agentDir: string;
}): ImageModelConfig | null {
const explicit = coerceImageModelConfig(params.cfg);
if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) {
return explicit;
}
const primary = resolveDefaultModelRef(params.cfg);
const openaiOk = hasAuthForProvider({
provider: "openai",
agentDir: params.agentDir,
});
const anthropicOk = hasAuthForProvider({
provider: "anthropic",
agentDir: params.agentDir,
});
const fallbacks: string[] = [];
const addFallback = (modelRef: string | null) => {
const ref = (modelRef ?? "").trim();
if (!ref) return;
if (fallbacks.includes(ref)) return;
fallbacks.push(ref);
};
const providerVisionFromConfig = resolveProviderVisionModelFromConfig({
cfg: params.cfg,
provider: primary.provider,
});
const providerOk = hasAuthForProvider({
provider: primary.provider,
agentDir: params.agentDir,
});
let preferred: string | null = null;
// MiniMax users: always try the canonical vision model first when auth exists.
if (primary.provider === "minimax" && providerOk) {
preferred = "minimax/MiniMax-VL-01";
} else if (providerOk && providerVisionFromConfig) {
preferred = providerVisionFromConfig;
} else if (primary.provider === "openai" && openaiOk) {
preferred = "openai/gpt-5-mini";
} else if (primary.provider === "anthropic" && anthropicOk) {
preferred = "anthropic/claude-opus-4-5";
}
if (preferred?.trim()) {
if (openaiOk) addFallback("openai/gpt-5-mini");
if (anthropicOk) addFallback("anthropic/claude-opus-4-5");
// Don't duplicate primary in fallbacks.
const pruned = fallbacks.filter((ref) => ref !== preferred);
return {
primary: preferred,
...(pruned.length > 0 ? { fallbacks: pruned } : {}),
};
}
// Cross-provider fallback when we can't pair with the primary provider.
if (openaiOk) {
if (anthropicOk) addFallback("anthropic/claude-opus-4-5");
return {
primary: "openai/gpt-5-mini",
...(fallbacks.length ? { fallbacks } : {}),
};
}
if (anthropicOk) {
return { primary: "anthropic/claude-opus-4-5" };
}
return null;
}
function pickMaxBytes(
@@ -78,17 +229,31 @@ function buildImageContext(
async function runImagePrompt(params: {
cfg?: ClawdbotConfig;
agentDir: string;
imageModelConfig: ImageModelConfig;
modelOverride?: string;
prompt: string;
base64: string;
mimeType: string;
}): Promise<{ text: string; provider: string; model: string }> {
await ensureClawdbotModelsJson(params.cfg, params.agentDir);
const effectiveCfg: ClawdbotConfig | undefined = params.cfg
? {
...params.cfg,
agents: {
...params.cfg.agents,
defaults: {
...params.cfg.agents?.defaults,
imageModel: params.imageModelConfig,
},
},
}
: undefined;
await ensureClawdbotModelsJson(effectiveCfg, params.agentDir);
const authStorage = discoverAuthStorage(params.agentDir);
const modelRegistry = discoverModels(authStorage, params.agentDir);
const result = await runWithImageModelFallback({
cfg: params.cfg,
cfg: effectiveCfg,
modelOverride: params.modelOverride,
run: async (provider, modelId) => {
const model = modelRegistry.find(provider, modelId) as Model<Api> | null;
@@ -102,7 +267,7 @@ async function runImagePrompt(params: {
}
const apiKeyInfo = await getApiKeyForModel({
model,
cfg: params.cfg,
cfg: effectiveCfg,
agentDir: params.agentDir,
});
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
@@ -132,11 +297,15 @@ export function createImageTool(options?: {
config?: ClawdbotConfig;
agentDir?: string;
}): AnyAgentTool | null {
if (!ensureImageToolConfigured(options?.config)) return null;
const agentDir = options?.agentDir;
if (!agentDir?.trim()) {
throw new Error("createImageTool requires agentDir when enabled");
}
const imageModelConfig = resolveImageModelConfigForTool({
cfg: options?.config,
agentDir,
});
if (!imageModelConfig) return null;
return {
label: "Image",
name: "image",
@@ -181,6 +350,7 @@ export function createImageTool(options?: {
const result = await runImagePrompt({
cfg: options?.config,
agentDir,
imageModelConfig,
modelOverride,
prompt: promptRaw,
base64,