feat(image): auto-pair image model
This commit is contained in:
@@ -153,7 +153,7 @@ Core parameters:
|
|||||||
- `maxBytesMb` (optional size cap)
|
- `maxBytesMb` (optional size cap)
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
- Only available when `agents.defaults.imageModel` is configured (primary or fallbacks).
|
- Only available when `agents.defaults.imageModel` is configured (primary or fallbacks), or when an implicit image model can be inferred from your default model + configured auth (best-effort pairing).
|
||||||
- Uses the image model directly (independent of the main chat model).
|
- Uses the image model directly (independent of the main chat model).
|
||||||
|
|
||||||
### `message`
|
### `message`
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ export type ProviderConfig = NonNullable<ModelsConfig["providers"]>[string];
|
|||||||
|
|
||||||
const MINIMAX_API_BASE_URL = "https://api.minimax.io/anthropic";
|
const MINIMAX_API_BASE_URL = "https://api.minimax.io/anthropic";
|
||||||
const MINIMAX_DEFAULT_MODEL_ID = "MiniMax-M2.1";
|
const MINIMAX_DEFAULT_MODEL_ID = "MiniMax-M2.1";
|
||||||
|
const MINIMAX_DEFAULT_VISION_MODEL_ID = "MiniMax-VL-01";
|
||||||
const MINIMAX_DEFAULT_CONTEXT_WINDOW = 200000;
|
const MINIMAX_DEFAULT_CONTEXT_WINDOW = 200000;
|
||||||
const MINIMAX_DEFAULT_MAX_TOKENS = 8192;
|
const MINIMAX_DEFAULT_MAX_TOKENS = 8192;
|
||||||
// Pricing: MiniMax doesn't publish public rates. Override in models.json for accurate costs.
|
// Pricing: MiniMax doesn't publish public rates. Override in models.json for accurate costs.
|
||||||
@@ -148,6 +149,15 @@ function buildMinimaxProvider(): ProviderConfig {
|
|||||||
contextWindow: MINIMAX_DEFAULT_CONTEXT_WINDOW,
|
contextWindow: MINIMAX_DEFAULT_CONTEXT_WINDOW,
|
||||||
maxTokens: MINIMAX_DEFAULT_MAX_TOKENS,
|
maxTokens: MINIMAX_DEFAULT_MAX_TOKENS,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
id: MINIMAX_DEFAULT_VISION_MODEL_ID,
|
||||||
|
name: "MiniMax VL 01",
|
||||||
|
reasoning: false,
|
||||||
|
input: ["text", "image"],
|
||||||
|
cost: MINIMAX_API_COST,
|
||||||
|
contextWindow: MINIMAX_DEFAULT_CONTEXT_WINDOW,
|
||||||
|
maxTokens: MINIMAX_DEFAULT_MAX_TOKENS,
|
||||||
|
},
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
107
src/agents/tools/image-tool.test.ts
Normal file
107
src/agents/tools/image-tool.test.ts
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
|
||||||
|
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||||
|
|
||||||
|
import type { ClawdbotConfig } from "../../config/config.js";
|
||||||
|
import {
|
||||||
|
createImageTool,
|
||||||
|
resolveImageModelConfigForTool,
|
||||||
|
} from "./image-tool.js";
|
||||||
|
|
||||||
|
async function writeAuthProfiles(agentDir: string, profiles: unknown) {
|
||||||
|
await fs.mkdir(agentDir, { recursive: true });
|
||||||
|
await fs.writeFile(
|
||||||
|
path.join(agentDir, "auth-profiles.json"),
|
||||||
|
`${JSON.stringify(profiles, null, 2)}\n`,
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("image tool implicit imageModel config", () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
vi.stubEnv("OPENAI_API_KEY", "");
|
||||||
|
vi.stubEnv("ANTHROPIC_API_KEY", "");
|
||||||
|
vi.stubEnv("ANTHROPIC_OAUTH_TOKEN", "");
|
||||||
|
vi.stubEnv("MINIMAX_API_KEY", "");
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
vi.unstubAllEnvs();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("stays disabled without auth when no pairing is possible", async () => {
|
||||||
|
const agentDir = await fs.mkdtemp(
|
||||||
|
path.join(os.tmpdir(), "clawdbot-image-"),
|
||||||
|
);
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
agents: { defaults: { model: { primary: "openai/gpt-5.2" } } },
|
||||||
|
};
|
||||||
|
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toBeNull();
|
||||||
|
expect(createImageTool({ config: cfg, agentDir })).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("pairs minimax primary with MiniMax-VL-01 (and fallbacks) when auth exists", async () => {
|
||||||
|
const agentDir = await fs.mkdtemp(
|
||||||
|
path.join(os.tmpdir(), "clawdbot-image-"),
|
||||||
|
);
|
||||||
|
vi.stubEnv("MINIMAX_API_KEY", "minimax-test");
|
||||||
|
vi.stubEnv("OPENAI_API_KEY", "openai-test");
|
||||||
|
vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test");
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
agents: { defaults: { model: { primary: "minimax/MiniMax-M2.1" } } },
|
||||||
|
};
|
||||||
|
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
|
||||||
|
primary: "minimax/MiniMax-VL-01",
|
||||||
|
fallbacks: ["openai/gpt-5-mini", "anthropic/claude-opus-4-5"],
|
||||||
|
});
|
||||||
|
expect(createImageTool({ config: cfg, agentDir })).not.toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("pairs a custom provider when it declares an image-capable model", async () => {
|
||||||
|
const agentDir = await fs.mkdtemp(
|
||||||
|
path.join(os.tmpdir(), "clawdbot-image-"),
|
||||||
|
);
|
||||||
|
await writeAuthProfiles(agentDir, {
|
||||||
|
version: 1,
|
||||||
|
profiles: {
|
||||||
|
"acme:default": { type: "api_key", provider: "acme", key: "sk-test" },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
agents: { defaults: { model: { primary: "acme/text-1" } } },
|
||||||
|
models: {
|
||||||
|
providers: {
|
||||||
|
acme: {
|
||||||
|
models: [
|
||||||
|
{ id: "text-1", input: ["text"] },
|
||||||
|
{ id: "vision-1", input: ["text", "image"] },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
|
||||||
|
primary: "acme/vision-1",
|
||||||
|
});
|
||||||
|
expect(createImageTool({ config: cfg, agentDir })).not.toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("prefers explicit agents.defaults.imageModel", async () => {
|
||||||
|
const agentDir = await fs.mkdtemp(
|
||||||
|
path.join(os.tmpdir(), "clawdbot-image-"),
|
||||||
|
);
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
agents: {
|
||||||
|
defaults: {
|
||||||
|
model: { primary: "minimax/MiniMax-M2.1" },
|
||||||
|
imageModel: { primary: "openai/gpt-5-mini" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
|
||||||
|
primary: "openai/gpt-5-mini",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -14,15 +14,23 @@ import { Type } from "@sinclair/typebox";
|
|||||||
import type { ClawdbotConfig } from "../../config/config.js";
|
import type { ClawdbotConfig } from "../../config/config.js";
|
||||||
import { resolveUserPath } from "../../utils.js";
|
import { resolveUserPath } from "../../utils.js";
|
||||||
import { loadWebMedia } from "../../web/media.js";
|
import { loadWebMedia } from "../../web/media.js";
|
||||||
import { getApiKeyForModel } from "../model-auth.js";
|
import {
|
||||||
|
ensureAuthProfileStore,
|
||||||
|
listProfilesForProvider,
|
||||||
|
} from "../auth-profiles.js";
|
||||||
|
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
|
||||||
|
import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js";
|
||||||
import { runWithImageModelFallback } from "../model-fallback.js";
|
import { runWithImageModelFallback } from "../model-fallback.js";
|
||||||
|
import { parseModelRef } from "../model-selection.js";
|
||||||
import { ensureClawdbotModelsJson } from "../models-config.js";
|
import { ensureClawdbotModelsJson } from "../models-config.js";
|
||||||
import { extractAssistantText } from "../pi-embedded-utils.js";
|
import { extractAssistantText } from "../pi-embedded-utils.js";
|
||||||
import type { AnyAgentTool } from "./common.js";
|
import type { AnyAgentTool } from "./common.js";
|
||||||
|
|
||||||
const DEFAULT_PROMPT = "Describe the image.";
|
const DEFAULT_PROMPT = "Describe the image.";
|
||||||
|
|
||||||
function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean {
|
type ImageModelConfig = { primary?: string; fallbacks?: string[] };
|
||||||
|
|
||||||
|
function coerceImageModelConfig(cfg?: ClawdbotConfig): ImageModelConfig {
|
||||||
const imageModel = cfg?.agents?.defaults?.imageModel as
|
const imageModel = cfg?.agents?.defaults?.imageModel as
|
||||||
| { primary?: string; fallbacks?: string[] }
|
| { primary?: string; fallbacks?: string[] }
|
||||||
| string
|
| string
|
||||||
@@ -31,7 +39,150 @@ function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean {
|
|||||||
typeof imageModel === "string" ? imageModel.trim() : imageModel?.primary;
|
typeof imageModel === "string" ? imageModel.trim() : imageModel?.primary;
|
||||||
const fallbacks =
|
const fallbacks =
|
||||||
typeof imageModel === "object" ? (imageModel?.fallbacks ?? []) : [];
|
typeof imageModel === "object" ? (imageModel?.fallbacks ?? []) : [];
|
||||||
return Boolean(primary?.trim() || fallbacks.length > 0);
|
return {
|
||||||
|
...(primary?.trim() ? { primary: primary.trim() } : {}),
|
||||||
|
...(fallbacks.length > 0 ? { fallbacks } : {}),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveProviderVisionModelFromConfig(params: {
|
||||||
|
cfg?: ClawdbotConfig;
|
||||||
|
provider: string;
|
||||||
|
}): string | null {
|
||||||
|
const providerCfg = params.cfg?.models?.providers?.[
|
||||||
|
params.provider
|
||||||
|
] as unknown as
|
||||||
|
| { models?: Array<{ id?: string; input?: string[] }> }
|
||||||
|
| undefined;
|
||||||
|
const models = providerCfg?.models ?? [];
|
||||||
|
const preferMinimaxVl =
|
||||||
|
params.provider === "minimax"
|
||||||
|
? models.find(
|
||||||
|
(m) =>
|
||||||
|
(m?.id ?? "").trim() === "MiniMax-VL-01" &&
|
||||||
|
Array.isArray(m?.input) &&
|
||||||
|
m.input.includes("image"),
|
||||||
|
)
|
||||||
|
: null;
|
||||||
|
const picked =
|
||||||
|
preferMinimaxVl ??
|
||||||
|
models.find(
|
||||||
|
(m) => Boolean((m?.id ?? "").trim()) && m.input?.includes("image"),
|
||||||
|
);
|
||||||
|
const id = (picked?.id ?? "").trim();
|
||||||
|
return id ? `${params.provider}/${id}` : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveDefaultModelRef(cfg?: ClawdbotConfig): {
|
||||||
|
provider: string;
|
||||||
|
model: string;
|
||||||
|
} {
|
||||||
|
const modelConfig = cfg?.agents?.defaults?.model as
|
||||||
|
| { primary?: string }
|
||||||
|
| string
|
||||||
|
| undefined;
|
||||||
|
const raw =
|
||||||
|
typeof modelConfig === "string"
|
||||||
|
? modelConfig.trim()
|
||||||
|
: modelConfig?.primary?.trim();
|
||||||
|
const parsed =
|
||||||
|
parseModelRef(raw ?? "", DEFAULT_PROVIDER) ??
|
||||||
|
({ provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL } as const);
|
||||||
|
return { provider: parsed.provider, model: parsed.model };
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasAuthForProvider(params: {
|
||||||
|
provider: string;
|
||||||
|
agentDir: string;
|
||||||
|
}): boolean {
|
||||||
|
if (resolveEnvApiKey(params.provider)?.apiKey) return true;
|
||||||
|
const store = ensureAuthProfileStore(params.agentDir, {
|
||||||
|
allowKeychainPrompt: false,
|
||||||
|
});
|
||||||
|
return listProfilesForProvider(store, params.provider).length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve the effective image model config for the `image` tool.
|
||||||
|
*
|
||||||
|
* - Prefer explicit config (`agents.defaults.imageModel`).
|
||||||
|
* - Otherwise, try to "pair" the primary model with an image-capable model:
|
||||||
|
* - same provider (best effort)
|
||||||
|
* - fall back to OpenAI/Anthropic when available
|
||||||
|
*/
|
||||||
|
export function resolveImageModelConfigForTool(params: {
|
||||||
|
cfg?: ClawdbotConfig;
|
||||||
|
agentDir: string;
|
||||||
|
}): ImageModelConfig | null {
|
||||||
|
const explicit = coerceImageModelConfig(params.cfg);
|
||||||
|
if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) {
|
||||||
|
return explicit;
|
||||||
|
}
|
||||||
|
|
||||||
|
const primary = resolveDefaultModelRef(params.cfg);
|
||||||
|
const openaiOk = hasAuthForProvider({
|
||||||
|
provider: "openai",
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
});
|
||||||
|
const anthropicOk = hasAuthForProvider({
|
||||||
|
provider: "anthropic",
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
});
|
||||||
|
|
||||||
|
const fallbacks: string[] = [];
|
||||||
|
const addFallback = (modelRef: string | null) => {
|
||||||
|
const ref = (modelRef ?? "").trim();
|
||||||
|
if (!ref) return;
|
||||||
|
if (fallbacks.includes(ref)) return;
|
||||||
|
fallbacks.push(ref);
|
||||||
|
};
|
||||||
|
|
||||||
|
const providerVisionFromConfig = resolveProviderVisionModelFromConfig({
|
||||||
|
cfg: params.cfg,
|
||||||
|
provider: primary.provider,
|
||||||
|
});
|
||||||
|
const providerOk = hasAuthForProvider({
|
||||||
|
provider: primary.provider,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
});
|
||||||
|
|
||||||
|
let preferred: string | null = null;
|
||||||
|
|
||||||
|
// MiniMax users: always try the canonical vision model first when auth exists.
|
||||||
|
if (primary.provider === "minimax" && providerOk) {
|
||||||
|
preferred = "minimax/MiniMax-VL-01";
|
||||||
|
} else if (providerOk && providerVisionFromConfig) {
|
||||||
|
preferred = providerVisionFromConfig;
|
||||||
|
} else if (primary.provider === "openai" && openaiOk) {
|
||||||
|
preferred = "openai/gpt-5-mini";
|
||||||
|
} else if (primary.provider === "anthropic" && anthropicOk) {
|
||||||
|
preferred = "anthropic/claude-opus-4-5";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preferred?.trim()) {
|
||||||
|
if (openaiOk) addFallback("openai/gpt-5-mini");
|
||||||
|
if (anthropicOk) addFallback("anthropic/claude-opus-4-5");
|
||||||
|
// Don't duplicate primary in fallbacks.
|
||||||
|
const pruned = fallbacks.filter((ref) => ref !== preferred);
|
||||||
|
return {
|
||||||
|
primary: preferred,
|
||||||
|
...(pruned.length > 0 ? { fallbacks: pruned } : {}),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cross-provider fallback when we can't pair with the primary provider.
|
||||||
|
if (openaiOk) {
|
||||||
|
if (anthropicOk) addFallback("anthropic/claude-opus-4-5");
|
||||||
|
return {
|
||||||
|
primary: "openai/gpt-5-mini",
|
||||||
|
...(fallbacks.length ? { fallbacks } : {}),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (anthropicOk) {
|
||||||
|
return { primary: "anthropic/claude-opus-4-5" };
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function pickMaxBytes(
|
function pickMaxBytes(
|
||||||
@@ -78,17 +229,31 @@ function buildImageContext(
|
|||||||
async function runImagePrompt(params: {
|
async function runImagePrompt(params: {
|
||||||
cfg?: ClawdbotConfig;
|
cfg?: ClawdbotConfig;
|
||||||
agentDir: string;
|
agentDir: string;
|
||||||
|
imageModelConfig: ImageModelConfig;
|
||||||
modelOverride?: string;
|
modelOverride?: string;
|
||||||
prompt: string;
|
prompt: string;
|
||||||
base64: string;
|
base64: string;
|
||||||
mimeType: string;
|
mimeType: string;
|
||||||
}): Promise<{ text: string; provider: string; model: string }> {
|
}): Promise<{ text: string; provider: string; model: string }> {
|
||||||
await ensureClawdbotModelsJson(params.cfg, params.agentDir);
|
const effectiveCfg: ClawdbotConfig | undefined = params.cfg
|
||||||
|
? {
|
||||||
|
...params.cfg,
|
||||||
|
agents: {
|
||||||
|
...params.cfg.agents,
|
||||||
|
defaults: {
|
||||||
|
...params.cfg.agents?.defaults,
|
||||||
|
imageModel: params.imageModelConfig,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
: undefined;
|
||||||
|
|
||||||
|
await ensureClawdbotModelsJson(effectiveCfg, params.agentDir);
|
||||||
const authStorage = discoverAuthStorage(params.agentDir);
|
const authStorage = discoverAuthStorage(params.agentDir);
|
||||||
const modelRegistry = discoverModels(authStorage, params.agentDir);
|
const modelRegistry = discoverModels(authStorage, params.agentDir);
|
||||||
|
|
||||||
const result = await runWithImageModelFallback({
|
const result = await runWithImageModelFallback({
|
||||||
cfg: params.cfg,
|
cfg: effectiveCfg,
|
||||||
modelOverride: params.modelOverride,
|
modelOverride: params.modelOverride,
|
||||||
run: async (provider, modelId) => {
|
run: async (provider, modelId) => {
|
||||||
const model = modelRegistry.find(provider, modelId) as Model<Api> | null;
|
const model = modelRegistry.find(provider, modelId) as Model<Api> | null;
|
||||||
@@ -102,7 +267,7 @@ async function runImagePrompt(params: {
|
|||||||
}
|
}
|
||||||
const apiKeyInfo = await getApiKeyForModel({
|
const apiKeyInfo = await getApiKeyForModel({
|
||||||
model,
|
model,
|
||||||
cfg: params.cfg,
|
cfg: effectiveCfg,
|
||||||
agentDir: params.agentDir,
|
agentDir: params.agentDir,
|
||||||
});
|
});
|
||||||
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
|
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
|
||||||
@@ -132,11 +297,15 @@ export function createImageTool(options?: {
|
|||||||
config?: ClawdbotConfig;
|
config?: ClawdbotConfig;
|
||||||
agentDir?: string;
|
agentDir?: string;
|
||||||
}): AnyAgentTool | null {
|
}): AnyAgentTool | null {
|
||||||
if (!ensureImageToolConfigured(options?.config)) return null;
|
|
||||||
const agentDir = options?.agentDir;
|
const agentDir = options?.agentDir;
|
||||||
if (!agentDir?.trim()) {
|
if (!agentDir?.trim()) {
|
||||||
throw new Error("createImageTool requires agentDir when enabled");
|
throw new Error("createImageTool requires agentDir when enabled");
|
||||||
}
|
}
|
||||||
|
const imageModelConfig = resolveImageModelConfigForTool({
|
||||||
|
cfg: options?.config,
|
||||||
|
agentDir,
|
||||||
|
});
|
||||||
|
if (!imageModelConfig) return null;
|
||||||
return {
|
return {
|
||||||
label: "Image",
|
label: "Image",
|
||||||
name: "image",
|
name: "image",
|
||||||
@@ -181,6 +350,7 @@ export function createImageTool(options?: {
|
|||||||
const result = await runImagePrompt({
|
const result = await runImagePrompt({
|
||||||
cfg: options?.config,
|
cfg: options?.config,
|
||||||
agentDir,
|
agentDir,
|
||||||
|
imageModelConfig,
|
||||||
modelOverride,
|
modelOverride,
|
||||||
prompt: promptRaw,
|
prompt: promptRaw,
|
||||||
base64,
|
base64,
|
||||||
|
|||||||
Reference in New Issue
Block a user