fix(image): route MiniMax vision to VLM

This commit is contained in:
Peter Steinberger
2026-01-12 20:38:11 +00:00
parent b73042500e
commit 36a02b3e67
4 changed files with 338 additions and 22 deletions

View File

@@ -5,6 +5,10 @@
### Changes
- Subagents: add config to set default sub-agent model (`agents.defaults.subagents.model` + per-agent override); still overridden by `sessions_spawn.model`.
### Fixes
- Tools/Models: MiniMax vision now uses the Coding Plan VLM endpoint (`/v1/coding_plan/vlm`) so the `image` tool works with MiniMax keys.
- Gateway/macOS: reduce noisy loopback WS “closed before connect” logs during tests.
## 2026.1.12-1
### Changes

115
src/agents/minimax-vlm.ts Normal file
View File

@@ -0,0 +1,115 @@
type MinimaxBaseResp = {
status_code?: number;
status_msg?: string;
};
function coerceApiHost(params: {
apiHost?: string;
modelBaseUrl?: string;
env?: NodeJS.ProcessEnv;
}): string {
const env = params.env ?? process.env;
const raw =
params.apiHost?.trim() ||
env.MINIMAX_API_HOST?.trim() ||
params.modelBaseUrl?.trim() ||
"https://api.minimax.io";
try {
const url = new URL(raw);
return url.origin;
} catch {}
try {
const url = new URL(`https://${raw}`);
return url.origin;
} catch {
return "https://api.minimax.io";
}
}
function isRecord(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === "object" && !Array.isArray(value));
}
function pickString(rec: Record<string, unknown>, key: string): string {
const v = rec[key];
return typeof v === "string" ? v : "";
}
export async function minimaxUnderstandImage(params: {
apiKey: string;
prompt: string;
imageDataUrl: string;
apiHost?: string;
modelBaseUrl?: string;
}): Promise<string> {
const apiKey = params.apiKey.trim();
if (!apiKey) throw new Error("MiniMax VLM: apiKey required");
const prompt = params.prompt.trim();
if (!prompt) throw new Error("MiniMax VLM: prompt required");
const imageDataUrl = params.imageDataUrl.trim();
if (!imageDataUrl) throw new Error("MiniMax VLM: imageDataUrl required");
if (!/^data:image\/(png|jpeg|webp);base64,/i.test(imageDataUrl)) {
throw new Error(
"MiniMax VLM: imageDataUrl must be a base64 data:image/(png|jpeg|webp) URL",
);
}
const host = coerceApiHost({
apiHost: params.apiHost,
modelBaseUrl: params.modelBaseUrl,
});
const url = new URL("/v1/coding_plan/vlm", host).toString();
const res = await fetch(url, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
"MM-API-Source": "Clawdbot",
},
body: JSON.stringify({
prompt,
image_url: imageDataUrl,
}),
});
const traceId = res.headers.get("Trace-Id") ?? "";
if (!res.ok) {
const body = await res.text().catch(() => "");
const trace = traceId ? ` Trace-Id: ${traceId}` : "";
throw new Error(
`MiniMax VLM request failed (${res.status} ${res.statusText}).${trace}${
body ? ` Body: ${body.slice(0, 400)}` : ""
}`,
);
}
const json = (await res.json().catch(() => null)) as unknown;
if (!isRecord(json)) {
const trace = traceId ? ` Trace-Id: ${traceId}` : "";
throw new Error(`MiniMax VLM response was not JSON.${trace}`);
}
const baseResp = isRecord(json.base_resp)
? (json.base_resp as MinimaxBaseResp)
: {};
const code =
typeof baseResp.status_code === "number" ? baseResp.status_code : -1;
if (code !== 0) {
const msg = (baseResp.status_msg ?? "").trim();
const trace = traceId ? ` Trace-Id: ${traceId}` : "";
throw new Error(
`MiniMax VLM API error (${code})${msg ? `: ${msg}` : ""}.${trace}`,
);
}
const content = pickString(json, "content").trim();
if (!content) {
const trace = traceId ? ` Trace-Id: ${traceId}` : "";
throw new Error(`MiniMax VLM returned no content.${trace}`);
}
return content;
}

View File

@@ -21,6 +21,8 @@ async function writeAuthProfiles(agentDir: string, profiles: unknown) {
}
describe("image tool implicit imageModel config", () => {
const priorFetch = global.fetch;
beforeEach(() => {
vi.stubEnv("OPENAI_API_KEY", "");
vi.stubEnv("ANTHROPIC_API_KEY", "");
@@ -30,6 +32,8 @@ describe("image tool implicit imageModel config", () => {
afterEach(() => {
vi.unstubAllEnvs();
// @ts-expect-error global fetch cleanup
global.fetch = priorFetch;
});
it("stays disabled without auth when no pairing is possible", async () => {
@@ -132,6 +136,60 @@ describe("image tool implicit imageModel config", () => {
tool.execute("t2", { image: "../escape.png" }),
).rejects.toThrow(/escapes sandbox root/i);
});
it("rewrites inbound absolute paths into sandbox media/inbound", async () => {
const stateDir = await fs.mkdtemp(
path.join(os.tmpdir(), "clawdbot-image-sandbox-"),
);
const agentDir = path.join(stateDir, "agent");
const sandboxRoot = path.join(stateDir, "sandbox");
await fs.mkdir(agentDir, { recursive: true });
await fs.mkdir(path.join(sandboxRoot, "media", "inbound"), {
recursive: true,
});
const pngB64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
await fs.writeFile(
path.join(sandboxRoot, "media", "inbound", "photo.png"),
Buffer.from(pngB64, "base64"),
);
const fetch = vi.fn().mockResolvedValue({
ok: true,
status: 200,
statusText: "OK",
headers: new Headers(),
json: async () => ({
content: "ok",
base_resp: { status_code: 0, status_msg: "" },
}),
});
// @ts-expect-error partial global
global.fetch = fetch;
vi.stubEnv("MINIMAX_API_KEY", "minimax-test");
const cfg: ClawdbotConfig = {
agents: {
defaults: {
model: { primary: "minimax/MiniMax-M2.1" },
imageModel: { primary: "minimax/MiniMax-VL-01" },
},
},
};
const tool = createImageTool({ config: cfg, agentDir, sandboxRoot });
expect(tool).not.toBeNull();
if (!tool) throw new Error("expected image tool");
const res = await tool.execute("t1", {
prompt: "Describe the image.",
image: "/Users/steipete/.clawdbot/media/inbound/photo.png",
});
expect(fetch).toHaveBeenCalledTimes(1);
expect((res.details as { rewrittenFrom?: string }).rewrittenFrom).toContain(
"photo.png",
);
});
});
describe("image tool data URL support", () => {
@@ -151,6 +209,99 @@ describe("image tool data URL support", () => {
});
});
describe("image tool MiniMax VLM routing", () => {
const pngB64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
const priorFetch = global.fetch;
beforeEach(() => {
vi.stubEnv("MINIMAX_API_KEY", "");
});
afterEach(() => {
vi.unstubAllEnvs();
// @ts-expect-error global fetch cleanup
global.fetch = priorFetch;
});
it("calls /v1/coding_plan/vlm for minimax image models", async () => {
const fetch = vi.fn().mockResolvedValue({
ok: true,
status: 200,
statusText: "OK",
headers: new Headers(),
json: async () => ({
content: "ok",
base_resp: { status_code: 0, status_msg: "" },
}),
});
// @ts-expect-error partial global
global.fetch = fetch;
const agentDir = await fs.mkdtemp(
path.join(os.tmpdir(), "clawdbot-minimax-vlm-"),
);
vi.stubEnv("MINIMAX_API_KEY", "minimax-test");
const cfg: ClawdbotConfig = {
agents: { defaults: { model: { primary: "minimax/MiniMax-M2.1" } } },
};
const tool = createImageTool({ config: cfg, agentDir });
expect(tool).not.toBeNull();
if (!tool) throw new Error("expected image tool");
const res = await tool.execute("t1", {
prompt: "Describe the image.",
image: `data:image/png;base64,${pngB64}`,
});
expect(fetch).toHaveBeenCalledTimes(1);
const [url, init] = fetch.mock.calls[0];
expect(String(url)).toBe("https://api.minimax.io/v1/coding_plan/vlm");
expect(init?.method).toBe("POST");
expect(
String((init?.headers as Record<string, string>)?.Authorization),
).toBe("Bearer minimax-test");
expect(String(init?.body)).toContain('"prompt":"Describe the image."');
expect(String(init?.body)).toContain('"image_url":"data:image/png;base64,');
const text = res.content?.find((b) => b.type === "text")?.text ?? "";
expect(text).toBe("ok");
});
it("surfaces MiniMax API errors from /v1/coding_plan/vlm", async () => {
const fetch = vi.fn().mockResolvedValue({
ok: true,
status: 200,
statusText: "OK",
headers: new Headers(),
json: async () => ({
content: "",
base_resp: { status_code: 1004, status_msg: "bad key" },
}),
});
// @ts-expect-error partial global
global.fetch = fetch;
const agentDir = await fs.mkdtemp(
path.join(os.tmpdir(), "clawdbot-minimax-vlm-"),
);
vi.stubEnv("MINIMAX_API_KEY", "minimax-test");
const cfg: ClawdbotConfig = {
agents: { defaults: { model: { primary: "minimax/MiniMax-M2.1" } } },
};
const tool = createImageTool({ config: cfg, agentDir });
expect(tool).not.toBeNull();
if (!tool) throw new Error("expected image tool");
await expect(
tool.execute("t1", {
prompt: "Describe the image.",
image: `data:image/png;base64,${pngB64}`,
}),
).rejects.toThrow(/MiniMax VLM API error/i);
});
});
describe("image tool response validation", () => {
it("rejects image-model responses with no final text", () => {
expect(() =>

View File

@@ -1,3 +1,6 @@
import fs from "node:fs/promises";
import path from "node:path";
import {
type Api,
type AssistantMessage,
@@ -19,6 +22,7 @@ import {
listProfilesForProvider,
} from "../auth-profiles.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
import { minimaxUnderstandImage } from "../minimax-vlm.js";
import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js";
import { runWithImageModelFallback } from "../model-fallback.js";
import { parseModelRef } from "../model-selection.js";
@@ -278,6 +282,38 @@ function buildImageContext(
};
}
async function resolveSandboxedImagePath(params: {
sandboxRoot: string;
imagePath: string;
}): Promise<{ resolved: string; rewrittenFrom?: string }> {
const normalize = (p: string) =>
p.startsWith("file://") ? p.slice("file://".length) : p;
const filePath = normalize(params.imagePath);
try {
const out = await assertSandboxPath({
filePath,
cwd: params.sandboxRoot,
root: params.sandboxRoot,
});
return { resolved: out.resolved };
} catch (err) {
const name = path.basename(filePath);
const candidateRel = path.join("media", "inbound", name);
const candidateAbs = path.join(params.sandboxRoot, candidateRel);
try {
await fs.stat(candidateAbs);
} catch {
throw err;
}
const out = await assertSandboxPath({
filePath: candidateRel,
cwd: params.sandboxRoot,
root: params.sandboxRoot,
});
return { resolved: out.resolved, rewrittenFrom: filePath };
}
}
async function runImagePrompt(params: {
cfg?: ClawdbotConfig;
agentDir: string;
@@ -328,6 +364,18 @@ async function runImagePrompt(params: {
agentDir: params.agentDir,
});
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
const imageDataUrl = `data:${params.mimeType};base64,${params.base64}`;
if (model.provider === "minimax") {
const text = await minimaxUnderstandImage({
apiKey: apiKeyInfo.apiKey,
prompt: params.prompt,
imageDataUrl,
modelBaseUrl: model.baseUrl,
});
return { text, provider: model.provider, model: model.id };
}
const context = buildImageContext(
params.prompt,
params.base64,
@@ -337,23 +385,19 @@ async function runImagePrompt(params: {
apiKey: apiKeyInfo.apiKey,
maxTokens: 512,
})) as AssistantMessage;
return {
const text = coerceImageAssistantText({
message,
provider: model.provider,
model: model.id,
};
});
return { text, provider: model.provider, model: model.id };
},
});
const text = coerceImageAssistantText({
message: result.result.message,
return {
text: result.result.text,
provider: result.result.provider,
model: result.result.model,
});
return {
text,
provider: result.provider,
model: result.model,
attempts: result.attempts.map((attempt) => ({
provider: attempt.provider,
model: attempt.model,
@@ -423,21 +467,20 @@ export function createImageTool(options?: {
if (imageRaw.startsWith("~")) return resolveUserPath(imageRaw);
return imageRaw;
})();
const resolvedPath = isDataUrl
? null
: sandboxRoot
? (
await assertSandboxPath({
filePath: resolvedImage.startsWith("file://")
const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } =
isDataUrl
? { resolved: "" }
: sandboxRoot
? await resolveSandboxedImagePath({
sandboxRoot,
imagePath: resolvedImage,
})
: {
resolved: resolvedImage.startsWith("file://")
? resolvedImage.slice("file://".length)
: resolvedImage,
cwd: sandboxRoot,
root: sandboxRoot,
})
).resolved
: resolvedImage.startsWith("file://")
? resolvedImage.slice("file://".length)
: resolvedImage;
};
const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved;
const media = isDataUrl
? decodeDataUrl(resolvedImage)
@@ -465,6 +508,9 @@ export function createImageTool(options?: {
details: {
model: `${result.provider}/${result.model}`,
image: resolvedImage,
...(resolvedPathInfo.rewrittenFrom
? { rewrittenFrom: resolvedPathInfo.rewrittenFrom }
: {}),
attempts: result.attempts,
},
};