diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c94cc440..942d13b16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ ### Changes - Subagents: add config to set default sub-agent model (`agents.defaults.subagents.model` + per-agent override); still overridden by `sessions_spawn.model`. +### Fixes +- Tools/Models: MiniMax vision now uses the Coding Plan VLM endpoint (`/v1/coding_plan/vlm`) so the `image` tool works with MiniMax keys. +- Gateway/macOS: reduce noisy loopback WS “closed before connect” logs during tests. + ## 2026.1.12-1 ### Changes diff --git a/src/agents/minimax-vlm.ts b/src/agents/minimax-vlm.ts new file mode 100644 index 000000000..0c7fd6ab0 --- /dev/null +++ b/src/agents/minimax-vlm.ts @@ -0,0 +1,115 @@ +type MinimaxBaseResp = { + status_code?: number; + status_msg?: string; +}; + +function coerceApiHost(params: { + apiHost?: string; + modelBaseUrl?: string; + env?: NodeJS.ProcessEnv; +}): string { + const env = params.env ?? process.env; + const raw = + params.apiHost?.trim() || + env.MINIMAX_API_HOST?.trim() || + params.modelBaseUrl?.trim() || + "https://api.minimax.io"; + + try { + const url = new URL(raw); + return url.origin; + } catch {} + + try { + const url = new URL(`https://${raw}`); + return url.origin; + } catch { + return "https://api.minimax.io"; + } +} + +function isRecord(value: unknown): value is Record { + return Boolean(value && typeof value === "object" && !Array.isArray(value)); +} + +function pickString(rec: Record, key: string): string { + const v = rec[key]; + return typeof v === "string" ? v : ""; +} + +export async function minimaxUnderstandImage(params: { + apiKey: string; + prompt: string; + imageDataUrl: string; + apiHost?: string; + modelBaseUrl?: string; +}): Promise { + const apiKey = params.apiKey.trim(); + if (!apiKey) throw new Error("MiniMax VLM: apiKey required"); + const prompt = params.prompt.trim(); + if (!prompt) throw new Error("MiniMax VLM: prompt required"); + const imageDataUrl = params.imageDataUrl.trim(); + if (!imageDataUrl) throw new Error("MiniMax VLM: imageDataUrl required"); + if (!/^data:image\/(png|jpeg|webp);base64,/i.test(imageDataUrl)) { + throw new Error( + "MiniMax VLM: imageDataUrl must be a base64 data:image/(png|jpeg|webp) URL", + ); + } + + const host = coerceApiHost({ + apiHost: params.apiHost, + modelBaseUrl: params.modelBaseUrl, + }); + const url = new URL("/v1/coding_plan/vlm", host).toString(); + + const res = await fetch(url, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + "MM-API-Source": "Clawdbot", + }, + body: JSON.stringify({ + prompt, + image_url: imageDataUrl, + }), + }); + + const traceId = res.headers.get("Trace-Id") ?? ""; + if (!res.ok) { + const body = await res.text().catch(() => ""); + const trace = traceId ? ` Trace-Id: ${traceId}` : ""; + throw new Error( + `MiniMax VLM request failed (${res.status} ${res.statusText}).${trace}${ + body ? ` Body: ${body.slice(0, 400)}` : "" + }`, + ); + } + + const json = (await res.json().catch(() => null)) as unknown; + if (!isRecord(json)) { + const trace = traceId ? ` Trace-Id: ${traceId}` : ""; + throw new Error(`MiniMax VLM response was not JSON.${trace}`); + } + + const baseResp = isRecord(json.base_resp) + ? (json.base_resp as MinimaxBaseResp) + : {}; + const code = + typeof baseResp.status_code === "number" ? baseResp.status_code : -1; + if (code !== 0) { + const msg = (baseResp.status_msg ?? "").trim(); + const trace = traceId ? ` Trace-Id: ${traceId}` : ""; + throw new Error( + `MiniMax VLM API error (${code})${msg ? `: ${msg}` : ""}.${trace}`, + ); + } + + const content = pickString(json, "content").trim(); + if (!content) { + const trace = traceId ? ` Trace-Id: ${traceId}` : ""; + throw new Error(`MiniMax VLM returned no content.${trace}`); + } + + return content; +} diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index c0d194542..bfdea82db 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -21,6 +21,8 @@ async function writeAuthProfiles(agentDir: string, profiles: unknown) { } describe("image tool implicit imageModel config", () => { + const priorFetch = global.fetch; + beforeEach(() => { vi.stubEnv("OPENAI_API_KEY", ""); vi.stubEnv("ANTHROPIC_API_KEY", ""); @@ -30,6 +32,8 @@ describe("image tool implicit imageModel config", () => { afterEach(() => { vi.unstubAllEnvs(); + // @ts-expect-error global fetch cleanup + global.fetch = priorFetch; }); it("stays disabled without auth when no pairing is possible", async () => { @@ -132,6 +136,60 @@ describe("image tool implicit imageModel config", () => { tool.execute("t2", { image: "../escape.png" }), ).rejects.toThrow(/escapes sandbox root/i); }); + + it("rewrites inbound absolute paths into sandbox media/inbound", async () => { + const stateDir = await fs.mkdtemp( + path.join(os.tmpdir(), "clawdbot-image-sandbox-"), + ); + const agentDir = path.join(stateDir, "agent"); + const sandboxRoot = path.join(stateDir, "sandbox"); + await fs.mkdir(agentDir, { recursive: true }); + await fs.mkdir(path.join(sandboxRoot, "media", "inbound"), { + recursive: true, + }); + const pngB64 = + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII="; + await fs.writeFile( + path.join(sandboxRoot, "media", "inbound", "photo.png"), + Buffer.from(pngB64, "base64"), + ); + + const fetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + statusText: "OK", + headers: new Headers(), + json: async () => ({ + content: "ok", + base_resp: { status_code: 0, status_msg: "" }, + }), + }); + // @ts-expect-error partial global + global.fetch = fetch; + vi.stubEnv("MINIMAX_API_KEY", "minimax-test"); + + const cfg: ClawdbotConfig = { + agents: { + defaults: { + model: { primary: "minimax/MiniMax-M2.1" }, + imageModel: { primary: "minimax/MiniMax-VL-01" }, + }, + }, + }; + const tool = createImageTool({ config: cfg, agentDir, sandboxRoot }); + expect(tool).not.toBeNull(); + if (!tool) throw new Error("expected image tool"); + + const res = await tool.execute("t1", { + prompt: "Describe the image.", + image: "/Users/steipete/.clawdbot/media/inbound/photo.png", + }); + + expect(fetch).toHaveBeenCalledTimes(1); + expect((res.details as { rewrittenFrom?: string }).rewrittenFrom).toContain( + "photo.png", + ); + }); }); describe("image tool data URL support", () => { @@ -151,6 +209,99 @@ describe("image tool data URL support", () => { }); }); +describe("image tool MiniMax VLM routing", () => { + const pngB64 = + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII="; + const priorFetch = global.fetch; + + beforeEach(() => { + vi.stubEnv("MINIMAX_API_KEY", ""); + }); + + afterEach(() => { + vi.unstubAllEnvs(); + // @ts-expect-error global fetch cleanup + global.fetch = priorFetch; + }); + + it("calls /v1/coding_plan/vlm for minimax image models", async () => { + const fetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + statusText: "OK", + headers: new Headers(), + json: async () => ({ + content: "ok", + base_resp: { status_code: 0, status_msg: "" }, + }), + }); + // @ts-expect-error partial global + global.fetch = fetch; + + const agentDir = await fs.mkdtemp( + path.join(os.tmpdir(), "clawdbot-minimax-vlm-"), + ); + vi.stubEnv("MINIMAX_API_KEY", "minimax-test"); + const cfg: ClawdbotConfig = { + agents: { defaults: { model: { primary: "minimax/MiniMax-M2.1" } } }, + }; + const tool = createImageTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + if (!tool) throw new Error("expected image tool"); + + const res = await tool.execute("t1", { + prompt: "Describe the image.", + image: `data:image/png;base64,${pngB64}`, + }); + + expect(fetch).toHaveBeenCalledTimes(1); + const [url, init] = fetch.mock.calls[0]; + expect(String(url)).toBe("https://api.minimax.io/v1/coding_plan/vlm"); + expect(init?.method).toBe("POST"); + expect( + String((init?.headers as Record)?.Authorization), + ).toBe("Bearer minimax-test"); + expect(String(init?.body)).toContain('"prompt":"Describe the image."'); + expect(String(init?.body)).toContain('"image_url":"data:image/png;base64,'); + + const text = res.content?.find((b) => b.type === "text")?.text ?? ""; + expect(text).toBe("ok"); + }); + + it("surfaces MiniMax API errors from /v1/coding_plan/vlm", async () => { + const fetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + statusText: "OK", + headers: new Headers(), + json: async () => ({ + content: "", + base_resp: { status_code: 1004, status_msg: "bad key" }, + }), + }); + // @ts-expect-error partial global + global.fetch = fetch; + + const agentDir = await fs.mkdtemp( + path.join(os.tmpdir(), "clawdbot-minimax-vlm-"), + ); + vi.stubEnv("MINIMAX_API_KEY", "minimax-test"); + const cfg: ClawdbotConfig = { + agents: { defaults: { model: { primary: "minimax/MiniMax-M2.1" } } }, + }; + const tool = createImageTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + if (!tool) throw new Error("expected image tool"); + + await expect( + tool.execute("t1", { + prompt: "Describe the image.", + image: `data:image/png;base64,${pngB64}`, + }), + ).rejects.toThrow(/MiniMax VLM API error/i); + }); +}); + describe("image tool response validation", () => { it("rejects image-model responses with no final text", () => { expect(() => diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index c3b1b9eab..46a16e3d2 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -1,3 +1,6 @@ +import fs from "node:fs/promises"; +import path from "node:path"; + import { type Api, type AssistantMessage, @@ -19,6 +22,7 @@ import { listProfilesForProvider, } from "../auth-profiles.js"; import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js"; +import { minimaxUnderstandImage } from "../minimax-vlm.js"; import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js"; import { runWithImageModelFallback } from "../model-fallback.js"; import { parseModelRef } from "../model-selection.js"; @@ -278,6 +282,38 @@ function buildImageContext( }; } +async function resolveSandboxedImagePath(params: { + sandboxRoot: string; + imagePath: string; +}): Promise<{ resolved: string; rewrittenFrom?: string }> { + const normalize = (p: string) => + p.startsWith("file://") ? p.slice("file://".length) : p; + const filePath = normalize(params.imagePath); + try { + const out = await assertSandboxPath({ + filePath, + cwd: params.sandboxRoot, + root: params.sandboxRoot, + }); + return { resolved: out.resolved }; + } catch (err) { + const name = path.basename(filePath); + const candidateRel = path.join("media", "inbound", name); + const candidateAbs = path.join(params.sandboxRoot, candidateRel); + try { + await fs.stat(candidateAbs); + } catch { + throw err; + } + const out = await assertSandboxPath({ + filePath: candidateRel, + cwd: params.sandboxRoot, + root: params.sandboxRoot, + }); + return { resolved: out.resolved, rewrittenFrom: filePath }; + } +} + async function runImagePrompt(params: { cfg?: ClawdbotConfig; agentDir: string; @@ -328,6 +364,18 @@ async function runImagePrompt(params: { agentDir: params.agentDir, }); authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey); + const imageDataUrl = `data:${params.mimeType};base64,${params.base64}`; + + if (model.provider === "minimax") { + const text = await minimaxUnderstandImage({ + apiKey: apiKeyInfo.apiKey, + prompt: params.prompt, + imageDataUrl, + modelBaseUrl: model.baseUrl, + }); + return { text, provider: model.provider, model: model.id }; + } + const context = buildImageContext( params.prompt, params.base64, @@ -337,23 +385,19 @@ async function runImagePrompt(params: { apiKey: apiKeyInfo.apiKey, maxTokens: 512, })) as AssistantMessage; - return { + const text = coerceImageAssistantText({ message, provider: model.provider, model: model.id, - }; + }); + return { text, provider: model.provider, model: model.id }; }, }); - const text = coerceImageAssistantText({ - message: result.result.message, + return { + text: result.result.text, provider: result.result.provider, model: result.result.model, - }); - return { - text, - provider: result.provider, - model: result.model, attempts: result.attempts.map((attempt) => ({ provider: attempt.provider, model: attempt.model, @@ -423,21 +467,20 @@ export function createImageTool(options?: { if (imageRaw.startsWith("~")) return resolveUserPath(imageRaw); return imageRaw; })(); - const resolvedPath = isDataUrl - ? null - : sandboxRoot - ? ( - await assertSandboxPath({ - filePath: resolvedImage.startsWith("file://") + const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = + isDataUrl + ? { resolved: "" } + : sandboxRoot + ? await resolveSandboxedImagePath({ + sandboxRoot, + imagePath: resolvedImage, + }) + : { + resolved: resolvedImage.startsWith("file://") ? resolvedImage.slice("file://".length) : resolvedImage, - cwd: sandboxRoot, - root: sandboxRoot, - }) - ).resolved - : resolvedImage.startsWith("file://") - ? resolvedImage.slice("file://".length) - : resolvedImage; + }; + const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved; const media = isDataUrl ? decodeDataUrl(resolvedImage) @@ -465,6 +508,9 @@ export function createImageTool(options?: { details: { model: `${result.provider}/${result.model}`, image: resolvedImage, + ...(resolvedPathInfo.rewrittenFrom + ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } + : {}), attempts: result.attempts, }, };