feat: add image model config + tool
This commit is contained in:
157
src/agents/tools/image-tool.ts
Normal file
157
src/agents/tools/image-tool.ts
Normal file
@@ -0,0 +1,157 @@
|
||||
import { type Api, type AssistantMessage, complete, type Context, type Model } from "@mariozechner/pi-ai";
|
||||
import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
|
||||
import type { ClawdbotConfig } from "../../config/config.js";
|
||||
import { loadWebMedia } from "../../web/media.js";
|
||||
import { resolveClawdbotAgentDir } from "../agent-paths.js";
|
||||
import { getApiKeyForModel } from "../model-auth.js";
|
||||
import { runWithImageModelFallback } from "../model-fallback.js";
|
||||
import { ensureClawdbotModelsJson } from "../models-config.js";
|
||||
import { extractAssistantText } from "../pi-embedded-utils.js";
|
||||
import { resolveUserPath } from "../../utils.js";
|
||||
import type { AnyAgentTool } from "./common.js";
|
||||
|
||||
const DEFAULT_PROMPT = "Describe the image.";
|
||||
|
||||
function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean {
|
||||
const primary = cfg?.agent?.imageModel?.trim();
|
||||
const fallbacks = cfg?.agent?.imageModelFallbacks ?? [];
|
||||
return Boolean(primary || fallbacks.length > 0);
|
||||
}
|
||||
|
||||
function pickMaxBytes(cfg?: ClawdbotConfig, maxBytesMb?: number): number | undefined {
|
||||
if (typeof maxBytesMb === "number" && Number.isFinite(maxBytesMb) && maxBytesMb > 0) {
|
||||
return Math.floor(maxBytesMb * 1024 * 1024);
|
||||
}
|
||||
const configured = cfg?.agent?.mediaMaxMb;
|
||||
if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
|
||||
return Math.floor(configured * 1024 * 1024);
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function buildImageContext(prompt: string, base64: string, mimeType: string): Context {
|
||||
return {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt },
|
||||
{ type: "image", data: base64, mimeType },
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function runImagePrompt(params: {
|
||||
cfg?: ClawdbotConfig;
|
||||
modelOverride?: string;
|
||||
prompt: string;
|
||||
base64: string;
|
||||
mimeType: string;
|
||||
}): Promise<{ text: string; provider: string; model: string }> {
|
||||
const agentDir = resolveClawdbotAgentDir();
|
||||
await ensureClawdbotModelsJson(params.cfg);
|
||||
const authStorage = discoverAuthStorage(agentDir);
|
||||
const modelRegistry = discoverModels(authStorage, agentDir);
|
||||
|
||||
const result = await runWithImageModelFallback({
|
||||
cfg: params.cfg,
|
||||
modelOverride: params.modelOverride,
|
||||
run: async (provider, modelId) => {
|
||||
const model = modelRegistry.find(provider, modelId) as Model<Api> | null;
|
||||
if (!model) {
|
||||
throw new Error(`Unknown model: ${provider}/${modelId}`);
|
||||
}
|
||||
if (!model.input?.includes("image")) {
|
||||
throw new Error(`Model does not support images: ${provider}/${modelId}`);
|
||||
}
|
||||
const apiKey = await getApiKeyForModel(model, authStorage);
|
||||
authStorage.setRuntimeApiKey(model.provider, apiKey);
|
||||
const context = buildImageContext(
|
||||
params.prompt,
|
||||
params.base64,
|
||||
params.mimeType,
|
||||
);
|
||||
const message = (await complete(model, context, {
|
||||
apiKey,
|
||||
maxTokens: 512,
|
||||
temperature: 0,
|
||||
})) as AssistantMessage;
|
||||
return message;
|
||||
},
|
||||
});
|
||||
|
||||
const text = extractAssistantText(result.result);
|
||||
return {
|
||||
text: text || "(no text returned)",
|
||||
provider: result.provider,
|
||||
model: result.model,
|
||||
};
|
||||
}
|
||||
|
||||
export function createImageTool(options?: {
|
||||
config?: ClawdbotConfig;
|
||||
}): AnyAgentTool | null {
|
||||
if (!ensureImageToolConfigured(options?.config)) return null;
|
||||
return {
|
||||
label: "Image",
|
||||
name: "image",
|
||||
description:
|
||||
"Analyze an image with the configured image model (agent.imageModel). Provide a prompt and image path or URL.",
|
||||
parameters: Type.Object({
|
||||
prompt: Type.Optional(Type.String()),
|
||||
image: Type.String(),
|
||||
model: Type.Optional(Type.String()),
|
||||
maxBytesMb: Type.Optional(Type.Number()),
|
||||
}),
|
||||
execute: async (_toolCallId, args) => {
|
||||
const record =
|
||||
args && typeof args === "object"
|
||||
? (args as Record<string, unknown>)
|
||||
: {};
|
||||
const imageRaw =
|
||||
typeof record.image === "string" ? record.image.trim() : "";
|
||||
if (!imageRaw) throw new Error("image required");
|
||||
const promptRaw =
|
||||
typeof record.prompt === "string" && record.prompt.trim()
|
||||
? record.prompt.trim()
|
||||
: DEFAULT_PROMPT;
|
||||
const modelOverride =
|
||||
typeof record.model === "string" && record.model.trim()
|
||||
? record.model.trim()
|
||||
: undefined;
|
||||
const maxBytesMb =
|
||||
typeof record.maxBytesMb === "number" ? record.maxBytesMb : undefined;
|
||||
const maxBytes = pickMaxBytes(options?.config, maxBytesMb);
|
||||
|
||||
const resolvedImage = imageRaw.startsWith("~")
|
||||
? resolveUserPath(imageRaw)
|
||||
: imageRaw;
|
||||
const media = await loadWebMedia(resolvedImage, maxBytes);
|
||||
if (media.kind !== "image") {
|
||||
throw new Error(`Unsupported media type: ${media.kind}`);
|
||||
}
|
||||
|
||||
const mimeType = media.contentType ?? "image/png";
|
||||
const base64 = media.buffer.toString("base64");
|
||||
const result = await runImagePrompt({
|
||||
cfg: options?.config,
|
||||
modelOverride,
|
||||
prompt: promptRaw,
|
||||
base64,
|
||||
mimeType,
|
||||
});
|
||||
return {
|
||||
content: [{ type: "text", text: result.text }],
|
||||
details: {
|
||||
model: `${result.provider}/${result.model}`,
|
||||
image: resolvedImage,
|
||||
},
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user