feat: add image model config + tool

2026-01-04 19:35:00 +01:00
parent 0716a624a8
commit 78998dba9e
20 changed files with 856 additions and 144 deletions
--- a/src/agents/tools/image-tool.ts
+++ b/src/agents/tools/image-tool.ts
@@ -0,0 +1,157 @@
+import { type Api, type AssistantMessage, complete, type Context, type Model } from "@mariozechner/pi-ai";
+import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
+import { Type } from "@sinclair/typebox";
+
+import type { ClawdbotConfig } from "../../config/config.js";
+import { loadWebMedia } from "../../web/media.js";
+import { resolveClawdbotAgentDir } from "../agent-paths.js";
+import { getApiKeyForModel } from "../model-auth.js";
+import { runWithImageModelFallback } from "../model-fallback.js";
+import { ensureClawdbotModelsJson } from "../models-config.js";
+import { extractAssistantText } from "../pi-embedded-utils.js";
+import { resolveUserPath } from "../../utils.js";
+import type { AnyAgentTool } from "./common.js";
+
+const DEFAULT_PROMPT = "Describe the image.";
+
+function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean {
+  const primary = cfg?.agent?.imageModel?.trim();
+  const fallbacks = cfg?.agent?.imageModelFallbacks ?? [];
+  return Boolean(primary || fallbacks.length > 0);
+}
+
+function pickMaxBytes(cfg?: ClawdbotConfig, maxBytesMb?: number): number | undefined {
+  if (typeof maxBytesMb === "number" && Number.isFinite(maxBytesMb) && maxBytesMb > 0) {
+    return Math.floor(maxBytesMb * 1024 * 1024);
+  }
+  const configured = cfg?.agent?.mediaMaxMb;
+  if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
+    return Math.floor(configured * 1024 * 1024);
+  }
+  return undefined;
+}
+
+function buildImageContext(prompt: string, base64: string, mimeType: string): Context {
+  return {
+    messages: [
+      {
+        role: "user",
+        content: [
+          { type: "text", text: prompt },
+          { type: "image", data: base64, mimeType },
+        ],
+        timestamp: Date.now(),
+      },
+    ],
+  };
+}
+
+async function runImagePrompt(params: {
+  cfg?: ClawdbotConfig;
+  modelOverride?: string;
+  prompt: string;
+  base64: string;
+  mimeType: string;
+}): Promise<{ text: string; provider: string; model: string }> {
+  const agentDir = resolveClawdbotAgentDir();
+  await ensureClawdbotModelsJson(params.cfg);
+  const authStorage = discoverAuthStorage(agentDir);
+  const modelRegistry = discoverModels(authStorage, agentDir);
+
+  const result = await runWithImageModelFallback({
+    cfg: params.cfg,
+    modelOverride: params.modelOverride,
+    run: async (provider, modelId) => {
+      const model = modelRegistry.find(provider, modelId) as Model<Api> | null;
+      if (!model) {
+        throw new Error(`Unknown model: ${provider}/${modelId}`);
+      }
+      if (!model.input?.includes("image")) {
+        throw new Error(`Model does not support images: ${provider}/${modelId}`);
+      }
+      const apiKey = await getApiKeyForModel(model, authStorage);
+      authStorage.setRuntimeApiKey(model.provider, apiKey);
+      const context = buildImageContext(
+        params.prompt,
+        params.base64,
+        params.mimeType,
+      );
+      const message = (await complete(model, context, {
+        apiKey,
+        maxTokens: 512,
+        temperature: 0,
+      })) as AssistantMessage;
+      return message;
+    },
+  });
+
+  const text = extractAssistantText(result.result);
+  return {
+    text: text || "(no text returned)",
+    provider: result.provider,
+    model: result.model,
+  };
+}
+
+export function createImageTool(options?: {
+  config?: ClawdbotConfig;
+}): AnyAgentTool | null {
+  if (!ensureImageToolConfigured(options?.config)) return null;
+  return {
+    label: "Image",
+    name: "image",
+    description:
+      "Analyze an image with the configured image model (agent.imageModel). Provide a prompt and image path or URL.",
+    parameters: Type.Object({
+      prompt: Type.Optional(Type.String()),
+      image: Type.String(),
+      model: Type.Optional(Type.String()),
+      maxBytesMb: Type.Optional(Type.Number()),
+    }),
+    execute: async (_toolCallId, args) => {
+      const record =
+        args && typeof args === "object"
+          ? (args as Record<string, unknown>)
+          : {};
+      const imageRaw =
+        typeof record.image === "string" ? record.image.trim() : "";
+      if (!imageRaw) throw new Error("image required");
+      const promptRaw =
+        typeof record.prompt === "string" && record.prompt.trim()
+          ? record.prompt.trim()
+          : DEFAULT_PROMPT;
+      const modelOverride =
+        typeof record.model === "string" && record.model.trim()
+          ? record.model.trim()
+          : undefined;
+      const maxBytesMb =
+        typeof record.maxBytesMb === "number" ? record.maxBytesMb : undefined;
+      const maxBytes = pickMaxBytes(options?.config, maxBytesMb);
+
+      const resolvedImage = imageRaw.startsWith("~")
+        ? resolveUserPath(imageRaw)
+        : imageRaw;
+      const media = await loadWebMedia(resolvedImage, maxBytes);
+      if (media.kind !== "image") {
+        throw new Error(`Unsupported media type: ${media.kind}`);
+      }
+
+      const mimeType = media.contentType ?? "image/png";
+      const base64 = media.buffer.toString("base64");
+      const result = await runImagePrompt({
+        cfg: options?.config,
+        modelOverride,
+        prompt: promptRaw,
+        base64,
+        mimeType,
+      });
+      return {
+        content: [{ type: "text", text: result.text }],
+        details: {
+          model: `${result.provider}/${result.model}`,
+          image: resolvedImage,
+        },
+      };
+    },
+  };
+}