diff --git a/src/agents/model-catalog.ts b/src/agents/model-catalog.ts index 3a3db7a28..5463542cb 100644 --- a/src/agents/model-catalog.ts +++ b/src/agents/model-catalog.ts @@ -8,6 +8,7 @@ export type ModelCatalogEntry = { provider: string; contextWindow?: number; reasoning?: boolean; + input?: Array<"text" | "image">; }; type DiscoveredModel = { @@ -16,6 +17,7 @@ type DiscoveredModel = { provider: string; contextWindow?: number; reasoning?: boolean; + input?: Array<"text" | "image">; }; type PiSdkModule = typeof import("@mariozechner/pi-coding-agent"); @@ -80,7 +82,10 @@ export async function loadModelCatalog(params?: { ? entry.contextWindow : undefined; const reasoning = typeof entry?.reasoning === "boolean" ? entry.reasoning : undefined; - models.push({ id, name, provider, contextWindow, reasoning }); + const input = Array.isArray(entry?.input) + ? (entry.input as Array<"text" | "image">) + : undefined; + models.push({ id, name, provider, contextWindow, reasoning, input }); } if (models.length === 0) { @@ -105,3 +110,27 @@ export async function loadModelCatalog(params?: { return modelCatalogPromise; } + +/** + * Check if a model supports image input based on its catalog entry. + */ +export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boolean { + return entry?.input?.includes("image") ?? false; +} + +/** + * Find a model in the catalog by provider and model ID. + */ +export function findModelInCatalog( + catalog: ModelCatalogEntry[], + provider: string, + modelId: string, +): ModelCatalogEntry | undefined { + const normalizedProvider = provider.toLowerCase().trim(); + const normalizedModelId = modelId.toLowerCase().trim(); + return catalog.find( + (entry) => + entry.provider.toLowerCase() === normalizedProvider && + entry.id.toLowerCase() === normalizedModelId, + ); +} diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 2e9bccb08..0bff2513e 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -4,6 +4,11 @@ import os from "node:os"; import path from "node:path"; import type { ClawdbotConfig } from "../config/config.js"; +import { + findModelInCatalog, + loadModelCatalog, + modelSupportsVision, +} from "../agents/model-catalog.js"; import type { MsgContext } from "../auto-reply/templating.js"; import { applyTemplate } from "../auto-reply/templating.js"; import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js"; @@ -986,6 +991,26 @@ export async function runCapability(params: { }; } + // Skip image understanding when the primary model supports vision natively. + // The image will be injected directly into the model context instead. + if (capability === "image" && params.activeModel?.provider) { + const catalog = await loadModelCatalog({ config: cfg }); + const entry = findModelInCatalog( + catalog, + params.activeModel.provider, + params.activeModel.model ?? "", + ); + if (modelSupportsVision(entry)) { + if (shouldLogVerbose()) { + logVerbose("Skipping image understanding: primary model supports vision natively"); + } + return { + outputs: [], + decision: { capability, outcome: "skipped", attachments: [] }, + }; + } + } + const attachmentPolicy = config?.attachments; const selected = selectAttachments({ capability,