From fdecf5c59a40c86459bf6528666566631bfbbd73 Mon Sep 17 00:00:00 2001 From: Tyler Yust Date: Sun, 25 Jan 2026 00:40:26 -0800 Subject: [PATCH] fix: skip image understanding when primary model has vision When the primary model supports vision natively (e.g., Claude Opus 4.5), skip the image understanding call entirely. The image will be injected directly into the model context instead, saving an API call and avoiding redundant descriptions. Co-Authored-By: Claude Opus 4.5 --- src/agents/model-catalog.ts | 31 ++++++++++++++++++++++++++++++- src/media-understanding/runner.ts | 25 +++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/src/agents/model-catalog.ts b/src/agents/model-catalog.ts index 3a3db7a28..5463542cb 100644 --- a/src/agents/model-catalog.ts +++ b/src/agents/model-catalog.ts @@ -8,6 +8,7 @@ export type ModelCatalogEntry = { provider: string; contextWindow?: number; reasoning?: boolean; + input?: Array<"text" | "image">; }; type DiscoveredModel = { @@ -16,6 +17,7 @@ type DiscoveredModel = { provider: string; contextWindow?: number; reasoning?: boolean; + input?: Array<"text" | "image">; }; type PiSdkModule = typeof import("@mariozechner/pi-coding-agent"); @@ -80,7 +82,10 @@ export async function loadModelCatalog(params?: { ? entry.contextWindow : undefined; const reasoning = typeof entry?.reasoning === "boolean" ? entry.reasoning : undefined; - models.push({ id, name, provider, contextWindow, reasoning }); + const input = Array.isArray(entry?.input) + ? (entry.input as Array<"text" | "image">) + : undefined; + models.push({ id, name, provider, contextWindow, reasoning, input }); } if (models.length === 0) { @@ -105,3 +110,27 @@ export async function loadModelCatalog(params?: { return modelCatalogPromise; } + +/** + * Check if a model supports image input based on its catalog entry. + */ +export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boolean { + return entry?.input?.includes("image") ?? false; +} + +/** + * Find a model in the catalog by provider and model ID. + */ +export function findModelInCatalog( + catalog: ModelCatalogEntry[], + provider: string, + modelId: string, +): ModelCatalogEntry | undefined { + const normalizedProvider = provider.toLowerCase().trim(); + const normalizedModelId = modelId.toLowerCase().trim(); + return catalog.find( + (entry) => + entry.provider.toLowerCase() === normalizedProvider && + entry.id.toLowerCase() === normalizedModelId, + ); +} diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 2e9bccb08..0bff2513e 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -4,6 +4,11 @@ import os from "node:os"; import path from "node:path"; import type { ClawdbotConfig } from "../config/config.js"; +import { + findModelInCatalog, + loadModelCatalog, + modelSupportsVision, +} from "../agents/model-catalog.js"; import type { MsgContext } from "../auto-reply/templating.js"; import { applyTemplate } from "../auto-reply/templating.js"; import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js"; @@ -986,6 +991,26 @@ export async function runCapability(params: { }; } + // Skip image understanding when the primary model supports vision natively. + // The image will be injected directly into the model context instead. + if (capability === "image" && params.activeModel?.provider) { + const catalog = await loadModelCatalog({ config: cfg }); + const entry = findModelInCatalog( + catalog, + params.activeModel.provider, + params.activeModel.model ?? "", + ); + if (modelSupportsVision(entry)) { + if (shouldLogVerbose()) { + logVerbose("Skipping image understanding: primary model supports vision natively"); + } + return { + outputs: [], + decision: { capability, outcome: "skipped", attachments: [] }, + }; + } + } + const attachmentPolicy = config?.attachments; const selected = selectAttachments({ capability,