fix: skip image understanding when primary model has vision

When the primary model supports vision natively (e.g., Claude Opus 4.5),
skip the image understanding call entirely. The image will be injected
directly into the model context instead, saving an API call and avoiding
redundant descriptions.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Tyler Yust
2026-01-25 00:40:26 -08:00
committed by Peter Steinberger
parent 83f92e34af
commit fdecf5c59a
2 changed files with 55 additions and 1 deletions

View File

@@ -8,6 +8,7 @@ export type ModelCatalogEntry = {
provider: string;
contextWindow?: number;
reasoning?: boolean;
input?: Array<"text" | "image">;
};
type DiscoveredModel = {
@@ -16,6 +17,7 @@ type DiscoveredModel = {
provider: string;
contextWindow?: number;
reasoning?: boolean;
input?: Array<"text" | "image">;
};
type PiSdkModule = typeof import("@mariozechner/pi-coding-agent");
@@ -80,7 +82,10 @@ export async function loadModelCatalog(params?: {
? entry.contextWindow
: undefined;
const reasoning = typeof entry?.reasoning === "boolean" ? entry.reasoning : undefined;
models.push({ id, name, provider, contextWindow, reasoning });
const input = Array.isArray(entry?.input)
? (entry.input as Array<"text" | "image">)
: undefined;
models.push({ id, name, provider, contextWindow, reasoning, input });
}
if (models.length === 0) {
@@ -105,3 +110,27 @@ export async function loadModelCatalog(params?: {
return modelCatalogPromise;
}
/**
* Check if a model supports image input based on its catalog entry.
*/
export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boolean {
return entry?.input?.includes("image") ?? false;
}
/**
* Find a model in the catalog by provider and model ID.
*/
export function findModelInCatalog(
catalog: ModelCatalogEntry[],
provider: string,
modelId: string,
): ModelCatalogEntry | undefined {
const normalizedProvider = provider.toLowerCase().trim();
const normalizedModelId = modelId.toLowerCase().trim();
return catalog.find(
(entry) =>
entry.provider.toLowerCase() === normalizedProvider &&
entry.id.toLowerCase() === normalizedModelId,
);
}

View File

@@ -4,6 +4,11 @@ import os from "node:os";
import path from "node:path";
import type { ClawdbotConfig } from "../config/config.js";
import {
findModelInCatalog,
loadModelCatalog,
modelSupportsVision,
} from "../agents/model-catalog.js";
import type { MsgContext } from "../auto-reply/templating.js";
import { applyTemplate } from "../auto-reply/templating.js";
import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js";
@@ -986,6 +991,26 @@ export async function runCapability(params: {
};
}
// Skip image understanding when the primary model supports vision natively.
// The image will be injected directly into the model context instead.
if (capability === "image" && params.activeModel?.provider) {
const catalog = await loadModelCatalog({ config: cfg });
const entry = findModelInCatalog(
catalog,
params.activeModel.provider,
params.activeModel.model ?? "",
);
if (modelSupportsVision(entry)) {
if (shouldLogVerbose()) {
logVerbose("Skipping image understanding: primary model supports vision natively");
}
return {
outputs: [],
decision: { capability, outcome: "skipped", attachments: [] },
};
}
}
const attachmentPolicy = config?.attachments;
const selected = selectAttachments({
capability,