fix: skip image understanding when primary model has vision
When the primary model supports vision natively (e.g., Claude Opus 4.5), skip the image understanding call entirely. The image will be injected directly into the model context instead, saving an API call and avoiding redundant descriptions. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Peter Steinberger
parent
83f92e34af
commit
fdecf5c59a
@@ -8,6 +8,7 @@ export type ModelCatalogEntry = {
|
||||
provider: string;
|
||||
contextWindow?: number;
|
||||
reasoning?: boolean;
|
||||
input?: Array<"text" | "image">;
|
||||
};
|
||||
|
||||
type DiscoveredModel = {
|
||||
@@ -16,6 +17,7 @@ type DiscoveredModel = {
|
||||
provider: string;
|
||||
contextWindow?: number;
|
||||
reasoning?: boolean;
|
||||
input?: Array<"text" | "image">;
|
||||
};
|
||||
|
||||
type PiSdkModule = typeof import("@mariozechner/pi-coding-agent");
|
||||
@@ -80,7 +82,10 @@ export async function loadModelCatalog(params?: {
|
||||
? entry.contextWindow
|
||||
: undefined;
|
||||
const reasoning = typeof entry?.reasoning === "boolean" ? entry.reasoning : undefined;
|
||||
models.push({ id, name, provider, contextWindow, reasoning });
|
||||
const input = Array.isArray(entry?.input)
|
||||
? (entry.input as Array<"text" | "image">)
|
||||
: undefined;
|
||||
models.push({ id, name, provider, contextWindow, reasoning, input });
|
||||
}
|
||||
|
||||
if (models.length === 0) {
|
||||
@@ -105,3 +110,27 @@ export async function loadModelCatalog(params?: {
|
||||
|
||||
return modelCatalogPromise;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a model supports image input based on its catalog entry.
|
||||
*/
|
||||
export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boolean {
|
||||
return entry?.input?.includes("image") ?? false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find a model in the catalog by provider and model ID.
|
||||
*/
|
||||
export function findModelInCatalog(
|
||||
catalog: ModelCatalogEntry[],
|
||||
provider: string,
|
||||
modelId: string,
|
||||
): ModelCatalogEntry | undefined {
|
||||
const normalizedProvider = provider.toLowerCase().trim();
|
||||
const normalizedModelId = modelId.toLowerCase().trim();
|
||||
return catalog.find(
|
||||
(entry) =>
|
||||
entry.provider.toLowerCase() === normalizedProvider &&
|
||||
entry.id.toLowerCase() === normalizedModelId,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -4,6 +4,11 @@ import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import {
|
||||
findModelInCatalog,
|
||||
loadModelCatalog,
|
||||
modelSupportsVision,
|
||||
} from "../agents/model-catalog.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import { applyTemplate } from "../auto-reply/templating.js";
|
||||
import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||
@@ -986,6 +991,26 @@ export async function runCapability(params: {
|
||||
};
|
||||
}
|
||||
|
||||
// Skip image understanding when the primary model supports vision natively.
|
||||
// The image will be injected directly into the model context instead.
|
||||
if (capability === "image" && params.activeModel?.provider) {
|
||||
const catalog = await loadModelCatalog({ config: cfg });
|
||||
const entry = findModelInCatalog(
|
||||
catalog,
|
||||
params.activeModel.provider,
|
||||
params.activeModel.model ?? "",
|
||||
);
|
||||
if (modelSupportsVision(entry)) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose("Skipping image understanding: primary model supports vision natively");
|
||||
}
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "skipped", attachments: [] },
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const attachmentPolicy = config?.attachments;
|
||||
const selected = selectAttachments({
|
||||
capability,
|
||||
|
||||
Reference in New Issue
Block a user