fix: skip image understanding when primary model has vision

When the primary model supports vision natively (e.g., Claude Opus 4.5), skip the image understanding call entirely. The image will be injected directly into the model context instead, saving an API call and avoiding redundant descriptions. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-25 00:40:26 -08:00
parent 83f92e34af
commit fdecf5c59a
2 changed files with 55 additions and 1 deletions
--- a/src/agents/model-catalog.ts
+++ b/src/agents/model-catalog.ts
@@ -8,6 +8,7 @@ export type ModelCatalogEntry = {
  provider: string;
  contextWindow?: number;
  reasoning?: boolean;
+  input?: Array<"text" | "image">;
 };

 type DiscoveredModel = {
@@ -16,6 +17,7 @@ type DiscoveredModel = {
  provider: string;
  contextWindow?: number;
  reasoning?: boolean;
+  input?: Array<"text" | "image">;
 };

 type PiSdkModule = typeof import("@mariozechner/pi-coding-agent");
@@ -80,7 +82,10 @@ export async function loadModelCatalog(params?: {
            ? entry.contextWindow
            : undefined;
        const reasoning = typeof entry?.reasoning === "boolean" ? entry.reasoning : undefined;
-        models.push({ id, name, provider, contextWindow, reasoning });
+        const input = Array.isArray(entry?.input)
+          ? (entry.input as Array<"text" | "image">)
+          : undefined;
+        models.push({ id, name, provider, contextWindow, reasoning, input });
      }

      if (models.length === 0) {
@@ -105,3 +110,27 @@ export async function loadModelCatalog(params?: {

  return modelCatalogPromise;
 }
+
+/**
+ * Check if a model supports image input based on its catalog entry.
+ */
+export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boolean {
+  return entry?.input?.includes("image") ?? false;
+}
+
+/**
+ * Find a model in the catalog by provider and model ID.
+ */
+export function findModelInCatalog(
+  catalog: ModelCatalogEntry[],
+  provider: string,
+  modelId: string,
+): ModelCatalogEntry | undefined {
+  const normalizedProvider = provider.toLowerCase().trim();
+  const normalizedModelId = modelId.toLowerCase().trim();
+  return catalog.find(
+    (entry) =>
+      entry.provider.toLowerCase() === normalizedProvider &&
+      entry.id.toLowerCase() === normalizedModelId,
+  );
+}
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -4,6 +4,11 @@ import os from "node:os";
 import path from "node:path";

 import type { ClawdbotConfig } from "../config/config.js";
+import {
+  findModelInCatalog,
+  loadModelCatalog,
+  modelSupportsVision,
+} from "../agents/model-catalog.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import { applyTemplate } from "../auto-reply/templating.js";
 import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js";
@@ -986,6 +991,26 @@ export async function runCapability(params: {
    };
  }

+  // Skip image understanding when the primary model supports vision natively.
+  // The image will be injected directly into the model context instead.
+  if (capability === "image" && params.activeModel?.provider) {
+    const catalog = await loadModelCatalog({ config: cfg });
+    const entry = findModelInCatalog(
+      catalog,
+      params.activeModel.provider,
+      params.activeModel.model ?? "",
+    );
+    if (modelSupportsVision(entry)) {
+      if (shouldLogVerbose()) {
+        logVerbose("Skipping image understanding: primary model supports vision natively");
+      }
+      return {
+        outputs: [],
+        decision: { capability, outcome: "skipped", attachments: [] },
+      };
+    }
+  }
+
  const attachmentPolicy = config?.attachments;
  const selected = selectAttachments({
    capability,