From 5f9863098be0e0fd8839af2e5b64fd7e599040bb Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sun, 25 Jan 2026 09:56:57 +0000
Subject: [PATCH] fix: skip image understanding for vision models (#1747)

Thanks @tyler6204.

Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com>
---
 CHANGELOG.md                                  |  1 +
 src/media-understanding/runner.ts             | 56 +++++++++++------
 .../runner.vision-skip.test.ts                | 61 +++++++++++++++++++
 3 files changed, 98 insertions(+), 20 deletions(-)
 create mode 100644 src/media-understanding/runner.vision-skip.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 126d379d4..efccc2942 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,7 @@ Docs: https://docs.clawd.bot
 - Agents: auto-compact on context overflow prompt errors before failing. (#1627) Thanks @rodrigouroz.
 - Agents: use the active auth profile for auto-compaction recovery.
 - Models: default missing custom provider fields so minimal configs are accepted.
+- Media understanding: skip image understanding when the primary model already supports vision. (#1747) Thanks @tyler6204.
 - Gateway: skip Tailscale DNS probing when tailscale.mode is off. (#1671)
 - Gateway: reduce log noise for late invokes + remote node probes; debounce skills refresh. (#1607) Thanks @petter-b.
 - Gateway: clarify Control UI/WebChat auth error hints for missing tokens. (#1690)
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
index 0bff2513e..9e92d67c0 100644
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -991,26 +991,6 @@ export async function runCapability(params: {
     };
   }
 
-  // Skip image understanding when the primary model supports vision natively.
-  // The image will be injected directly into the model context instead.
-  if (capability === "image" && params.activeModel?.provider) {
-    const catalog = await loadModelCatalog({ config: cfg });
-    const entry = findModelInCatalog(
-      catalog,
-      params.activeModel.provider,
-      params.activeModel.model ?? "",
-    );
-    if (modelSupportsVision(entry)) {
-      if (shouldLogVerbose()) {
-        logVerbose("Skipping image understanding: primary model supports vision natively");
-      }
-      return {
-        outputs: [],
-        decision: { capability, outcome: "skipped", attachments: [] },
-      };
-    }
-  }
-
   const attachmentPolicy = config?.attachments;
   const selected = selectAttachments({
     capability,
@@ -1039,6 +1019,42 @@ export async function runCapability(params: {
     };
   }
 
+  // Skip image understanding when the primary model supports vision natively.
+  // The image will be injected directly into the model context instead.
+  const activeProvider = params.activeModel?.provider?.trim();
+  if (capability === "image" && activeProvider) {
+    const catalog = await loadModelCatalog({ config: cfg });
+    const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
+    if (modelSupportsVision(entry)) {
+      if (shouldLogVerbose()) {
+        logVerbose("Skipping image understanding: primary model supports vision natively");
+      }
+      const model = params.activeModel?.model?.trim();
+      const reason = "primary model supports vision natively";
+      return {
+        outputs: [],
+        decision: {
+          capability,
+          outcome: "skipped",
+          attachments: selected.map((item) => {
+            const attempt = {
+              type: "provider" as const,
+              provider: activeProvider,
+              model: model || undefined,
+              outcome: "skipped" as const,
+              reason,
+            };
+            return {
+              attachmentIndex: item.index,
+              attempts: [attempt],
+              chosen: attempt,
+            };
+          }),
+        },
+      };
+    }
+  }
+
   const entries = resolveModelEntries({
     cfg,
     capability,
diff --git a/src/media-understanding/runner.vision-skip.test.ts b/src/media-understanding/runner.vision-skip.test.ts
new file mode 100644
index 000000000..7d8371949
--- /dev/null
+++ b/src/media-understanding/runner.vision-skip.test.ts
@@ -0,0 +1,61 @@
+import { describe, expect, it, vi } from "vitest";
+
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { ClawdbotConfig } from "../config/config.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+const catalog = [
+  {
+    id: "gpt-4.1",
+    name: "GPT-4.1",
+    provider: "openai",
+    input: ["text", "image"] as const,
+  },
+];
+
+vi.mock("../agents/model-catalog.js", async () => {
+  const actual = await vi.importActual<typeof import("../agents/model-catalog.js")>(
+    "../agents/model-catalog.js",
+  );
+  return {
+    ...actual,
+    loadModelCatalog: vi.fn(async () => catalog),
+  };
+});
+
+describe("runCapability image skip", () => {
+  it("skips image understanding when the active model supports vision", async () => {
+    const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+    const cfg = {} as ClawdbotConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "image",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry: buildProviderRegistry(),
+        activeModel: { provider: "openai", model: "gpt-4.1" },
+      });
+
+      expect(result.outputs).toHaveLength(0);
+      expect(result.decision.outcome).toBe("skipped");
+      expect(result.decision.attachments).toHaveLength(1);
+      expect(result.decision.attachments[0]?.attachmentIndex).toBe(0);
+      expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped");
+      expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe(
+        "primary model supports vision natively",
+      );
+    } finally {
+      await cache.cleanup();
+    }
+  });
+});