fix: skip image understanding for vision models (#1747)

Thanks @tyler6204. Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com>
2026-01-25 09:56:57 +00:00
parent fdecf5c59a
commit 5f9863098b
3 changed files with 98 additions and 20 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,7 @@ Docs: https://docs.clawd.bot
 - Agents: auto-compact on context overflow prompt errors before failing. (#1627) Thanks @rodrigouroz.
 - Agents: use the active auth profile for auto-compaction recovery.
 - Models: default missing custom provider fields so minimal configs are accepted.
 - Media understanding: skip image understanding when the primary model already supports vision. (#1747) Thanks @tyler6204.
 - Gateway: skip Tailscale DNS probing when tailscale.mode is off. (#1671)
 - Gateway: reduce log noise for late invokes + remote node probes; debounce skills refresh. (#1607) Thanks @petter-b.
 - Gateway: clarify Control UI/WebChat auth error hints for missing tokens. (#1690)
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -991,26 +991,6 @@ export async function runCapability(params: {
    };
  }
  // Skip image understanding when the primary model supports vision natively.
  // The image will be injected directly into the model context instead.
  if (capability === "image" && params.activeModel?.provider) {
    const catalog = await loadModelCatalog({ config: cfg });
    const entry = findModelInCatalog(
      catalog,
      params.activeModel.provider,
      params.activeModel.model ?? "",
    );
    if (modelSupportsVision(entry)) {
      if (shouldLogVerbose()) {
        logVerbose("Skipping image understanding: primary model supports vision natively");
      }
      return {
        outputs: [],
        decision: { capability, outcome: "skipped", attachments: [] },
      };
    }
  }
  const attachmentPolicy = config?.attachments;
  const selected = selectAttachments({
    capability,
@@ -1039,6 +1019,42 @@ export async function runCapability(params: {
    };
  }
  // Skip image understanding when the primary model supports vision natively.
  // The image will be injected directly into the model context instead.
  const activeProvider = params.activeModel?.provider?.trim();
  if (capability === "image" && activeProvider) {
    const catalog = await loadModelCatalog({ config: cfg });
    const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
    if (modelSupportsVision(entry)) {
      if (shouldLogVerbose()) {
        logVerbose("Skipping image understanding: primary model supports vision natively");
      }
      const model = params.activeModel?.model?.trim();
      const reason = "primary model supports vision natively";
      return {
        outputs: [],
        decision: {
          capability,
          outcome: "skipped",
          attachments: selected.map((item) => {
            const attempt = {
              type: "provider" as const,
              provider: activeProvider,
              model: model || undefined,
              outcome: "skipped" as const,
              reason,
            };
            return {
              attachmentIndex: item.index,
              attempts: [attempt],
              chosen: attempt,
            };
          }),
        },
      };
    }
  }
  const entries = resolveModelEntries({
    cfg,
    capability,
--- a/src/media-understanding/runner.vision-skip.test.ts
+++ b/src/media-understanding/runner.vision-skip.test.ts
@@ -0,0 +1,61 @@
 import { describe, expect, it, vi } from "vitest";
 import type { MsgContext } from "../auto-reply/templating.js";
 import type { ClawdbotConfig } from "../config/config.js";
 import {
  buildProviderRegistry,
  createMediaAttachmentCache,
  normalizeMediaAttachments,
  runCapability,
 } from "./runner.js";
 const catalog = [
  {
    id: "gpt-4.1",
    name: "GPT-4.1",
    provider: "openai",
    input: ["text", "image"] as const,
  },
 ];
 vi.mock("../agents/model-catalog.js", async () => {
  const actual = await vi.importActual<typeof import("../agents/model-catalog.js")>(
    "../agents/model-catalog.js",
  );
  return {
    ...actual,
    loadModelCatalog: vi.fn(async () => catalog),
  };
 });
 describe("runCapability image skip", () => {
  it("skips image understanding when the active model supports vision", async () => {
    const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" };
    const media = normalizeMediaAttachments(ctx);
    const cache = createMediaAttachmentCache(media);
    const cfg = {} as ClawdbotConfig;
    try {
      const result = await runCapability({
        capability: "image",
        cfg,
        ctx,
        attachments: cache,
        media,
        providerRegistry: buildProviderRegistry(),
        activeModel: { provider: "openai", model: "gpt-4.1" },
      });
      expect(result.outputs).toHaveLength(0);
      expect(result.decision.outcome).toBe("skipped");
      expect(result.decision.attachments).toHaveLength(1);
      expect(result.decision.attachments[0]?.attachmentIndex).toBe(0);
      expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped");
      expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe(
        "primary model supports vision natively",
      );
    } finally {
      await cache.cleanup();
    }
  });
 });