From 5f9863098be0e0fd8839af2e5b64fd7e599040bb Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 25 Jan 2026 09:56:57 +0000 Subject: [PATCH] fix: skip image understanding for vision models (#1747) Thanks @tyler6204. Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com> --- CHANGELOG.md | 1 + src/media-understanding/runner.ts | 56 +++++++++++------ .../runner.vision-skip.test.ts | 61 +++++++++++++++++++ 3 files changed, 98 insertions(+), 20 deletions(-) create mode 100644 src/media-understanding/runner.vision-skip.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 126d379d4..efccc2942 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ Docs: https://docs.clawd.bot - Agents: auto-compact on context overflow prompt errors before failing. (#1627) Thanks @rodrigouroz. - Agents: use the active auth profile for auto-compaction recovery. - Models: default missing custom provider fields so minimal configs are accepted. +- Media understanding: skip image understanding when the primary model already supports vision. (#1747) Thanks @tyler6204. - Gateway: skip Tailscale DNS probing when tailscale.mode is off. (#1671) - Gateway: reduce log noise for late invokes + remote node probes; debounce skills refresh. (#1607) Thanks @petter-b. - Gateway: clarify Control UI/WebChat auth error hints for missing tokens. (#1690) diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 0bff2513e..9e92d67c0 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -991,26 +991,6 @@ export async function runCapability(params: { }; } - // Skip image understanding when the primary model supports vision natively. - // The image will be injected directly into the model context instead. - if (capability === "image" && params.activeModel?.provider) { - const catalog = await loadModelCatalog({ config: cfg }); - const entry = findModelInCatalog( - catalog, - params.activeModel.provider, - params.activeModel.model ?? "", - ); - if (modelSupportsVision(entry)) { - if (shouldLogVerbose()) { - logVerbose("Skipping image understanding: primary model supports vision natively"); - } - return { - outputs: [], - decision: { capability, outcome: "skipped", attachments: [] }, - }; - } - } - const attachmentPolicy = config?.attachments; const selected = selectAttachments({ capability, @@ -1039,6 +1019,42 @@ export async function runCapability(params: { }; } + // Skip image understanding when the primary model supports vision natively. + // The image will be injected directly into the model context instead. + const activeProvider = params.activeModel?.provider?.trim(); + if (capability === "image" && activeProvider) { + const catalog = await loadModelCatalog({ config: cfg }); + const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? ""); + if (modelSupportsVision(entry)) { + if (shouldLogVerbose()) { + logVerbose("Skipping image understanding: primary model supports vision natively"); + } + const model = params.activeModel?.model?.trim(); + const reason = "primary model supports vision natively"; + return { + outputs: [], + decision: { + capability, + outcome: "skipped", + attachments: selected.map((item) => { + const attempt = { + type: "provider" as const, + provider: activeProvider, + model: model || undefined, + outcome: "skipped" as const, + reason, + }; + return { + attachmentIndex: item.index, + attempts: [attempt], + chosen: attempt, + }; + }), + }, + }; + } + } + const entries = resolveModelEntries({ cfg, capability, diff --git a/src/media-understanding/runner.vision-skip.test.ts b/src/media-understanding/runner.vision-skip.test.ts new file mode 100644 index 000000000..7d8371949 --- /dev/null +++ b/src/media-understanding/runner.vision-skip.test.ts @@ -0,0 +1,61 @@ +import { describe, expect, it, vi } from "vitest"; + +import type { MsgContext } from "../auto-reply/templating.js"; +import type { ClawdbotConfig } from "../config/config.js"; +import { + buildProviderRegistry, + createMediaAttachmentCache, + normalizeMediaAttachments, + runCapability, +} from "./runner.js"; + +const catalog = [ + { + id: "gpt-4.1", + name: "GPT-4.1", + provider: "openai", + input: ["text", "image"] as const, + }, +]; + +vi.mock("../agents/model-catalog.js", async () => { + const actual = await vi.importActual( + "../agents/model-catalog.js", + ); + return { + ...actual, + loadModelCatalog: vi.fn(async () => catalog), + }; +}); + +describe("runCapability image skip", () => { + it("skips image understanding when the active model supports vision", async () => { + const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media); + const cfg = {} as ClawdbotConfig; + + try { + const result = await runCapability({ + capability: "image", + cfg, + ctx, + attachments: cache, + media, + providerRegistry: buildProviderRegistry(), + activeModel: { provider: "openai", model: "gpt-4.1" }, + }); + + expect(result.outputs).toHaveLength(0); + expect(result.decision.outcome).toBe("skipped"); + expect(result.decision.attachments).toHaveLength(1); + expect(result.decision.attachments[0]?.attachmentIndex).toBe(0); + expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped"); + expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe( + "primary model supports vision natively", + ); + } finally { + await cache.cleanup(); + } + }); +});