fix: skip image understanding for vision models (#1747)

Thanks @tyler6204.

Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com>
This commit is contained in:
Peter Steinberger
2026-01-25 09:56:57 +00:00
parent fdecf5c59a
commit 5f9863098b
3 changed files with 98 additions and 20 deletions

View File

@@ -38,6 +38,7 @@ Docs: https://docs.clawd.bot
- Agents: auto-compact on context overflow prompt errors before failing. (#1627) Thanks @rodrigouroz.
- Agents: use the active auth profile for auto-compaction recovery.
- Models: default missing custom provider fields so minimal configs are accepted.
- Media understanding: skip image understanding when the primary model already supports vision. (#1747) Thanks @tyler6204.
- Gateway: skip Tailscale DNS probing when tailscale.mode is off. (#1671)
- Gateway: reduce log noise for late invokes + remote node probes; debounce skills refresh. (#1607) Thanks @petter-b.
- Gateway: clarify Control UI/WebChat auth error hints for missing tokens. (#1690)

View File

@@ -991,26 +991,6 @@ export async function runCapability(params: {
};
}
// Skip image understanding when the primary model supports vision natively.
// The image will be injected directly into the model context instead.
if (capability === "image" && params.activeModel?.provider) {
const catalog = await loadModelCatalog({ config: cfg });
const entry = findModelInCatalog(
catalog,
params.activeModel.provider,
params.activeModel.model ?? "",
);
if (modelSupportsVision(entry)) {
if (shouldLogVerbose()) {
logVerbose("Skipping image understanding: primary model supports vision natively");
}
return {
outputs: [],
decision: { capability, outcome: "skipped", attachments: [] },
};
}
}
const attachmentPolicy = config?.attachments;
const selected = selectAttachments({
capability,
@@ -1039,6 +1019,42 @@ export async function runCapability(params: {
};
}
// Skip image understanding when the primary model supports vision natively.
// The image will be injected directly into the model context instead.
const activeProvider = params.activeModel?.provider?.trim();
if (capability === "image" && activeProvider) {
const catalog = await loadModelCatalog({ config: cfg });
const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
if (modelSupportsVision(entry)) {
if (shouldLogVerbose()) {
logVerbose("Skipping image understanding: primary model supports vision natively");
}
const model = params.activeModel?.model?.trim();
const reason = "primary model supports vision natively";
return {
outputs: [],
decision: {
capability,
outcome: "skipped",
attachments: selected.map((item) => {
const attempt = {
type: "provider" as const,
provider: activeProvider,
model: model || undefined,
outcome: "skipped" as const,
reason,
};
return {
attachmentIndex: item.index,
attempts: [attempt],
chosen: attempt,
};
}),
},
};
}
}
const entries = resolveModelEntries({
cfg,
capability,

View File

@@ -0,0 +1,61 @@
import { describe, expect, it, vi } from "vitest";
import type { MsgContext } from "../auto-reply/templating.js";
import type { ClawdbotConfig } from "../config/config.js";
import {
buildProviderRegistry,
createMediaAttachmentCache,
normalizeMediaAttachments,
runCapability,
} from "./runner.js";
const catalog = [
{
id: "gpt-4.1",
name: "GPT-4.1",
provider: "openai",
input: ["text", "image"] as const,
},
];
vi.mock("../agents/model-catalog.js", async () => {
const actual = await vi.importActual<typeof import("../agents/model-catalog.js")>(
"../agents/model-catalog.js",
);
return {
...actual,
loadModelCatalog: vi.fn(async () => catalog),
};
});
describe("runCapability image skip", () => {
it("skips image understanding when the active model supports vision", async () => {
const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" };
const media = normalizeMediaAttachments(ctx);
const cache = createMediaAttachmentCache(media);
const cfg = {} as ClawdbotConfig;
try {
const result = await runCapability({
capability: "image",
cfg,
ctx,
attachments: cache,
media,
providerRegistry: buildProviderRegistry(),
activeModel: { provider: "openai", model: "gpt-4.1" },
});
expect(result.outputs).toHaveLength(0);
expect(result.decision.outcome).toBe("skipped");
expect(result.decision.attachments).toHaveLength(1);
expect(result.decision.attachments[0]?.attachmentIndex).toBe(0);
expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped");
expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe(
"primary model supports vision natively",
);
} finally {
await cache.cleanup();
}
});
});