fix: skip image understanding for vision models (#1747)
Thanks @tyler6204. Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com>
This commit is contained in:
@@ -38,6 +38,7 @@ Docs: https://docs.clawd.bot
|
||||
- Agents: auto-compact on context overflow prompt errors before failing. (#1627) Thanks @rodrigouroz.
|
||||
- Agents: use the active auth profile for auto-compaction recovery.
|
||||
- Models: default missing custom provider fields so minimal configs are accepted.
|
||||
- Media understanding: skip image understanding when the primary model already supports vision. (#1747) Thanks @tyler6204.
|
||||
- Gateway: skip Tailscale DNS probing when tailscale.mode is off. (#1671)
|
||||
- Gateway: reduce log noise for late invokes + remote node probes; debounce skills refresh. (#1607) Thanks @petter-b.
|
||||
- Gateway: clarify Control UI/WebChat auth error hints for missing tokens. (#1690)
|
||||
|
||||
@@ -991,26 +991,6 @@ export async function runCapability(params: {
|
||||
};
|
||||
}
|
||||
|
||||
// Skip image understanding when the primary model supports vision natively.
|
||||
// The image will be injected directly into the model context instead.
|
||||
if (capability === "image" && params.activeModel?.provider) {
|
||||
const catalog = await loadModelCatalog({ config: cfg });
|
||||
const entry = findModelInCatalog(
|
||||
catalog,
|
||||
params.activeModel.provider,
|
||||
params.activeModel.model ?? "",
|
||||
);
|
||||
if (modelSupportsVision(entry)) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose("Skipping image understanding: primary model supports vision natively");
|
||||
}
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "skipped", attachments: [] },
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const attachmentPolicy = config?.attachments;
|
||||
const selected = selectAttachments({
|
||||
capability,
|
||||
@@ -1039,6 +1019,42 @@ export async function runCapability(params: {
|
||||
};
|
||||
}
|
||||
|
||||
// Skip image understanding when the primary model supports vision natively.
|
||||
// The image will be injected directly into the model context instead.
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (capability === "image" && activeProvider) {
|
||||
const catalog = await loadModelCatalog({ config: cfg });
|
||||
const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
|
||||
if (modelSupportsVision(entry)) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose("Skipping image understanding: primary model supports vision natively");
|
||||
}
|
||||
const model = params.activeModel?.model?.trim();
|
||||
const reason = "primary model supports vision natively";
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "skipped",
|
||||
attachments: selected.map((item) => {
|
||||
const attempt = {
|
||||
type: "provider" as const,
|
||||
provider: activeProvider,
|
||||
model: model || undefined,
|
||||
outcome: "skipped" as const,
|
||||
reason,
|
||||
};
|
||||
return {
|
||||
attachmentIndex: item.index,
|
||||
attempts: [attempt],
|
||||
chosen: attempt,
|
||||
};
|
||||
}),
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const entries = resolveModelEntries({
|
||||
cfg,
|
||||
capability,
|
||||
|
||||
61
src/media-understanding/runner.vision-skip.test.ts
Normal file
61
src/media-understanding/runner.vision-skip.test.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import {
|
||||
buildProviderRegistry,
|
||||
createMediaAttachmentCache,
|
||||
normalizeMediaAttachments,
|
||||
runCapability,
|
||||
} from "./runner.js";
|
||||
|
||||
const catalog = [
|
||||
{
|
||||
id: "gpt-4.1",
|
||||
name: "GPT-4.1",
|
||||
provider: "openai",
|
||||
input: ["text", "image"] as const,
|
||||
},
|
||||
];
|
||||
|
||||
vi.mock("../agents/model-catalog.js", async () => {
|
||||
const actual = await vi.importActual<typeof import("../agents/model-catalog.js")>(
|
||||
"../agents/model-catalog.js",
|
||||
);
|
||||
return {
|
||||
...actual,
|
||||
loadModelCatalog: vi.fn(async () => catalog),
|
||||
};
|
||||
});
|
||||
|
||||
describe("runCapability image skip", () => {
|
||||
it("skips image understanding when the active model supports vision", async () => {
|
||||
const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" };
|
||||
const media = normalizeMediaAttachments(ctx);
|
||||
const cache = createMediaAttachmentCache(media);
|
||||
const cfg = {} as ClawdbotConfig;
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: "image",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
providerRegistry: buildProviderRegistry(),
|
||||
activeModel: { provider: "openai", model: "gpt-4.1" },
|
||||
});
|
||||
|
||||
expect(result.outputs).toHaveLength(0);
|
||||
expect(result.decision.outcome).toBe("skipped");
|
||||
expect(result.decision.attachments).toHaveLength(1);
|
||||
expect(result.decision.attachments[0]?.attachmentIndex).toBe(0);
|
||||
expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped");
|
||||
expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe(
|
||||
"primary model supports vision natively",
|
||||
);
|
||||
} finally {
|
||||
await cache.cleanup();
|
||||
}
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user