fix: skip image understanding for vision models (#1747)
Thanks @tyler6204. Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com>
This commit is contained in:
@@ -38,6 +38,7 @@ Docs: https://docs.clawd.bot
|
|||||||
- Agents: auto-compact on context overflow prompt errors before failing. (#1627) Thanks @rodrigouroz.
|
- Agents: auto-compact on context overflow prompt errors before failing. (#1627) Thanks @rodrigouroz.
|
||||||
- Agents: use the active auth profile for auto-compaction recovery.
|
- Agents: use the active auth profile for auto-compaction recovery.
|
||||||
- Models: default missing custom provider fields so minimal configs are accepted.
|
- Models: default missing custom provider fields so minimal configs are accepted.
|
||||||
|
- Media understanding: skip image understanding when the primary model already supports vision. (#1747) Thanks @tyler6204.
|
||||||
- Gateway: skip Tailscale DNS probing when tailscale.mode is off. (#1671)
|
- Gateway: skip Tailscale DNS probing when tailscale.mode is off. (#1671)
|
||||||
- Gateway: reduce log noise for late invokes + remote node probes; debounce skills refresh. (#1607) Thanks @petter-b.
|
- Gateway: reduce log noise for late invokes + remote node probes; debounce skills refresh. (#1607) Thanks @petter-b.
|
||||||
- Gateway: clarify Control UI/WebChat auth error hints for missing tokens. (#1690)
|
- Gateway: clarify Control UI/WebChat auth error hints for missing tokens. (#1690)
|
||||||
|
|||||||
@@ -991,26 +991,6 @@ export async function runCapability(params: {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip image understanding when the primary model supports vision natively.
|
|
||||||
// The image will be injected directly into the model context instead.
|
|
||||||
if (capability === "image" && params.activeModel?.provider) {
|
|
||||||
const catalog = await loadModelCatalog({ config: cfg });
|
|
||||||
const entry = findModelInCatalog(
|
|
||||||
catalog,
|
|
||||||
params.activeModel.provider,
|
|
||||||
params.activeModel.model ?? "",
|
|
||||||
);
|
|
||||||
if (modelSupportsVision(entry)) {
|
|
||||||
if (shouldLogVerbose()) {
|
|
||||||
logVerbose("Skipping image understanding: primary model supports vision natively");
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
outputs: [],
|
|
||||||
decision: { capability, outcome: "skipped", attachments: [] },
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const attachmentPolicy = config?.attachments;
|
const attachmentPolicy = config?.attachments;
|
||||||
const selected = selectAttachments({
|
const selected = selectAttachments({
|
||||||
capability,
|
capability,
|
||||||
@@ -1039,6 +1019,42 @@ export async function runCapability(params: {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Skip image understanding when the primary model supports vision natively.
|
||||||
|
// The image will be injected directly into the model context instead.
|
||||||
|
const activeProvider = params.activeModel?.provider?.trim();
|
||||||
|
if (capability === "image" && activeProvider) {
|
||||||
|
const catalog = await loadModelCatalog({ config: cfg });
|
||||||
|
const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
|
||||||
|
if (modelSupportsVision(entry)) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose("Skipping image understanding: primary model supports vision natively");
|
||||||
|
}
|
||||||
|
const model = params.activeModel?.model?.trim();
|
||||||
|
const reason = "primary model supports vision natively";
|
||||||
|
return {
|
||||||
|
outputs: [],
|
||||||
|
decision: {
|
||||||
|
capability,
|
||||||
|
outcome: "skipped",
|
||||||
|
attachments: selected.map((item) => {
|
||||||
|
const attempt = {
|
||||||
|
type: "provider" as const,
|
||||||
|
provider: activeProvider,
|
||||||
|
model: model || undefined,
|
||||||
|
outcome: "skipped" as const,
|
||||||
|
reason,
|
||||||
|
};
|
||||||
|
return {
|
||||||
|
attachmentIndex: item.index,
|
||||||
|
attempts: [attempt],
|
||||||
|
chosen: attempt,
|
||||||
|
};
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const entries = resolveModelEntries({
|
const entries = resolveModelEntries({
|
||||||
cfg,
|
cfg,
|
||||||
capability,
|
capability,
|
||||||
|
|||||||
61
src/media-understanding/runner.vision-skip.test.ts
Normal file
61
src/media-understanding/runner.vision-skip.test.ts
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
import { describe, expect, it, vi } from "vitest";
|
||||||
|
|
||||||
|
import type { MsgContext } from "../auto-reply/templating.js";
|
||||||
|
import type { ClawdbotConfig } from "../config/config.js";
|
||||||
|
import {
|
||||||
|
buildProviderRegistry,
|
||||||
|
createMediaAttachmentCache,
|
||||||
|
normalizeMediaAttachments,
|
||||||
|
runCapability,
|
||||||
|
} from "./runner.js";
|
||||||
|
|
||||||
|
const catalog = [
|
||||||
|
{
|
||||||
|
id: "gpt-4.1",
|
||||||
|
name: "GPT-4.1",
|
||||||
|
provider: "openai",
|
||||||
|
input: ["text", "image"] as const,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
vi.mock("../agents/model-catalog.js", async () => {
|
||||||
|
const actual = await vi.importActual<typeof import("../agents/model-catalog.js")>(
|
||||||
|
"../agents/model-catalog.js",
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
...actual,
|
||||||
|
loadModelCatalog: vi.fn(async () => catalog),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("runCapability image skip", () => {
|
||||||
|
it("skips image understanding when the active model supports vision", async () => {
|
||||||
|
const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" };
|
||||||
|
const media = normalizeMediaAttachments(ctx);
|
||||||
|
const cache = createMediaAttachmentCache(media);
|
||||||
|
const cfg = {} as ClawdbotConfig;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await runCapability({
|
||||||
|
capability: "image",
|
||||||
|
cfg,
|
||||||
|
ctx,
|
||||||
|
attachments: cache,
|
||||||
|
media,
|
||||||
|
providerRegistry: buildProviderRegistry(),
|
||||||
|
activeModel: { provider: "openai", model: "gpt-4.1" },
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.outputs).toHaveLength(0);
|
||||||
|
expect(result.decision.outcome).toBe("skipped");
|
||||||
|
expect(result.decision.attachments).toHaveLength(1);
|
||||||
|
expect(result.decision.attachments[0]?.attachmentIndex).toBe(0);
|
||||||
|
expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped");
|
||||||
|
expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe(
|
||||||
|
"primary model supports vision natively",
|
||||||
|
);
|
||||||
|
} finally {
|
||||||
|
await cache.cleanup();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user