refactor: tighten media diagnostics

This commit is contained in:
Peter Steinberger
2026-01-17 07:27:38 +00:00
parent 0c0e1e4226
commit 2ee45d50a4
5 changed files with 252 additions and 10 deletions

View File

@@ -256,6 +256,15 @@ When `mode: "all"`, outputs are labeled `[Image 1/2]`, `[Audio 2/2]`, etc.
}
```
## Status output
When media understanding runs, `/status` includes a short summary line:
```
📎 Media: image ok (openai/gpt-5.2) · audio skipped (maxBytes)
```
This shows percapability outcomes and the chosen provider/model when applicable.
## Notes
- Understanding is **besteffort**. Errors do not block replies.
- Attachments are still passed to models even when understanding is disabled.

View File

@@ -41,4 +41,69 @@ describe("buildInboundMediaNote", () => {
});
expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
});
it("only suppresses attachments when media understanding succeeded", () => {
const note = buildInboundMediaNote({
MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
MediaUnderstandingDecisions: [
{
capability: "image",
outcome: "skipped",
attachments: [
{
attachmentIndex: 0,
attempts: [
{
type: "provider",
outcome: "skipped",
reason: "maxBytes: too large",
},
],
},
],
},
],
});
expect(note).toBe(
[
"[media attached: 2 files]",
"[media attached 1/2: /tmp/a.png | https://example.com/a.png]",
"[media attached 2/2: /tmp/b.png | https://example.com/b.png]",
].join("\n"),
);
});
it("suppresses attachments when media understanding succeeds via decisions", () => {
const note = buildInboundMediaNote({
MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
MediaUnderstandingDecisions: [
{
capability: "image",
outcome: "success",
attachments: [
{
attachmentIndex: 0,
attempts: [
{
type: "provider",
outcome: "success",
provider: "openai",
model: "gpt-5.2",
},
],
chosen: {
type: "provider",
outcome: "success",
provider: "openai",
model: "gpt-5.2",
},
},
],
},
],
});
expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
});
});

View File

@@ -19,11 +19,22 @@ function formatMediaAttachedLine(params: {
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
// Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
const suppressed = new Set(
Array.isArray(ctx.MediaUnderstanding)
? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
: [],
);
const suppressed = new Set<number>();
if (Array.isArray(ctx.MediaUnderstanding)) {
for (const output of ctx.MediaUnderstanding) {
suppressed.add(output.attachmentIndex);
}
}
if (Array.isArray(ctx.MediaUnderstandingDecisions)) {
for (const decision of ctx.MediaUnderstandingDecisions) {
if (decision.outcome !== "success") continue;
for (const attachment of decision.attachments) {
if (attachment.chosen?.outcome === "success") {
suppressed.add(attachment.attachmentIndex);
}
}
}
}
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
const paths =
pathsFromArray && pathsFromArray.length > 0

View File

@@ -0,0 +1,136 @@
import { describe, expect, it } from "vitest";
import type { ClawdbotConfig } from "../config/config.js";
import {
resolveEntriesWithActiveFallback,
resolveModelEntries,
} from "./resolve.js";
const providerRegistry = new Map([
["openai", { capabilities: ["image"] }],
["groq", { capabilities: ["audio"] }],
]);
describe("resolveModelEntries", () => {
it("uses provider capabilities for shared entries without explicit caps", () => {
const cfg: ClawdbotConfig = {
tools: {
media: {
models: [{ provider: "openai", model: "gpt-5.2" }],
},
},
};
const imageEntries = resolveModelEntries({
cfg,
capability: "image",
providerRegistry,
});
expect(imageEntries).toHaveLength(1);
const audioEntries = resolveModelEntries({
cfg,
capability: "audio",
providerRegistry,
});
expect(audioEntries).toHaveLength(0);
});
it("keeps per-capability entries even without explicit caps", () => {
const cfg: ClawdbotConfig = {
tools: {
media: {
image: {
models: [{ provider: "openai", model: "gpt-5.2" }],
},
},
},
};
const imageEntries = resolveModelEntries({
cfg,
capability: "image",
config: cfg.tools?.media?.image,
providerRegistry,
});
expect(imageEntries).toHaveLength(1);
});
it("skips shared CLI entries without capabilities", () => {
const cfg: ClawdbotConfig = {
tools: {
media: {
models: [{ type: "cli", command: "gemini", args: ["--file", "{{MediaPath}}"] }],
},
},
};
const entries = resolveModelEntries({
cfg,
capability: "image",
providerRegistry,
});
expect(entries).toHaveLength(0);
});
});
describe("resolveEntriesWithActiveFallback", () => {
it("uses active model when enabled and no models are configured", () => {
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: { enabled: true },
},
},
};
const entries = resolveEntriesWithActiveFallback({
cfg,
capability: "audio",
config: cfg.tools?.media?.audio,
providerRegistry,
activeModel: { provider: "groq", model: "whisper-large-v3" },
});
expect(entries).toHaveLength(1);
expect(entries[0]?.provider).toBe("groq");
});
it("ignores active model when configured entries exist", () => {
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: { enabled: true, models: [{ provider: "openai", model: "whisper-1" }] },
},
},
};
const entries = resolveEntriesWithActiveFallback({
cfg,
capability: "audio",
config: cfg.tools?.media?.audio,
providerRegistry,
activeModel: { provider: "groq", model: "whisper-large-v3" },
});
expect(entries).toHaveLength(1);
expect(entries[0]?.provider).toBe("openai");
});
it("skips active model when provider lacks capability", () => {
const cfg: ClawdbotConfig = {
tools: {
media: {
video: { enabled: true },
},
},
};
const entries = resolveEntriesWithActiveFallback({
cfg,
capability: "video",
config: cfg.tools?.media?.video,
providerRegistry,
activeModel: { provider: "groq", model: "whisper-large-v3" },
});
expect(entries).toHaveLength(0);
});
});

View File

@@ -98,6 +98,23 @@ function buildModelDecision(params: {
};
}
function formatDecisionSummary(decision: MediaUnderstandingDecision): string {
const total = decision.attachments.length;
const success = decision.attachments.filter((entry) => entry.chosen?.outcome === "success").length;
const chosen = decision.attachments.find((entry) => entry.chosen)?.chosen;
const provider = chosen?.provider?.trim();
const model = chosen?.model?.trim();
const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
const reason = decision.attachments
.flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean))
.find(Boolean);
const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
const countLabel = total > 0 ? ` (${success}/${total})` : "";
const viaLabel = modelLabel ? ` via ${modelLabel}` : "";
const reasonLabel = shortReason ? ` reason=${shortReason}` : "";
return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
}
async function runProviderEntry(params: {
capability: MediaUnderstandingCapability;
entry: MediaUnderstandingModelConfig;
@@ -495,12 +512,16 @@ export async function runCapability(params: {
chosen: attempts.find((attempt) => attempt.outcome === "success"),
});
}
const decision: MediaUnderstandingDecision = {
capability,
outcome: outputs.length > 0 ? "success" : "skipped",
attachments: attachmentDecisions,
};
if (shouldLogVerbose()) {
logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
}
return {
outputs,
decision: {
capability,
outcome: outputs.length > 0 ? "success" : "skipped",
attachments: attachmentDecisions,
},
decision,
};
}