diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index b9d2abdc1..38b2931e3 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -256,6 +256,15 @@ When `mode: "all"`, outputs are labeled `[Image 1/2]`, `[Audio 2/2]`, etc. } ``` +## Status output +When media understanding runs, `/status` includes a short summary line: + +``` +📎 Media: image ok (openai/gpt-5.2) · audio skipped (maxBytes) +``` + +This shows per‑capability outcomes and the chosen provider/model when applicable. + ## Notes - Understanding is **best‑effort**. Errors do not block replies. - Attachments are still passed to models even when understanding is disabled. diff --git a/src/auto-reply/media-note.test.ts b/src/auto-reply/media-note.test.ts index 0dd403386..5d9ae04cb 100644 --- a/src/auto-reply/media-note.test.ts +++ b/src/auto-reply/media-note.test.ts @@ -41,4 +41,69 @@ describe("buildInboundMediaNote", () => { }); expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]"); }); + + it("only suppresses attachments when media understanding succeeded", () => { + const note = buildInboundMediaNote({ + MediaPaths: ["/tmp/a.png", "/tmp/b.png"], + MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"], + MediaUnderstandingDecisions: [ + { + capability: "image", + outcome: "skipped", + attachments: [ + { + attachmentIndex: 0, + attempts: [ + { + type: "provider", + outcome: "skipped", + reason: "maxBytes: too large", + }, + ], + }, + ], + }, + ], + }); + expect(note).toBe( + [ + "[media attached: 2 files]", + "[media attached 1/2: /tmp/a.png | https://example.com/a.png]", + "[media attached 2/2: /tmp/b.png | https://example.com/b.png]", + ].join("\n"), + ); + }); + + it("suppresses attachments when media understanding succeeds via decisions", () => { + const note = buildInboundMediaNote({ + MediaPaths: ["/tmp/a.png", "/tmp/b.png"], + MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"], + MediaUnderstandingDecisions: [ + { + capability: "image", + outcome: "success", + attachments: [ + { + attachmentIndex: 0, + attempts: [ + { + type: "provider", + outcome: "success", + provider: "openai", + model: "gpt-5.2", + }, + ], + chosen: { + type: "provider", + outcome: "success", + provider: "openai", + model: "gpt-5.2", + }, + }, + ], + }, + ], + }); + expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]"); + }); }); diff --git a/src/auto-reply/media-note.ts b/src/auto-reply/media-note.ts index aafadb999..cd8617c30 100644 --- a/src/auto-reply/media-note.ts +++ b/src/auto-reply/media-note.ts @@ -19,11 +19,22 @@ function formatMediaAttachedLine(params: { export function buildInboundMediaNote(ctx: MsgContext): string | undefined { // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel. - const suppressed = new Set( - Array.isArray(ctx.MediaUnderstanding) - ? ctx.MediaUnderstanding.map((output) => output.attachmentIndex) - : [], - ); + const suppressed = new Set(); + if (Array.isArray(ctx.MediaUnderstanding)) { + for (const output of ctx.MediaUnderstanding) { + suppressed.add(output.attachmentIndex); + } + } + if (Array.isArray(ctx.MediaUnderstandingDecisions)) { + for (const decision of ctx.MediaUnderstandingDecisions) { + if (decision.outcome !== "success") continue; + for (const attachment of decision.attachments) { + if (attachment.chosen?.outcome === "success") { + suppressed.add(attachment.attachmentIndex); + } + } + } + } const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined; const paths = pathsFromArray && pathsFromArray.length > 0 diff --git a/src/media-understanding/resolve.test.ts b/src/media-understanding/resolve.test.ts new file mode 100644 index 000000000..f2a42c582 --- /dev/null +++ b/src/media-understanding/resolve.test.ts @@ -0,0 +1,136 @@ +import { describe, expect, it } from "vitest"; + +import type { ClawdbotConfig } from "../config/config.js"; +import { + resolveEntriesWithActiveFallback, + resolveModelEntries, +} from "./resolve.js"; + +const providerRegistry = new Map([ + ["openai", { capabilities: ["image"] }], + ["groq", { capabilities: ["audio"] }], +]); + +describe("resolveModelEntries", () => { + it("uses provider capabilities for shared entries without explicit caps", () => { + const cfg: ClawdbotConfig = { + tools: { + media: { + models: [{ provider: "openai", model: "gpt-5.2" }], + }, + }, + }; + + const imageEntries = resolveModelEntries({ + cfg, + capability: "image", + providerRegistry, + }); + expect(imageEntries).toHaveLength(1); + + const audioEntries = resolveModelEntries({ + cfg, + capability: "audio", + providerRegistry, + }); + expect(audioEntries).toHaveLength(0); + }); + + it("keeps per-capability entries even without explicit caps", () => { + const cfg: ClawdbotConfig = { + tools: { + media: { + image: { + models: [{ provider: "openai", model: "gpt-5.2" }], + }, + }, + }, + }; + + const imageEntries = resolveModelEntries({ + cfg, + capability: "image", + config: cfg.tools?.media?.image, + providerRegistry, + }); + expect(imageEntries).toHaveLength(1); + }); + + it("skips shared CLI entries without capabilities", () => { + const cfg: ClawdbotConfig = { + tools: { + media: { + models: [{ type: "cli", command: "gemini", args: ["--file", "{{MediaPath}}"] }], + }, + }, + }; + + const entries = resolveModelEntries({ + cfg, + capability: "image", + providerRegistry, + }); + expect(entries).toHaveLength(0); + }); +}); + +describe("resolveEntriesWithActiveFallback", () => { + it("uses active model when enabled and no models are configured", () => { + const cfg: ClawdbotConfig = { + tools: { + media: { + audio: { enabled: true }, + }, + }, + }; + + const entries = resolveEntriesWithActiveFallback({ + cfg, + capability: "audio", + config: cfg.tools?.media?.audio, + providerRegistry, + activeModel: { provider: "groq", model: "whisper-large-v3" }, + }); + expect(entries).toHaveLength(1); + expect(entries[0]?.provider).toBe("groq"); + }); + + it("ignores active model when configured entries exist", () => { + const cfg: ClawdbotConfig = { + tools: { + media: { + audio: { enabled: true, models: [{ provider: "openai", model: "whisper-1" }] }, + }, + }, + }; + + const entries = resolveEntriesWithActiveFallback({ + cfg, + capability: "audio", + config: cfg.tools?.media?.audio, + providerRegistry, + activeModel: { provider: "groq", model: "whisper-large-v3" }, + }); + expect(entries).toHaveLength(1); + expect(entries[0]?.provider).toBe("openai"); + }); + + it("skips active model when provider lacks capability", () => { + const cfg: ClawdbotConfig = { + tools: { + media: { + video: { enabled: true }, + }, + }, + }; + + const entries = resolveEntriesWithActiveFallback({ + cfg, + capability: "video", + config: cfg.tools?.media?.video, + providerRegistry, + activeModel: { provider: "groq", model: "whisper-large-v3" }, + }); + expect(entries).toHaveLength(0); + }); +}); diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index d33ab6296..675d47e37 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -98,6 +98,23 @@ function buildModelDecision(params: { }; } +function formatDecisionSummary(decision: MediaUnderstandingDecision): string { + const total = decision.attachments.length; + const success = decision.attachments.filter((entry) => entry.chosen?.outcome === "success").length; + const chosen = decision.attachments.find((entry) => entry.chosen)?.chosen; + const provider = chosen?.provider?.trim(); + const model = chosen?.model?.trim(); + const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined; + const reason = decision.attachments + .flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean)) + .find(Boolean); + const shortReason = reason ? reason.split(":")[0]?.trim() : undefined; + const countLabel = total > 0 ? ` (${success}/${total})` : ""; + const viaLabel = modelLabel ? ` via ${modelLabel}` : ""; + const reasonLabel = shortReason ? ` reason=${shortReason}` : ""; + return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`; +} + async function runProviderEntry(params: { capability: MediaUnderstandingCapability; entry: MediaUnderstandingModelConfig; @@ -495,12 +512,16 @@ export async function runCapability(params: { chosen: attempts.find((attempt) => attempt.outcome === "success"), }); } + const decision: MediaUnderstandingDecision = { + capability, + outcome: outputs.length > 0 ? "success" : "skipped", + attachments: attachmentDecisions, + }; + if (shouldLogVerbose()) { + logVerbose(`Media understanding ${formatDecisionSummary(decision)}`); + } return { outputs, - decision: { - capability, - outcome: outputs.length > 0 ? "success" : "skipped", - attachments: attachmentDecisions, - }, + decision, }; }