refactor: tighten media diagnostics

2026-01-17 07:27:38 +00:00
parent 0c0e1e4226
commit 2ee45d50a4
5 changed files with 252 additions and 10 deletions
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -256,6 +256,15 @@ When `mode: "all"`, outputs are labeled `[Image 1/2]`, `[Audio 2/2]`, etc.
 }
 ```

+## Status output
+When media understanding runs, `/status` includes a short summary line:
+
+```
+📎 Media: image ok (openai/gpt-5.2) · audio skipped (maxBytes)
+```
+
+This shows per‑capability outcomes and the chosen provider/model when applicable.
+
 ## Notes
 - Understanding is **best‑effort**. Errors do not block replies.
 - Attachments are still passed to models even when understanding is disabled.
--- a/src/auto-reply/media-note.test.ts
+++ b/src/auto-reply/media-note.test.ts
@@ -41,4 +41,69 @@ describe("buildInboundMediaNote", () => {
    });
    expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
  });
+
+  it("only suppresses attachments when media understanding succeeded", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
+      MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
+      MediaUnderstandingDecisions: [
+        {
+          capability: "image",
+          outcome: "skipped",
+          attachments: [
+            {
+              attachmentIndex: 0,
+              attempts: [
+                {
+                  type: "provider",
+                  outcome: "skipped",
+                  reason: "maxBytes: too large",
+                },
+              ],
+            },
+          ],
+        },
+      ],
+    });
+    expect(note).toBe(
+      [
+        "[media attached: 2 files]",
+        "[media attached 1/2: /tmp/a.png | https://example.com/a.png]",
+        "[media attached 2/2: /tmp/b.png | https://example.com/b.png]",
+      ].join("\n"),
+    );
+  });
+
+  it("suppresses attachments when media understanding succeeds via decisions", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
+      MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
+      MediaUnderstandingDecisions: [
+        {
+          capability: "image",
+          outcome: "success",
+          attachments: [
+            {
+              attachmentIndex: 0,
+              attempts: [
+                {
+                  type: "provider",
+                  outcome: "success",
+                  provider: "openai",
+                  model: "gpt-5.2",
+                },
+              ],
+              chosen: {
+                type: "provider",
+                outcome: "success",
+                provider: "openai",
+                model: "gpt-5.2",
+              },
+            },
+          ],
+        },
+      ],
+    });
+    expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
+  });
 });
--- a/src/auto-reply/media-note.ts
+++ b/src/auto-reply/media-note.ts
@@ -19,11 +19,22 @@ function formatMediaAttachedLine(params: {

 export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
  // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
-  const suppressed = new Set(
-    Array.isArray(ctx.MediaUnderstanding)
-      ? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
-      : [],
-  );
+  const suppressed = new Set<number>();
+  if (Array.isArray(ctx.MediaUnderstanding)) {
+    for (const output of ctx.MediaUnderstanding) {
+      suppressed.add(output.attachmentIndex);
+    }
+  }
+  if (Array.isArray(ctx.MediaUnderstandingDecisions)) {
+    for (const decision of ctx.MediaUnderstandingDecisions) {
+      if (decision.outcome !== "success") continue;
+      for (const attachment of decision.attachments) {
+        if (attachment.chosen?.outcome === "success") {
+          suppressed.add(attachment.attachmentIndex);
+        }
+      }
+    }
+  }
  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
  const paths =
    pathsFromArray && pathsFromArray.length > 0
--- a/src/media-understanding/resolve.test.ts
+++ b/src/media-understanding/resolve.test.ts
@@ -0,0 +1,136 @@
+import { describe, expect, it } from "vitest";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import {
+  resolveEntriesWithActiveFallback,
+  resolveModelEntries,
+} from "./resolve.js";
+
+const providerRegistry = new Map([
+  ["openai", { capabilities: ["image"] }],
+  ["groq", { capabilities: ["audio"] }],
+]);
+
+describe("resolveModelEntries", () => {
+  it("uses provider capabilities for shared entries without explicit caps", () => {
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          models: [{ provider: "openai", model: "gpt-5.2" }],
+        },
+      },
+    };
+
+    const imageEntries = resolveModelEntries({
+      cfg,
+      capability: "image",
+      providerRegistry,
+    });
+    expect(imageEntries).toHaveLength(1);
+
+    const audioEntries = resolveModelEntries({
+      cfg,
+      capability: "audio",
+      providerRegistry,
+    });
+    expect(audioEntries).toHaveLength(0);
+  });
+
+  it("keeps per-capability entries even without explicit caps", () => {
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          image: {
+            models: [{ provider: "openai", model: "gpt-5.2" }],
+          },
+        },
+      },
+    };
+
+    const imageEntries = resolveModelEntries({
+      cfg,
+      capability: "image",
+      config: cfg.tools?.media?.image,
+      providerRegistry,
+    });
+    expect(imageEntries).toHaveLength(1);
+  });
+
+  it("skips shared CLI entries without capabilities", () => {
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          models: [{ type: "cli", command: "gemini", args: ["--file", "{{MediaPath}}"] }],
+        },
+      },
+    };
+
+    const entries = resolveModelEntries({
+      cfg,
+      capability: "image",
+      providerRegistry,
+    });
+    expect(entries).toHaveLength(0);
+  });
+});
+
+describe("resolveEntriesWithActiveFallback", () => {
+  it("uses active model when enabled and no models are configured", () => {
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: { enabled: true },
+        },
+      },
+    };
+
+    const entries = resolveEntriesWithActiveFallback({
+      cfg,
+      capability: "audio",
+      config: cfg.tools?.media?.audio,
+      providerRegistry,
+      activeModel: { provider: "groq", model: "whisper-large-v3" },
+    });
+    expect(entries).toHaveLength(1);
+    expect(entries[0]?.provider).toBe("groq");
+  });
+
+  it("ignores active model when configured entries exist", () => {
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: { enabled: true, models: [{ provider: "openai", model: "whisper-1" }] },
+        },
+      },
+    };
+
+    const entries = resolveEntriesWithActiveFallback({
+      cfg,
+      capability: "audio",
+      config: cfg.tools?.media?.audio,
+      providerRegistry,
+      activeModel: { provider: "groq", model: "whisper-large-v3" },
+    });
+    expect(entries).toHaveLength(1);
+    expect(entries[0]?.provider).toBe("openai");
+  });
+
+  it("skips active model when provider lacks capability", () => {
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          video: { enabled: true },
+        },
+      },
+    };
+
+    const entries = resolveEntriesWithActiveFallback({
+      cfg,
+      capability: "video",
+      config: cfg.tools?.media?.video,
+      providerRegistry,
+      activeModel: { provider: "groq", model: "whisper-large-v3" },
+    });
+    expect(entries).toHaveLength(0);
+  });
+});
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -98,6 +98,23 @@ function buildModelDecision(params: {
  };
 }

+function formatDecisionSummary(decision: MediaUnderstandingDecision): string {
+  const total = decision.attachments.length;
+  const success = decision.attachments.filter((entry) => entry.chosen?.outcome === "success").length;
+  const chosen = decision.attachments.find((entry) => entry.chosen)?.chosen;
+  const provider = chosen?.provider?.trim();
+  const model = chosen?.model?.trim();
+  const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
+  const reason = decision.attachments
+    .flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean))
+    .find(Boolean);
+  const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
+  const countLabel = total > 0 ? ` (${success}/${total})` : "";
+  const viaLabel = modelLabel ? ` via ${modelLabel}` : "";
+  const reasonLabel = shortReason ? ` reason=${shortReason}` : "";
+  return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
+}
+
 async function runProviderEntry(params: {
  capability: MediaUnderstandingCapability;
  entry: MediaUnderstandingModelConfig;
@@ -495,12 +512,16 @@ export async function runCapability(params: {
      chosen: attempts.find((attempt) => attempt.outcome === "success"),
    });
  }
+  const decision: MediaUnderstandingDecision = {
+    capability,
+    outcome: outputs.length > 0 ? "success" : "skipped",
+    attachments: attachmentDecisions,
+  };
+  if (shouldLogVerbose()) {
+    logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
+  }
  return {
    outputs,
-    decision: {
-      capability,
-      outcome: outputs.length > 0 ? "success" : "skipped",
-      attachments: attachmentDecisions,
-    },
+    decision,
  };
 }