refactor: tighten media diagnostics

2026-01-17 07:27:38 +00:00
parent 0c0e1e4226
commit 2ee45d50a4
5 changed files with 252 additions and 10 deletions
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -256,6 +256,15 @@ When `mode: "all"`, outputs are labeled `[Image 1/2]`, `[Audio 2/2]`, etc.
 }
 ```
 ## Status output
 When media understanding runs, `/status` includes a short summary line:
 ```
 📎 Media: image ok (openai/gpt-5.2) · audio skipped (maxBytes)
 ```
 This shows per‑capability outcomes and the chosen provider/model when applicable.
 ## Notes
 - Understanding is **best‑effort**. Errors do not block replies.
 - Attachments are still passed to models even when understanding is disabled.
--- a/src/auto-reply/media-note.test.ts
+++ b/src/auto-reply/media-note.test.ts
@@ -41,4 +41,69 @@ describe("buildInboundMediaNote", () => {
    });
    expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
  });
  it("only suppresses attachments when media understanding succeeded", () => {
    const note = buildInboundMediaNote({
      MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
      MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
      MediaUnderstandingDecisions: [
        {
          capability: "image",
          outcome: "skipped",
          attachments: [
            {
              attachmentIndex: 0,
              attempts: [
                {
                  type: "provider",
                  outcome: "skipped",
                  reason: "maxBytes: too large",
                },
              ],
            },
          ],
        },
      ],
    });
    expect(note).toBe(
      [
        "[media attached: 2 files]",
        "[media attached 1/2: /tmp/a.png | https://example.com/a.png]",
        "[media attached 2/2: /tmp/b.png | https://example.com/b.png]",
      ].join("\n"),
    );
  });
  it("suppresses attachments when media understanding succeeds via decisions", () => {
    const note = buildInboundMediaNote({
      MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
      MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
      MediaUnderstandingDecisions: [
        {
          capability: "image",
          outcome: "success",
          attachments: [
            {
              attachmentIndex: 0,
              attempts: [
                {
                  type: "provider",
                  outcome: "success",
                  provider: "openai",
                  model: "gpt-5.2",
                },
              ],
              chosen: {
                type: "provider",
                outcome: "success",
                provider: "openai",
                model: "gpt-5.2",
              },
            },
          ],
        },
      ],
    });
    expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
  });
 });
--- a/src/auto-reply/media-note.ts
+++ b/src/auto-reply/media-note.ts
@@ -19,11 +19,22 @@ function formatMediaAttachedLine(params: {
 export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
  // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
-  const suppressed = new Set(
+  const suppressed = new Set<number>();
-    Array.isArray(ctx.MediaUnderstanding)
+  if (Array.isArray(ctx.MediaUnderstanding)) {
-      ? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
+    for (const output of ctx.MediaUnderstanding) {
-      : [],
+      suppressed.add(output.attachmentIndex);
-  );
+    }
  }
  if (Array.isArray(ctx.MediaUnderstandingDecisions)) {
    for (const decision of ctx.MediaUnderstandingDecisions) {
      if (decision.outcome !== "success") continue;
      for (const attachment of decision.attachments) {
        if (attachment.chosen?.outcome === "success") {
          suppressed.add(attachment.attachmentIndex);
        }
      }
    }
  }
  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
  const paths =
    pathsFromArray && pathsFromArray.length > 0
--- a/src/media-understanding/resolve.test.ts
+++ b/src/media-understanding/resolve.test.ts
@@ -0,0 +1,136 @@
 import { describe, expect, it } from "vitest";
 import type { ClawdbotConfig } from "../config/config.js";
 import {
  resolveEntriesWithActiveFallback,
  resolveModelEntries,
 } from "./resolve.js";
 const providerRegistry = new Map([
  ["openai", { capabilities: ["image"] }],
  ["groq", { capabilities: ["audio"] }],
 ]);
 describe("resolveModelEntries", () => {
  it("uses provider capabilities for shared entries without explicit caps", () => {
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          models: [{ provider: "openai", model: "gpt-5.2" }],
        },
      },
    };
    const imageEntries = resolveModelEntries({
      cfg,
      capability: "image",
      providerRegistry,
    });
    expect(imageEntries).toHaveLength(1);
    const audioEntries = resolveModelEntries({
      cfg,
      capability: "audio",
      providerRegistry,
    });
    expect(audioEntries).toHaveLength(0);
  });
  it("keeps per-capability entries even without explicit caps", () => {
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          image: {
            models: [{ provider: "openai", model: "gpt-5.2" }],
          },
        },
      },
    };
    const imageEntries = resolveModelEntries({
      cfg,
      capability: "image",
      config: cfg.tools?.media?.image,
      providerRegistry,
    });
    expect(imageEntries).toHaveLength(1);
  });
  it("skips shared CLI entries without capabilities", () => {
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          models: [{ type: "cli", command: "gemini", args: ["--file", "{{MediaPath}}"] }],
        },
      },
    };
    const entries = resolveModelEntries({
      cfg,
      capability: "image",
      providerRegistry,
    });
    expect(entries).toHaveLength(0);
  });
 });
 describe("resolveEntriesWithActiveFallback", () => {
  it("uses active model when enabled and no models are configured", () => {
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          audio: { enabled: true },
        },
      },
    };
    const entries = resolveEntriesWithActiveFallback({
      cfg,
      capability: "audio",
      config: cfg.tools?.media?.audio,
      providerRegistry,
      activeModel: { provider: "groq", model: "whisper-large-v3" },
    });
    expect(entries).toHaveLength(1);
    expect(entries[0]?.provider).toBe("groq");
  });
  it("ignores active model when configured entries exist", () => {
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          audio: { enabled: true, models: [{ provider: "openai", model: "whisper-1" }] },
        },
      },
    };
    const entries = resolveEntriesWithActiveFallback({
      cfg,
      capability: "audio",
      config: cfg.tools?.media?.audio,
      providerRegistry,
      activeModel: { provider: "groq", model: "whisper-large-v3" },
    });
    expect(entries).toHaveLength(1);
    expect(entries[0]?.provider).toBe("openai");
  });
  it("skips active model when provider lacks capability", () => {
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          video: { enabled: true },
        },
      },
    };
    const entries = resolveEntriesWithActiveFallback({
      cfg,
      capability: "video",
      config: cfg.tools?.media?.video,
      providerRegistry,
      activeModel: { provider: "groq", model: "whisper-large-v3" },
    });
    expect(entries).toHaveLength(0);
  });
 });
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -98,6 +98,23 @@ function buildModelDecision(params: {
  };
 }
 function formatDecisionSummary(decision: MediaUnderstandingDecision): string {
  const total = decision.attachments.length;
  const success = decision.attachments.filter((entry) => entry.chosen?.outcome === "success").length;
  const chosen = decision.attachments.find((entry) => entry.chosen)?.chosen;
  const provider = chosen?.provider?.trim();
  const model = chosen?.model?.trim();
  const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
  const reason = decision.attachments
    .flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean))
    .find(Boolean);
  const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
  const countLabel = total > 0 ? ` (${success}/${total})` : "";
  const viaLabel = modelLabel ? ` via ${modelLabel}` : "";
  const reasonLabel = shortReason ? ` reason=${shortReason}` : "";
  return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
 }
 async function runProviderEntry(params: {
  capability: MediaUnderstandingCapability;
  entry: MediaUnderstandingModelConfig;
@@ -495,12 +512,16 @@ export async function runCapability(params: {
      chosen: attempts.find((attempt) => attempt.outcome === "success"),
    });
  }
  const decision: MediaUnderstandingDecision = {
    capability,
    outcome: outputs.length > 0 ? "success" : "skipped",
    attachments: attachmentDecisions,
  };
  if (shouldLogVerbose()) {
    logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
  }
  return {
    outputs,
-    decision: {
+    decision,
      capability,
      outcome: outputs.length > 0 ? "success" : "skipped",
      attachments: attachmentDecisions,
    },
  };
 }