feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions
--- a/src/auto-reply/media-note.test.ts
+++ b/src/auto-reply/media-note.test.ts
@@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => {
      ].join("\n"),
    );
  });
+
+  it("skips media notes for attachments with understanding output", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
+      MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
+      MediaUnderstanding: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "hello",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
+  });
 });
--- a/src/auto-reply/media-note.ts
+++ b/src/auto-reply/media-note.ts
@@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: {
 }

 export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
+  // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
+  const suppressed = new Set(
+    Array.isArray(ctx.MediaUnderstanding)
+      ? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
+      : [],
+  );
  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
  const paths =
    pathsFromArray && pathsFromArray.length > 0
@@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
      ? ctx.MediaTypes
      : undefined;

-  if (paths.length === 1) {
+  const entries = paths
+    .map((entry, index) => ({
+      path: entry ?? "",
+      type: types?.[index] ?? ctx.MediaType,
+      url: urls?.[index] ?? ctx.MediaUrl,
+      index,
+    }))
+    .filter((entry) => !suppressed.has(entry.index));
+  if (entries.length === 0) return undefined;
+  if (entries.length === 1) {
    return formatMediaAttachedLine({
-      path: paths[0] ?? "",
-      type: types?.[0] ?? ctx.MediaType,
-      url: urls?.[0] ?? ctx.MediaUrl,
+      path: entries[0]?.path ?? "",
+      type: entries[0]?.type,
+      url: entries[0]?.url,
    });
  }

-  const count = paths.length;
+  const count = entries.length;
  const lines: string[] = [`[media attached: ${count} files]`];
-  for (const [idx, mediaPath] of paths.entries()) {
+  for (const [idx, entry] of entries.entries()) {
    lines.push(
      formatMediaAttachedLine({
-        path: mediaPath,
+        path: entry.path,
        index: idx + 1,
        total: count,
-        type: types?.[idx],
-        url: urls?.[idx],
+        type: entry.type,
+        url: entry.url,
      }),
    );
  }
--- a/src/auto-reply/reply/get-reply-directives.ts
+++ b/src/auto-reply/reply/get-reply-directives.ts
@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
  const commandSource =
    sessionCtx.CommandBody ??
    sessionCtx.RawBody ??
+    sessionCtx.Transcript ??
    sessionCtx.BodyStripped ??
    sessionCtx.Body ??
    "";
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
    cap?: number;
    dropPolicy?: InlineDirectives["dropPolicy"];
  };
-  transcribedText?: string;
  typing: TypingController;
  opts?: GetReplyOptions;
  defaultModel: string;
@@ -210,7 +209,6 @@ export async function runPreparedReply(
    model,
    perMessageQueueMode,
    perMessageQueueOptions,
-    transcribedText,
    typing,
    opts,
    defaultModel,
@@ -325,11 +323,7 @@ export async function runPreparedReply(
  sessionEntry = skillResult.sessionEntry ?? sessionEntry;
  currentSystemSent = skillResult.systemSent;
  const skillsSnapshot = skillResult.skillsSnapshot;
-  const prefixedBody = transcribedText
-    ? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
-        .filter(Boolean)
-        .join("\n\n")
-    : [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
+  const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
  const mediaNote = buildInboundMediaNote(ctx);
  const mediaReplyHint = mediaNote
    ? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
@@ -370,11 +364,7 @@ export async function runPreparedReply(
  }
  const sessionIdFinal = sessionId ?? crypto.randomUUID();
  const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
-  const queueBodyBase = transcribedText
-    ? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
-        .filter(Boolean)
-        .join("\n\n")
-    : [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
+  const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
  const queuedBody = mediaNote
    ? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
    : queueBodyBase;
--- a/src/auto-reply/reply/get-reply.ts
+++ b/src/auto-reply/reply/get-reply.ts
@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
 import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
 import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
 import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
-import { logVerbose } from "../../globals.js";
 import { defaultRuntime } from "../../runtime.js";
 import { resolveCommandAuthorization } from "../command-auth.js";
 import type { MsgContext } from "../templating.js";
 import { SILENT_REPLY_TOKEN } from "../tokens.js";
-import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveDefaultModel } from "./directive-handling.js";
 import { resolveReplyDirectives } from "./get-reply-directives.js";
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
  });
  opts?.onTypingController?.(typing);

-  let transcribedText: string | undefined;
-  if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
-    const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
-    if (transcribed?.text) {
-      transcribedText = transcribed.text;
-      ctx.Body = transcribed.text;
-      ctx.Transcript = transcribed.text;
-      logVerbose("Replaced Body with audio transcript for reply flow");
-    }
-  }
+  await applyMediaUnderstanding({
+    ctx,
+    cfg,
+    agentDir,
+  });

  const commandAuthorized = ctx.CommandAuthorized ?? true;
  resolveCommandAuthorization({
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
    model,
    perMessageQueueMode,
    perMessageQueueOptions,
-    transcribedText,
    typing,
    opts,
    defaultModel,
--- a/src/auto-reply/templating.ts
+++ b/src/auto-reply/templating.ts
@@ -1,6 +1,7 @@
 import type { ChannelId } from "../channels/plugins/types.js";
 import type { InternalMessageChannel } from "../utils/message-channel.js";
 import type { CommandArgs } from "./commands-registry.types.js";
+import type { MediaUnderstandingOutput } from "../media-understanding/types.js";

 /** Valid message channels for routing. */
 export type OriginatingChannelType = ChannelId | InternalMessageChannel;
@@ -41,6 +42,9 @@ export type MsgContext = {
  /** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
  MediaRemoteHost?: string;
  Transcript?: string;
+  MediaUnderstanding?: MediaUnderstandingOutput[];
+  Prompt?: string;
+  MaxChars?: number;
  ChatType?: string;
  GroupSubject?: string;
  GroupRoom?: string;