feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions
--- a/src/auto-reply/reply/get-reply.ts
+++ b/src/auto-reply/reply/get-reply.ts
@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
 import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
 import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
 import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
-import { logVerbose } from "../../globals.js";
 import { defaultRuntime } from "../../runtime.js";
 import { resolveCommandAuthorization } from "../command-auth.js";
 import type { MsgContext } from "../templating.js";
 import { SILENT_REPLY_TOKEN } from "../tokens.js";
-import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveDefaultModel } from "./directive-handling.js";
 import { resolveReplyDirectives } from "./get-reply-directives.js";
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
  });
  opts?.onTypingController?.(typing);

-  let transcribedText: string | undefined;
-  if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
-    const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
-    if (transcribed?.text) {
-      transcribedText = transcribed.text;
-      ctx.Body = transcribed.text;
-      ctx.Transcript = transcribed.text;
-      logVerbose("Replaced Body with audio transcript for reply flow");
-    }
-  }
+  await applyMediaUnderstanding({
+    ctx,
+    cfg,
+    agentDir,
+  });

  const commandAuthorized = ctx.CommandAuthorized ?? true;
  resolveCommandAuthorization({
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
    model,
    perMessageQueueMode,
    perMessageQueueOptions,
-    transcribedText,
    typing,
    opts,
    defaultModel,