feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
This commit is contained in:
Peter Steinberger
2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions

View File

@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
const commandSource =
sessionCtx.CommandBody ??
sessionCtx.RawBody ??
sessionCtx.Transcript ??
sessionCtx.BodyStripped ??
sessionCtx.Body ??
"";

View File

@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
cap?: number;
dropPolicy?: InlineDirectives["dropPolicy"];
};
transcribedText?: string;
typing: TypingController;
opts?: GetReplyOptions;
defaultModel: string;
@@ -210,7 +209,6 @@ export async function runPreparedReply(
model,
perMessageQueueMode,
perMessageQueueOptions,
transcribedText,
typing,
opts,
defaultModel,
@@ -325,11 +323,7 @@ export async function runPreparedReply(
sessionEntry = skillResult.sessionEntry ?? sessionEntry;
currentSystemSent = skillResult.systemSent;
const skillsSnapshot = skillResult.skillsSnapshot;
const prefixedBody = transcribedText
? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
.filter(Boolean)
.join("\n\n")
: [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
const mediaNote = buildInboundMediaNote(ctx);
const mediaReplyHint = mediaNote
? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
@@ -370,11 +364,7 @@ export async function runPreparedReply(
}
const sessionIdFinal = sessionId ?? crypto.randomUUID();
const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
const queueBodyBase = transcribedText
? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
.filter(Boolean)
.join("\n\n")
: [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
const queuedBody = mediaNote
? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
: queueBodyBase;

View File

@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
import { logVerbose } from "../../globals.js";
import { defaultRuntime } from "../../runtime.js";
import { resolveCommandAuthorization } from "../command-auth.js";
import type { MsgContext } from "../templating.js";
import { SILENT_REPLY_TOKEN } from "../tokens.js";
import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
import type { GetReplyOptions, ReplyPayload } from "../types.js";
import { resolveDefaultModel } from "./directive-handling.js";
import { resolveReplyDirectives } from "./get-reply-directives.js";
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
});
opts?.onTypingController?.(typing);
let transcribedText: string | undefined;
if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
if (transcribed?.text) {
transcribedText = transcribed.text;
ctx.Body = transcribed.text;
ctx.Transcript = transcribed.text;
logVerbose("Replaced Body with audio transcript for reply flow");
}
}
await applyMediaUnderstanding({
ctx,
cfg,
agentDir,
});
const commandAuthorized = ctx.CommandAuthorized ?? true;
resolveCommandAuthorization({
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
model,
perMessageQueueMode,
perMessageQueueOptions,
transcribedText,
typing,
opts,
defaultModel,