feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
This commit is contained in:
Peter Steinberger
2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions

View File

@@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => {
].join("\n"),
);
});
it("skips media notes for attachments with understanding output", () => {
const note = buildInboundMediaNote({
MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
MediaUnderstanding: [
{
kind: "audio.transcription",
attachmentIndex: 0,
text: "hello",
provider: "groq",
},
],
});
expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
});
});

View File

@@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: {
}
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
// Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
const suppressed = new Set(
Array.isArray(ctx.MediaUnderstanding)
? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
: [],
);
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
const paths =
pathsFromArray && pathsFromArray.length > 0
@@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
? ctx.MediaTypes
: undefined;
if (paths.length === 1) {
const entries = paths
.map((entry, index) => ({
path: entry ?? "",
type: types?.[index] ?? ctx.MediaType,
url: urls?.[index] ?? ctx.MediaUrl,
index,
}))
.filter((entry) => !suppressed.has(entry.index));
if (entries.length === 0) return undefined;
if (entries.length === 1) {
return formatMediaAttachedLine({
path: paths[0] ?? "",
type: types?.[0] ?? ctx.MediaType,
url: urls?.[0] ?? ctx.MediaUrl,
path: entries[0]?.path ?? "",
type: entries[0]?.type,
url: entries[0]?.url,
});
}
const count = paths.length;
const count = entries.length;
const lines: string[] = [`[media attached: ${count} files]`];
for (const [idx, mediaPath] of paths.entries()) {
for (const [idx, entry] of entries.entries()) {
lines.push(
formatMediaAttachedLine({
path: mediaPath,
path: entry.path,
index: idx + 1,
total: count,
type: types?.[idx],
url: urls?.[idx],
type: entry.type,
url: entry.url,
}),
);
}

View File

@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
const commandSource =
sessionCtx.CommandBody ??
sessionCtx.RawBody ??
sessionCtx.Transcript ??
sessionCtx.BodyStripped ??
sessionCtx.Body ??
"";

View File

@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
cap?: number;
dropPolicy?: InlineDirectives["dropPolicy"];
};
transcribedText?: string;
typing: TypingController;
opts?: GetReplyOptions;
defaultModel: string;
@@ -210,7 +209,6 @@ export async function runPreparedReply(
model,
perMessageQueueMode,
perMessageQueueOptions,
transcribedText,
typing,
opts,
defaultModel,
@@ -325,11 +323,7 @@ export async function runPreparedReply(
sessionEntry = skillResult.sessionEntry ?? sessionEntry;
currentSystemSent = skillResult.systemSent;
const skillsSnapshot = skillResult.skillsSnapshot;
const prefixedBody = transcribedText
? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
.filter(Boolean)
.join("\n\n")
: [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
const mediaNote = buildInboundMediaNote(ctx);
const mediaReplyHint = mediaNote
? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
@@ -370,11 +364,7 @@ export async function runPreparedReply(
}
const sessionIdFinal = sessionId ?? crypto.randomUUID();
const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
const queueBodyBase = transcribedText
? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
.filter(Boolean)
.join("\n\n")
: [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
const queuedBody = mediaNote
? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
: queueBodyBase;

View File

@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
import { logVerbose } from "../../globals.js";
import { defaultRuntime } from "../../runtime.js";
import { resolveCommandAuthorization } from "../command-auth.js";
import type { MsgContext } from "../templating.js";
import { SILENT_REPLY_TOKEN } from "../tokens.js";
import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
import type { GetReplyOptions, ReplyPayload } from "../types.js";
import { resolveDefaultModel } from "./directive-handling.js";
import { resolveReplyDirectives } from "./get-reply-directives.js";
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
});
opts?.onTypingController?.(typing);
let transcribedText: string | undefined;
if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
if (transcribed?.text) {
transcribedText = transcribed.text;
ctx.Body = transcribed.text;
ctx.Transcript = transcribed.text;
logVerbose("Replaced Body with audio transcript for reply flow");
}
}
await applyMediaUnderstanding({
ctx,
cfg,
agentDir,
});
const commandAuthorized = ctx.CommandAuthorized ?? true;
resolveCommandAuthorization({
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
model,
perMessageQueueMode,
perMessageQueueOptions,
transcribedText,
typing,
opts,
defaultModel,

View File

@@ -1,6 +1,7 @@
import type { ChannelId } from "../channels/plugins/types.js";
import type { InternalMessageChannel } from "../utils/message-channel.js";
import type { CommandArgs } from "./commands-registry.types.js";
import type { MediaUnderstandingOutput } from "../media-understanding/types.js";
/** Valid message channels for routing. */
export type OriginatingChannelType = ChannelId | InternalMessageChannel;
@@ -41,6 +42,9 @@ export type MsgContext = {
/** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
MediaRemoteHost?: string;
Transcript?: string;
MediaUnderstanding?: MediaUnderstandingOutput[];
Prompt?: string;
MaxChars?: number;
ChatType?: string;
GroupSubject?: string;
GroupRoom?: string;