feat: add inbound media understanding
Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
This commit is contained in:
@@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => {
|
||||
].join("\n"),
|
||||
);
|
||||
});
|
||||
|
||||
it("skips media notes for attachments with understanding output", () => {
|
||||
const note = buildInboundMediaNote({
|
||||
MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
|
||||
MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
|
||||
MediaUnderstanding: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 0,
|
||||
text: "hello",
|
||||
provider: "groq",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: {
|
||||
}
|
||||
|
||||
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
||||
// Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
|
||||
const suppressed = new Set(
|
||||
Array.isArray(ctx.MediaUnderstanding)
|
||||
? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
|
||||
: [],
|
||||
);
|
||||
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
|
||||
const paths =
|
||||
pathsFromArray && pathsFromArray.length > 0
|
||||
@@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
||||
? ctx.MediaTypes
|
||||
: undefined;
|
||||
|
||||
if (paths.length === 1) {
|
||||
const entries = paths
|
||||
.map((entry, index) => ({
|
||||
path: entry ?? "",
|
||||
type: types?.[index] ?? ctx.MediaType,
|
||||
url: urls?.[index] ?? ctx.MediaUrl,
|
||||
index,
|
||||
}))
|
||||
.filter((entry) => !suppressed.has(entry.index));
|
||||
if (entries.length === 0) return undefined;
|
||||
if (entries.length === 1) {
|
||||
return formatMediaAttachedLine({
|
||||
path: paths[0] ?? "",
|
||||
type: types?.[0] ?? ctx.MediaType,
|
||||
url: urls?.[0] ?? ctx.MediaUrl,
|
||||
path: entries[0]?.path ?? "",
|
||||
type: entries[0]?.type,
|
||||
url: entries[0]?.url,
|
||||
});
|
||||
}
|
||||
|
||||
const count = paths.length;
|
||||
const count = entries.length;
|
||||
const lines: string[] = [`[media attached: ${count} files]`];
|
||||
for (const [idx, mediaPath] of paths.entries()) {
|
||||
for (const [idx, entry] of entries.entries()) {
|
||||
lines.push(
|
||||
formatMediaAttachedLine({
|
||||
path: mediaPath,
|
||||
path: entry.path,
|
||||
index: idx + 1,
|
||||
total: count,
|
||||
type: types?.[idx],
|
||||
url: urls?.[idx],
|
||||
type: entry.type,
|
||||
url: entry.url,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
|
||||
const commandSource =
|
||||
sessionCtx.CommandBody ??
|
||||
sessionCtx.RawBody ??
|
||||
sessionCtx.Transcript ??
|
||||
sessionCtx.BodyStripped ??
|
||||
sessionCtx.Body ??
|
||||
"";
|
||||
|
||||
@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
|
||||
cap?: number;
|
||||
dropPolicy?: InlineDirectives["dropPolicy"];
|
||||
};
|
||||
transcribedText?: string;
|
||||
typing: TypingController;
|
||||
opts?: GetReplyOptions;
|
||||
defaultModel: string;
|
||||
@@ -210,7 +209,6 @@ export async function runPreparedReply(
|
||||
model,
|
||||
perMessageQueueMode,
|
||||
perMessageQueueOptions,
|
||||
transcribedText,
|
||||
typing,
|
||||
opts,
|
||||
defaultModel,
|
||||
@@ -325,11 +323,7 @@ export async function runPreparedReply(
|
||||
sessionEntry = skillResult.sessionEntry ?? sessionEntry;
|
||||
currentSystemSent = skillResult.systemSent;
|
||||
const skillsSnapshot = skillResult.skillsSnapshot;
|
||||
const prefixedBody = transcribedText
|
||||
? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
|
||||
.filter(Boolean)
|
||||
.join("\n\n")
|
||||
: [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
|
||||
const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
|
||||
const mediaNote = buildInboundMediaNote(ctx);
|
||||
const mediaReplyHint = mediaNote
|
||||
? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
|
||||
@@ -370,11 +364,7 @@ export async function runPreparedReply(
|
||||
}
|
||||
const sessionIdFinal = sessionId ?? crypto.randomUUID();
|
||||
const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
|
||||
const queueBodyBase = transcribedText
|
||||
? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
|
||||
.filter(Boolean)
|
||||
.join("\n\n")
|
||||
: [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
|
||||
const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
|
||||
const queuedBody = mediaNote
|
||||
? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
|
||||
: queueBodyBase;
|
||||
|
||||
@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
|
||||
import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
|
||||
import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
|
||||
import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
|
||||
import { logVerbose } from "../../globals.js";
|
||||
import { defaultRuntime } from "../../runtime.js";
|
||||
import { resolveCommandAuthorization } from "../command-auth.js";
|
||||
import type { MsgContext } from "../templating.js";
|
||||
import { SILENT_REPLY_TOKEN } from "../tokens.js";
|
||||
import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
|
||||
import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
|
||||
import type { GetReplyOptions, ReplyPayload } from "../types.js";
|
||||
import { resolveDefaultModel } from "./directive-handling.js";
|
||||
import { resolveReplyDirectives } from "./get-reply-directives.js";
|
||||
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
|
||||
});
|
||||
opts?.onTypingController?.(typing);
|
||||
|
||||
let transcribedText: string | undefined;
|
||||
if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
|
||||
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
|
||||
if (transcribed?.text) {
|
||||
transcribedText = transcribed.text;
|
||||
ctx.Body = transcribed.text;
|
||||
ctx.Transcript = transcribed.text;
|
||||
logVerbose("Replaced Body with audio transcript for reply flow");
|
||||
}
|
||||
}
|
||||
await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
agentDir,
|
||||
});
|
||||
|
||||
const commandAuthorized = ctx.CommandAuthorized ?? true;
|
||||
resolveCommandAuthorization({
|
||||
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
|
||||
model,
|
||||
perMessageQueueMode,
|
||||
perMessageQueueOptions,
|
||||
transcribedText,
|
||||
typing,
|
||||
opts,
|
||||
defaultModel,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import type { ChannelId } from "../channels/plugins/types.js";
|
||||
import type { InternalMessageChannel } from "../utils/message-channel.js";
|
||||
import type { CommandArgs } from "./commands-registry.types.js";
|
||||
import type { MediaUnderstandingOutput } from "../media-understanding/types.js";
|
||||
|
||||
/** Valid message channels for routing. */
|
||||
export type OriginatingChannelType = ChannelId | InternalMessageChannel;
|
||||
@@ -41,6 +42,9 @@ export type MsgContext = {
|
||||
/** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
|
||||
MediaRemoteHost?: string;
|
||||
Transcript?: string;
|
||||
MediaUnderstanding?: MediaUnderstandingOutput[];
|
||||
Prompt?: string;
|
||||
MaxChars?: number;
|
||||
ChatType?: string;
|
||||
GroupSubject?: string;
|
||||
GroupRoom?: string;
|
||||
|
||||
Reference in New Issue
Block a user