feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
This commit is contained in:
Peter Steinberger
2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions

View File

@@ -254,7 +254,10 @@ export function createExecTool(
let yielded = false;
let yieldTimer: NodeJS.Timeout | null = null;
let timeoutTimer: NodeJS.Timeout | null = null;
let timeoutFinalizeTimer: NodeJS.Timeout | null = null;
let timedOut = false;
const timeoutFinalizeMs = 1000;
let rejectFn: ((err: Error) => void) | null = null;
const settle = (fn: () => void) => {
if (settled) return;
@@ -262,6 +265,18 @@ export function createExecTool(
fn();
};
const effectiveTimeout =
typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
const finalizeTimeout = () => {
if (session.exited) return;
markExited(session, null, "SIGKILL", "failed");
if (settled || !rejectFn) return;
const aggregated = session.aggregated.trim();
const reason = `Command timed out after ${effectiveTimeout} seconds`;
const message = aggregated ? `${aggregated}\n\n${reason}` : reason;
settle(() => rejectFn?.(new Error(message)));
};
// Tool-call abort should not kill backgrounded sessions; timeouts still must.
const onAbortSignal = () => {
if (yielded || session.backgrounded) return;
@@ -272,15 +287,17 @@ export function createExecTool(
const onTimeout = () => {
timedOut = true;
killSession(session);
if (!timeoutFinalizeTimer) {
timeoutFinalizeTimer = setTimeout(() => {
finalizeTimeout();
}, timeoutFinalizeMs);
}
};
if (signal?.aborted) onAbortSignal();
else if (signal) {
signal.addEventListener("abort", onAbortSignal, { once: true });
}
const effectiveTimeout =
typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
if (effectiveTimeout > 0) {
timeoutTimer = setTimeout(() => {
onTimeout();
@@ -321,6 +338,7 @@ export function createExecTool(
});
return new Promise<AgentToolResult<ExecToolDetails>>((resolve, reject) => {
rejectFn = reject;
const resolveRunning = () => {
settle(() =>
resolve({
@@ -369,6 +387,7 @@ export function createExecTool(
const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
if (yieldTimer) clearTimeout(yieldTimer);
if (timeoutTimer) clearTimeout(timeoutTimer);
if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
const durationMs = Date.now() - startedAt;
const wasSignal = exitSignal != null;
const isSuccess = code === 0 && !wasSignal && !signal?.aborted && !timedOut;
@@ -421,6 +440,7 @@ export function createExecTool(
child.once("error", (err) => {
if (yieldTimer) clearTimeout(yieldTimer);
if (timeoutTimer) clearTimeout(timeoutTimer);
if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
markExited(session, null, null, "failed");
settle(() => reject(err));
});

View File

@@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => {
].join("\n"),
);
});
it("skips media notes for attachments with understanding output", () => {
const note = buildInboundMediaNote({
MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
MediaUnderstanding: [
{
kind: "audio.transcription",
attachmentIndex: 0,
text: "hello",
provider: "groq",
},
],
});
expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
});
});

View File

@@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: {
}
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
// Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
const suppressed = new Set(
Array.isArray(ctx.MediaUnderstanding)
? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
: [],
);
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
const paths =
pathsFromArray && pathsFromArray.length > 0
@@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
? ctx.MediaTypes
: undefined;
if (paths.length === 1) {
const entries = paths
.map((entry, index) => ({
path: entry ?? "",
type: types?.[index] ?? ctx.MediaType,
url: urls?.[index] ?? ctx.MediaUrl,
index,
}))
.filter((entry) => !suppressed.has(entry.index));
if (entries.length === 0) return undefined;
if (entries.length === 1) {
return formatMediaAttachedLine({
path: paths[0] ?? "",
type: types?.[0] ?? ctx.MediaType,
url: urls?.[0] ?? ctx.MediaUrl,
path: entries[0]?.path ?? "",
type: entries[0]?.type,
url: entries[0]?.url,
});
}
const count = paths.length;
const count = entries.length;
const lines: string[] = [`[media attached: ${count} files]`];
for (const [idx, mediaPath] of paths.entries()) {
for (const [idx, entry] of entries.entries()) {
lines.push(
formatMediaAttachedLine({
path: mediaPath,
path: entry.path,
index: idx + 1,
total: count,
type: types?.[idx],
url: urls?.[idx],
type: entry.type,
url: entry.url,
}),
);
}

View File

@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
const commandSource =
sessionCtx.CommandBody ??
sessionCtx.RawBody ??
sessionCtx.Transcript ??
sessionCtx.BodyStripped ??
sessionCtx.Body ??
"";

View File

@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
cap?: number;
dropPolicy?: InlineDirectives["dropPolicy"];
};
transcribedText?: string;
typing: TypingController;
opts?: GetReplyOptions;
defaultModel: string;
@@ -210,7 +209,6 @@ export async function runPreparedReply(
model,
perMessageQueueMode,
perMessageQueueOptions,
transcribedText,
typing,
opts,
defaultModel,
@@ -325,11 +323,7 @@ export async function runPreparedReply(
sessionEntry = skillResult.sessionEntry ?? sessionEntry;
currentSystemSent = skillResult.systemSent;
const skillsSnapshot = skillResult.skillsSnapshot;
const prefixedBody = transcribedText
? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
.filter(Boolean)
.join("\n\n")
: [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
const mediaNote = buildInboundMediaNote(ctx);
const mediaReplyHint = mediaNote
? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
@@ -370,11 +364,7 @@ export async function runPreparedReply(
}
const sessionIdFinal = sessionId ?? crypto.randomUUID();
const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
const queueBodyBase = transcribedText
? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
.filter(Boolean)
.join("\n\n")
: [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
const queuedBody = mediaNote
? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
: queueBodyBase;

View File

@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
import { logVerbose } from "../../globals.js";
import { defaultRuntime } from "../../runtime.js";
import { resolveCommandAuthorization } from "../command-auth.js";
import type { MsgContext } from "../templating.js";
import { SILENT_REPLY_TOKEN } from "../tokens.js";
import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
import type { GetReplyOptions, ReplyPayload } from "../types.js";
import { resolveDefaultModel } from "./directive-handling.js";
import { resolveReplyDirectives } from "./get-reply-directives.js";
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
});
opts?.onTypingController?.(typing);
let transcribedText: string | undefined;
if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
if (transcribed?.text) {
transcribedText = transcribed.text;
ctx.Body = transcribed.text;
ctx.Transcript = transcribed.text;
logVerbose("Replaced Body with audio transcript for reply flow");
}
}
await applyMediaUnderstanding({
ctx,
cfg,
agentDir,
});
const commandAuthorized = ctx.CommandAuthorized ?? true;
resolveCommandAuthorization({
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
model,
perMessageQueueMode,
perMessageQueueOptions,
transcribedText,
typing,
opts,
defaultModel,

View File

@@ -1,6 +1,7 @@
import type { ChannelId } from "../channels/plugins/types.js";
import type { InternalMessageChannel } from "../utils/message-channel.js";
import type { CommandArgs } from "./commands-registry.types.js";
import type { MediaUnderstandingOutput } from "../media-understanding/types.js";
/** Valid message channels for routing. */
export type OriginatingChannelType = ChannelId | InternalMessageChannel;
@@ -41,6 +42,9 @@ export type MsgContext = {
/** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
MediaRemoteHost?: string;
Transcript?: string;
MediaUnderstanding?: MediaUnderstandingOutput[];
Prompt?: string;
MaxChars?: number;
ChatType?: string;
GroupSubject?: string;
GroupRoom?: string;

View File

@@ -65,7 +65,7 @@ describe("legacy config detection", () => {
expect(res.config?.messages?.groupChat?.mentionPatterns).toEqual(["@clawd"]);
expect(res.config?.routing?.groupChat?.mentionPatterns).toBeUndefined();
});
it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/audio", async () => {
it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/media", async () => {
vi.resetModules();
const { migrateLegacyConfig } = await import("./config.js");
const res = migrateLegacyConfig({
@@ -80,7 +80,7 @@ describe("legacy config detection", () => {
});
expect(res.changes).toContain("Moved routing.agentToAgent → tools.agentToAgent.");
expect(res.changes).toContain("Moved routing.queue → messages.queue.");
expect(res.changes).toContain("Moved routing.transcribeAudio → tools.audio.transcription.");
expect(res.changes).toContain("Moved routing.transcribeAudio → tools.media.audio.models.");
expect(res.config?.tools?.agentToAgent).toEqual({
enabled: true,
allow: ["main"],
@@ -89,9 +89,16 @@ describe("legacy config detection", () => {
mode: "queue",
cap: 3,
});
expect(res.config?.tools?.audio?.transcription).toEqual({
args: ["--model", "base"],
timeoutSeconds: 2,
expect(res.config?.tools?.media?.audio).toEqual({
enabled: true,
models: [
{
command: "whisper",
type: "cli",
args: ["--model", "base"],
timeoutSeconds: 2,
},
],
});
expect(res.config?.routing).toBeUndefined();
});

View File

@@ -330,14 +330,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
const mapped = mapLegacyAudioTranscription(routing.transcribeAudio);
if (mapped) {
const tools = ensureRecord(raw, "tools");
const toolsAudio = ensureRecord(tools, "audio");
if (toolsAudio.transcription === undefined) {
toolsAudio.transcription = mapped;
changes.push("Moved routing.transcribeAudio → tools.audio.transcription.");
const media = ensureRecord(tools, "media");
const mediaAudio = ensureRecord(media, "audio");
const models = Array.isArray(mediaAudio.models)
? (mediaAudio.models as unknown[])
: [];
if (models.length === 0) {
mediaAudio.enabled = true;
mediaAudio.models = [mapped];
changes.push("Moved routing.transcribeAudio → tools.media.audio.models.");
} else {
changes.push(
"Removed routing.transcribeAudio (tools.audio.transcription already set).",
);
changes.push("Removed routing.transcribeAudio (tools.media.audio.models already set).");
}
} else {
changes.push("Removed routing.transcribeAudio (unsupported transcription CLI).");
@@ -350,12 +353,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
const mapped = mapLegacyAudioTranscription(audio.transcription);
if (mapped) {
const tools = ensureRecord(raw, "tools");
const toolsAudio = ensureRecord(tools, "audio");
if (toolsAudio.transcription === undefined) {
toolsAudio.transcription = mapped;
changes.push("Moved audio.transcription → tools.audio.transcription.");
const media = ensureRecord(tools, "media");
const mediaAudio = ensureRecord(media, "audio");
const models = Array.isArray(mediaAudio.models)
? (mediaAudio.models as unknown[])
: [];
if (models.length === 0) {
mediaAudio.enabled = true;
mediaAudio.models = [mapped];
changes.push("Moved audio.transcription → tools.media.audio.models.");
} else {
changes.push("Removed audio.transcription (tools.audio.transcription already set).");
changes.push("Removed audio.transcription (tools.media.audio.models already set).");
}
delete audio.transcription;
if (Object.keys(audio).length === 0) delete raw.audio;

View File

@@ -69,7 +69,7 @@ export const LEGACY_CONFIG_RULES: LegacyConfigRule[] = [
{
path: ["routing", "transcribeAudio"],
message:
"routing.transcribeAudio was moved; use tools.audio.transcription instead (auto-migrated on load).",
"routing.transcribeAudio was moved; use tools.media.audio.models instead (auto-migrated on load).",
},
{
path: ["telegram", "requireMention"],

View File

@@ -56,7 +56,7 @@ export const mapLegacyAudioTranscription = (value: unknown): Record<string, unkn
const timeoutSeconds =
typeof transcriber?.timeoutSeconds === "number" ? transcriber?.timeoutSeconds : undefined;
const result: Record<string, unknown> = {};
const result: Record<string, unknown> = { command: rawExecutable, type: "cli" };
if (args.length > 0) result.args = args;
if (timeoutSeconds !== undefined) result.timeoutSeconds = timeoutSeconds;
return result;

View File

@@ -102,8 +102,28 @@ const FIELD_LABELS: Record<string, string> = {
"gateway.remote.password": "Remote Gateway Password",
"gateway.auth.token": "Gateway Token",
"gateway.auth.password": "Gateway Password",
"tools.audio.transcription.args": "Audio Transcription Args",
"tools.audio.transcription.timeoutSeconds": "Audio Transcription Timeout (sec)",
"tools.media.image.enabled": "Enable Image Understanding",
"tools.media.image.maxBytes": "Image Understanding Max Bytes",
"tools.media.image.maxChars": "Image Understanding Max Chars",
"tools.media.image.prompt": "Image Understanding Prompt",
"tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
"tools.media.image.models": "Image Understanding Models",
"tools.media.image.scope": "Image Understanding Scope",
"tools.media.audio.enabled": "Enable Audio Understanding",
"tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
"tools.media.audio.maxChars": "Audio Understanding Max Chars",
"tools.media.audio.prompt": "Audio Understanding Prompt",
"tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
"tools.media.audio.language": "Audio Understanding Language",
"tools.media.audio.models": "Audio Understanding Models",
"tools.media.audio.scope": "Audio Understanding Scope",
"tools.media.video.enabled": "Enable Video Understanding",
"tools.media.video.maxBytes": "Video Understanding Max Bytes",
"tools.media.video.maxChars": "Video Understanding Max Chars",
"tools.media.video.prompt": "Video Understanding Prompt",
"tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
"tools.media.video.models": "Video Understanding Models",
"tools.media.video.scope": "Video Understanding Scope",
"tools.profile": "Tool Profile",
"agents.list[].tools.profile": "Agent Tool Profile",
"tools.byProvider": "Tool Policy by Provider",

View File

@@ -47,7 +47,7 @@ export type BroadcastConfig = {
};
export type AudioConfig = {
/** @deprecated Use tools.audio.transcription instead. */
/** @deprecated Use tools.media.audio.models instead. */
transcription?: {
// Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout.
command: string[];

View File

@@ -1,4 +1,76 @@
import type { AgentElevatedAllowFromConfig } from "./types.base.js";
import type { AgentElevatedAllowFromConfig, SessionSendPolicyAction } from "./types.base.js";
export type MediaUnderstandingScopeMatch = {
channel?: string;
chatType?: "direct" | "group" | "room";
keyPrefix?: string;
};
export type MediaUnderstandingScopeRule = {
action: SessionSendPolicyAction;
match?: MediaUnderstandingScopeMatch;
};
export type MediaUnderstandingScopeConfig = {
default?: SessionSendPolicyAction;
rules?: MediaUnderstandingScopeRule[];
};
export type MediaUnderstandingCapability = "image" | "audio" | "video";
export type MediaUnderstandingModelConfig = {
/** provider API id (e.g. openai, google). */
provider?: string;
/** Model id for provider-based understanding. */
model?: string;
/** Optional capability tags for shared model lists. */
capabilities?: MediaUnderstandingCapability[];
/** Use a CLI command instead of provider API. */
type?: "provider" | "cli";
/** CLI binary (required when type=cli). */
command?: string;
/** CLI args (template-enabled). */
args?: string[];
/** Optional prompt override for this model entry. */
prompt?: string;
/** Optional max output characters for this model entry. */
maxChars?: number;
/** Optional max bytes for this model entry. */
maxBytes?: number;
/** Optional timeout override (seconds) for this model entry. */
timeoutSeconds?: number;
/** Optional language hint for audio transcription. */
language?: string;
/** Auth profile id to use for this provider. */
profile?: string;
/** Preferred profile id if multiple are available. */
preferredProfile?: string;
};
export type MediaUnderstandingConfig = {
/** Enable media understanding when models are configured. */
enabled?: boolean;
/** Optional scope gating for understanding. */
scope?: MediaUnderstandingScopeConfig;
/** Default max bytes to send. */
maxBytes?: number;
/** Default max output characters. */
maxChars?: number;
/** Default prompt. */
prompt?: string;
/** Default timeout (seconds). */
timeoutSeconds?: number;
/** Default language hint (audio). */
language?: string;
/** Ordered model list (fallbacks in order). */
models?: MediaUnderstandingModelConfig[];
};
export type MediaToolsConfig = {
image?: MediaUnderstandingConfig;
audio?: MediaUnderstandingConfig;
video?: MediaUnderstandingConfig;
};
export type ToolProfileId = "minimal" | "coding" | "messaging" | "full";
@@ -127,13 +199,7 @@ export type ToolsConfig = {
};
};
};
audio?: {
transcription?: {
/** CLI args (template-enabled). */
args?: string[];
timeoutSeconds?: number;
};
};
media?: MediaToolsConfig;
/** Message tool configuration. */
message?: {
/**

View File

@@ -5,7 +5,7 @@ import {
GroupChatSchema,
HumanDelaySchema,
IdentitySchema,
ToolsAudioTranscriptionSchema,
ToolsMediaSchema,
} from "./zod-schema.core.js";
export const HeartbeatSchema = z
@@ -283,11 +283,7 @@ export const ToolsSchema = z
deny: z.array(z.string()).optional(),
byProvider: z.record(z.string(), ToolPolicyWithProfileSchema).optional(),
web: ToolsWebSchema,
audio: z
.object({
transcription: ToolsAudioTranscriptionSchema,
})
.optional(),
media: ToolsMediaSchema,
message: z
.object({
allowCrossContextSend: z.boolean().optional(),

View File

@@ -240,10 +240,68 @@ export const ExecutableTokenSchema = z
.string()
.refine(isSafeExecutableValue, "expected safe executable name or path");
export const ToolsAudioTranscriptionSchema = z
export const MediaUnderstandingScopeSchema = z
.object({
default: z.union([z.literal("allow"), z.literal("deny")]).optional(),
rules: z
.array(
z.object({
action: z.union([z.literal("allow"), z.literal("deny")]),
match: z
.object({
channel: z.string().optional(),
chatType: z
.union([z.literal("direct"), z.literal("group"), z.literal("room")])
.optional(),
keyPrefix: z.string().optional(),
})
.optional(),
}),
)
.optional(),
})
.optional();
export const MediaUnderstandingCapabilitiesSchema = z
.array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
.optional();
export const MediaUnderstandingModelSchema = z
.object({
provider: z.string().optional(),
model: z.string().optional(),
capabilities: MediaUnderstandingCapabilitiesSchema,
type: z.union([z.literal("provider"), z.literal("cli")]).optional(),
command: z.string().optional(),
args: z.array(z.string()).optional(),
prompt: z.string().optional(),
maxChars: z.number().int().positive().optional(),
maxBytes: z.number().int().positive().optional(),
timeoutSeconds: z.number().int().positive().optional(),
language: z.string().optional(),
profile: z.string().optional(),
preferredProfile: z.string().optional(),
})
.optional();
export const ToolsMediaUnderstandingSchema = z
.object({
enabled: z.boolean().optional(),
scope: MediaUnderstandingScopeSchema,
maxBytes: z.number().int().positive().optional(),
maxChars: z.number().int().positive().optional(),
prompt: z.string().optional(),
timeoutSeconds: z.number().int().positive().optional(),
language: z.string().optional(),
models: z.array(MediaUnderstandingModelSchema).optional(),
})
.optional();
export const ToolsMediaSchema = z
.object({
image: ToolsMediaUnderstandingSchema.optional(),
audio: ToolsMediaUnderstandingSchema.optional(),
video: ToolsMediaUnderstandingSchema.optional(),
})
.optional();

View File

@@ -0,0 +1,258 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { ClawdbotConfig } from "../config/config.js";
import type { MsgContext } from "../auto-reply/templating.js";
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
import { fetchRemoteMedia } from "../media/fetch.js";
vi.mock("../agents/model-auth.js", () => ({
resolveApiKeyForProvider: vi.fn(async () => ({
apiKey: "test-key",
source: "test",
})),
}));
vi.mock("../media/fetch.js", () => ({
fetchRemoteMedia: vi.fn(),
}));
vi.mock("../process/exec.js", () => ({
runExec: vi.fn(),
}));
async function loadApply() {
return await import("./apply.js");
}
describe("applyMediaUnderstanding", () => {
const mockedResolveApiKey = vi.mocked(resolveApiKeyForProvider);
const mockedFetchRemoteMedia = vi.mocked(fetchRemoteMedia);
beforeEach(() => {
mockedResolveApiKey.mockClear();
mockedFetchRemoteMedia.mockReset();
mockedFetchRemoteMedia.mockResolvedValue({
buffer: Buffer.from("audio-bytes"),
contentType: "audio/ogg",
fileName: "note.ogg",
});
});
it("sets Transcript and replaces Body when audio transcription succeeds", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const audioPath = path.join(dir, "note.ogg");
await fs.writeFile(audioPath, "hello");
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: audioPath,
MediaType: "audio/ogg",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio: async () => ({ text: "transcribed text" }),
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("transcribed text");
expect(ctx.Body).toBe("[Audio]\nTranscript:\ntranscribed text");
expect(ctx.CommandBody).toBe("transcribed text");
expect(ctx.RawBody).toBe("transcribed text");
});
it("handles URL-only attachments for audio transcription", async () => {
const { applyMediaUnderstanding } = await loadApply();
const ctx: MsgContext = {
Body: "<media:audio>",
MediaUrl: "https://example.com/note.ogg",
MediaType: "audio/ogg",
ChatType: "dm",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
scope: {
default: "deny",
rules: [{ action: "allow", match: { chatType: "direct" } }],
},
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio: async () => ({ text: "remote transcript" }),
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("remote transcript");
expect(ctx.Body).toBe("[Audio]\nTranscript:\nremote transcript");
});
it("skips audio transcription when attachment exceeds maxBytes", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const audioPath = path.join(dir, "large.wav");
await fs.writeFile(audioPath, "0123456789");
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: audioPath,
MediaType: "audio/wav",
};
const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 4,
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: { groq: { id: "groq", transcribeAudio } },
});
expect(result.appliedAudio).toBe(false);
expect(transcribeAudio).not.toHaveBeenCalled();
expect(ctx.Body).toBe("<media:audio>");
});
it("falls back to CLI model when provider fails", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const audioPath = path.join(dir, "note.ogg");
await fs.writeFile(audioPath, "hello");
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: audioPath,
MediaType: "audio/ogg",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
models: [
{ provider: "groq" },
{
type: "cli",
command: "whisper",
args: ["{{MediaPath}}"],
},
],
},
},
},
};
const execModule = await import("../process/exec.js");
vi.mocked(execModule.runExec).mockResolvedValue({
stdout: "cli transcript\n",
stderr: "",
});
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio: async () => {
throw new Error("boom");
},
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("cli transcript");
expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
});
it("uses CLI image understanding and preserves caption for commands", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const imagePath = path.join(dir, "photo.jpg");
await fs.writeFile(imagePath, "image-bytes");
const ctx: MsgContext = {
Body: "<media:image> show Dom",
MediaPath: imagePath,
MediaType: "image/jpeg",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
image: {
enabled: true,
models: [
{
type: "cli",
command: "gemini",
args: ["--file", "{{MediaPath}}", "--prompt", "{{Prompt}}"],
},
],
},
},
},
};
const execModule = await import("../process/exec.js");
vi.mocked(execModule.runExec).mockResolvedValue({
stdout: "image description\n",
stderr: "",
});
const result = await applyMediaUnderstanding({
ctx,
cfg,
});
expect(result.appliedImage).toBe(true);
expect(ctx.Body).toBe("[Image]\nUser text:\nshow Dom\nDescription:\nimage description");
expect(ctx.CommandBody).toBe("show Dom");
expect(ctx.RawBody).toBe("show Dom");
});
});

View File

@@ -0,0 +1,804 @@
import crypto from "node:crypto";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { fileURLToPath } from "node:url";
import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
import { complete } from "@mariozechner/pi-ai";
import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
import type { ClawdbotConfig } from "../config/config.js";
import type { MsgContext } from "../auto-reply/templating.js";
import { applyTemplate } from "../auto-reply/templating.js";
import { getApiKeyForModel, resolveApiKeyForProvider } from "../agents/model-auth.js";
import { ensureClawdbotModelsJson } from "../agents/models-config.js";
import { minimaxUnderstandImage } from "../agents/minimax-vlm.js";
import { logVerbose, shouldLogVerbose } from "../globals.js";
import { fetchRemoteMedia } from "../media/fetch.js";
import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
import { runExec } from "../process/exec.js";
import type {
MediaUnderstandingConfig,
MediaUnderstandingModelConfig,
MediaUnderstandingScopeConfig,
} from "../config/types.tools.js";
import { extractMediaUserText, formatMediaUnderstandingBody } from "./format.js";
import {
buildMediaUnderstandingRegistry,
getMediaUnderstandingProvider,
normalizeMediaProviderId,
} from "./providers/index.js";
import { fetchWithTimeout } from "./providers/shared.js";
import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
import type {
MediaAttachment,
MediaUnderstandingOutput,
MediaUnderstandingProvider,
} from "./types.js";
import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
const MB = 1024 * 1024;
const DEFAULT_MAX_CHARS = 500;
const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<Capability, number | undefined> = {
image: DEFAULT_MAX_CHARS,
audio: undefined,
video: DEFAULT_MAX_CHARS,
};
const DEFAULT_MAX_BYTES: Record<Capability, number> = {
image: 10 * MB,
audio: 20 * MB,
video: 50 * MB,
};
const DEFAULT_TIMEOUT_SECONDS: Record<Capability, number> = {
image: 60,
audio: 60,
video: 120,
};
const DEFAULT_PROMPT: Record<Capability, string> = {
image: "Describe the image.",
audio: "Transcribe the audio.",
video: "Describe the video.",
};
const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
const DEFAULT_AUDIO_MODELS: Record<string, string> = {
groq: "whisper-large-v3-turbo",
openai: "whisper-1",
};
const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
export type ApplyMediaUnderstandingResult = {
outputs: MediaUnderstandingOutput[];
appliedImage: boolean;
appliedAudio: boolean;
appliedVideo: boolean;
};
type Capability = "image" | "audio" | "video";
type MediaBufferResult = {
buffer: Buffer;
mime?: string;
fileName: string;
};
type MediaPathResult = {
path: string;
cleanup?: () => Promise<void> | void;
};
function normalizeAttachmentPath(raw?: string | null): string | undefined {
const value = raw?.trim();
if (!value) return undefined;
if (value.startsWith("file://")) {
try {
return fileURLToPath(value);
} catch {
return undefined;
}
}
return value;
}
function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
const resolveMime = (count: number, index: number) => {
const typeHint = typesFromArray?.[index];
const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
if (trimmed) return trimmed;
return count === 1 ? ctx.MediaType : undefined;
};
if (pathsFromArray && pathsFromArray.length > 0) {
const count = pathsFromArray.length;
const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
return pathsFromArray
.map((value, index) => ({
path: value?.trim() || undefined,
url: urls?.[index] ?? ctx.MediaUrl,
mime: resolveMime(count, index),
index,
}))
.filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
}
if (urlsFromArray && urlsFromArray.length > 0) {
const count = urlsFromArray.length;
return urlsFromArray
.map((value, index) => ({
path: undefined,
url: value?.trim() || undefined,
mime: resolveMime(count, index),
index,
}))
.filter((entry) => Boolean(entry.url?.trim()));
}
const pathValue = ctx.MediaPath?.trim();
const url = ctx.MediaUrl?.trim();
if (!pathValue && !url) return [];
return [
{
path: pathValue || undefined,
url: url || undefined,
mime: ctx.MediaType,
index: 0,
},
];
}
function isVideoAttachment(attachment: MediaAttachment): boolean {
if (attachment.mime?.startsWith("video/")) return true;
const ext = getFileExtension(attachment.path ?? attachment.url);
if (!ext) return false;
return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
}
function isAudioAttachment(attachment: MediaAttachment): boolean {
if (attachment.mime?.startsWith("audio/")) return true;
return isAudioFileName(attachment.path ?? attachment.url);
}
function isImageAttachment(attachment: MediaAttachment): boolean {
if (attachment.mime?.startsWith("image/")) return true;
const ext = getFileExtension(attachment.path ?? attachment.url);
if (!ext) return false;
return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
}
function estimateBase64Size(bytes: number): number {
return Math.ceil(bytes / 3) * 4;
}
function resolveVideoMaxBase64Bytes(maxBytes: number): number {
const expanded = Math.floor(maxBytes * (4 / 3));
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
}
function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
return Math.max(1000, Math.floor(value * 1000));
}
function resolvePrompt(capability: Capability, prompt?: string, maxChars?: number): string {
const base = prompt?.trim() || DEFAULT_PROMPT[capability];
if (!maxChars || capability === "audio") return base;
return `${base} Respond in at most ${maxChars} characters.`;
}
function resolveRequestUrl(input: RequestInfo | URL): string {
if (typeof input === "string") return input;
if (input instanceof URL) return input.toString();
return input.url;
}
function normalizeErrorMessage(err: unknown): string {
if (!err) return "";
if (typeof err === "string") return err;
if (err instanceof Error) return err.message;
try {
return JSON.stringify(err);
} catch {
return "";
}
}
function resolveMaxChars(params: {
capability: Capability;
entry: MediaUnderstandingModelConfig;
cfg: ClawdbotConfig;
}): number | undefined {
const { capability, entry, cfg } = params;
const configured = entry.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
if (typeof configured === "number") return configured;
return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
}
function trimOutput(text: string, maxChars?: number): string {
const trimmed = text.trim();
if (!maxChars || trimmed.length <= maxChars) return trimmed;
return trimmed.slice(0, maxChars).trim();
}
function resolveConfigValue<T>(primary: T | undefined, fallback: T): T {
return primary === undefined ? fallback : primary;
}
function resolveCapabilityConfig(
cfg: ClawdbotConfig,
capability: Capability,
): MediaUnderstandingConfig | undefined {
return cfg.tools?.media?.[capability];
}
function resolveScopeDecision(params: {
scope?: MediaUnderstandingScopeConfig;
ctx: MsgContext;
}): "allow" | "deny" {
return resolveMediaUnderstandingScope({
scope: params.scope,
sessionKey: params.ctx.SessionKey,
channel: params.ctx.Surface ?? params.ctx.Provider,
chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
});
}
function resolveModelEntries(
cfg: MediaUnderstandingConfig | undefined,
capability: Capability,
): MediaUnderstandingModelConfig[] {
const models = cfg?.models ?? [];
if (models.length === 0) return [];
return models.filter((entry) => {
const caps = entry.capabilities;
if (!caps || caps.length === 0) return true;
return caps.includes(capability);
});
}
function isMaxBytesError(err: unknown): boolean {
const message = normalizeErrorMessage(err);
if (!message) return false;
return message.includes("exceeds maxBytes") || message.includes("payload exceeds maxBytes");
}
async function loadAttachmentBuffer(params: {
attachment: MediaAttachment;
maxBytes: number;
timeoutMs: number;
}): Promise<MediaBufferResult | undefined> {
const { attachment, maxBytes, timeoutMs } = params;
const rawPath = normalizeAttachmentPath(attachment.path);
if (rawPath) {
const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
try {
const stat = await fs.stat(resolved);
if (!stat.isFile()) return undefined;
if (stat.size > maxBytes) {
if (shouldLogVerbose()) {
logVerbose(
`Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
);
}
return undefined;
}
const buffer = await fs.readFile(resolved);
const mime =
attachment.mime ??
(await detectMime({
buffer,
filePath: resolved,
}));
const fileName = path.basename(resolved) || `media-${attachment.index + 1}`;
return { buffer, mime, fileName };
} catch (err) {
if (shouldLogVerbose()) {
logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
}
}
}
const url = attachment.url?.trim();
if (!url) return undefined;
try {
const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
if (fetched.buffer.length > maxBytes) {
if (shouldLogVerbose()) {
logVerbose(
`Skipping media attachment ${attachment.index + 1}: ${fetched.buffer.length} bytes exceeds ${maxBytes}`,
);
}
return undefined;
}
const mime =
attachment.mime ??
fetched.contentType ??
(await detectMime({
buffer: fetched.buffer,
filePath: fetched.fileName ?? url,
}));
const fileName = fetched.fileName ?? `media-${attachment.index + 1}`;
return { buffer: fetched.buffer, mime, fileName };
} catch (err) {
if (shouldLogVerbose()) {
logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
}
}
return undefined;
}
async function resolveAttachmentPath(params: {
attachment: MediaAttachment;
maxBytes?: number;
timeoutMs: number;
}): Promise<MediaPathResult | undefined> {
const { attachment, maxBytes, timeoutMs } = params;
const rawPath = normalizeAttachmentPath(attachment.path);
if (rawPath) {
const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
try {
const stat = await fs.stat(resolved);
if (!stat.isFile()) return undefined;
if (maxBytes && stat.size > maxBytes) {
if (shouldLogVerbose()) {
logVerbose(
`Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
);
}
return undefined;
}
return { path: resolved };
} catch (err) {
if (shouldLogVerbose()) {
logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
}
}
}
const url = attachment.url?.trim();
if (!url) return undefined;
try {
const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
const buffer = fetched.buffer;
if (maxBytes && buffer.length > maxBytes) {
if (shouldLogVerbose()) {
logVerbose(
`Skipping media attachment ${attachment.index + 1}: ${buffer.length} bytes exceeds ${maxBytes}`,
);
}
return undefined;
}
const extension = fetched.fileName ? path.extname(fetched.fileName) : "";
const tmpPath = path.join(
os.tmpdir(),
`clawdbot-media-${crypto.randomUUID()}${extension || ""}`,
);
await fs.writeFile(tmpPath, buffer);
return {
path: tmpPath,
cleanup: async () => {
await fs.unlink(tmpPath).catch(() => {});
},
};
} catch (err) {
if (shouldLogVerbose()) {
logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
}
}
return undefined;
}
async function describeImageWithModel(params: {
cfg: ClawdbotConfig;
agentDir: string;
provider: string;
model: string;
prompt: string;
maxChars?: number;
buffer: Buffer;
mimeType: string;
profile?: string;
preferredProfile?: string;
}): Promise<{ text: string; model: string }> {
await ensureClawdbotModelsJson(params.cfg, params.agentDir);
const authStorage = discoverAuthStorage(params.agentDir);
const modelRegistry = discoverModels(authStorage, params.agentDir);
const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
if (!model) {
throw new Error(`Unknown model: ${params.provider}/${params.model}`);
}
if (!model.input?.includes("image")) {
throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
}
const apiKeyInfo = await getApiKeyForModel({
model,
cfg: params.cfg,
agentDir: params.agentDir,
profileId: params.profile,
preferredProfile: params.preferredProfile,
});
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
const base64 = params.buffer.toString("base64");
if (model.provider === "minimax") {
const text = await minimaxUnderstandImage({
apiKey: apiKeyInfo.apiKey,
prompt: params.prompt,
imageDataUrl: `data:${params.mimeType};base64,${base64}`,
modelBaseUrl: model.baseUrl,
});
return { text, model: model.id };
}
const context: Context = {
messages: [
{
role: "user",
content: [
{ type: "text", text: params.prompt },
{ type: "image", data: base64, mimeType: params.mimeType },
],
timestamp: Date.now(),
},
],
};
const message = (await complete(model, context, {
apiKey: apiKeyInfo.apiKey,
maxTokens: 512,
})) as AssistantMessage;
const text = coerceImageAssistantText({
message,
provider: model.provider,
model: model.id,
});
return { text, model: model.id };
}
async function runProviderEntry(params: {
capability: Capability;
entry: MediaUnderstandingModelConfig;
cfg: ClawdbotConfig;
ctx: MsgContext;
attachment: MediaAttachment;
agentDir?: string;
providerRegistry: Map<string, MediaUnderstandingProvider>;
}): Promise<MediaUnderstandingOutput | null> {
const { entry, capability, cfg, attachment } = params;
const providerIdRaw = entry.provider?.trim();
if (!providerIdRaw) {
throw new Error(`Provider entry missing provider for ${capability}`);
}
const providerId = normalizeMediaProviderId(providerIdRaw);
const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
const maxChars = resolveMaxChars({ capability, entry, cfg });
const timeoutMs = resolveTimeoutMs(
entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
DEFAULT_TIMEOUT_SECONDS[capability],
);
const prompt = resolvePrompt(
capability,
entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
maxChars,
);
if (capability === "image") {
if (!params.agentDir) {
throw new Error("Image understanding requires agentDir");
}
const modelId = entry.model?.trim();
if (!modelId) {
throw new Error("Image understanding requires model id");
}
const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
if (!media) return null;
const mimeType = media.mime ?? "image/jpeg";
const result = await describeImageWithModel({
cfg,
agentDir: params.agentDir,
provider: providerId,
model: modelId,
prompt,
maxChars,
buffer: media.buffer,
mimeType,
profile: entry.profile,
preferredProfile: entry.preferredProfile,
});
return {
kind: "image.description",
attachmentIndex: attachment.index,
text: trimOutput(result.text, maxChars),
provider: providerId,
model: result.model,
};
}
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
if (!provider) {
throw new Error(`Media provider not available: ${providerId}`);
}
if (capability === "audio") {
if (!provider.transcribeAudio) {
throw new Error(`Audio transcription provider "${providerId}" not available.`);
}
const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
if (!media) return null;
const key = await resolveApiKeyForProvider({
provider: providerId,
cfg,
profileId: entry.profile,
preferredProfile: entry.preferredProfile,
agentDir: params.agentDir,
});
const providerConfig = cfg.models?.providers?.[providerId];
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
const result = await provider.transcribeAudio({
buffer: media.buffer,
fileName: media.fileName,
mime: media.mime,
apiKey: key.apiKey,
baseUrl: providerConfig?.baseUrl,
headers: providerConfig?.headers,
model,
language: entry.language ?? cfg.tools?.media?.audio?.language,
prompt,
timeoutMs,
});
return {
kind: "audio.transcription",
attachmentIndex: attachment.index,
text: trimOutput(result.text, maxChars),
provider: providerId,
model: result.model ?? model,
};
}
if (capability === "video") {
if (!provider.describeVideo) {
throw new Error(`Video understanding provider "${providerId}" not available.`);
}
const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
if (!media) return null;
const estimatedBase64Bytes = estimateBase64Size(media.buffer.length);
const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
if (estimatedBase64Bytes > maxBase64Bytes) {
if (shouldLogVerbose()) {
logVerbose(
`Skipping video attachment ${attachment.index + 1}: base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
);
}
return null;
}
const key = await resolveApiKeyForProvider({
provider: providerId,
cfg,
profileId: entry.profile,
preferredProfile: entry.preferredProfile,
agentDir: params.agentDir,
});
const providerConfig = cfg.models?.providers?.[providerId];
const result = await provider.describeVideo({
buffer: media.buffer,
fileName: media.fileName,
mime: media.mime,
apiKey: key.apiKey,
baseUrl: providerConfig?.baseUrl,
headers: providerConfig?.headers,
model: entry.model,
prompt,
timeoutMs,
});
return {
kind: "video.description",
attachmentIndex: attachment.index,
text: trimOutput(result.text, maxChars),
provider: providerId,
model: result.model ?? entry.model,
};
}
return null;
}
async function runCliEntry(params: {
capability: Capability;
entry: MediaUnderstandingModelConfig;
cfg: ClawdbotConfig;
ctx: MsgContext;
attachment: MediaAttachment;
}): Promise<MediaUnderstandingOutput | null> {
const { entry, capability, cfg, ctx, attachment } = params;
const command = entry.command?.trim();
const args = entry.args ?? [];
if (!command) {
throw new Error(`CLI entry missing command for ${capability}`);
}
const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
const maxChars = resolveMaxChars({ capability, entry, cfg });
const timeoutMs = resolveTimeoutMs(
entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
DEFAULT_TIMEOUT_SECONDS[capability],
);
const prompt = resolvePrompt(
capability,
entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
maxChars,
);
const pathResult = await resolveAttachmentPath({
attachment,
maxBytes,
timeoutMs,
});
if (!pathResult) return null;
const templCtx: MsgContext = {
...ctx,
MediaPath: pathResult.path,
Prompt: prompt,
MaxChars: maxChars,
};
const argv = [command, ...args].map((part, index) =>
index === 0 ? part : applyTemplate(part, templCtx),
);
if (shouldLogVerbose()) {
logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
}
try {
const { stdout } = await runExec(argv[0], argv.slice(1), {
timeoutMs,
maxBuffer: CLI_OUTPUT_MAX_BUFFER,
});
const text = trimOutput(stdout, maxChars);
if (!text) return null;
return {
kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
attachmentIndex: attachment.index,
text,
provider: "cli",
model: command,
};
} finally {
if (pathResult.cleanup) {
await pathResult.cleanup();
}
}
}
async function runCapability(params: {
capability: Capability;
cfg: ClawdbotConfig;
ctx: MsgContext;
attachments: MediaAttachment[];
agentDir?: string;
providerRegistry: Map<string, MediaUnderstandingProvider>;
}): Promise<MediaUnderstandingOutput | null> {
const { capability, cfg, ctx, attachments } = params;
const config = resolveCapabilityConfig(cfg, capability);
if (!config || config.enabled === false) return null;
const entries = resolveModelEntries(config, capability);
if (entries.length === 0) return null;
const scopeDecision = resolveScopeDecision({ scope: config.scope, ctx });
if (scopeDecision === "deny") {
if (shouldLogVerbose()) {
logVerbose(`${capability} understanding disabled by scope policy.`);
}
return null;
}
const attachment = attachments.find((item) => {
if (capability === "image") return isImageAttachment(item);
if (capability === "audio") return isAudioAttachment(item);
return isVideoAttachment(item);
});
if (!attachment) return null;
for (const entry of entries) {
try {
const entryType = entry.type ?? (entry.command ? "cli" : "provider");
const result =
entryType === "cli"
? await runCliEntry({ capability, entry, cfg, ctx, attachment })
: await runProviderEntry({
capability,
entry,
cfg,
ctx,
attachment,
agentDir: params.agentDir,
providerRegistry: params.providerRegistry,
});
if (result) return result;
} catch (err) {
if (isMaxBytesError(err)) {
if (shouldLogVerbose()) {
logVerbose(`Skipping ${capability} model due to size: ${String(err)}`);
}
} else if (shouldLogVerbose()) {
logVerbose(`${capability} understanding failed: ${String(err)}`);
}
}
}
return null;
}
export async function applyMediaUnderstanding(params: {
ctx: MsgContext;
cfg: ClawdbotConfig;
agentDir?: string;
providers?: Record<string, MediaUnderstandingProvider>;
}): Promise<ApplyMediaUnderstandingResult> {
const { ctx, cfg } = params;
const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
const originalUserText =
commandCandidates
.map((value) => extractMediaUserText(value))
.find((value) => value && value.trim()) ?? undefined;
const attachments = normalizeAttachments(ctx);
const providerRegistry = buildMediaUnderstandingRegistry(params.providers);
const outputs: MediaUnderstandingOutput[] = [];
const imageOutput = await runCapability({
capability: "image",
cfg,
ctx,
attachments,
agentDir: params.agentDir,
providerRegistry,
});
if (imageOutput) outputs.push(imageOutput);
const audioOutput = await runCapability({
capability: "audio",
cfg,
ctx,
attachments,
agentDir: params.agentDir,
providerRegistry,
});
if (audioOutput) outputs.push(audioOutput);
const videoOutput = await runCapability({
capability: "video",
cfg,
ctx,
attachments,
agentDir: params.agentDir,
providerRegistry,
});
if (videoOutput) outputs.push(videoOutput);
if (outputs.length > 0) {
ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
const audioResult = outputs.find((output) => output.kind === "audio.transcription");
if (audioResult) {
ctx.Transcript = audioResult.text;
ctx.CommandBody = audioResult.text;
ctx.RawBody = audioResult.text;
} else if (originalUserText) {
ctx.CommandBody = originalUserText;
ctx.RawBody = originalUserText;
}
ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
}
return {
outputs,
appliedImage: outputs.some((output) => output.kind === "image.description"),
appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
appliedVideo: outputs.some((output) => output.kind === "video.description"),
};
}

View File

@@ -0,0 +1,91 @@
import { describe, expect, it } from "vitest";
import { formatMediaUnderstandingBody } from "./format.js";
describe("formatMediaUnderstandingBody", () => {
it("replaces placeholder body with transcript", () => {
const body = formatMediaUnderstandingBody({
body: "<media:audio>",
outputs: [
{
kind: "audio.transcription",
attachmentIndex: 0,
text: "hello world",
provider: "groq",
},
],
});
expect(body).toBe("[Audio]\nTranscript:\nhello world");
});
it("includes user text when body is meaningful", () => {
const body = formatMediaUnderstandingBody({
body: "caption here",
outputs: [
{
kind: "audio.transcription",
attachmentIndex: 0,
text: "transcribed",
provider: "groq",
},
],
});
expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
});
it("strips leading media placeholders from user text", () => {
const body = formatMediaUnderstandingBody({
body: "<media:audio> caption here",
outputs: [
{
kind: "audio.transcription",
attachmentIndex: 0,
text: "transcribed",
provider: "groq",
},
],
});
expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
});
it("keeps user text once when multiple outputs exist", () => {
const body = formatMediaUnderstandingBody({
body: "caption here",
outputs: [
{
kind: "audio.transcription",
attachmentIndex: 0,
text: "audio text",
provider: "groq",
},
{
kind: "video.description",
attachmentIndex: 1,
text: "video text",
provider: "google",
},
],
});
expect(body).toBe(
[
"User text:\ncaption here",
"[Audio]\nTranscript:\naudio text",
"[Video]\nDescription:\nvideo text",
].join("\n\n"),
);
});
it("formats image outputs", () => {
const body = formatMediaUnderstandingBody({
body: "<media:image>",
outputs: [
{
kind: "image.description",
attachmentIndex: 0,
text: "a cat",
provider: "openai",
},
],
});
expect(body).toBe("[Image]\nDescription:\na cat");
});
});

View File

@@ -0,0 +1,77 @@
import type { MediaUnderstandingOutput } from "./types.js";
const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
export function extractMediaUserText(body?: string): string | undefined {
const trimmed = body?.trim() ?? "";
if (!trimmed) return undefined;
if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return undefined;
const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
return cleaned || undefined;
}
function formatSection(
title: "Audio" | "Video" | "Image",
kind: "Transcript" | "Description",
text: string,
userText?: string,
): string {
const lines = [`[${title}]`];
if (userText) {
lines.push(`User text:\n${userText}`);
}
lines.push(`${kind}:\n${text}`);
return lines.join("\n");
}
export function formatMediaUnderstandingBody(params: {
body?: string;
outputs: MediaUnderstandingOutput[];
}): string {
const outputs = params.outputs.filter((output) => output.text.trim());
if (outputs.length === 0) {
return params.body ?? "";
}
const userText = extractMediaUserText(params.body);
const sections: string[] = [];
if (userText && outputs.length > 1) {
sections.push(`User text:\n${userText}`);
}
for (const output of outputs) {
if (output.kind === "audio.transcription") {
sections.push(
formatSection(
"Audio",
"Transcript",
output.text,
outputs.length === 1 ? userText : undefined,
),
);
continue;
}
if (output.kind === "image.description") {
sections.push(
formatSection(
"Image",
"Description",
output.text,
outputs.length === 1 ? userText : undefined,
),
);
continue;
}
sections.push(
formatSection(
"Video",
"Description",
output.text,
outputs.length === 1 ? userText : undefined,
),
);
}
return sections.join("\n\n").trim();
}

View File

@@ -0,0 +1,9 @@
export { applyMediaUnderstanding } from "./apply.js";
export { formatMediaUnderstandingBody } from "./format.js";
export { resolveMediaUnderstandingScope } from "./scope.js";
export type {
MediaAttachment,
MediaUnderstandingOutput,
MediaUnderstandingProvider,
MediaUnderstandingKind,
} from "./types.js";

View File

@@ -0,0 +1,7 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeGeminiVideo } from "./video.js";
export const googleProvider: MediaUnderstandingProvider = {
id: "google",
describeVideo: describeGeminiVideo,
};

View File

@@ -0,0 +1,93 @@
import { describe, expect, it } from "vitest";
import { describeGeminiVideo } from "./video.js";
const resolveRequestUrl = (input: RequestInfo | URL) => {
if (typeof input === "string") return input;
if (input instanceof URL) return input.toString();
return input.url;
};
describe("describeGeminiVideo", () => {
it("respects case-insensitive x-goog-api-key overrides", async () => {
let seenKey: string | null = null;
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
const headers = new Headers(init?.headers);
seenKey = headers.get("x-goog-api-key");
return new Response(
JSON.stringify({
candidates: [{ content: { parts: [{ text: "video ok" }] } }],
}),
{ status: 200, headers: { "content-type": "application/json" } },
);
};
const result = await describeGeminiVideo({
buffer: Buffer.from("video"),
fileName: "clip.mp4",
apiKey: "test-key",
timeoutMs: 1000,
headers: { "X-Goog-Api-Key": "override" },
fetchFn,
});
expect(seenKey).toBe("override");
expect(result.text).toBe("video ok");
});
it("builds the expected request payload", async () => {
let seenUrl: string | null = null;
let seenInit: RequestInit | undefined;
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
seenUrl = resolveRequestUrl(input);
seenInit = init;
return new Response(
JSON.stringify({
candidates: [
{
content: {
parts: [{ text: "first" }, { text: " second " }, { text: "" }],
},
},
],
}),
{ status: 200, headers: { "content-type": "application/json" } },
);
};
const result = await describeGeminiVideo({
buffer: Buffer.from("video-bytes"),
fileName: "clip.mp4",
apiKey: "test-key",
timeoutMs: 1500,
baseUrl: "https://example.com/v1beta/",
model: "gemini-3-pro",
headers: { "X-Other": "1" },
fetchFn,
});
expect(result.model).toBe("gemini-3-pro-preview");
expect(result.text).toBe("first\nsecond");
expect(seenUrl).toBe("https://example.com/v1beta/models/gemini-3-pro-preview:generateContent");
expect(seenInit?.method).toBe("POST");
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
const headers = new Headers(seenInit?.headers);
expect(headers.get("x-goog-api-key")).toBe("test-key");
expect(headers.get("content-type")).toBe("application/json");
expect(headers.get("x-other")).toBe("1");
const bodyText =
typeof seenInit?.body === "string"
? seenInit.body
: Buffer.isBuffer(seenInit?.body)
? seenInit.body.toString("utf8")
: "";
const body = JSON.parse(bodyText);
expect(body.contents?.[0]?.parts?.[0]?.text).toBe("Describe the video.");
expect(body.contents?.[0]?.parts?.[1]?.inline_data?.mime_type).toBe("video/mp4");
expect(body.contents?.[0]?.parts?.[1]?.inline_data?.data).toBe(
Buffer.from("video-bytes").toString("base64"),
);
});
});

View File

@@ -0,0 +1,84 @@
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
function resolveModel(model?: string): string {
const trimmed = model?.trim();
if (!trimmed) return DEFAULT_GOOGLE_VIDEO_MODEL;
return normalizeGoogleModelId(trimmed);
}
function resolvePrompt(prompt?: string): string {
const trimmed = prompt?.trim();
return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
}
export async function describeGeminiVideo(
params: VideoDescriptionRequest,
): Promise<VideoDescriptionResult> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
const model = resolveModel(params.model);
const url = `${baseUrl}/models/${model}:generateContent`;
const headers = new Headers(params.headers);
if (!headers.has("content-type")) {
headers.set("content-type", "application/json");
}
if (!headers.has("x-goog-api-key")) {
headers.set("x-goog-api-key", params.apiKey);
}
const body = {
contents: [
{
role: "user",
parts: [
{ text: resolvePrompt(params.prompt) },
{
inline_data: {
mime_type: params.mime ?? "video/mp4",
data: params.buffer.toString("base64"),
},
},
],
},
],
};
const res = await fetchWithTimeout(
url,
{
method: "POST",
headers,
body: JSON.stringify(body),
},
params.timeoutMs,
fetchFn,
);
if (!res.ok) {
const detail = await readErrorResponse(res);
const suffix = detail ? `: ${detail}` : "";
throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
}
const payload = (await res.json()) as {
candidates?: Array<{
content?: { parts?: Array<{ text?: string }> };
}>;
};
const parts = payload.candidates?.[0]?.content?.parts ?? [];
const text = parts
.map((part) => part?.text?.trim())
.filter(Boolean)
.join("\n");
if (!text) {
throw new Error("Video description response missing text");
}
return { text, model };
}

View File

@@ -0,0 +1,13 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
export const groqProvider: MediaUnderstandingProvider = {
id: "groq",
transcribeAudio: (req) =>
transcribeOpenAiCompatibleAudio({
...req,
baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
}),
};

View File

@@ -0,0 +1,35 @@
import { normalizeProviderId } from "../../agents/model-selection.js";
import type { MediaUnderstandingProvider } from "../types.js";
import { googleProvider } from "./google/index.js";
import { groqProvider } from "./groq/index.js";
import { openaiProvider } from "./openai/index.js";
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
export function normalizeMediaProviderId(id: string): string {
const normalized = normalizeProviderId(id);
if (normalized === "gemini") return "google";
return normalized;
}
export function buildMediaUnderstandingRegistry(
overrides?: Record<string, MediaUnderstandingProvider>,
): Map<string, MediaUnderstandingProvider> {
const registry = new Map<string, MediaUnderstandingProvider>();
for (const provider of PROVIDERS) {
registry.set(normalizeMediaProviderId(provider.id), provider);
}
if (overrides) {
for (const [key, provider] of Object.entries(overrides)) {
registry.set(normalizeMediaProviderId(key), provider);
}
}
return registry;
}
export function getMediaUnderstandingProvider(
id: string,
registry: Map<string, MediaUnderstandingProvider>,
): MediaUnderstandingProvider | undefined {
return registry.get(normalizeMediaProviderId(id));
}

View File

@@ -0,0 +1,86 @@
import { describe, expect, it } from "vitest";
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
const resolveRequestUrl = (input: RequestInfo | URL) => {
if (typeof input === "string") return input;
if (input instanceof URL) return input.toString();
return input.url;
};
describe("transcribeOpenAiCompatibleAudio", () => {
it("respects lowercase authorization header overrides", async () => {
let seenAuth: string | null = null;
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
const headers = new Headers(init?.headers);
seenAuth = headers.get("authorization");
return new Response(JSON.stringify({ text: "ok" }), {
status: 200,
headers: { "content-type": "application/json" },
});
};
const result = await transcribeOpenAiCompatibleAudio({
buffer: Buffer.from("audio"),
fileName: "note.mp3",
apiKey: "test-key",
timeoutMs: 1000,
headers: { authorization: "Bearer override" },
fetchFn,
});
expect(seenAuth).toBe("Bearer override");
expect(result.text).toBe("ok");
});
it("builds the expected request payload", async () => {
let seenUrl: string | null = null;
let seenInit: RequestInit | undefined;
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
seenUrl = resolveRequestUrl(input);
seenInit = init;
return new Response(JSON.stringify({ text: "hello" }), {
status: 200,
headers: { "content-type": "application/json" },
});
};
const result = await transcribeOpenAiCompatibleAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.wav",
apiKey: "test-key",
timeoutMs: 1234,
baseUrl: "https://api.example.com/v1/",
model: " ",
language: " en ",
prompt: " hello ",
mime: "audio/wav",
headers: { "X-Custom": "1" },
fetchFn,
});
expect(result.model).toBe("whisper-1");
expect(result.text).toBe("hello");
expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
expect(seenInit?.method).toBe("POST");
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
const headers = new Headers(seenInit?.headers);
expect(headers.get("authorization")).toBe("Bearer test-key");
expect(headers.get("x-custom")).toBe("1");
const form = seenInit?.body as FormData;
expect(form).toBeInstanceOf(FormData);
expect(form.get("model")).toBe("whisper-1");
expect(form.get("language")).toBe("en");
expect(form.get("prompt")).toBe("hello");
const file = form.get("file") as Blob | { type?: string; name?: string } | null;
expect(file).not.toBeNull();
if (file) {
expect(file.type).toBe("audio/wav");
if ("name" in file && typeof file.name === "string") {
expect(file.name).toBe("voice.wav");
}
}
});
});

View File

@@ -0,0 +1,61 @@
import path from "node:path";
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
function resolveModel(model?: string): string {
const trimmed = model?.trim();
return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
}
export async function transcribeOpenAiCompatibleAudio(
params: AudioTranscriptionRequest,
): Promise<AudioTranscriptionResult> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
const url = `${baseUrl}/audio/transcriptions`;
const model = resolveModel(params.model);
const form = new FormData();
const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
const bytes = new Uint8Array(params.buffer);
const blob = new Blob([bytes], {
type: params.mime ?? "application/octet-stream",
});
form.append("file", blob, fileName);
form.append("model", model);
if (params.language?.trim()) form.append("language", params.language.trim());
if (params.prompt?.trim()) form.append("prompt", params.prompt.trim());
const headers = new Headers(params.headers);
if (!headers.has("authorization")) {
headers.set("authorization", `Bearer ${params.apiKey}`);
}
const res = await fetchWithTimeout(
url,
{
method: "POST",
headers,
body: form,
},
params.timeoutMs,
fetchFn,
);
if (!res.ok) {
const detail = await readErrorResponse(res);
const suffix = detail ? `: ${detail}` : "";
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
}
const payload = (await res.json()) as { text?: string };
const text = payload.text?.trim();
if (!text) {
throw new Error("Audio transcription response missing text");
}
return { text, model };
}

View File

@@ -0,0 +1,7 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
export const openaiProvider: MediaUnderstandingProvider = {
id: "openai",
transcribeAudio: transcribeOpenAiCompatibleAudio,
};

View File

@@ -0,0 +1,33 @@
const MAX_ERROR_CHARS = 300;
export function normalizeBaseUrl(baseUrl: string | undefined, fallback: string): string {
const raw = baseUrl?.trim() || fallback;
return raw.replace(/\/+$/, "");
}
export async function fetchWithTimeout(
url: string,
init: RequestInit,
timeoutMs: number,
fetchFn: typeof fetch,
): Promise<Response> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
try {
return await fetchFn(url, { ...init, signal: controller.signal });
} finally {
clearTimeout(timer);
}
}
export async function readErrorResponse(res: Response): Promise<string | undefined> {
try {
const text = await res.text();
const collapsed = text.replace(/\s+/g, " ").trim();
if (!collapsed) return undefined;
if (collapsed.length <= MAX_ERROR_CHARS) return collapsed;
return `${collapsed.slice(0, MAX_ERROR_CHARS)}`;
} catch {
return undefined;
}
}

View File

@@ -0,0 +1,46 @@
import { describe, expect, it } from "vitest";
import { resolveMediaUnderstandingScope } from "./scope.js";
describe("resolveMediaUnderstandingScope", () => {
it("defaults to allow when scope is undefined", () => {
expect(resolveMediaUnderstandingScope({})).toBe("allow");
});
it("uses first matching rule", () => {
const decision = resolveMediaUnderstandingScope({
scope: {
default: "deny",
rules: [
{ action: "allow", match: { channel: "whatsapp" } },
{ action: "deny", match: { channel: "whatsapp", chatType: "direct" } },
],
},
channel: "whatsapp",
chatType: "direct",
sessionKey: "whatsapp:direct:123",
});
expect(decision).toBe("allow");
});
it("matches keyPrefix when provided", () => {
const decision = resolveMediaUnderstandingScope({
scope: {
default: "deny",
rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
},
sessionKey: "agent:main:whatsapp:group:123",
});
expect(decision).toBe("allow");
});
it("matches keyPrefix case-insensitively", () => {
const decision = resolveMediaUnderstandingScope({
scope: {
default: "deny",
rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
},
sessionKey: "AGENT:MAIN:WHATSAPP:GROUP:123",
});
expect(decision).toBe("allow");
});
});

View File

@@ -0,0 +1,54 @@
import type { MediaUnderstandingScopeConfig } from "../config/types.tools.js";
export type MediaUnderstandingScopeDecision = "allow" | "deny";
function normalizeDecision(value?: string | null): MediaUnderstandingScopeDecision | undefined {
const normalized = value?.trim().toLowerCase();
if (normalized === "allow") return "allow";
if (normalized === "deny") return "deny";
return undefined;
}
function normalizeMatch(value?: string | null): string | undefined {
const normalized = value?.trim().toLowerCase();
return normalized || undefined;
}
export function normalizeMediaUnderstandingChatType(raw?: string | null): string | undefined {
const value = raw?.trim().toLowerCase();
if (!value) return undefined;
if (value === "dm" || value === "direct_message" || value === "private") return "direct";
if (value === "groups") return "group";
if (value === "channel") return "room";
return value;
}
export function resolveMediaUnderstandingScope(params: {
scope?: MediaUnderstandingScopeConfig;
sessionKey?: string;
channel?: string;
chatType?: string;
}): MediaUnderstandingScopeDecision {
const scope = params.scope;
if (!scope) return "allow";
const channel = normalizeMatch(params.channel);
const chatType = normalizeMatch(params.chatType);
const sessionKey = normalizeMatch(params.sessionKey) ?? "";
for (const rule of scope.rules ?? []) {
if (!rule) continue;
const action = normalizeDecision(rule.action) ?? "allow";
const match = rule.match ?? {};
const matchChannel = normalizeMatch(match.channel);
const matchChatType = normalizeMatch(match.chatType);
const matchPrefix = normalizeMatch(match.keyPrefix);
if (matchChannel && matchChannel !== channel) continue;
if (matchChatType && matchChatType !== chatType) continue;
if (matchPrefix && !sessionKey.startsWith(matchPrefix)) continue;
return action;
}
return normalizeDecision(scope.default) ?? "allow";
}

View File

@@ -0,0 +1,62 @@
export type MediaUnderstandingKind =
| "audio.transcription"
| "video.description"
| "image.description";
export type MediaAttachment = {
path?: string;
url?: string;
mime?: string;
index: number;
};
export type MediaUnderstandingOutput = {
kind: MediaUnderstandingKind;
attachmentIndex: number;
text: string;
provider: string;
model?: string;
};
export type AudioTranscriptionRequest = {
buffer: Buffer;
fileName: string;
mime?: string;
apiKey: string;
baseUrl?: string;
headers?: Record<string, string>;
model?: string;
language?: string;
prompt?: string;
timeoutMs: number;
fetchFn?: typeof fetch;
};
export type AudioTranscriptionResult = {
text: string;
model?: string;
};
export type VideoDescriptionRequest = {
buffer: Buffer;
fileName: string;
mime?: string;
apiKey: string;
baseUrl?: string;
headers?: Record<string, string>;
model?: string;
prompt?: string;
timeoutMs: number;
fetchFn?: typeof fetch;
};
export type VideoDescriptionResult = {
text: string;
model?: string;
};
export type MediaUnderstandingProvider = {
id: string;
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
};

47
src/media/fetch.test.ts Normal file
View File

@@ -0,0 +1,47 @@
import { describe, expect, it } from "vitest";
import { fetchRemoteMedia } from "./fetch.js";
function makeStream(chunks: Uint8Array[]) {
return new ReadableStream<Uint8Array>({
start(controller) {
for (const chunk of chunks) {
controller.enqueue(chunk);
}
controller.close();
},
});
}
describe("fetchRemoteMedia", () => {
it("rejects when content-length exceeds maxBytes", async () => {
const fetchImpl = async () =>
new Response(makeStream([new Uint8Array([1, 2, 3, 4, 5])]), {
status: 200,
headers: { "content-length": "5" },
});
await expect(
fetchRemoteMedia({
url: "https://example.com/file.bin",
fetchImpl,
maxBytes: 4,
}),
).rejects.toThrow("exceeds maxBytes");
});
it("rejects when streamed payload exceeds maxBytes", async () => {
const fetchImpl = async () =>
new Response(makeStream([new Uint8Array([1, 2, 3]), new Uint8Array([4, 5, 6])]), {
status: 200,
});
await expect(
fetchRemoteMedia({
url: "https://example.com/file.bin",
fetchImpl,
maxBytes: 4,
}),
).rejects.toThrow("exceeds maxBytes");
});
});

View File

@@ -14,6 +14,7 @@ type FetchMediaOptions = {
url: string;
fetchImpl?: FetchLike;
filePathHint?: string;
maxBytes?: number;
};
function stripQuotes(value: string): string {
@@ -51,7 +52,7 @@ async function readErrorBodySnippet(res: Response, maxChars = 200): Promise<stri
}
export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<FetchMediaResult> {
const { url, fetchImpl, filePathHint } = options;
const { url, fetchImpl, filePathHint, maxBytes } = options;
const fetcher: FetchLike | undefined = fetchImpl ?? globalThis.fetch;
if (!fetcher) {
throw new Error("fetch is not available");
@@ -77,7 +78,19 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
}
const buffer = Buffer.from(await res.arrayBuffer());
const contentLength = res.headers.get("content-length");
if (maxBytes && contentLength) {
const length = Number(contentLength);
if (Number.isFinite(length) && length > maxBytes) {
throw new Error(
`Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
);
}
}
const buffer = maxBytes
? await readResponseWithLimit(res, maxBytes)
: Buffer.from(await res.arrayBuffer());
let fileNameFromUrl: string | undefined;
try {
const parsed = new URL(url);
@@ -109,3 +122,47 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
fileName,
};
}
async function readResponseWithLimit(res: Response, maxBytes: number): Promise<Buffer> {
const body = res.body;
if (!body || typeof body.getReader !== "function") {
const fallback = Buffer.from(await res.arrayBuffer());
if (fallback.length > maxBytes) {
throw new Error(
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
);
}
return fallback;
}
const reader = body.getReader();
const chunks: Uint8Array[] = [];
let total = 0;
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
if (value?.length) {
total += value.length;
if (total > maxBytes) {
try {
await reader.cancel();
} catch {}
throw new Error(
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
);
}
chunks.push(value);
}
}
} finally {
try {
reader.releaseLock();
} catch {}
}
return Buffer.concat(
chunks.map((chunk) => Buffer.from(chunk)),
total,
);
}