feat: add inbound media understanding
Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
This commit is contained in:
@@ -254,7 +254,10 @@ export function createExecTool(
|
||||
let yielded = false;
|
||||
let yieldTimer: NodeJS.Timeout | null = null;
|
||||
let timeoutTimer: NodeJS.Timeout | null = null;
|
||||
let timeoutFinalizeTimer: NodeJS.Timeout | null = null;
|
||||
let timedOut = false;
|
||||
const timeoutFinalizeMs = 1000;
|
||||
let rejectFn: ((err: Error) => void) | null = null;
|
||||
|
||||
const settle = (fn: () => void) => {
|
||||
if (settled) return;
|
||||
@@ -262,6 +265,18 @@ export function createExecTool(
|
||||
fn();
|
||||
};
|
||||
|
||||
const effectiveTimeout =
|
||||
typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
|
||||
const finalizeTimeout = () => {
|
||||
if (session.exited) return;
|
||||
markExited(session, null, "SIGKILL", "failed");
|
||||
if (settled || !rejectFn) return;
|
||||
const aggregated = session.aggregated.trim();
|
||||
const reason = `Command timed out after ${effectiveTimeout} seconds`;
|
||||
const message = aggregated ? `${aggregated}\n\n${reason}` : reason;
|
||||
settle(() => rejectFn?.(new Error(message)));
|
||||
};
|
||||
|
||||
// Tool-call abort should not kill backgrounded sessions; timeouts still must.
|
||||
const onAbortSignal = () => {
|
||||
if (yielded || session.backgrounded) return;
|
||||
@@ -272,15 +287,17 @@ export function createExecTool(
|
||||
const onTimeout = () => {
|
||||
timedOut = true;
|
||||
killSession(session);
|
||||
if (!timeoutFinalizeTimer) {
|
||||
timeoutFinalizeTimer = setTimeout(() => {
|
||||
finalizeTimeout();
|
||||
}, timeoutFinalizeMs);
|
||||
}
|
||||
};
|
||||
|
||||
if (signal?.aborted) onAbortSignal();
|
||||
else if (signal) {
|
||||
signal.addEventListener("abort", onAbortSignal, { once: true });
|
||||
}
|
||||
|
||||
const effectiveTimeout =
|
||||
typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
|
||||
if (effectiveTimeout > 0) {
|
||||
timeoutTimer = setTimeout(() => {
|
||||
onTimeout();
|
||||
@@ -321,6 +338,7 @@ export function createExecTool(
|
||||
});
|
||||
|
||||
return new Promise<AgentToolResult<ExecToolDetails>>((resolve, reject) => {
|
||||
rejectFn = reject;
|
||||
const resolveRunning = () => {
|
||||
settle(() =>
|
||||
resolve({
|
||||
@@ -369,6 +387,7 @@ export function createExecTool(
|
||||
const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
|
||||
if (yieldTimer) clearTimeout(yieldTimer);
|
||||
if (timeoutTimer) clearTimeout(timeoutTimer);
|
||||
if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
|
||||
const durationMs = Date.now() - startedAt;
|
||||
const wasSignal = exitSignal != null;
|
||||
const isSuccess = code === 0 && !wasSignal && !signal?.aborted && !timedOut;
|
||||
@@ -421,6 +440,7 @@ export function createExecTool(
|
||||
child.once("error", (err) => {
|
||||
if (yieldTimer) clearTimeout(yieldTimer);
|
||||
if (timeoutTimer) clearTimeout(timeoutTimer);
|
||||
if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
|
||||
markExited(session, null, null, "failed");
|
||||
settle(() => reject(err));
|
||||
});
|
||||
|
||||
@@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => {
|
||||
].join("\n"),
|
||||
);
|
||||
});
|
||||
|
||||
it("skips media notes for attachments with understanding output", () => {
|
||||
const note = buildInboundMediaNote({
|
||||
MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
|
||||
MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
|
||||
MediaUnderstanding: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 0,
|
||||
text: "hello",
|
||||
provider: "groq",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: {
|
||||
}
|
||||
|
||||
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
||||
// Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
|
||||
const suppressed = new Set(
|
||||
Array.isArray(ctx.MediaUnderstanding)
|
||||
? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
|
||||
: [],
|
||||
);
|
||||
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
|
||||
const paths =
|
||||
pathsFromArray && pathsFromArray.length > 0
|
||||
@@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
||||
? ctx.MediaTypes
|
||||
: undefined;
|
||||
|
||||
if (paths.length === 1) {
|
||||
const entries = paths
|
||||
.map((entry, index) => ({
|
||||
path: entry ?? "",
|
||||
type: types?.[index] ?? ctx.MediaType,
|
||||
url: urls?.[index] ?? ctx.MediaUrl,
|
||||
index,
|
||||
}))
|
||||
.filter((entry) => !suppressed.has(entry.index));
|
||||
if (entries.length === 0) return undefined;
|
||||
if (entries.length === 1) {
|
||||
return formatMediaAttachedLine({
|
||||
path: paths[0] ?? "",
|
||||
type: types?.[0] ?? ctx.MediaType,
|
||||
url: urls?.[0] ?? ctx.MediaUrl,
|
||||
path: entries[0]?.path ?? "",
|
||||
type: entries[0]?.type,
|
||||
url: entries[0]?.url,
|
||||
});
|
||||
}
|
||||
|
||||
const count = paths.length;
|
||||
const count = entries.length;
|
||||
const lines: string[] = [`[media attached: ${count} files]`];
|
||||
for (const [idx, mediaPath] of paths.entries()) {
|
||||
for (const [idx, entry] of entries.entries()) {
|
||||
lines.push(
|
||||
formatMediaAttachedLine({
|
||||
path: mediaPath,
|
||||
path: entry.path,
|
||||
index: idx + 1,
|
||||
total: count,
|
||||
type: types?.[idx],
|
||||
url: urls?.[idx],
|
||||
type: entry.type,
|
||||
url: entry.url,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
|
||||
const commandSource =
|
||||
sessionCtx.CommandBody ??
|
||||
sessionCtx.RawBody ??
|
||||
sessionCtx.Transcript ??
|
||||
sessionCtx.BodyStripped ??
|
||||
sessionCtx.Body ??
|
||||
"";
|
||||
|
||||
@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
|
||||
cap?: number;
|
||||
dropPolicy?: InlineDirectives["dropPolicy"];
|
||||
};
|
||||
transcribedText?: string;
|
||||
typing: TypingController;
|
||||
opts?: GetReplyOptions;
|
||||
defaultModel: string;
|
||||
@@ -210,7 +209,6 @@ export async function runPreparedReply(
|
||||
model,
|
||||
perMessageQueueMode,
|
||||
perMessageQueueOptions,
|
||||
transcribedText,
|
||||
typing,
|
||||
opts,
|
||||
defaultModel,
|
||||
@@ -325,11 +323,7 @@ export async function runPreparedReply(
|
||||
sessionEntry = skillResult.sessionEntry ?? sessionEntry;
|
||||
currentSystemSent = skillResult.systemSent;
|
||||
const skillsSnapshot = skillResult.skillsSnapshot;
|
||||
const prefixedBody = transcribedText
|
||||
? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
|
||||
.filter(Boolean)
|
||||
.join("\n\n")
|
||||
: [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
|
||||
const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
|
||||
const mediaNote = buildInboundMediaNote(ctx);
|
||||
const mediaReplyHint = mediaNote
|
||||
? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
|
||||
@@ -370,11 +364,7 @@ export async function runPreparedReply(
|
||||
}
|
||||
const sessionIdFinal = sessionId ?? crypto.randomUUID();
|
||||
const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
|
||||
const queueBodyBase = transcribedText
|
||||
? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
|
||||
.filter(Boolean)
|
||||
.join("\n\n")
|
||||
: [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
|
||||
const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
|
||||
const queuedBody = mediaNote
|
||||
? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
|
||||
: queueBodyBase;
|
||||
|
||||
@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
|
||||
import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
|
||||
import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
|
||||
import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
|
||||
import { logVerbose } from "../../globals.js";
|
||||
import { defaultRuntime } from "../../runtime.js";
|
||||
import { resolveCommandAuthorization } from "../command-auth.js";
|
||||
import type { MsgContext } from "../templating.js";
|
||||
import { SILENT_REPLY_TOKEN } from "../tokens.js";
|
||||
import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
|
||||
import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
|
||||
import type { GetReplyOptions, ReplyPayload } from "../types.js";
|
||||
import { resolveDefaultModel } from "./directive-handling.js";
|
||||
import { resolveReplyDirectives } from "./get-reply-directives.js";
|
||||
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
|
||||
});
|
||||
opts?.onTypingController?.(typing);
|
||||
|
||||
let transcribedText: string | undefined;
|
||||
if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
|
||||
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
|
||||
if (transcribed?.text) {
|
||||
transcribedText = transcribed.text;
|
||||
ctx.Body = transcribed.text;
|
||||
ctx.Transcript = transcribed.text;
|
||||
logVerbose("Replaced Body with audio transcript for reply flow");
|
||||
}
|
||||
}
|
||||
await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
agentDir,
|
||||
});
|
||||
|
||||
const commandAuthorized = ctx.CommandAuthorized ?? true;
|
||||
resolveCommandAuthorization({
|
||||
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
|
||||
model,
|
||||
perMessageQueueMode,
|
||||
perMessageQueueOptions,
|
||||
transcribedText,
|
||||
typing,
|
||||
opts,
|
||||
defaultModel,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import type { ChannelId } from "../channels/plugins/types.js";
|
||||
import type { InternalMessageChannel } from "../utils/message-channel.js";
|
||||
import type { CommandArgs } from "./commands-registry.types.js";
|
||||
import type { MediaUnderstandingOutput } from "../media-understanding/types.js";
|
||||
|
||||
/** Valid message channels for routing. */
|
||||
export type OriginatingChannelType = ChannelId | InternalMessageChannel;
|
||||
@@ -41,6 +42,9 @@ export type MsgContext = {
|
||||
/** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
|
||||
MediaRemoteHost?: string;
|
||||
Transcript?: string;
|
||||
MediaUnderstanding?: MediaUnderstandingOutput[];
|
||||
Prompt?: string;
|
||||
MaxChars?: number;
|
||||
ChatType?: string;
|
||||
GroupSubject?: string;
|
||||
GroupRoom?: string;
|
||||
|
||||
@@ -65,7 +65,7 @@ describe("legacy config detection", () => {
|
||||
expect(res.config?.messages?.groupChat?.mentionPatterns).toEqual(["@clawd"]);
|
||||
expect(res.config?.routing?.groupChat?.mentionPatterns).toBeUndefined();
|
||||
});
|
||||
it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/audio", async () => {
|
||||
it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/media", async () => {
|
||||
vi.resetModules();
|
||||
const { migrateLegacyConfig } = await import("./config.js");
|
||||
const res = migrateLegacyConfig({
|
||||
@@ -80,7 +80,7 @@ describe("legacy config detection", () => {
|
||||
});
|
||||
expect(res.changes).toContain("Moved routing.agentToAgent → tools.agentToAgent.");
|
||||
expect(res.changes).toContain("Moved routing.queue → messages.queue.");
|
||||
expect(res.changes).toContain("Moved routing.transcribeAudio → tools.audio.transcription.");
|
||||
expect(res.changes).toContain("Moved routing.transcribeAudio → tools.media.audio.models.");
|
||||
expect(res.config?.tools?.agentToAgent).toEqual({
|
||||
enabled: true,
|
||||
allow: ["main"],
|
||||
@@ -89,9 +89,16 @@ describe("legacy config detection", () => {
|
||||
mode: "queue",
|
||||
cap: 3,
|
||||
});
|
||||
expect(res.config?.tools?.audio?.transcription).toEqual({
|
||||
args: ["--model", "base"],
|
||||
timeoutSeconds: 2,
|
||||
expect(res.config?.tools?.media?.audio).toEqual({
|
||||
enabled: true,
|
||||
models: [
|
||||
{
|
||||
command: "whisper",
|
||||
type: "cli",
|
||||
args: ["--model", "base"],
|
||||
timeoutSeconds: 2,
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(res.config?.routing).toBeUndefined();
|
||||
});
|
||||
|
||||
@@ -330,14 +330,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
|
||||
const mapped = mapLegacyAudioTranscription(routing.transcribeAudio);
|
||||
if (mapped) {
|
||||
const tools = ensureRecord(raw, "tools");
|
||||
const toolsAudio = ensureRecord(tools, "audio");
|
||||
if (toolsAudio.transcription === undefined) {
|
||||
toolsAudio.transcription = mapped;
|
||||
changes.push("Moved routing.transcribeAudio → tools.audio.transcription.");
|
||||
const media = ensureRecord(tools, "media");
|
||||
const mediaAudio = ensureRecord(media, "audio");
|
||||
const models = Array.isArray(mediaAudio.models)
|
||||
? (mediaAudio.models as unknown[])
|
||||
: [];
|
||||
if (models.length === 0) {
|
||||
mediaAudio.enabled = true;
|
||||
mediaAudio.models = [mapped];
|
||||
changes.push("Moved routing.transcribeAudio → tools.media.audio.models.");
|
||||
} else {
|
||||
changes.push(
|
||||
"Removed routing.transcribeAudio (tools.audio.transcription already set).",
|
||||
);
|
||||
changes.push("Removed routing.transcribeAudio (tools.media.audio.models already set).");
|
||||
}
|
||||
} else {
|
||||
changes.push("Removed routing.transcribeAudio (unsupported transcription CLI).");
|
||||
@@ -350,12 +353,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
|
||||
const mapped = mapLegacyAudioTranscription(audio.transcription);
|
||||
if (mapped) {
|
||||
const tools = ensureRecord(raw, "tools");
|
||||
const toolsAudio = ensureRecord(tools, "audio");
|
||||
if (toolsAudio.transcription === undefined) {
|
||||
toolsAudio.transcription = mapped;
|
||||
changes.push("Moved audio.transcription → tools.audio.transcription.");
|
||||
const media = ensureRecord(tools, "media");
|
||||
const mediaAudio = ensureRecord(media, "audio");
|
||||
const models = Array.isArray(mediaAudio.models)
|
||||
? (mediaAudio.models as unknown[])
|
||||
: [];
|
||||
if (models.length === 0) {
|
||||
mediaAudio.enabled = true;
|
||||
mediaAudio.models = [mapped];
|
||||
changes.push("Moved audio.transcription → tools.media.audio.models.");
|
||||
} else {
|
||||
changes.push("Removed audio.transcription (tools.audio.transcription already set).");
|
||||
changes.push("Removed audio.transcription (tools.media.audio.models already set).");
|
||||
}
|
||||
delete audio.transcription;
|
||||
if (Object.keys(audio).length === 0) delete raw.audio;
|
||||
|
||||
@@ -69,7 +69,7 @@ export const LEGACY_CONFIG_RULES: LegacyConfigRule[] = [
|
||||
{
|
||||
path: ["routing", "transcribeAudio"],
|
||||
message:
|
||||
"routing.transcribeAudio was moved; use tools.audio.transcription instead (auto-migrated on load).",
|
||||
"routing.transcribeAudio was moved; use tools.media.audio.models instead (auto-migrated on load).",
|
||||
},
|
||||
{
|
||||
path: ["telegram", "requireMention"],
|
||||
|
||||
@@ -56,7 +56,7 @@ export const mapLegacyAudioTranscription = (value: unknown): Record<string, unkn
|
||||
const timeoutSeconds =
|
||||
typeof transcriber?.timeoutSeconds === "number" ? transcriber?.timeoutSeconds : undefined;
|
||||
|
||||
const result: Record<string, unknown> = {};
|
||||
const result: Record<string, unknown> = { command: rawExecutable, type: "cli" };
|
||||
if (args.length > 0) result.args = args;
|
||||
if (timeoutSeconds !== undefined) result.timeoutSeconds = timeoutSeconds;
|
||||
return result;
|
||||
|
||||
@@ -102,8 +102,28 @@ const FIELD_LABELS: Record<string, string> = {
|
||||
"gateway.remote.password": "Remote Gateway Password",
|
||||
"gateway.auth.token": "Gateway Token",
|
||||
"gateway.auth.password": "Gateway Password",
|
||||
"tools.audio.transcription.args": "Audio Transcription Args",
|
||||
"tools.audio.transcription.timeoutSeconds": "Audio Transcription Timeout (sec)",
|
||||
"tools.media.image.enabled": "Enable Image Understanding",
|
||||
"tools.media.image.maxBytes": "Image Understanding Max Bytes",
|
||||
"tools.media.image.maxChars": "Image Understanding Max Chars",
|
||||
"tools.media.image.prompt": "Image Understanding Prompt",
|
||||
"tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
|
||||
"tools.media.image.models": "Image Understanding Models",
|
||||
"tools.media.image.scope": "Image Understanding Scope",
|
||||
"tools.media.audio.enabled": "Enable Audio Understanding",
|
||||
"tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
|
||||
"tools.media.audio.maxChars": "Audio Understanding Max Chars",
|
||||
"tools.media.audio.prompt": "Audio Understanding Prompt",
|
||||
"tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
|
||||
"tools.media.audio.language": "Audio Understanding Language",
|
||||
"tools.media.audio.models": "Audio Understanding Models",
|
||||
"tools.media.audio.scope": "Audio Understanding Scope",
|
||||
"tools.media.video.enabled": "Enable Video Understanding",
|
||||
"tools.media.video.maxBytes": "Video Understanding Max Bytes",
|
||||
"tools.media.video.maxChars": "Video Understanding Max Chars",
|
||||
"tools.media.video.prompt": "Video Understanding Prompt",
|
||||
"tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
|
||||
"tools.media.video.models": "Video Understanding Models",
|
||||
"tools.media.video.scope": "Video Understanding Scope",
|
||||
"tools.profile": "Tool Profile",
|
||||
"agents.list[].tools.profile": "Agent Tool Profile",
|
||||
"tools.byProvider": "Tool Policy by Provider",
|
||||
|
||||
@@ -47,7 +47,7 @@ export type BroadcastConfig = {
|
||||
};
|
||||
|
||||
export type AudioConfig = {
|
||||
/** @deprecated Use tools.audio.transcription instead. */
|
||||
/** @deprecated Use tools.media.audio.models instead. */
|
||||
transcription?: {
|
||||
// Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout.
|
||||
command: string[];
|
||||
|
||||
@@ -1,4 +1,76 @@
|
||||
import type { AgentElevatedAllowFromConfig } from "./types.base.js";
|
||||
import type { AgentElevatedAllowFromConfig, SessionSendPolicyAction } from "./types.base.js";
|
||||
|
||||
export type MediaUnderstandingScopeMatch = {
|
||||
channel?: string;
|
||||
chatType?: "direct" | "group" | "room";
|
||||
keyPrefix?: string;
|
||||
};
|
||||
|
||||
export type MediaUnderstandingScopeRule = {
|
||||
action: SessionSendPolicyAction;
|
||||
match?: MediaUnderstandingScopeMatch;
|
||||
};
|
||||
|
||||
export type MediaUnderstandingScopeConfig = {
|
||||
default?: SessionSendPolicyAction;
|
||||
rules?: MediaUnderstandingScopeRule[];
|
||||
};
|
||||
|
||||
export type MediaUnderstandingCapability = "image" | "audio" | "video";
|
||||
|
||||
export type MediaUnderstandingModelConfig = {
|
||||
/** provider API id (e.g. openai, google). */
|
||||
provider?: string;
|
||||
/** Model id for provider-based understanding. */
|
||||
model?: string;
|
||||
/** Optional capability tags for shared model lists. */
|
||||
capabilities?: MediaUnderstandingCapability[];
|
||||
/** Use a CLI command instead of provider API. */
|
||||
type?: "provider" | "cli";
|
||||
/** CLI binary (required when type=cli). */
|
||||
command?: string;
|
||||
/** CLI args (template-enabled). */
|
||||
args?: string[];
|
||||
/** Optional prompt override for this model entry. */
|
||||
prompt?: string;
|
||||
/** Optional max output characters for this model entry. */
|
||||
maxChars?: number;
|
||||
/** Optional max bytes for this model entry. */
|
||||
maxBytes?: number;
|
||||
/** Optional timeout override (seconds) for this model entry. */
|
||||
timeoutSeconds?: number;
|
||||
/** Optional language hint for audio transcription. */
|
||||
language?: string;
|
||||
/** Auth profile id to use for this provider. */
|
||||
profile?: string;
|
||||
/** Preferred profile id if multiple are available. */
|
||||
preferredProfile?: string;
|
||||
};
|
||||
|
||||
export type MediaUnderstandingConfig = {
|
||||
/** Enable media understanding when models are configured. */
|
||||
enabled?: boolean;
|
||||
/** Optional scope gating for understanding. */
|
||||
scope?: MediaUnderstandingScopeConfig;
|
||||
/** Default max bytes to send. */
|
||||
maxBytes?: number;
|
||||
/** Default max output characters. */
|
||||
maxChars?: number;
|
||||
/** Default prompt. */
|
||||
prompt?: string;
|
||||
/** Default timeout (seconds). */
|
||||
timeoutSeconds?: number;
|
||||
/** Default language hint (audio). */
|
||||
language?: string;
|
||||
/** Ordered model list (fallbacks in order). */
|
||||
models?: MediaUnderstandingModelConfig[];
|
||||
};
|
||||
|
||||
export type MediaToolsConfig = {
|
||||
image?: MediaUnderstandingConfig;
|
||||
audio?: MediaUnderstandingConfig;
|
||||
video?: MediaUnderstandingConfig;
|
||||
};
|
||||
|
||||
export type ToolProfileId = "minimal" | "coding" | "messaging" | "full";
|
||||
|
||||
@@ -127,13 +199,7 @@ export type ToolsConfig = {
|
||||
};
|
||||
};
|
||||
};
|
||||
audio?: {
|
||||
transcription?: {
|
||||
/** CLI args (template-enabled). */
|
||||
args?: string[];
|
||||
timeoutSeconds?: number;
|
||||
};
|
||||
};
|
||||
media?: MediaToolsConfig;
|
||||
/** Message tool configuration. */
|
||||
message?: {
|
||||
/**
|
||||
|
||||
@@ -5,7 +5,7 @@ import {
|
||||
GroupChatSchema,
|
||||
HumanDelaySchema,
|
||||
IdentitySchema,
|
||||
ToolsAudioTranscriptionSchema,
|
||||
ToolsMediaSchema,
|
||||
} from "./zod-schema.core.js";
|
||||
|
||||
export const HeartbeatSchema = z
|
||||
@@ -283,11 +283,7 @@ export const ToolsSchema = z
|
||||
deny: z.array(z.string()).optional(),
|
||||
byProvider: z.record(z.string(), ToolPolicyWithProfileSchema).optional(),
|
||||
web: ToolsWebSchema,
|
||||
audio: z
|
||||
.object({
|
||||
transcription: ToolsAudioTranscriptionSchema,
|
||||
})
|
||||
.optional(),
|
||||
media: ToolsMediaSchema,
|
||||
message: z
|
||||
.object({
|
||||
allowCrossContextSend: z.boolean().optional(),
|
||||
|
||||
@@ -240,10 +240,68 @@ export const ExecutableTokenSchema = z
|
||||
.string()
|
||||
.refine(isSafeExecutableValue, "expected safe executable name or path");
|
||||
|
||||
export const ToolsAudioTranscriptionSchema = z
|
||||
export const MediaUnderstandingScopeSchema = z
|
||||
.object({
|
||||
default: z.union([z.literal("allow"), z.literal("deny")]).optional(),
|
||||
rules: z
|
||||
.array(
|
||||
z.object({
|
||||
action: z.union([z.literal("allow"), z.literal("deny")]),
|
||||
match: z
|
||||
.object({
|
||||
channel: z.string().optional(),
|
||||
chatType: z
|
||||
.union([z.literal("direct"), z.literal("group"), z.literal("room")])
|
||||
.optional(),
|
||||
keyPrefix: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
}),
|
||||
)
|
||||
.optional(),
|
||||
})
|
||||
.optional();
|
||||
|
||||
export const MediaUnderstandingCapabilitiesSchema = z
|
||||
.array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
|
||||
.optional();
|
||||
|
||||
export const MediaUnderstandingModelSchema = z
|
||||
.object({
|
||||
provider: z.string().optional(),
|
||||
model: z.string().optional(),
|
||||
capabilities: MediaUnderstandingCapabilitiesSchema,
|
||||
type: z.union([z.literal("provider"), z.literal("cli")]).optional(),
|
||||
command: z.string().optional(),
|
||||
args: z.array(z.string()).optional(),
|
||||
prompt: z.string().optional(),
|
||||
maxChars: z.number().int().positive().optional(),
|
||||
maxBytes: z.number().int().positive().optional(),
|
||||
timeoutSeconds: z.number().int().positive().optional(),
|
||||
language: z.string().optional(),
|
||||
profile: z.string().optional(),
|
||||
preferredProfile: z.string().optional(),
|
||||
})
|
||||
.optional();
|
||||
|
||||
export const ToolsMediaUnderstandingSchema = z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
scope: MediaUnderstandingScopeSchema,
|
||||
maxBytes: z.number().int().positive().optional(),
|
||||
maxChars: z.number().int().positive().optional(),
|
||||
prompt: z.string().optional(),
|
||||
timeoutSeconds: z.number().int().positive().optional(),
|
||||
language: z.string().optional(),
|
||||
models: z.array(MediaUnderstandingModelSchema).optional(),
|
||||
})
|
||||
.optional();
|
||||
|
||||
export const ToolsMediaSchema = z
|
||||
.object({
|
||||
image: ToolsMediaUnderstandingSchema.optional(),
|
||||
audio: ToolsMediaUnderstandingSchema.optional(),
|
||||
video: ToolsMediaUnderstandingSchema.optional(),
|
||||
})
|
||||
.optional();
|
||||
|
||||
|
||||
258
src/media-understanding/apply.test.ts
Normal file
258
src/media-understanding/apply.test.ts
Normal file
@@ -0,0 +1,258 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||
import { fetchRemoteMedia } from "../media/fetch.js";
|
||||
|
||||
vi.mock("../agents/model-auth.js", () => ({
|
||||
resolveApiKeyForProvider: vi.fn(async () => ({
|
||||
apiKey: "test-key",
|
||||
source: "test",
|
||||
})),
|
||||
}));
|
||||
|
||||
vi.mock("../media/fetch.js", () => ({
|
||||
fetchRemoteMedia: vi.fn(),
|
||||
}));
|
||||
|
||||
vi.mock("../process/exec.js", () => ({
|
||||
runExec: vi.fn(),
|
||||
}));
|
||||
|
||||
async function loadApply() {
|
||||
return await import("./apply.js");
|
||||
}
|
||||
|
||||
describe("applyMediaUnderstanding", () => {
|
||||
const mockedResolveApiKey = vi.mocked(resolveApiKeyForProvider);
|
||||
const mockedFetchRemoteMedia = vi.mocked(fetchRemoteMedia);
|
||||
|
||||
beforeEach(() => {
|
||||
mockedResolveApiKey.mockClear();
|
||||
mockedFetchRemoteMedia.mockReset();
|
||||
mockedFetchRemoteMedia.mockResolvedValue({
|
||||
buffer: Buffer.from("audio-bytes"),
|
||||
contentType: "audio/ogg",
|
||||
fileName: "note.ogg",
|
||||
});
|
||||
});
|
||||
|
||||
it("sets Transcript and replaces Body when audio transcription succeeds", async () => {
|
||||
const { applyMediaUnderstanding } = await loadApply();
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||
const audioPath = path.join(dir, "note.ogg");
|
||||
await fs.writeFile(audioPath, "hello");
|
||||
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:audio>",
|
||||
MediaPath: audioPath,
|
||||
MediaType: "audio/ogg",
|
||||
};
|
||||
const cfg: ClawdbotConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
maxBytes: 1024 * 1024,
|
||||
models: [{ provider: "groq" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
providers: {
|
||||
groq: {
|
||||
id: "groq",
|
||||
transcribeAudio: async () => ({ text: "transcribed text" }),
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(ctx.Transcript).toBe("transcribed text");
|
||||
expect(ctx.Body).toBe("[Audio]\nTranscript:\ntranscribed text");
|
||||
expect(ctx.CommandBody).toBe("transcribed text");
|
||||
expect(ctx.RawBody).toBe("transcribed text");
|
||||
});
|
||||
|
||||
it("handles URL-only attachments for audio transcription", async () => {
|
||||
const { applyMediaUnderstanding } = await loadApply();
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:audio>",
|
||||
MediaUrl: "https://example.com/note.ogg",
|
||||
MediaType: "audio/ogg",
|
||||
ChatType: "dm",
|
||||
};
|
||||
const cfg: ClawdbotConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
maxBytes: 1024 * 1024,
|
||||
scope: {
|
||||
default: "deny",
|
||||
rules: [{ action: "allow", match: { chatType: "direct" } }],
|
||||
},
|
||||
models: [{ provider: "groq" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
providers: {
|
||||
groq: {
|
||||
id: "groq",
|
||||
transcribeAudio: async () => ({ text: "remote transcript" }),
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(ctx.Transcript).toBe("remote transcript");
|
||||
expect(ctx.Body).toBe("[Audio]\nTranscript:\nremote transcript");
|
||||
});
|
||||
|
||||
it("skips audio transcription when attachment exceeds maxBytes", async () => {
|
||||
const { applyMediaUnderstanding } = await loadApply();
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||
const audioPath = path.join(dir, "large.wav");
|
||||
await fs.writeFile(audioPath, "0123456789");
|
||||
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:audio>",
|
||||
MediaPath: audioPath,
|
||||
MediaType: "audio/wav",
|
||||
};
|
||||
const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
|
||||
const cfg: ClawdbotConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
maxBytes: 4,
|
||||
models: [{ provider: "groq" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
providers: { groq: { id: "groq", transcribeAudio } },
|
||||
});
|
||||
|
||||
expect(result.appliedAudio).toBe(false);
|
||||
expect(transcribeAudio).not.toHaveBeenCalled();
|
||||
expect(ctx.Body).toBe("<media:audio>");
|
||||
});
|
||||
|
||||
it("falls back to CLI model when provider fails", async () => {
|
||||
const { applyMediaUnderstanding } = await loadApply();
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||
const audioPath = path.join(dir, "note.ogg");
|
||||
await fs.writeFile(audioPath, "hello");
|
||||
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:audio>",
|
||||
MediaPath: audioPath,
|
||||
MediaType: "audio/ogg",
|
||||
};
|
||||
const cfg: ClawdbotConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
models: [
|
||||
{ provider: "groq" },
|
||||
{
|
||||
type: "cli",
|
||||
command: "whisper",
|
||||
args: ["{{MediaPath}}"],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const execModule = await import("../process/exec.js");
|
||||
vi.mocked(execModule.runExec).mockResolvedValue({
|
||||
stdout: "cli transcript\n",
|
||||
stderr: "",
|
||||
});
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
providers: {
|
||||
groq: {
|
||||
id: "groq",
|
||||
transcribeAudio: async () => {
|
||||
throw new Error("boom");
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(ctx.Transcript).toBe("cli transcript");
|
||||
expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
|
||||
});
|
||||
|
||||
it("uses CLI image understanding and preserves caption for commands", async () => {
|
||||
const { applyMediaUnderstanding } = await loadApply();
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||
const imagePath = path.join(dir, "photo.jpg");
|
||||
await fs.writeFile(imagePath, "image-bytes");
|
||||
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:image> show Dom",
|
||||
MediaPath: imagePath,
|
||||
MediaType: "image/jpeg",
|
||||
};
|
||||
const cfg: ClawdbotConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
image: {
|
||||
enabled: true,
|
||||
models: [
|
||||
{
|
||||
type: "cli",
|
||||
command: "gemini",
|
||||
args: ["--file", "{{MediaPath}}", "--prompt", "{{Prompt}}"],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const execModule = await import("../process/exec.js");
|
||||
vi.mocked(execModule.runExec).mockResolvedValue({
|
||||
stdout: "image description\n",
|
||||
stderr: "",
|
||||
});
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
});
|
||||
|
||||
expect(result.appliedImage).toBe(true);
|
||||
expect(ctx.Body).toBe("[Image]\nUser text:\nshow Dom\nDescription:\nimage description");
|
||||
expect(ctx.CommandBody).toBe("show Dom");
|
||||
expect(ctx.RawBody).toBe("show Dom");
|
||||
});
|
||||
});
|
||||
804
src/media-understanding/apply.ts
Normal file
804
src/media-understanding/apply.ts
Normal file
@@ -0,0 +1,804 @@
|
||||
import crypto from "node:crypto";
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
|
||||
import { complete } from "@mariozechner/pi-ai";
|
||||
import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
|
||||
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import { applyTemplate } from "../auto-reply/templating.js";
|
||||
import { getApiKeyForModel, resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||
import { ensureClawdbotModelsJson } from "../agents/models-config.js";
|
||||
import { minimaxUnderstandImage } from "../agents/minimax-vlm.js";
|
||||
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||
import { fetchRemoteMedia } from "../media/fetch.js";
|
||||
import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
|
||||
import { runExec } from "../process/exec.js";
|
||||
import type {
|
||||
MediaUnderstandingConfig,
|
||||
MediaUnderstandingModelConfig,
|
||||
MediaUnderstandingScopeConfig,
|
||||
} from "../config/types.tools.js";
|
||||
import { extractMediaUserText, formatMediaUnderstandingBody } from "./format.js";
|
||||
import {
|
||||
buildMediaUnderstandingRegistry,
|
||||
getMediaUnderstandingProvider,
|
||||
normalizeMediaProviderId,
|
||||
} from "./providers/index.js";
|
||||
import { fetchWithTimeout } from "./providers/shared.js";
|
||||
import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
|
||||
import type {
|
||||
MediaAttachment,
|
||||
MediaUnderstandingOutput,
|
||||
MediaUnderstandingProvider,
|
||||
} from "./types.js";
|
||||
import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
|
||||
|
||||
const MB = 1024 * 1024;
|
||||
const DEFAULT_MAX_CHARS = 500;
|
||||
const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<Capability, number | undefined> = {
|
||||
image: DEFAULT_MAX_CHARS,
|
||||
audio: undefined,
|
||||
video: DEFAULT_MAX_CHARS,
|
||||
};
|
||||
const DEFAULT_MAX_BYTES: Record<Capability, number> = {
|
||||
image: 10 * MB,
|
||||
audio: 20 * MB,
|
||||
video: 50 * MB,
|
||||
};
|
||||
const DEFAULT_TIMEOUT_SECONDS: Record<Capability, number> = {
|
||||
image: 60,
|
||||
audio: 60,
|
||||
video: 120,
|
||||
};
|
||||
const DEFAULT_PROMPT: Record<Capability, string> = {
|
||||
image: "Describe the image.",
|
||||
audio: "Transcribe the audio.",
|
||||
video: "Describe the video.",
|
||||
};
|
||||
const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
|
||||
const DEFAULT_AUDIO_MODELS: Record<string, string> = {
|
||||
groq: "whisper-large-v3-turbo",
|
||||
openai: "whisper-1",
|
||||
};
|
||||
const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
||||
|
||||
export type ApplyMediaUnderstandingResult = {
|
||||
outputs: MediaUnderstandingOutput[];
|
||||
appliedImage: boolean;
|
||||
appliedAudio: boolean;
|
||||
appliedVideo: boolean;
|
||||
};
|
||||
|
||||
type Capability = "image" | "audio" | "video";
|
||||
|
||||
type MediaBufferResult = {
|
||||
buffer: Buffer;
|
||||
mime?: string;
|
||||
fileName: string;
|
||||
};
|
||||
|
||||
type MediaPathResult = {
|
||||
path: string;
|
||||
cleanup?: () => Promise<void> | void;
|
||||
};
|
||||
|
||||
function normalizeAttachmentPath(raw?: string | null): string | undefined {
|
||||
const value = raw?.trim();
|
||||
if (!value) return undefined;
|
||||
if (value.startsWith("file://")) {
|
||||
try {
|
||||
return fileURLToPath(value);
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
|
||||
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
|
||||
const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
|
||||
const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
|
||||
const resolveMime = (count: number, index: number) => {
|
||||
const typeHint = typesFromArray?.[index];
|
||||
const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
|
||||
if (trimmed) return trimmed;
|
||||
return count === 1 ? ctx.MediaType : undefined;
|
||||
};
|
||||
|
||||
if (pathsFromArray && pathsFromArray.length > 0) {
|
||||
const count = pathsFromArray.length;
|
||||
const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
|
||||
return pathsFromArray
|
||||
.map((value, index) => ({
|
||||
path: value?.trim() || undefined,
|
||||
url: urls?.[index] ?? ctx.MediaUrl,
|
||||
mime: resolveMime(count, index),
|
||||
index,
|
||||
}))
|
||||
.filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
|
||||
}
|
||||
|
||||
if (urlsFromArray && urlsFromArray.length > 0) {
|
||||
const count = urlsFromArray.length;
|
||||
return urlsFromArray
|
||||
.map((value, index) => ({
|
||||
path: undefined,
|
||||
url: value?.trim() || undefined,
|
||||
mime: resolveMime(count, index),
|
||||
index,
|
||||
}))
|
||||
.filter((entry) => Boolean(entry.url?.trim()));
|
||||
}
|
||||
|
||||
const pathValue = ctx.MediaPath?.trim();
|
||||
const url = ctx.MediaUrl?.trim();
|
||||
if (!pathValue && !url) return [];
|
||||
return [
|
||||
{
|
||||
path: pathValue || undefined,
|
||||
url: url || undefined,
|
||||
mime: ctx.MediaType,
|
||||
index: 0,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
function isVideoAttachment(attachment: MediaAttachment): boolean {
|
||||
if (attachment.mime?.startsWith("video/")) return true;
|
||||
const ext = getFileExtension(attachment.path ?? attachment.url);
|
||||
if (!ext) return false;
|
||||
return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
|
||||
}
|
||||
|
||||
function isAudioAttachment(attachment: MediaAttachment): boolean {
|
||||
if (attachment.mime?.startsWith("audio/")) return true;
|
||||
return isAudioFileName(attachment.path ?? attachment.url);
|
||||
}
|
||||
|
||||
function isImageAttachment(attachment: MediaAttachment): boolean {
|
||||
if (attachment.mime?.startsWith("image/")) return true;
|
||||
const ext = getFileExtension(attachment.path ?? attachment.url);
|
||||
if (!ext) return false;
|
||||
return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
|
||||
}
|
||||
|
||||
function estimateBase64Size(bytes: number): number {
|
||||
return Math.ceil(bytes / 3) * 4;
|
||||
}
|
||||
|
||||
function resolveVideoMaxBase64Bytes(maxBytes: number): number {
|
||||
const expanded = Math.floor(maxBytes * (4 / 3));
|
||||
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
|
||||
}
|
||||
|
||||
function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
|
||||
const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
|
||||
return Math.max(1000, Math.floor(value * 1000));
|
||||
}
|
||||
|
||||
function resolvePrompt(capability: Capability, prompt?: string, maxChars?: number): string {
|
||||
const base = prompt?.trim() || DEFAULT_PROMPT[capability];
|
||||
if (!maxChars || capability === "audio") return base;
|
||||
return `${base} Respond in at most ${maxChars} characters.`;
|
||||
}
|
||||
|
||||
function resolveRequestUrl(input: RequestInfo | URL): string {
|
||||
if (typeof input === "string") return input;
|
||||
if (input instanceof URL) return input.toString();
|
||||
return input.url;
|
||||
}
|
||||
|
||||
function normalizeErrorMessage(err: unknown): string {
|
||||
if (!err) return "";
|
||||
if (typeof err === "string") return err;
|
||||
if (err instanceof Error) return err.message;
|
||||
try {
|
||||
return JSON.stringify(err);
|
||||
} catch {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
function resolveMaxChars(params: {
|
||||
capability: Capability;
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
cfg: ClawdbotConfig;
|
||||
}): number | undefined {
|
||||
const { capability, entry, cfg } = params;
|
||||
const configured = entry.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
|
||||
if (typeof configured === "number") return configured;
|
||||
return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
|
||||
}
|
||||
|
||||
function trimOutput(text: string, maxChars?: number): string {
|
||||
const trimmed = text.trim();
|
||||
if (!maxChars || trimmed.length <= maxChars) return trimmed;
|
||||
return trimmed.slice(0, maxChars).trim();
|
||||
}
|
||||
|
||||
function resolveConfigValue<T>(primary: T | undefined, fallback: T): T {
|
||||
return primary === undefined ? fallback : primary;
|
||||
}
|
||||
|
||||
function resolveCapabilityConfig(
|
||||
cfg: ClawdbotConfig,
|
||||
capability: Capability,
|
||||
): MediaUnderstandingConfig | undefined {
|
||||
return cfg.tools?.media?.[capability];
|
||||
}
|
||||
|
||||
function resolveScopeDecision(params: {
|
||||
scope?: MediaUnderstandingScopeConfig;
|
||||
ctx: MsgContext;
|
||||
}): "allow" | "deny" {
|
||||
return resolveMediaUnderstandingScope({
|
||||
scope: params.scope,
|
||||
sessionKey: params.ctx.SessionKey,
|
||||
channel: params.ctx.Surface ?? params.ctx.Provider,
|
||||
chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
|
||||
});
|
||||
}
|
||||
|
||||
function resolveModelEntries(
|
||||
cfg: MediaUnderstandingConfig | undefined,
|
||||
capability: Capability,
|
||||
): MediaUnderstandingModelConfig[] {
|
||||
const models = cfg?.models ?? [];
|
||||
if (models.length === 0) return [];
|
||||
return models.filter((entry) => {
|
||||
const caps = entry.capabilities;
|
||||
if (!caps || caps.length === 0) return true;
|
||||
return caps.includes(capability);
|
||||
});
|
||||
}
|
||||
|
||||
function isMaxBytesError(err: unknown): boolean {
|
||||
const message = normalizeErrorMessage(err);
|
||||
if (!message) return false;
|
||||
return message.includes("exceeds maxBytes") || message.includes("payload exceeds maxBytes");
|
||||
}
|
||||
|
||||
async function loadAttachmentBuffer(params: {
|
||||
attachment: MediaAttachment;
|
||||
maxBytes: number;
|
||||
timeoutMs: number;
|
||||
}): Promise<MediaBufferResult | undefined> {
|
||||
const { attachment, maxBytes, timeoutMs } = params;
|
||||
const rawPath = normalizeAttachmentPath(attachment.path);
|
||||
if (rawPath) {
|
||||
const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
|
||||
try {
|
||||
const stat = await fs.stat(resolved);
|
||||
if (!stat.isFile()) return undefined;
|
||||
if (stat.size > maxBytes) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(
|
||||
`Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
|
||||
);
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
const buffer = await fs.readFile(resolved);
|
||||
const mime =
|
||||
attachment.mime ??
|
||||
(await detectMime({
|
||||
buffer,
|
||||
filePath: resolved,
|
||||
}));
|
||||
const fileName = path.basename(resolved) || `media-${attachment.index + 1}`;
|
||||
return { buffer, mime, fileName };
|
||||
} catch (err) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const url = attachment.url?.trim();
|
||||
if (!url) return undefined;
|
||||
|
||||
try {
|
||||
const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
|
||||
fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
|
||||
const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
|
||||
if (fetched.buffer.length > maxBytes) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(
|
||||
`Skipping media attachment ${attachment.index + 1}: ${fetched.buffer.length} bytes exceeds ${maxBytes}`,
|
||||
);
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
const mime =
|
||||
attachment.mime ??
|
||||
fetched.contentType ??
|
||||
(await detectMime({
|
||||
buffer: fetched.buffer,
|
||||
filePath: fetched.fileName ?? url,
|
||||
}));
|
||||
const fileName = fetched.fileName ?? `media-${attachment.index + 1}`;
|
||||
return { buffer: fetched.buffer, mime, fileName };
|
||||
} catch (err) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
async function resolveAttachmentPath(params: {
|
||||
attachment: MediaAttachment;
|
||||
maxBytes?: number;
|
||||
timeoutMs: number;
|
||||
}): Promise<MediaPathResult | undefined> {
|
||||
const { attachment, maxBytes, timeoutMs } = params;
|
||||
const rawPath = normalizeAttachmentPath(attachment.path);
|
||||
if (rawPath) {
|
||||
const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
|
||||
try {
|
||||
const stat = await fs.stat(resolved);
|
||||
if (!stat.isFile()) return undefined;
|
||||
if (maxBytes && stat.size > maxBytes) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(
|
||||
`Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
|
||||
);
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
return { path: resolved };
|
||||
} catch (err) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const url = attachment.url?.trim();
|
||||
if (!url) return undefined;
|
||||
|
||||
try {
|
||||
const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
|
||||
fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
|
||||
const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
|
||||
const buffer = fetched.buffer;
|
||||
if (maxBytes && buffer.length > maxBytes) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(
|
||||
`Skipping media attachment ${attachment.index + 1}: ${buffer.length} bytes exceeds ${maxBytes}`,
|
||||
);
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
const extension = fetched.fileName ? path.extname(fetched.fileName) : "";
|
||||
const tmpPath = path.join(
|
||||
os.tmpdir(),
|
||||
`clawdbot-media-${crypto.randomUUID()}${extension || ""}`,
|
||||
);
|
||||
await fs.writeFile(tmpPath, buffer);
|
||||
return {
|
||||
path: tmpPath,
|
||||
cleanup: async () => {
|
||||
await fs.unlink(tmpPath).catch(() => {});
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
async function describeImageWithModel(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
agentDir: string;
|
||||
provider: string;
|
||||
model: string;
|
||||
prompt: string;
|
||||
maxChars?: number;
|
||||
buffer: Buffer;
|
||||
mimeType: string;
|
||||
profile?: string;
|
||||
preferredProfile?: string;
|
||||
}): Promise<{ text: string; model: string }> {
|
||||
await ensureClawdbotModelsJson(params.cfg, params.agentDir);
|
||||
const authStorage = discoverAuthStorage(params.agentDir);
|
||||
const modelRegistry = discoverModels(authStorage, params.agentDir);
|
||||
const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
|
||||
if (!model) {
|
||||
throw new Error(`Unknown model: ${params.provider}/${params.model}`);
|
||||
}
|
||||
if (!model.input?.includes("image")) {
|
||||
throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
|
||||
}
|
||||
const apiKeyInfo = await getApiKeyForModel({
|
||||
model,
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
profileId: params.profile,
|
||||
preferredProfile: params.preferredProfile,
|
||||
});
|
||||
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
|
||||
|
||||
const base64 = params.buffer.toString("base64");
|
||||
if (model.provider === "minimax") {
|
||||
const text = await minimaxUnderstandImage({
|
||||
apiKey: apiKeyInfo.apiKey,
|
||||
prompt: params.prompt,
|
||||
imageDataUrl: `data:${params.mimeType};base64,${base64}`,
|
||||
modelBaseUrl: model.baseUrl,
|
||||
});
|
||||
return { text, model: model.id };
|
||||
}
|
||||
|
||||
const context: Context = {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: params.prompt },
|
||||
{ type: "image", data: base64, mimeType: params.mimeType },
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
};
|
||||
const message = (await complete(model, context, {
|
||||
apiKey: apiKeyInfo.apiKey,
|
||||
maxTokens: 512,
|
||||
})) as AssistantMessage;
|
||||
const text = coerceImageAssistantText({
|
||||
message,
|
||||
provider: model.provider,
|
||||
model: model.id,
|
||||
});
|
||||
return { text, model: model.id };
|
||||
}
|
||||
|
||||
async function runProviderEntry(params: {
|
||||
capability: Capability;
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachment: MediaAttachment;
|
||||
agentDir?: string;
|
||||
providerRegistry: Map<string, MediaUnderstandingProvider>;
|
||||
}): Promise<MediaUnderstandingOutput | null> {
|
||||
const { entry, capability, cfg, attachment } = params;
|
||||
const providerIdRaw = entry.provider?.trim();
|
||||
if (!providerIdRaw) {
|
||||
throw new Error(`Provider entry missing provider for ${capability}`);
|
||||
}
|
||||
const providerId = normalizeMediaProviderId(providerIdRaw);
|
||||
const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
|
||||
const maxChars = resolveMaxChars({ capability, entry, cfg });
|
||||
const timeoutMs = resolveTimeoutMs(
|
||||
entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
|
||||
DEFAULT_TIMEOUT_SECONDS[capability],
|
||||
);
|
||||
const prompt = resolvePrompt(
|
||||
capability,
|
||||
entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
|
||||
maxChars,
|
||||
);
|
||||
|
||||
if (capability === "image") {
|
||||
if (!params.agentDir) {
|
||||
throw new Error("Image understanding requires agentDir");
|
||||
}
|
||||
const modelId = entry.model?.trim();
|
||||
if (!modelId) {
|
||||
throw new Error("Image understanding requires model id");
|
||||
}
|
||||
const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
|
||||
if (!media) return null;
|
||||
const mimeType = media.mime ?? "image/jpeg";
|
||||
const result = await describeImageWithModel({
|
||||
cfg,
|
||||
agentDir: params.agentDir,
|
||||
provider: providerId,
|
||||
model: modelId,
|
||||
prompt,
|
||||
maxChars,
|
||||
buffer: media.buffer,
|
||||
mimeType,
|
||||
profile: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
});
|
||||
return {
|
||||
kind: "image.description",
|
||||
attachmentIndex: attachment.index,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
model: result.model,
|
||||
};
|
||||
}
|
||||
|
||||
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||
if (!provider) {
|
||||
throw new Error(`Media provider not available: ${providerId}`);
|
||||
}
|
||||
|
||||
if (capability === "audio") {
|
||||
if (!provider.transcribeAudio) {
|
||||
throw new Error(`Audio transcription provider "${providerId}" not available.`);
|
||||
}
|
||||
const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
|
||||
if (!media) return null;
|
||||
const key = await resolveApiKeyForProvider({
|
||||
provider: providerId,
|
||||
cfg,
|
||||
profileId: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
const providerConfig = cfg.models?.providers?.[providerId];
|
||||
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
||||
const result = await provider.transcribeAudio({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
apiKey: key.apiKey,
|
||||
baseUrl: providerConfig?.baseUrl,
|
||||
headers: providerConfig?.headers,
|
||||
model,
|
||||
language: entry.language ?? cfg.tools?.media?.audio?.language,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
});
|
||||
return {
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: attachment.index,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
model: result.model ?? model,
|
||||
};
|
||||
}
|
||||
|
||||
if (capability === "video") {
|
||||
if (!provider.describeVideo) {
|
||||
throw new Error(`Video understanding provider "${providerId}" not available.`);
|
||||
}
|
||||
const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
|
||||
if (!media) return null;
|
||||
const estimatedBase64Bytes = estimateBase64Size(media.buffer.length);
|
||||
const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
|
||||
if (estimatedBase64Bytes > maxBase64Bytes) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(
|
||||
`Skipping video attachment ${attachment.index + 1}: base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
|
||||
);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
const key = await resolveApiKeyForProvider({
|
||||
provider: providerId,
|
||||
cfg,
|
||||
profileId: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
const providerConfig = cfg.models?.providers?.[providerId];
|
||||
const result = await provider.describeVideo({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
apiKey: key.apiKey,
|
||||
baseUrl: providerConfig?.baseUrl,
|
||||
headers: providerConfig?.headers,
|
||||
model: entry.model,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
});
|
||||
return {
|
||||
kind: "video.description",
|
||||
attachmentIndex: attachment.index,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
model: result.model ?? entry.model,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function runCliEntry(params: {
|
||||
capability: Capability;
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachment: MediaAttachment;
|
||||
}): Promise<MediaUnderstandingOutput | null> {
|
||||
const { entry, capability, cfg, ctx, attachment } = params;
|
||||
const command = entry.command?.trim();
|
||||
const args = entry.args ?? [];
|
||||
if (!command) {
|
||||
throw new Error(`CLI entry missing command for ${capability}`);
|
||||
}
|
||||
const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
|
||||
const maxChars = resolveMaxChars({ capability, entry, cfg });
|
||||
const timeoutMs = resolveTimeoutMs(
|
||||
entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
|
||||
DEFAULT_TIMEOUT_SECONDS[capability],
|
||||
);
|
||||
const prompt = resolvePrompt(
|
||||
capability,
|
||||
entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
|
||||
maxChars,
|
||||
);
|
||||
const pathResult = await resolveAttachmentPath({
|
||||
attachment,
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
if (!pathResult) return null;
|
||||
|
||||
const templCtx: MsgContext = {
|
||||
...ctx,
|
||||
MediaPath: pathResult.path,
|
||||
Prompt: prompt,
|
||||
MaxChars: maxChars,
|
||||
};
|
||||
const argv = [command, ...args].map((part, index) =>
|
||||
index === 0 ? part : applyTemplate(part, templCtx),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
|
||||
}
|
||||
try {
|
||||
const { stdout } = await runExec(argv[0], argv.slice(1), {
|
||||
timeoutMs,
|
||||
maxBuffer: CLI_OUTPUT_MAX_BUFFER,
|
||||
});
|
||||
const text = trimOutput(stdout, maxChars);
|
||||
if (!text) return null;
|
||||
return {
|
||||
kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
|
||||
attachmentIndex: attachment.index,
|
||||
text,
|
||||
provider: "cli",
|
||||
model: command,
|
||||
};
|
||||
} finally {
|
||||
if (pathResult.cleanup) {
|
||||
await pathResult.cleanup();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function runCapability(params: {
|
||||
capability: Capability;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachments: MediaAttachment[];
|
||||
agentDir?: string;
|
||||
providerRegistry: Map<string, MediaUnderstandingProvider>;
|
||||
}): Promise<MediaUnderstandingOutput | null> {
|
||||
const { capability, cfg, ctx, attachments } = params;
|
||||
const config = resolveCapabilityConfig(cfg, capability);
|
||||
if (!config || config.enabled === false) return null;
|
||||
const entries = resolveModelEntries(config, capability);
|
||||
if (entries.length === 0) return null;
|
||||
|
||||
const scopeDecision = resolveScopeDecision({ scope: config.scope, ctx });
|
||||
if (scopeDecision === "deny") {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding disabled by scope policy.`);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const attachment = attachments.find((item) => {
|
||||
if (capability === "image") return isImageAttachment(item);
|
||||
if (capability === "audio") return isAudioAttachment(item);
|
||||
return isVideoAttachment(item);
|
||||
});
|
||||
if (!attachment) return null;
|
||||
|
||||
for (const entry of entries) {
|
||||
try {
|
||||
const entryType = entry.type ?? (entry.command ? "cli" : "provider");
|
||||
const result =
|
||||
entryType === "cli"
|
||||
? await runCliEntry({ capability, entry, cfg, ctx, attachment })
|
||||
: await runProviderEntry({
|
||||
capability,
|
||||
entry,
|
||||
cfg,
|
||||
ctx,
|
||||
attachment,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
});
|
||||
if (result) return result;
|
||||
} catch (err) {
|
||||
if (isMaxBytesError(err)) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Skipping ${capability} model due to size: ${String(err)}`);
|
||||
}
|
||||
} else if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding failed: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function applyMediaUnderstanding(params: {
|
||||
ctx: MsgContext;
|
||||
cfg: ClawdbotConfig;
|
||||
agentDir?: string;
|
||||
providers?: Record<string, MediaUnderstandingProvider>;
|
||||
}): Promise<ApplyMediaUnderstandingResult> {
|
||||
const { ctx, cfg } = params;
|
||||
const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
|
||||
const originalUserText =
|
||||
commandCandidates
|
||||
.map((value) => extractMediaUserText(value))
|
||||
.find((value) => value && value.trim()) ?? undefined;
|
||||
|
||||
const attachments = normalizeAttachments(ctx);
|
||||
const providerRegistry = buildMediaUnderstandingRegistry(params.providers);
|
||||
const outputs: MediaUnderstandingOutput[] = [];
|
||||
|
||||
const imageOutput = await runCapability({
|
||||
capability: "image",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry,
|
||||
});
|
||||
if (imageOutput) outputs.push(imageOutput);
|
||||
|
||||
const audioOutput = await runCapability({
|
||||
capability: "audio",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry,
|
||||
});
|
||||
if (audioOutput) outputs.push(audioOutput);
|
||||
|
||||
const videoOutput = await runCapability({
|
||||
capability: "video",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry,
|
||||
});
|
||||
if (videoOutput) outputs.push(videoOutput);
|
||||
|
||||
if (outputs.length > 0) {
|
||||
ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
|
||||
const audioResult = outputs.find((output) => output.kind === "audio.transcription");
|
||||
if (audioResult) {
|
||||
ctx.Transcript = audioResult.text;
|
||||
ctx.CommandBody = audioResult.text;
|
||||
ctx.RawBody = audioResult.text;
|
||||
} else if (originalUserText) {
|
||||
ctx.CommandBody = originalUserText;
|
||||
ctx.RawBody = originalUserText;
|
||||
}
|
||||
ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
|
||||
}
|
||||
|
||||
return {
|
||||
outputs,
|
||||
appliedImage: outputs.some((output) => output.kind === "image.description"),
|
||||
appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
|
||||
appliedVideo: outputs.some((output) => output.kind === "video.description"),
|
||||
};
|
||||
}
|
||||
91
src/media-understanding/format.test.ts
Normal file
91
src/media-understanding/format.test.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { formatMediaUnderstandingBody } from "./format.js";
|
||||
|
||||
describe("formatMediaUnderstandingBody", () => {
|
||||
it("replaces placeholder body with transcript", () => {
|
||||
const body = formatMediaUnderstandingBody({
|
||||
body: "<media:audio>",
|
||||
outputs: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 0,
|
||||
text: "hello world",
|
||||
provider: "groq",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(body).toBe("[Audio]\nTranscript:\nhello world");
|
||||
});
|
||||
|
||||
it("includes user text when body is meaningful", () => {
|
||||
const body = formatMediaUnderstandingBody({
|
||||
body: "caption here",
|
||||
outputs: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 0,
|
||||
text: "transcribed",
|
||||
provider: "groq",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
|
||||
});
|
||||
|
||||
it("strips leading media placeholders from user text", () => {
|
||||
const body = formatMediaUnderstandingBody({
|
||||
body: "<media:audio> caption here",
|
||||
outputs: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 0,
|
||||
text: "transcribed",
|
||||
provider: "groq",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
|
||||
});
|
||||
|
||||
it("keeps user text once when multiple outputs exist", () => {
|
||||
const body = formatMediaUnderstandingBody({
|
||||
body: "caption here",
|
||||
outputs: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 0,
|
||||
text: "audio text",
|
||||
provider: "groq",
|
||||
},
|
||||
{
|
||||
kind: "video.description",
|
||||
attachmentIndex: 1,
|
||||
text: "video text",
|
||||
provider: "google",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(body).toBe(
|
||||
[
|
||||
"User text:\ncaption here",
|
||||
"[Audio]\nTranscript:\naudio text",
|
||||
"[Video]\nDescription:\nvideo text",
|
||||
].join("\n\n"),
|
||||
);
|
||||
});
|
||||
|
||||
it("formats image outputs", () => {
|
||||
const body = formatMediaUnderstandingBody({
|
||||
body: "<media:image>",
|
||||
outputs: [
|
||||
{
|
||||
kind: "image.description",
|
||||
attachmentIndex: 0,
|
||||
text: "a cat",
|
||||
provider: "openai",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(body).toBe("[Image]\nDescription:\na cat");
|
||||
});
|
||||
});
|
||||
77
src/media-understanding/format.ts
Normal file
77
src/media-understanding/format.ts
Normal file
@@ -0,0 +1,77 @@
|
||||
import type { MediaUnderstandingOutput } from "./types.js";
|
||||
|
||||
const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
|
||||
const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
|
||||
|
||||
export function extractMediaUserText(body?: string): string | undefined {
|
||||
const trimmed = body?.trim() ?? "";
|
||||
if (!trimmed) return undefined;
|
||||
if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return undefined;
|
||||
const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
|
||||
return cleaned || undefined;
|
||||
}
|
||||
|
||||
function formatSection(
|
||||
title: "Audio" | "Video" | "Image",
|
||||
kind: "Transcript" | "Description",
|
||||
text: string,
|
||||
userText?: string,
|
||||
): string {
|
||||
const lines = [`[${title}]`];
|
||||
if (userText) {
|
||||
lines.push(`User text:\n${userText}`);
|
||||
}
|
||||
lines.push(`${kind}:\n${text}`);
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
export function formatMediaUnderstandingBody(params: {
|
||||
body?: string;
|
||||
outputs: MediaUnderstandingOutput[];
|
||||
}): string {
|
||||
const outputs = params.outputs.filter((output) => output.text.trim());
|
||||
if (outputs.length === 0) {
|
||||
return params.body ?? "";
|
||||
}
|
||||
|
||||
const userText = extractMediaUserText(params.body);
|
||||
const sections: string[] = [];
|
||||
if (userText && outputs.length > 1) {
|
||||
sections.push(`User text:\n${userText}`);
|
||||
}
|
||||
|
||||
for (const output of outputs) {
|
||||
if (output.kind === "audio.transcription") {
|
||||
sections.push(
|
||||
formatSection(
|
||||
"Audio",
|
||||
"Transcript",
|
||||
output.text,
|
||||
outputs.length === 1 ? userText : undefined,
|
||||
),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if (output.kind === "image.description") {
|
||||
sections.push(
|
||||
formatSection(
|
||||
"Image",
|
||||
"Description",
|
||||
output.text,
|
||||
outputs.length === 1 ? userText : undefined,
|
||||
),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
sections.push(
|
||||
formatSection(
|
||||
"Video",
|
||||
"Description",
|
||||
output.text,
|
||||
outputs.length === 1 ? userText : undefined,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
return sections.join("\n\n").trim();
|
||||
}
|
||||
9
src/media-understanding/index.ts
Normal file
9
src/media-understanding/index.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
export { applyMediaUnderstanding } from "./apply.js";
|
||||
export { formatMediaUnderstandingBody } from "./format.js";
|
||||
export { resolveMediaUnderstandingScope } from "./scope.js";
|
||||
export type {
|
||||
MediaAttachment,
|
||||
MediaUnderstandingOutput,
|
||||
MediaUnderstandingProvider,
|
||||
MediaUnderstandingKind,
|
||||
} from "./types.js";
|
||||
7
src/media-understanding/providers/google/index.ts
Normal file
7
src/media-understanding/providers/google/index.ts
Normal file
@@ -0,0 +1,7 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeGeminiVideo } from "./video.js";
|
||||
|
||||
export const googleProvider: MediaUnderstandingProvider = {
|
||||
id: "google",
|
||||
describeVideo: describeGeminiVideo,
|
||||
};
|
||||
93
src/media-understanding/providers/google/video.test.ts
Normal file
93
src/media-understanding/providers/google/video.test.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { describeGeminiVideo } from "./video.js";
|
||||
|
||||
const resolveRequestUrl = (input: RequestInfo | URL) => {
|
||||
if (typeof input === "string") return input;
|
||||
if (input instanceof URL) return input.toString();
|
||||
return input.url;
|
||||
};
|
||||
|
||||
describe("describeGeminiVideo", () => {
|
||||
it("respects case-insensitive x-goog-api-key overrides", async () => {
|
||||
let seenKey: string | null = null;
|
||||
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
|
||||
const headers = new Headers(init?.headers);
|
||||
seenKey = headers.get("x-goog-api-key");
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
candidates: [{ content: { parts: [{ text: "video ok" }] } }],
|
||||
}),
|
||||
{ status: 200, headers: { "content-type": "application/json" } },
|
||||
);
|
||||
};
|
||||
|
||||
const result = await describeGeminiVideo({
|
||||
buffer: Buffer.from("video"),
|
||||
fileName: "clip.mp4",
|
||||
apiKey: "test-key",
|
||||
timeoutMs: 1000,
|
||||
headers: { "X-Goog-Api-Key": "override" },
|
||||
fetchFn,
|
||||
});
|
||||
|
||||
expect(seenKey).toBe("override");
|
||||
expect(result.text).toBe("video ok");
|
||||
});
|
||||
|
||||
it("builds the expected request payload", async () => {
|
||||
let seenUrl: string | null = null;
|
||||
let seenInit: RequestInit | undefined;
|
||||
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||
seenUrl = resolveRequestUrl(input);
|
||||
seenInit = init;
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
candidates: [
|
||||
{
|
||||
content: {
|
||||
parts: [{ text: "first" }, { text: " second " }, { text: "" }],
|
||||
},
|
||||
},
|
||||
],
|
||||
}),
|
||||
{ status: 200, headers: { "content-type": "application/json" } },
|
||||
);
|
||||
};
|
||||
|
||||
const result = await describeGeminiVideo({
|
||||
buffer: Buffer.from("video-bytes"),
|
||||
fileName: "clip.mp4",
|
||||
apiKey: "test-key",
|
||||
timeoutMs: 1500,
|
||||
baseUrl: "https://example.com/v1beta/",
|
||||
model: "gemini-3-pro",
|
||||
headers: { "X-Other": "1" },
|
||||
fetchFn,
|
||||
});
|
||||
|
||||
expect(result.model).toBe("gemini-3-pro-preview");
|
||||
expect(result.text).toBe("first\nsecond");
|
||||
expect(seenUrl).toBe("https://example.com/v1beta/models/gemini-3-pro-preview:generateContent");
|
||||
expect(seenInit?.method).toBe("POST");
|
||||
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
|
||||
|
||||
const headers = new Headers(seenInit?.headers);
|
||||
expect(headers.get("x-goog-api-key")).toBe("test-key");
|
||||
expect(headers.get("content-type")).toBe("application/json");
|
||||
expect(headers.get("x-other")).toBe("1");
|
||||
|
||||
const bodyText =
|
||||
typeof seenInit?.body === "string"
|
||||
? seenInit.body
|
||||
: Buffer.isBuffer(seenInit?.body)
|
||||
? seenInit.body.toString("utf8")
|
||||
: "";
|
||||
const body = JSON.parse(bodyText);
|
||||
expect(body.contents?.[0]?.parts?.[0]?.text).toBe("Describe the video.");
|
||||
expect(body.contents?.[0]?.parts?.[1]?.inline_data?.mime_type).toBe("video/mp4");
|
||||
expect(body.contents?.[0]?.parts?.[1]?.inline_data?.data).toBe(
|
||||
Buffer.from("video-bytes").toString("base64"),
|
||||
);
|
||||
});
|
||||
});
|
||||
84
src/media-understanding/providers/google/video.ts
Normal file
84
src/media-understanding/providers/google/video.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
|
||||
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
|
||||
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||
|
||||
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
|
||||
|
||||
function resolveModel(model?: string): string {
|
||||
const trimmed = model?.trim();
|
||||
if (!trimmed) return DEFAULT_GOOGLE_VIDEO_MODEL;
|
||||
return normalizeGoogleModelId(trimmed);
|
||||
}
|
||||
|
||||
function resolvePrompt(prompt?: string): string {
|
||||
const trimmed = prompt?.trim();
|
||||
return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
|
||||
}
|
||||
|
||||
export async function describeGeminiVideo(
|
||||
params: VideoDescriptionRequest,
|
||||
): Promise<VideoDescriptionResult> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
|
||||
const model = resolveModel(params.model);
|
||||
const url = `${baseUrl}/models/${model}:generateContent`;
|
||||
|
||||
const headers = new Headers(params.headers);
|
||||
if (!headers.has("content-type")) {
|
||||
headers.set("content-type", "application/json");
|
||||
}
|
||||
if (!headers.has("x-goog-api-key")) {
|
||||
headers.set("x-goog-api-key", params.apiKey);
|
||||
}
|
||||
|
||||
const body = {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ text: resolvePrompt(params.prompt) },
|
||||
{
|
||||
inline_data: {
|
||||
mime_type: params.mime ?? "video/mp4",
|
||||
data: params.buffer.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const res = await fetchWithTimeout(
|
||||
url,
|
||||
{
|
||||
method: "POST",
|
||||
headers,
|
||||
body: JSON.stringify(body),
|
||||
},
|
||||
params.timeoutMs,
|
||||
fetchFn,
|
||||
);
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await readErrorResponse(res);
|
||||
const suffix = detail ? `: ${detail}` : "";
|
||||
throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
|
||||
}
|
||||
|
||||
const payload = (await res.json()) as {
|
||||
candidates?: Array<{
|
||||
content?: { parts?: Array<{ text?: string }> };
|
||||
}>;
|
||||
};
|
||||
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
||||
const text = parts
|
||||
.map((part) => part?.text?.trim())
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
if (!text) {
|
||||
throw new Error("Video description response missing text");
|
||||
}
|
||||
return { text, model };
|
||||
}
|
||||
13
src/media-understanding/providers/groq/index.ts
Normal file
13
src/media-understanding/providers/groq/index.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
|
||||
|
||||
const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
|
||||
|
||||
export const groqProvider: MediaUnderstandingProvider = {
|
||||
id: "groq",
|
||||
transcribeAudio: (req) =>
|
||||
transcribeOpenAiCompatibleAudio({
|
||||
...req,
|
||||
baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
|
||||
}),
|
||||
};
|
||||
35
src/media-understanding/providers/index.ts
Normal file
35
src/media-understanding/providers/index.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import { normalizeProviderId } from "../../agents/model-selection.js";
|
||||
import type { MediaUnderstandingProvider } from "../types.js";
|
||||
import { googleProvider } from "./google/index.js";
|
||||
import { groqProvider } from "./groq/index.js";
|
||||
import { openaiProvider } from "./openai/index.js";
|
||||
|
||||
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
|
||||
|
||||
export function normalizeMediaProviderId(id: string): string {
|
||||
const normalized = normalizeProviderId(id);
|
||||
if (normalized === "gemini") return "google";
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export function buildMediaUnderstandingRegistry(
|
||||
overrides?: Record<string, MediaUnderstandingProvider>,
|
||||
): Map<string, MediaUnderstandingProvider> {
|
||||
const registry = new Map<string, MediaUnderstandingProvider>();
|
||||
for (const provider of PROVIDERS) {
|
||||
registry.set(normalizeMediaProviderId(provider.id), provider);
|
||||
}
|
||||
if (overrides) {
|
||||
for (const [key, provider] of Object.entries(overrides)) {
|
||||
registry.set(normalizeMediaProviderId(key), provider);
|
||||
}
|
||||
}
|
||||
return registry;
|
||||
}
|
||||
|
||||
export function getMediaUnderstandingProvider(
|
||||
id: string,
|
||||
registry: Map<string, MediaUnderstandingProvider>,
|
||||
): MediaUnderstandingProvider | undefined {
|
||||
return registry.get(normalizeMediaProviderId(id));
|
||||
}
|
||||
86
src/media-understanding/providers/openai/audio.test.ts
Normal file
86
src/media-understanding/providers/openai/audio.test.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||
|
||||
const resolveRequestUrl = (input: RequestInfo | URL) => {
|
||||
if (typeof input === "string") return input;
|
||||
if (input instanceof URL) return input.toString();
|
||||
return input.url;
|
||||
};
|
||||
|
||||
describe("transcribeOpenAiCompatibleAudio", () => {
|
||||
it("respects lowercase authorization header overrides", async () => {
|
||||
let seenAuth: string | null = null;
|
||||
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
|
||||
const headers = new Headers(init?.headers);
|
||||
seenAuth = headers.get("authorization");
|
||||
return new Response(JSON.stringify({ text: "ok" }), {
|
||||
status: 200,
|
||||
headers: { "content-type": "application/json" },
|
||||
});
|
||||
};
|
||||
|
||||
const result = await transcribeOpenAiCompatibleAudio({
|
||||
buffer: Buffer.from("audio"),
|
||||
fileName: "note.mp3",
|
||||
apiKey: "test-key",
|
||||
timeoutMs: 1000,
|
||||
headers: { authorization: "Bearer override" },
|
||||
fetchFn,
|
||||
});
|
||||
|
||||
expect(seenAuth).toBe("Bearer override");
|
||||
expect(result.text).toBe("ok");
|
||||
});
|
||||
|
||||
it("builds the expected request payload", async () => {
|
||||
let seenUrl: string | null = null;
|
||||
let seenInit: RequestInit | undefined;
|
||||
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||
seenUrl = resolveRequestUrl(input);
|
||||
seenInit = init;
|
||||
return new Response(JSON.stringify({ text: "hello" }), {
|
||||
status: 200,
|
||||
headers: { "content-type": "application/json" },
|
||||
});
|
||||
};
|
||||
|
||||
const result = await transcribeOpenAiCompatibleAudio({
|
||||
buffer: Buffer.from("audio-bytes"),
|
||||
fileName: "voice.wav",
|
||||
apiKey: "test-key",
|
||||
timeoutMs: 1234,
|
||||
baseUrl: "https://api.example.com/v1/",
|
||||
model: " ",
|
||||
language: " en ",
|
||||
prompt: " hello ",
|
||||
mime: "audio/wav",
|
||||
headers: { "X-Custom": "1" },
|
||||
fetchFn,
|
||||
});
|
||||
|
||||
expect(result.model).toBe("whisper-1");
|
||||
expect(result.text).toBe("hello");
|
||||
expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
|
||||
expect(seenInit?.method).toBe("POST");
|
||||
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
|
||||
|
||||
const headers = new Headers(seenInit?.headers);
|
||||
expect(headers.get("authorization")).toBe("Bearer test-key");
|
||||
expect(headers.get("x-custom")).toBe("1");
|
||||
|
||||
const form = seenInit?.body as FormData;
|
||||
expect(form).toBeInstanceOf(FormData);
|
||||
expect(form.get("model")).toBe("whisper-1");
|
||||
expect(form.get("language")).toBe("en");
|
||||
expect(form.get("prompt")).toBe("hello");
|
||||
const file = form.get("file") as Blob | { type?: string; name?: string } | null;
|
||||
expect(file).not.toBeNull();
|
||||
if (file) {
|
||||
expect(file.type).toBe("audio/wav");
|
||||
if ("name" in file && typeof file.name === "string") {
|
||||
expect(file.name).toBe("voice.wav");
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
61
src/media-understanding/providers/openai/audio.ts
Normal file
61
src/media-understanding/providers/openai/audio.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
import path from "node:path";
|
||||
|
||||
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
|
||||
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||
|
||||
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
|
||||
const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
|
||||
|
||||
function resolveModel(model?: string): string {
|
||||
const trimmed = model?.trim();
|
||||
return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
|
||||
}
|
||||
|
||||
export async function transcribeOpenAiCompatibleAudio(
|
||||
params: AudioTranscriptionRequest,
|
||||
): Promise<AudioTranscriptionResult> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
|
||||
const url = `${baseUrl}/audio/transcriptions`;
|
||||
|
||||
const model = resolveModel(params.model);
|
||||
const form = new FormData();
|
||||
const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
|
||||
const bytes = new Uint8Array(params.buffer);
|
||||
const blob = new Blob([bytes], {
|
||||
type: params.mime ?? "application/octet-stream",
|
||||
});
|
||||
form.append("file", blob, fileName);
|
||||
form.append("model", model);
|
||||
if (params.language?.trim()) form.append("language", params.language.trim());
|
||||
if (params.prompt?.trim()) form.append("prompt", params.prompt.trim());
|
||||
|
||||
const headers = new Headers(params.headers);
|
||||
if (!headers.has("authorization")) {
|
||||
headers.set("authorization", `Bearer ${params.apiKey}`);
|
||||
}
|
||||
|
||||
const res = await fetchWithTimeout(
|
||||
url,
|
||||
{
|
||||
method: "POST",
|
||||
headers,
|
||||
body: form,
|
||||
},
|
||||
params.timeoutMs,
|
||||
fetchFn,
|
||||
);
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await readErrorResponse(res);
|
||||
const suffix = detail ? `: ${detail}` : "";
|
||||
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
|
||||
}
|
||||
|
||||
const payload = (await res.json()) as { text?: string };
|
||||
const text = payload.text?.trim();
|
||||
if (!text) {
|
||||
throw new Error("Audio transcription response missing text");
|
||||
}
|
||||
return { text, model };
|
||||
}
|
||||
7
src/media-understanding/providers/openai/index.ts
Normal file
7
src/media-understanding/providers/openai/index.ts
Normal file
@@ -0,0 +1,7 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||
|
||||
export const openaiProvider: MediaUnderstandingProvider = {
|
||||
id: "openai",
|
||||
transcribeAudio: transcribeOpenAiCompatibleAudio,
|
||||
};
|
||||
33
src/media-understanding/providers/shared.ts
Normal file
33
src/media-understanding/providers/shared.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
const MAX_ERROR_CHARS = 300;
|
||||
|
||||
export function normalizeBaseUrl(baseUrl: string | undefined, fallback: string): string {
|
||||
const raw = baseUrl?.trim() || fallback;
|
||||
return raw.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
export async function fetchWithTimeout(
|
||||
url: string,
|
||||
init: RequestInit,
|
||||
timeoutMs: number,
|
||||
fetchFn: typeof fetch,
|
||||
): Promise<Response> {
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
|
||||
try {
|
||||
return await fetchFn(url, { ...init, signal: controller.signal });
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
export async function readErrorResponse(res: Response): Promise<string | undefined> {
|
||||
try {
|
||||
const text = await res.text();
|
||||
const collapsed = text.replace(/\s+/g, " ").trim();
|
||||
if (!collapsed) return undefined;
|
||||
if (collapsed.length <= MAX_ERROR_CHARS) return collapsed;
|
||||
return `${collapsed.slice(0, MAX_ERROR_CHARS)}…`;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
46
src/media-understanding/scope.test.ts
Normal file
46
src/media-understanding/scope.test.ts
Normal file
@@ -0,0 +1,46 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { resolveMediaUnderstandingScope } from "./scope.js";
|
||||
|
||||
describe("resolveMediaUnderstandingScope", () => {
|
||||
it("defaults to allow when scope is undefined", () => {
|
||||
expect(resolveMediaUnderstandingScope({})).toBe("allow");
|
||||
});
|
||||
|
||||
it("uses first matching rule", () => {
|
||||
const decision = resolveMediaUnderstandingScope({
|
||||
scope: {
|
||||
default: "deny",
|
||||
rules: [
|
||||
{ action: "allow", match: { channel: "whatsapp" } },
|
||||
{ action: "deny", match: { channel: "whatsapp", chatType: "direct" } },
|
||||
],
|
||||
},
|
||||
channel: "whatsapp",
|
||||
chatType: "direct",
|
||||
sessionKey: "whatsapp:direct:123",
|
||||
});
|
||||
expect(decision).toBe("allow");
|
||||
});
|
||||
|
||||
it("matches keyPrefix when provided", () => {
|
||||
const decision = resolveMediaUnderstandingScope({
|
||||
scope: {
|
||||
default: "deny",
|
||||
rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
|
||||
},
|
||||
sessionKey: "agent:main:whatsapp:group:123",
|
||||
});
|
||||
expect(decision).toBe("allow");
|
||||
});
|
||||
|
||||
it("matches keyPrefix case-insensitively", () => {
|
||||
const decision = resolveMediaUnderstandingScope({
|
||||
scope: {
|
||||
default: "deny",
|
||||
rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
|
||||
},
|
||||
sessionKey: "AGENT:MAIN:WHATSAPP:GROUP:123",
|
||||
});
|
||||
expect(decision).toBe("allow");
|
||||
});
|
||||
});
|
||||
54
src/media-understanding/scope.ts
Normal file
54
src/media-understanding/scope.ts
Normal file
@@ -0,0 +1,54 @@
|
||||
import type { MediaUnderstandingScopeConfig } from "../config/types.tools.js";
|
||||
|
||||
export type MediaUnderstandingScopeDecision = "allow" | "deny";
|
||||
|
||||
function normalizeDecision(value?: string | null): MediaUnderstandingScopeDecision | undefined {
|
||||
const normalized = value?.trim().toLowerCase();
|
||||
if (normalized === "allow") return "allow";
|
||||
if (normalized === "deny") return "deny";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function normalizeMatch(value?: string | null): string | undefined {
|
||||
const normalized = value?.trim().toLowerCase();
|
||||
return normalized || undefined;
|
||||
}
|
||||
|
||||
export function normalizeMediaUnderstandingChatType(raw?: string | null): string | undefined {
|
||||
const value = raw?.trim().toLowerCase();
|
||||
if (!value) return undefined;
|
||||
if (value === "dm" || value === "direct_message" || value === "private") return "direct";
|
||||
if (value === "groups") return "group";
|
||||
if (value === "channel") return "room";
|
||||
return value;
|
||||
}
|
||||
|
||||
export function resolveMediaUnderstandingScope(params: {
|
||||
scope?: MediaUnderstandingScopeConfig;
|
||||
sessionKey?: string;
|
||||
channel?: string;
|
||||
chatType?: string;
|
||||
}): MediaUnderstandingScopeDecision {
|
||||
const scope = params.scope;
|
||||
if (!scope) return "allow";
|
||||
|
||||
const channel = normalizeMatch(params.channel);
|
||||
const chatType = normalizeMatch(params.chatType);
|
||||
const sessionKey = normalizeMatch(params.sessionKey) ?? "";
|
||||
|
||||
for (const rule of scope.rules ?? []) {
|
||||
if (!rule) continue;
|
||||
const action = normalizeDecision(rule.action) ?? "allow";
|
||||
const match = rule.match ?? {};
|
||||
const matchChannel = normalizeMatch(match.channel);
|
||||
const matchChatType = normalizeMatch(match.chatType);
|
||||
const matchPrefix = normalizeMatch(match.keyPrefix);
|
||||
|
||||
if (matchChannel && matchChannel !== channel) continue;
|
||||
if (matchChatType && matchChatType !== chatType) continue;
|
||||
if (matchPrefix && !sessionKey.startsWith(matchPrefix)) continue;
|
||||
return action;
|
||||
}
|
||||
|
||||
return normalizeDecision(scope.default) ?? "allow";
|
||||
}
|
||||
62
src/media-understanding/types.ts
Normal file
62
src/media-understanding/types.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
export type MediaUnderstandingKind =
|
||||
| "audio.transcription"
|
||||
| "video.description"
|
||||
| "image.description";
|
||||
|
||||
export type MediaAttachment = {
|
||||
path?: string;
|
||||
url?: string;
|
||||
mime?: string;
|
||||
index: number;
|
||||
};
|
||||
|
||||
export type MediaUnderstandingOutput = {
|
||||
kind: MediaUnderstandingKind;
|
||||
attachmentIndex: number;
|
||||
text: string;
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
|
||||
export type AudioTranscriptionRequest = {
|
||||
buffer: Buffer;
|
||||
fileName: string;
|
||||
mime?: string;
|
||||
apiKey: string;
|
||||
baseUrl?: string;
|
||||
headers?: Record<string, string>;
|
||||
model?: string;
|
||||
language?: string;
|
||||
prompt?: string;
|
||||
timeoutMs: number;
|
||||
fetchFn?: typeof fetch;
|
||||
};
|
||||
|
||||
export type AudioTranscriptionResult = {
|
||||
text: string;
|
||||
model?: string;
|
||||
};
|
||||
|
||||
export type VideoDescriptionRequest = {
|
||||
buffer: Buffer;
|
||||
fileName: string;
|
||||
mime?: string;
|
||||
apiKey: string;
|
||||
baseUrl?: string;
|
||||
headers?: Record<string, string>;
|
||||
model?: string;
|
||||
prompt?: string;
|
||||
timeoutMs: number;
|
||||
fetchFn?: typeof fetch;
|
||||
};
|
||||
|
||||
export type VideoDescriptionResult = {
|
||||
text: string;
|
||||
model?: string;
|
||||
};
|
||||
|
||||
export type MediaUnderstandingProvider = {
|
||||
id: string;
|
||||
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
|
||||
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
|
||||
};
|
||||
47
src/media/fetch.test.ts
Normal file
47
src/media/fetch.test.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { fetchRemoteMedia } from "./fetch.js";
|
||||
|
||||
function makeStream(chunks: Uint8Array[]) {
|
||||
return new ReadableStream<Uint8Array>({
|
||||
start(controller) {
|
||||
for (const chunk of chunks) {
|
||||
controller.enqueue(chunk);
|
||||
}
|
||||
controller.close();
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
describe("fetchRemoteMedia", () => {
|
||||
it("rejects when content-length exceeds maxBytes", async () => {
|
||||
const fetchImpl = async () =>
|
||||
new Response(makeStream([new Uint8Array([1, 2, 3, 4, 5])]), {
|
||||
status: 200,
|
||||
headers: { "content-length": "5" },
|
||||
});
|
||||
|
||||
await expect(
|
||||
fetchRemoteMedia({
|
||||
url: "https://example.com/file.bin",
|
||||
fetchImpl,
|
||||
maxBytes: 4,
|
||||
}),
|
||||
).rejects.toThrow("exceeds maxBytes");
|
||||
});
|
||||
|
||||
it("rejects when streamed payload exceeds maxBytes", async () => {
|
||||
const fetchImpl = async () =>
|
||||
new Response(makeStream([new Uint8Array([1, 2, 3]), new Uint8Array([4, 5, 6])]), {
|
||||
status: 200,
|
||||
});
|
||||
|
||||
await expect(
|
||||
fetchRemoteMedia({
|
||||
url: "https://example.com/file.bin",
|
||||
fetchImpl,
|
||||
maxBytes: 4,
|
||||
}),
|
||||
).rejects.toThrow("exceeds maxBytes");
|
||||
});
|
||||
});
|
||||
@@ -14,6 +14,7 @@ type FetchMediaOptions = {
|
||||
url: string;
|
||||
fetchImpl?: FetchLike;
|
||||
filePathHint?: string;
|
||||
maxBytes?: number;
|
||||
};
|
||||
|
||||
function stripQuotes(value: string): string {
|
||||
@@ -51,7 +52,7 @@ async function readErrorBodySnippet(res: Response, maxChars = 200): Promise<stri
|
||||
}
|
||||
|
||||
export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<FetchMediaResult> {
|
||||
const { url, fetchImpl, filePathHint } = options;
|
||||
const { url, fetchImpl, filePathHint, maxBytes } = options;
|
||||
const fetcher: FetchLike | undefined = fetchImpl ?? globalThis.fetch;
|
||||
if (!fetcher) {
|
||||
throw new Error("fetch is not available");
|
||||
@@ -77,7 +78,19 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
|
||||
throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
|
||||
}
|
||||
|
||||
const buffer = Buffer.from(await res.arrayBuffer());
|
||||
const contentLength = res.headers.get("content-length");
|
||||
if (maxBytes && contentLength) {
|
||||
const length = Number(contentLength);
|
||||
if (Number.isFinite(length) && length > maxBytes) {
|
||||
throw new Error(
|
||||
`Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const buffer = maxBytes
|
||||
? await readResponseWithLimit(res, maxBytes)
|
||||
: Buffer.from(await res.arrayBuffer());
|
||||
let fileNameFromUrl: string | undefined;
|
||||
try {
|
||||
const parsed = new URL(url);
|
||||
@@ -109,3 +122,47 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
|
||||
fileName,
|
||||
};
|
||||
}
|
||||
|
||||
async function readResponseWithLimit(res: Response, maxBytes: number): Promise<Buffer> {
|
||||
const body = res.body;
|
||||
if (!body || typeof body.getReader !== "function") {
|
||||
const fallback = Buffer.from(await res.arrayBuffer());
|
||||
if (fallback.length > maxBytes) {
|
||||
throw new Error(
|
||||
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
|
||||
);
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
const reader = body.getReader();
|
||||
const chunks: Uint8Array[] = [];
|
||||
let total = 0;
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
if (value?.length) {
|
||||
total += value.length;
|
||||
if (total > maxBytes) {
|
||||
try {
|
||||
await reader.cancel();
|
||||
} catch {}
|
||||
throw new Error(
|
||||
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
|
||||
);
|
||||
}
|
||||
chunks.push(value);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
reader.releaseLock();
|
||||
} catch {}
|
||||
}
|
||||
|
||||
return Buffer.concat(
|
||||
chunks.map((chunk) => Buffer.from(chunk)),
|
||||
total,
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user