refactor: unify media understanding pipeline

This commit is contained in:
Peter Steinberger
2026-01-17 04:38:20 +00:00
parent 49ecbd8fea
commit fcb7c9ff65
24 changed files with 1250 additions and 643 deletions

View File

@@ -255,4 +255,90 @@ describe("applyMediaUnderstanding", () => {
expect(ctx.CommandBody).toBe("show Dom");
expect(ctx.RawBody).toBe("show Dom");
});
it("uses shared media models list when capability config is missing", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const imagePath = path.join(dir, "shared.jpg");
await fs.writeFile(imagePath, "image-bytes");
const ctx: MsgContext = {
Body: "<media:image>",
MediaPath: imagePath,
MediaType: "image/jpeg",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
models: [
{
type: "cli",
command: "gemini",
args: ["--allowed-tools", "read_file", "{{MediaPath}}"],
capabilities: ["image"],
},
],
},
},
};
const execModule = await import("../process/exec.js");
vi.mocked(execModule.runExec).mockResolvedValue({
stdout: "shared description\n",
stderr: "",
});
const result = await applyMediaUnderstanding({
ctx,
cfg,
});
expect(result.appliedImage).toBe(true);
expect(ctx.Body).toBe("[Image]\nDescription:\nshared description");
});
it("handles multiple audio attachments when attachment mode is all", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const audioPathA = path.join(dir, "note-a.ogg");
const audioPathB = path.join(dir, "note-b.ogg");
await fs.writeFile(audioPathA, "hello");
await fs.writeFile(audioPathB, "world");
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPaths: [audioPathA, audioPathB],
MediaTypes: ["audio/ogg", "audio/ogg"],
};
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
attachments: { mode: "all", maxAttachments: 2 },
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio: async (req) => ({ text: req.fileName }),
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("Audio 1:\nnote-a.ogg\n\nAudio 2:\nnote-b.ogg");
expect(ctx.Body).toBe(
["[Audio 1/2]\nTranscript:\nnote-a.ogg", "[Audio 2/2]\nTranscript:\nnote-b.ogg"].join(
"\n\n",
),
);
});
});

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,386 @@
import crypto from "node:crypto";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { fileURLToPath } from "node:url";
import type { MsgContext } from "../auto-reply/templating.js";
import type { MediaUnderstandingAttachmentsConfig } from "../config/types.tools.js";
import { fetchRemoteMedia, MediaFetchError } from "../media/fetch.js";
import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
import { logVerbose, shouldLogVerbose } from "../globals.js";
import { fetchWithTimeout } from "./providers/shared.js";
import type { MediaAttachment, MediaUnderstandingCapability } from "./types.js";
import { MediaUnderstandingSkipError } from "./errors.js";
type MediaBufferResult = {
buffer: Buffer;
mime?: string;
fileName: string;
size: number;
};
type MediaPathResult = {
path: string;
cleanup?: () => Promise<void> | void;
};
type AttachmentCacheEntry = {
attachment: MediaAttachment;
resolvedPath?: string;
statSize?: number;
buffer?: Buffer;
bufferMime?: string;
bufferFileName?: string;
tempPath?: string;
tempCleanup?: () => Promise<void>;
};
const DEFAULT_MAX_ATTACHMENTS = 1;
function normalizeAttachmentPath(raw?: string | null): string | undefined {
const value = raw?.trim();
if (!value) return undefined;
if (value.startsWith("file://")) {
try {
return fileURLToPath(value);
} catch {
return undefined;
}
}
return value;
}
export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
const resolveMime = (count: number, index: number) => {
const typeHint = typesFromArray?.[index];
const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
if (trimmed) return trimmed;
return count === 1 ? ctx.MediaType : undefined;
};
if (pathsFromArray && pathsFromArray.length > 0) {
const count = pathsFromArray.length;
const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
return pathsFromArray
.map((value, index) => ({
path: value?.trim() || undefined,
url: urls?.[index] ?? ctx.MediaUrl,
mime: resolveMime(count, index),
index,
}))
.filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
}
if (urlsFromArray && urlsFromArray.length > 0) {
const count = urlsFromArray.length;
return urlsFromArray
.map((value, index) => ({
path: undefined,
url: value?.trim() || undefined,
mime: resolveMime(count, index),
index,
}))
.filter((entry) => Boolean(entry.url?.trim()));
}
const pathValue = ctx.MediaPath?.trim();
const url = ctx.MediaUrl?.trim();
if (!pathValue && !url) return [];
return [
{
path: pathValue || undefined,
url: url || undefined,
mime: ctx.MediaType,
index: 0,
},
];
}
export function isVideoAttachment(attachment: MediaAttachment): boolean {
if (attachment.mime?.startsWith("video/")) return true;
const ext = getFileExtension(attachment.path ?? attachment.url);
if (!ext) return false;
return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
}
export function isAudioAttachment(attachment: MediaAttachment): boolean {
if (attachment.mime?.startsWith("audio/")) return true;
return isAudioFileName(attachment.path ?? attachment.url);
}
export function isImageAttachment(attachment: MediaAttachment): boolean {
if (attachment.mime?.startsWith("image/")) return true;
const ext = getFileExtension(attachment.path ?? attachment.url);
if (!ext) return false;
return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
}
function isAbortError(err: unknown): boolean {
if (!err) return false;
if (err instanceof Error && err.name === "AbortError") return true;
return false;
}
function resolveRequestUrl(input: RequestInfo | URL): string {
if (typeof input === "string") return input;
if (input instanceof URL) return input.toString();
return input.url;
}
function orderAttachments(
attachments: MediaAttachment[],
prefer?: MediaUnderstandingAttachmentsConfig["prefer"],
): MediaAttachment[] {
if (!prefer || prefer === "first") return attachments;
if (prefer === "last") return [...attachments].reverse();
if (prefer === "path") {
const withPath = attachments.filter((item) => item.path);
const withoutPath = attachments.filter((item) => !item.path);
return [...withPath, ...withoutPath];
}
if (prefer === "url") {
const withUrl = attachments.filter((item) => item.url);
const withoutUrl = attachments.filter((item) => !item.url);
return [...withUrl, ...withoutUrl];
}
return attachments;
}
export function selectAttachments(params: {
capability: MediaUnderstandingCapability;
attachments: MediaAttachment[];
policy?: MediaUnderstandingAttachmentsConfig;
}): MediaAttachment[] {
const { capability, attachments, policy } = params;
const matches = attachments.filter((item) => {
if (capability === "image") return isImageAttachment(item);
if (capability === "audio") return isAudioAttachment(item);
return isVideoAttachment(item);
});
if (matches.length === 0) return [];
const ordered = orderAttachments(matches, policy?.prefer);
const mode = policy?.mode ?? "first";
const maxAttachments = policy?.maxAttachments ?? DEFAULT_MAX_ATTACHMENTS;
if (mode === "all") {
return ordered.slice(0, Math.max(1, maxAttachments));
}
return ordered.slice(0, 1);
}
export class MediaAttachmentCache {
private readonly entries = new Map<number, AttachmentCacheEntry>();
private readonly attachments: MediaAttachment[];
constructor(attachments: MediaAttachment[]) {
this.attachments = attachments;
for (const attachment of attachments) {
this.entries.set(attachment.index, { attachment });
}
}
async getBuffer(params: {
attachmentIndex: number;
maxBytes: number;
timeoutMs: number;
}): Promise<MediaBufferResult> {
const entry = await this.ensureEntry(params.attachmentIndex);
if (entry.buffer) {
if (entry.buffer.length > params.maxBytes) {
throw new MediaUnderstandingSkipError(
"maxBytes",
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
);
}
return {
buffer: entry.buffer,
mime: entry.bufferMime,
fileName: entry.bufferFileName ?? `media-${params.attachmentIndex + 1}`,
size: entry.buffer.length,
};
}
if (entry.resolvedPath) {
const size = await this.ensureLocalStat(entry);
if (entry.resolvedPath) {
if (size !== undefined && size > params.maxBytes) {
throw new MediaUnderstandingSkipError(
"maxBytes",
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
);
}
const buffer = await fs.readFile(entry.resolvedPath);
entry.buffer = buffer;
entry.bufferMime =
entry.bufferMime ??
entry.attachment.mime ??
(await detectMime({
buffer,
filePath: entry.resolvedPath,
}));
entry.bufferFileName =
path.basename(entry.resolvedPath) || `media-${params.attachmentIndex + 1}`;
return {
buffer,
mime: entry.bufferMime,
fileName: entry.bufferFileName,
size: buffer.length,
};
}
}
const url = entry.attachment.url?.trim();
if (!url) {
throw new MediaUnderstandingSkipError(
"empty",
`Attachment ${params.attachmentIndex + 1} has no path or URL.`,
);
}
try {
const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
fetchWithTimeout(resolveRequestUrl(input), init ?? {}, params.timeoutMs, fetch);
const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes: params.maxBytes });
entry.buffer = fetched.buffer;
entry.bufferMime =
entry.attachment.mime ??
fetched.contentType ??
(await detectMime({
buffer: fetched.buffer,
filePath: fetched.fileName ?? url,
}));
entry.bufferFileName = fetched.fileName ?? `media-${params.attachmentIndex + 1}`;
return {
buffer: fetched.buffer,
mime: entry.bufferMime,
fileName: entry.bufferFileName,
size: fetched.buffer.length,
};
} catch (err) {
if (err instanceof MediaFetchError && err.code === "max_bytes") {
throw new MediaUnderstandingSkipError(
"maxBytes",
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
);
}
if (isAbortError(err)) {
throw new MediaUnderstandingSkipError(
"timeout",
`Attachment ${params.attachmentIndex + 1} timed out while fetching.`,
);
}
throw err;
}
}
async getPath(params: {
attachmentIndex: number;
maxBytes?: number;
timeoutMs: number;
}): Promise<MediaPathResult> {
const entry = await this.ensureEntry(params.attachmentIndex);
if (entry.resolvedPath) {
if (params.maxBytes) {
const size = await this.ensureLocalStat(entry);
if (entry.resolvedPath) {
if (size !== undefined && size > params.maxBytes) {
throw new MediaUnderstandingSkipError(
"maxBytes",
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
);
}
}
}
if (entry.resolvedPath) {
return { path: entry.resolvedPath };
}
}
if (entry.tempPath) {
if (params.maxBytes && entry.buffer && entry.buffer.length > params.maxBytes) {
throw new MediaUnderstandingSkipError(
"maxBytes",
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
);
}
return { path: entry.tempPath, cleanup: entry.tempCleanup };
}
const maxBytes = params.maxBytes ?? Number.POSITIVE_INFINITY;
const bufferResult = await this.getBuffer({
attachmentIndex: params.attachmentIndex,
maxBytes,
timeoutMs: params.timeoutMs,
});
const extension = path.extname(bufferResult.fileName || "") || "";
const tmpPath = path.join(
os.tmpdir(),
`clawdbot-media-${crypto.randomUUID()}${extension}`,
);
await fs.writeFile(tmpPath, bufferResult.buffer);
entry.tempPath = tmpPath;
entry.tempCleanup = async () => {
await fs.unlink(tmpPath).catch(() => {});
};
return { path: tmpPath, cleanup: entry.tempCleanup };
}
async cleanup(): Promise<void> {
const cleanups: Array<Promise<void> | void> = [];
for (const entry of this.entries.values()) {
if (entry.tempCleanup) {
cleanups.push(Promise.resolve(entry.tempCleanup()));
entry.tempCleanup = undefined;
}
}
await Promise.all(cleanups);
}
private async ensureEntry(attachmentIndex: number): Promise<AttachmentCacheEntry> {
const existing = this.entries.get(attachmentIndex);
if (existing) {
if (!existing.resolvedPath) {
existing.resolvedPath = this.resolveLocalPath(existing.attachment);
}
return existing;
}
const attachment =
this.attachments.find((item) => item.index === attachmentIndex) ?? { index: attachmentIndex };
const entry: AttachmentCacheEntry = {
attachment,
resolvedPath: this.resolveLocalPath(attachment),
};
this.entries.set(attachmentIndex, entry);
return entry;
}
private resolveLocalPath(attachment: MediaAttachment): string | undefined {
const rawPath = normalizeAttachmentPath(attachment.path);
if (!rawPath) return undefined;
return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
}
private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {
if (!entry.resolvedPath) return undefined;
if (entry.statSize !== undefined) return entry.statSize;
try {
const stat = await fs.stat(entry.resolvedPath);
if (!stat.isFile()) {
entry.resolvedPath = undefined;
return undefined;
}
entry.statSize = stat.size;
return stat.size;
} catch (err) {
entry.resolvedPath = undefined;
if (shouldLogVerbose()) {
logVerbose(`Failed to read attachment ${entry.attachment.index + 1}: ${String(err)}`);
}
return undefined;
}
}
}

View File

@@ -0,0 +1,29 @@
import { logVerbose, shouldLogVerbose } from "../globals.js";
export async function runWithConcurrency<T>(
tasks: Array<() => Promise<T>>,
limit: number,
): Promise<T[]> {
if (tasks.length === 0) return [];
const resolvedLimit = Math.max(1, Math.min(limit, tasks.length));
const results: T[] = Array.from({ length: tasks.length });
let next = 0;
const workers = Array.from({ length: resolvedLimit }, async () => {
while (true) {
const index = next;
next += 1;
if (index >= tasks.length) return;
try {
results[index] = await tasks[index]();
} catch (err) {
if (shouldLogVerbose()) {
logVerbose(`Media understanding task failed: ${String(err)}`);
}
}
}
});
await Promise.allSettled(workers);
return results;
}

View File

@@ -0,0 +1,35 @@
import type { MediaUnderstandingCapability } from "./types.js";
const MB = 1024 * 1024;
export const DEFAULT_MAX_CHARS = 500;
export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<
MediaUnderstandingCapability,
number | undefined
> = {
image: DEFAULT_MAX_CHARS,
audio: undefined,
video: DEFAULT_MAX_CHARS,
};
export const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number> = {
image: 10 * MB,
audio: 20 * MB,
video: 50 * MB,
};
export const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number> = {
image: 60,
audio: 60,
video: 120,
};
export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
image: "Describe the image.",
audio: "Transcribe the audio.",
video: "Describe the video.",
};
export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
groq: "whisper-large-v3-turbo",
openai: "whisper-1",
};
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
export const DEFAULT_MEDIA_CONCURRENCY = 2;

View File

@@ -0,0 +1,17 @@
export type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty";
export class MediaUnderstandingSkipError extends Error {
readonly reason: MediaUnderstandingSkipReason;
constructor(reason: MediaUnderstandingSkipReason, message: string) {
super(message);
this.reason = reason;
this.name = "MediaUnderstandingSkipError";
}
}
export function isMediaUnderstandingSkipError(
err: unknown,
): err is MediaUnderstandingSkipError {
return err instanceof MediaUnderstandingSkipError;
}

View File

@@ -12,7 +12,7 @@ export function extractMediaUserText(body?: string): string | undefined {
}
function formatSection(
title: "Audio" | "Video" | "Image",
title: string,
kind: "Transcript" | "Description",
text: string,
userText?: string,
@@ -40,11 +40,21 @@ export function formatMediaUnderstandingBody(params: {
sections.push(`User text:\n${userText}`);
}
const counts = new Map<MediaUnderstandingOutput["kind"], number>();
for (const output of outputs) {
counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
}
const seen = new Map<MediaUnderstandingOutput["kind"], number>();
for (const output of outputs) {
const count = counts.get(output.kind) ?? 1;
const next = (seen.get(output.kind) ?? 0) + 1;
seen.set(output.kind, next);
const suffix = count > 1 ? ` ${next}/${count}` : "";
if (output.kind === "audio.transcription") {
sections.push(
formatSection(
"Audio",
`Audio${suffix}`,
"Transcript",
output.text,
outputs.length === 1 ? userText : undefined,
@@ -55,7 +65,7 @@ export function formatMediaUnderstandingBody(params: {
if (output.kind === "image.description") {
sections.push(
formatSection(
"Image",
`Image${suffix}`,
"Description",
output.text,
outputs.length === 1 ? userText : undefined,
@@ -65,7 +75,7 @@ export function formatMediaUnderstandingBody(params: {
}
sections.push(
formatSection(
"Video",
`Video${suffix}`,
"Description",
output.text,
outputs.length === 1 ? userText : undefined,
@@ -75,3 +85,10 @@ export function formatMediaUnderstandingBody(params: {
return sections.join("\n\n").trim();
}
export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string {
if (outputs.length === 1) return outputs[0].text;
return outputs
.map((output, index) => `Audio ${index + 1}:\n${output.text}`)
.join("\n\n");
}

View File

@@ -0,0 +1,7 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
export const anthropicProvider: MediaUnderstandingProvider = {
id: "anthropic",
describeImage: describeImageWithModel,
};

View File

@@ -1,7 +1,9 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
import { describeGeminiVideo } from "./video.js";
export const googleProvider: MediaUnderstandingProvider = {
id: "google",
describeImage: describeImageWithModel,
describeVideo: describeGeminiVideo,
};

View File

@@ -0,0 +1,66 @@
import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
import { complete } from "@mariozechner/pi-ai";
import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
import { getApiKeyForModel } from "../../agents/model-auth.js";
import { ensureClawdbotModelsJson } from "../../agents/models-config.js";
import { minimaxUnderstandImage } from "../../agents/minimax-vlm.js";
import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js";
import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js";
export async function describeImageWithModel(
params: ImageDescriptionRequest,
): Promise<ImageDescriptionResult> {
await ensureClawdbotModelsJson(params.cfg, params.agentDir);
const authStorage = discoverAuthStorage(params.agentDir);
const modelRegistry = discoverModels(authStorage, params.agentDir);
const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
if (!model) {
throw new Error(`Unknown model: ${params.provider}/${params.model}`);
}
if (!model.input?.includes("image")) {
throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
}
const apiKeyInfo = await getApiKeyForModel({
model,
cfg: params.cfg,
agentDir: params.agentDir,
profileId: params.profile,
preferredProfile: params.preferredProfile,
});
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
const base64 = params.buffer.toString("base64");
if (model.provider === "minimax") {
const text = await minimaxUnderstandImage({
apiKey: apiKeyInfo.apiKey,
prompt: params.prompt ?? "Describe the image.",
imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`,
modelBaseUrl: model.baseUrl,
});
return { text, model: model.id };
}
const context: Context = {
messages: [
{
role: "user",
content: [
{ type: "text", text: params.prompt ?? "Describe the image." },
{ type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" },
],
timestamp: Date.now(),
},
],
};
const message = (await complete(model, context, {
apiKey: apiKeyInfo.apiKey,
maxTokens: params.maxTokens ?? 512,
})) as AssistantMessage;
const text = coerceImageAssistantText({
message,
provider: model.provider,
model: model.id,
});
return { text, model: model.id };
}

View File

@@ -1,10 +1,18 @@
import { normalizeProviderId } from "../../agents/model-selection.js";
import type { MediaUnderstandingProvider } from "../types.js";
import { anthropicProvider } from "./anthropic/index.js";
import { googleProvider } from "./google/index.js";
import { groqProvider } from "./groq/index.js";
import { minimaxProvider } from "./minimax/index.js";
import { openaiProvider } from "./openai/index.js";
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
const PROVIDERS: MediaUnderstandingProvider[] = [
groqProvider,
openaiProvider,
googleProvider,
anthropicProvider,
minimaxProvider,
];
export function normalizeMediaProviderId(id: string): string {
const normalized = normalizeProviderId(id);

View File

@@ -0,0 +1,7 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
export const minimaxProvider: MediaUnderstandingProvider = {
id: "minimax",
describeImage: describeImageWithModel,
};

View File

@@ -1,7 +1,9 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
export const openaiProvider: MediaUnderstandingProvider = {
id: "openai",
describeImage: describeImageWithModel,
transcribeAudio: transcribeOpenAiCompatibleAudio,
};

View File

@@ -0,0 +1,154 @@
import type { ClawdbotConfig } from "../config/config.js";
import type { MsgContext } from "../auto-reply/templating.js";
import type {
MediaUnderstandingConfig,
MediaUnderstandingModelConfig,
MediaUnderstandingScopeConfig,
} from "../config/types.tools.js";
import { logVerbose, shouldLogVerbose } from "../globals.js";
import {
DEFAULT_MAX_BYTES,
DEFAULT_MAX_CHARS_BY_CAPABILITY,
DEFAULT_MEDIA_CONCURRENCY,
DEFAULT_PROMPT,
} from "./defaults.js";
import { normalizeMediaProviderId } from "./providers/index.js";
import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
import type { MediaUnderstandingCapability } from "./types.js";
export function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
return Math.max(1000, Math.floor(value * 1000));
}
export function resolvePrompt(
capability: MediaUnderstandingCapability,
prompt?: string,
maxChars?: number,
): string {
const base = prompt?.trim() || DEFAULT_PROMPT[capability];
if (!maxChars || capability === "audio") return base;
return `${base} Respond in at most ${maxChars} characters.`;
}
export function resolveMaxChars(params: {
capability: MediaUnderstandingCapability;
entry: MediaUnderstandingModelConfig;
cfg: ClawdbotConfig;
config?: MediaUnderstandingConfig;
}): number | undefined {
const { capability, entry, cfg } = params;
const configured =
entry.maxChars ?? params.config?.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
if (typeof configured === "number") return configured;
return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
}
export function resolveMaxBytes(params: {
capability: MediaUnderstandingCapability;
entry: MediaUnderstandingModelConfig;
cfg: ClawdbotConfig;
config?: MediaUnderstandingConfig;
}): number {
const configured =
params.entry.maxBytes ??
params.config?.maxBytes ??
params.cfg.tools?.media?.[params.capability]?.maxBytes;
if (typeof configured === "number") return configured;
return DEFAULT_MAX_BYTES[params.capability];
}
export function resolveCapabilityConfig(
cfg: ClawdbotConfig,
capability: MediaUnderstandingCapability,
): MediaUnderstandingConfig | undefined {
return cfg.tools?.media?.[capability];
}
export function resolveScopeDecision(params: {
scope?: MediaUnderstandingScopeConfig;
ctx: MsgContext;
}): "allow" | "deny" {
return resolveMediaUnderstandingScope({
scope: params.scope,
sessionKey: params.ctx.SessionKey,
channel: params.ctx.Surface ?? params.ctx.Provider,
chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
});
}
function inferCapabilities(
entry: MediaUnderstandingModelConfig,
): MediaUnderstandingCapability[] | undefined {
if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") {
return ["image", "audio", "video"];
}
const provider = normalizeMediaProviderId(entry.provider ?? "");
if (!provider) return undefined;
if (provider === "openai" || provider === "anthropic" || provider === "minimax") {
return ["image"];
}
if (provider === "google") {
return ["image", "audio", "video"];
}
if (provider === "groq") {
return ["audio"];
}
return undefined;
}
export function resolveModelEntries(params: {
cfg: ClawdbotConfig;
capability: MediaUnderstandingCapability;
config?: MediaUnderstandingConfig;
}): MediaUnderstandingModelConfig[] {
const { cfg, capability, config } = params;
const sharedModels = cfg.tools?.media?.models ?? [];
const entries = [
...(config?.models ?? []).map((entry) => ({ entry, source: "capability" as const })),
...sharedModels.map((entry) => ({ entry, source: "shared" as const })),
];
if (entries.length === 0) return [];
return entries
.filter(({ entry, source }) => {
const caps =
entry.capabilities && entry.capabilities.length > 0
? entry.capabilities
: source === "shared"
? inferCapabilities(entry)
: undefined;
if (!caps || caps.length === 0) {
if (source === "shared") {
if (shouldLogVerbose()) {
logVerbose(
`Skipping shared media model without capabilities: ${entry.provider ?? entry.command ?? "unknown"}`,
);
}
return false;
}
return true;
}
return caps.includes(capability);
})
.map(({ entry }) => entry);
}
export function resolveConcurrency(cfg: ClawdbotConfig): number {
const configured = cfg.tools?.media?.concurrency;
if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
return Math.floor(configured);
}
return DEFAULT_MEDIA_CONCURRENCY;
}
export function resolveCapabilityEnabled(params: {
cfg: ClawdbotConfig;
config?: MediaUnderstandingConfig;
}): boolean {
if (params.config?.enabled === false) return false;
const sharedModels = params.cfg.tools?.media?.models ?? [];
const hasModels = (params.config?.models?.length ?? 0) > 0 || sharedModels.length > 0;
if (!hasModels) return false;
return true;
}

View File

@@ -3,6 +3,8 @@ export type MediaUnderstandingKind =
| "video.description"
| "image.description";
export type MediaUnderstandingCapability = "image" | "audio" | "video";
export type MediaAttachment = {
path?: string;
url?: string;
@@ -55,8 +57,29 @@ export type VideoDescriptionResult = {
model?: string;
};
export type ImageDescriptionRequest = {
buffer: Buffer;
fileName: string;
mime?: string;
model: string;
provider: string;
prompt?: string;
maxTokens?: number;
timeoutMs: number;
profile?: string;
preferredProfile?: string;
agentDir: string;
cfg: import("../config/config.js").ClawdbotConfig;
};
export type ImageDescriptionResult = {
text: string;
model?: string;
};
export type MediaUnderstandingProvider = {
id: string;
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;
};

View File

@@ -0,0 +1,10 @@
import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.js";
export function estimateBase64Size(bytes: number): number {
return Math.ceil(bytes / 3) * 4;
}
export function resolveVideoMaxBase64Bytes(maxBytes: number): number {
const expanded = Math.floor(maxBytes * (4 / 3));
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
}