diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 3e2b55d6c..076c30704 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1770,13 +1770,16 @@ Legacy: `tools.bash` is still accepted as an alias. - `tools.web.fetch.firecrawl.timeoutSeconds` (optional) `tools.media` configures inbound media understanding (image/audio/video): +- `tools.media.models`: shared model list (capability-tagged; used after per-cap lists). +- `tools.media.concurrency`: max concurrent capability runs (default 2). - `tools.media.image` / `tools.media.audio` / `tools.media.video`: - - `enabled`: opt-out switch (default true). + - `enabled`: opt-out switch (default true when models are configured). - `prompt`: optional prompt override (image/video append a `maxChars` hint automatically). - `maxChars`: max output characters (default 500 for image/video; unset for audio). - `maxBytes`: max media size to send (defaults: image 10MB, audio 20MB, video 50MB). - `timeoutSeconds`: request timeout (defaults: image 60s, audio 60s, video 120s). - `language`: optional audio hint. + - `attachments`: attachment policy (`mode`, `maxAttachments`, `prefer`). - `scope`: optional gating (first match wins) with `match.channel`, `match.chatType`, or `match.keyPrefix`. - `models`: ordered list of model entries; failures or oversize media fall back to the next entry. - Each `models[]` entry: @@ -1787,7 +1790,7 @@ Legacy: `tools.bash` is still accepted as an alias. - CLI entry (`type: "cli"`): - `command`: executable to run. - `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc). - - `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry. + - `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry. Defaults when omitted: `openai`/`anthropic`/`minimax` → image, `google` → image+audio+video, `groq` → audio. - `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language` can be overridden per entry. If no models are configured (or `enabled: false`), understanding is skipped; the model still receives the original attachments. @@ -2900,7 +2903,7 @@ clawdbot dns setup --apply ## Template variables -Template placeholders are expanded in `tools.media.*.models[].args` (and any future templated argument fields). +Template placeholders are expanded in `tools.media.*.models[].args` and `tools.media.models[].args` (and any future templated argument fields). | Variable | Description | |----------|-------------| diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index b6019b26e..ba68b35e9 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -6,7 +6,7 @@ read_when: # Audio / Voice Notes — 2026-01-17 ## What works -- **Media understanding (audio)**: If `tools.media.audio` is enabled and has `models`, Clawdbot: +- **Media understanding (audio)**: If `tools.media.audio` is enabled (or a shared `tools.media.models` entry supports audio), Clawdbot: 1) Locates the first audio attachment (local path or URL) and downloads it if needed. 2) Enforces `maxBytes` before sending to each model entry. 3) Runs the first eligible model entry in order (provider or CLI). @@ -66,6 +66,7 @@ read_when: - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`). - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried. - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output. +- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`). - Transcript is available to templates as `{{Transcript}}`. - CLI stdout is capped (5MB); keep CLI output concise. diff --git a/docs/nodes/images.md b/docs/nodes/images.md index 4d163e535..bb9188738 100644 --- a/docs/nodes/images.md +++ b/docs/nodes/images.md @@ -38,10 +38,10 @@ The WhatsApp channel runs via **Baileys Web**. This document captures the curren - `{{MediaUrl}}` pseudo-URL for the inbound media. - `{{MediaPath}}` local temp path written before running the command. - When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/`. -- Media understanding (if configured via `tools.media.*`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`. +- Media understanding (if configured via `tools.media.*` or shared `tools.media.models`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`. - Audio sets `{{Transcript}}` and uses the transcript for command parsing so slash commands still work. - Video and image descriptions preserve any caption text for command parsing. -- Only the first matching image/audio/video attachment is processed; remaining attachments are left untouched. +- By default only the first matching image/audio/video attachment is processed; set `tools.media..attachments` to process multiple attachments. ## Limits & Errors **Outbound send caps (WhatsApp web send)** diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index fe8e69a52..12f4c2317 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -16,7 +16,7 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t ## High‑level behavior 1) Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`). -2) For each enabled capability (image/audio/video), pick the **first matching attachment**. +2) For each enabled capability (image/audio/video), select attachments per policy (default: **first**). 3) Choose the first eligible model entry (size + capability + auth). 4) If a model fails or the media is too large, **fall back to the next entry**. 5) On success: @@ -27,18 +27,23 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t If understanding fails or is disabled, **the reply flow continues** with the original body + attachments. ## Config overview -Use **per‑capability configs** under `tools.media`. Each capability can define: -- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`) -- **ordered `models` list** (fallback order) -- `scope` (optional gating by channel/chatType/session key) +`tools.media` supports **shared models** plus per‑capability overrides: +- `tools.media.models`: shared model list (use `capabilities` to gate). +- `tools.media.image` / `tools.media.audio` / `tools.media.video`: + - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`) + - optional **per‑capability `models` list** (preferred before shared models) + - `attachments` policy (`mode`, `maxAttachments`, `prefer`) + - `scope` (optional gating by channel/chatType/session key) +- `tools.media.concurrency`: max concurrent capability runs (default **2**). ```json5 { tools: { media: { - image: { /* config */ }, - audio: { /* config */ }, - video: { /* config */ } + models: [ /* shared list */ ], + image: { /* optional overrides */ }, + audio: { /* optional overrides */ }, + video: { /* optional overrides */ } } } } @@ -95,12 +100,13 @@ Rules: - `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only). ## Capabilities (optional) -If you set `capabilities`, the entry only runs for those media types. Suggested -defaults when you opt in: -- `openai`, `anthropic`: **image** +If you set `capabilities`, the entry only runs for those media types. For shared +lists, Clawdbot can infer defaults: +- `openai`, `anthropic`, `minimax`: **image** - `google` (Gemini API): **image + audio + video** -- CLI entries: declare the exact capabilities you support. +- `groq`: **audio** +For CLI entries, **set `capabilities` explicitly** to avoid surprising matches. If you omit `capabilities`, the entry is eligible for the list it appears in. ## Provider support matrix (Clawdbot integrations) @@ -123,9 +129,49 @@ If you omit `capabilities`, the entry is eligible for the list it appears in. - `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer). - CLI fallback: `gemini` CLI (supports `read_file` on video/audio). +## Attachment policy +Per‑capability `attachments` controls which attachments are processed: +- `mode`: `first` (default) or `all` +- `maxAttachments`: cap the number processed (default **1**) +- `prefer`: `first`, `last`, `path`, `url` + +When `mode: "all"`, outputs are labeled `[Image 1/2]`, `[Audio 2/2]`, etc. + ## Config examples -### 1) Audio + Video only (image off) +### 1) Shared models list + overrides +```json5 +{ + tools: { + media: { + models: [ + { provider: "openai", model: "gpt-5.2", capabilities: ["image"] }, + { provider: "google", model: "gemini-3-flash-preview", capabilities: ["image", "audio", "video"] }, + { + type: "cli", + command: "gemini", + args: [ + "-m", + "gemini-3-flash", + "--allowed-tools", + "read_file", + "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters." + ], + capabilities: ["image", "video"] + } + ], + audio: { + attachments: { mode: "all", maxAttachments: 2 } + }, + video: { + maxChars: 500 + } + } + } +} +``` + +### 2) Audio + Video only (image off) ```json5 { tools: { @@ -164,7 +210,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in. } ``` -### 2) Optional image understanding +### 3) Optional image understanding ```json5 { tools: { @@ -194,7 +240,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in. } ``` -### 3) Multi‑modal single entry (explicit capabilities) +### 4) Multi‑modal single entry (explicit capabilities) ```json5 { tools: { diff --git a/src/config/schema.ts b/src/config/schema.ts index a7a9b60a6..60bb21961 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -107,14 +107,18 @@ const FIELD_LABELS: Record = { "tools.media.image.maxChars": "Image Understanding Max Chars", "tools.media.image.prompt": "Image Understanding Prompt", "tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)", + "tools.media.image.attachments": "Image Understanding Attachment Policy", "tools.media.image.models": "Image Understanding Models", "tools.media.image.scope": "Image Understanding Scope", + "tools.media.models": "Media Understanding Shared Models", + "tools.media.concurrency": "Media Understanding Concurrency", "tools.media.audio.enabled": "Enable Audio Understanding", "tools.media.audio.maxBytes": "Audio Understanding Max Bytes", "tools.media.audio.maxChars": "Audio Understanding Max Chars", "tools.media.audio.prompt": "Audio Understanding Prompt", "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)", "tools.media.audio.language": "Audio Understanding Language", + "tools.media.audio.attachments": "Audio Understanding Attachment Policy", "tools.media.audio.models": "Audio Understanding Models", "tools.media.audio.scope": "Audio Understanding Scope", "tools.media.video.enabled": "Enable Video Understanding", @@ -122,6 +126,7 @@ const FIELD_LABELS: Record = { "tools.media.video.maxChars": "Video Understanding Max Chars", "tools.media.video.prompt": "Video Understanding Prompt", "tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)", + "tools.media.video.attachments": "Video Understanding Attachment Policy", "tools.media.video.models": "Video Understanding Models", "tools.media.video.scope": "Video Understanding Scope", "tools.profile": "Tool Profile", diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index f749c1814..f3e9736e7 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -18,6 +18,15 @@ export type MediaUnderstandingScopeConfig = { export type MediaUnderstandingCapability = "image" | "audio" | "video"; +export type MediaUnderstandingAttachmentsConfig = { + /** Select the first matching attachment or process multiple. */ + mode?: "first" | "all"; + /** Max number of attachments to process (default: 1). */ + maxAttachments?: number; + /** Attachment ordering preference. */ + prefer?: "first" | "last" | "path" | "url"; +}; + export type MediaUnderstandingModelConfig = { /** provider API id (e.g. openai, google). */ provider?: string; @@ -62,11 +71,17 @@ export type MediaUnderstandingConfig = { timeoutSeconds?: number; /** Default language hint (audio). */ language?: string; + /** Attachment selection policy. */ + attachments?: MediaUnderstandingAttachmentsConfig; /** Ordered model list (fallbacks in order). */ models?: MediaUnderstandingModelConfig[]; }; export type MediaToolsConfig = { + /** Shared model list applied across image/audio/video. */ + models?: MediaUnderstandingModelConfig[]; + /** Max concurrent media understanding runs. */ + concurrency?: number; image?: MediaUnderstandingConfig; audio?: MediaUnderstandingConfig; video?: MediaUnderstandingConfig; diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 1af60b7a5..c8479ef37 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -271,6 +271,14 @@ export const MediaUnderstandingCapabilitiesSchema = z .array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")])) .optional(); +export const MediaUnderstandingAttachmentsSchema = z + .object({ + mode: z.union([z.literal("first"), z.literal("all")]).optional(), + maxAttachments: z.number().int().positive().optional(), + prefer: z.union([z.literal("first"), z.literal("last"), z.literal("path"), z.literal("url")]).optional(), + }) + .optional(); + export const MediaUnderstandingModelSchema = z .object({ provider: z.string().optional(), @@ -298,12 +306,15 @@ export const ToolsMediaUnderstandingSchema = z prompt: z.string().optional(), timeoutSeconds: z.number().int().positive().optional(), language: z.string().optional(), + attachments: MediaUnderstandingAttachmentsSchema, models: z.array(MediaUnderstandingModelSchema).optional(), }) .optional(); export const ToolsMediaSchema = z .object({ + models: z.array(MediaUnderstandingModelSchema).optional(), + concurrency: z.number().int().positive().optional(), image: ToolsMediaUnderstandingSchema.optional(), audio: ToolsMediaUnderstandingSchema.optional(), video: ToolsMediaUnderstandingSchema.optional(), diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index f52685e6f..b753cac31 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -255,4 +255,90 @@ describe("applyMediaUnderstanding", () => { expect(ctx.CommandBody).toBe("show Dom"); expect(ctx.RawBody).toBe("show Dom"); }); + + it("uses shared media models list when capability config is missing", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); + const imagePath = path.join(dir, "shared.jpg"); + await fs.writeFile(imagePath, "image-bytes"); + + const ctx: MsgContext = { + Body: "", + MediaPath: imagePath, + MediaType: "image/jpeg", + }; + const cfg: ClawdbotConfig = { + tools: { + media: { + models: [ + { + type: "cli", + command: "gemini", + args: ["--allowed-tools", "read_file", "{{MediaPath}}"], + capabilities: ["image"], + }, + ], + }, + }, + }; + + const execModule = await import("../process/exec.js"); + vi.mocked(execModule.runExec).mockResolvedValue({ + stdout: "shared description\n", + stderr: "", + }); + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + }); + + expect(result.appliedImage).toBe(true); + expect(ctx.Body).toBe("[Image]\nDescription:\nshared description"); + }); + + it("handles multiple audio attachments when attachment mode is all", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); + const audioPathA = path.join(dir, "note-a.ogg"); + const audioPathB = path.join(dir, "note-b.ogg"); + await fs.writeFile(audioPathA, "hello"); + await fs.writeFile(audioPathB, "world"); + + const ctx: MsgContext = { + Body: "", + MediaPaths: [audioPathA, audioPathB], + MediaTypes: ["audio/ogg", "audio/ogg"], + }; + const cfg: ClawdbotConfig = { + tools: { + media: { + audio: { + enabled: true, + attachments: { mode: "all", maxAttachments: 2 }, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { + id: "groq", + transcribeAudio: async (req) => ({ text: req.fileName }), + }, + }, + }); + + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toBe("Audio 1:\nnote-a.ogg\n\nAudio 2:\nnote-b.ogg"); + expect(ctx.Body).toBe( + ["[Audio 1/2]\nTranscript:\nnote-a.ogg", "[Audio 2/2]\nTranscript:\nnote-b.ogg"].join( + "\n\n", + ), + ); + }); }); diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 558b76f57..1b05348a6 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -1,71 +1,53 @@ -import crypto from "node:crypto"; -import fs from "node:fs/promises"; -import os from "node:os"; -import path from "node:path"; -import { fileURLToPath } from "node:url"; - -import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai"; -import { complete } from "@mariozechner/pi-ai"; -import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent"; - import type { ClawdbotConfig } from "../config/config.js"; import type { MsgContext } from "../auto-reply/templating.js"; import { applyTemplate } from "../auto-reply/templating.js"; -import { getApiKeyForModel, resolveApiKeyForProvider } from "../agents/model-auth.js"; -import { ensureClawdbotModelsJson } from "../agents/models-config.js"; -import { minimaxUnderstandImage } from "../agents/minimax-vlm.js"; +import { resolveApiKeyForProvider } from "../agents/model-auth.js"; import { logVerbose, shouldLogVerbose } from "../globals.js"; -import { fetchRemoteMedia } from "../media/fetch.js"; -import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js"; import { runExec } from "../process/exec.js"; import type { MediaUnderstandingConfig, MediaUnderstandingModelConfig, - MediaUnderstandingScopeConfig, } from "../config/types.tools.js"; -import { extractMediaUserText, formatMediaUnderstandingBody } from "./format.js"; +import { + MediaAttachmentCache, + normalizeAttachments, + selectAttachments, +} from "./attachments.js"; +import { + CLI_OUTPUT_MAX_BUFFER, + DEFAULT_AUDIO_MODELS, + DEFAULT_TIMEOUT_SECONDS, +} from "./defaults.js"; +import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js"; +import { + extractMediaUserText, + formatAudioTranscripts, + formatMediaUnderstandingBody, +} from "./format.js"; import { buildMediaUnderstandingRegistry, getMediaUnderstandingProvider, normalizeMediaProviderId, } from "./providers/index.js"; -import { fetchWithTimeout } from "./providers/shared.js"; -import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js"; +import { describeImageWithModel } from "./providers/image.js"; +import { + resolveCapabilityConfig, + resolveCapabilityEnabled, + resolveConcurrency, + resolveMaxBytes, + resolveMaxChars, + resolveModelEntries, + resolvePrompt, + resolveScopeDecision, + resolveTimeoutMs, +} from "./resolve.js"; import type { - MediaAttachment, + MediaUnderstandingCapability, MediaUnderstandingOutput, MediaUnderstandingProvider, } from "./types.js"; -import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js"; - -const MB = 1024 * 1024; -const DEFAULT_MAX_CHARS = 500; -const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record = { - image: DEFAULT_MAX_CHARS, - audio: undefined, - video: DEFAULT_MAX_CHARS, -}; -const DEFAULT_MAX_BYTES: Record = { - image: 10 * MB, - audio: 20 * MB, - video: 50 * MB, -}; -const DEFAULT_TIMEOUT_SECONDS: Record = { - image: 60, - audio: 60, - video: 120, -}; -const DEFAULT_PROMPT: Record = { - image: "Describe the image.", - audio: "Transcribe the audio.", - video: "Describe the video.", -}; -const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; -const DEFAULT_AUDIO_MODELS: Record = { - groq: "whisper-large-v3-turbo", - openai: "whisper-1", -}; -const CLI_OUTPUT_MAX_BUFFER = 5 * MB; +import { runWithConcurrency } from "./concurrency.js"; +import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js"; export type ApplyMediaUnderstandingResult = { outputs: MediaUnderstandingOutput[]; @@ -74,147 +56,7 @@ export type ApplyMediaUnderstandingResult = { appliedVideo: boolean; }; -type Capability = "image" | "audio" | "video"; - -type MediaBufferResult = { - buffer: Buffer; - mime?: string; - fileName: string; -}; - -type MediaPathResult = { - path: string; - cleanup?: () => Promise | void; -}; - -function normalizeAttachmentPath(raw?: string | null): string | undefined { - const value = raw?.trim(); - if (!value) return undefined; - if (value.startsWith("file://")) { - try { - return fileURLToPath(value); - } catch { - return undefined; - } - } - return value; -} - -function normalizeAttachments(ctx: MsgContext): MediaAttachment[] { - const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined; - const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined; - const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined; - const resolveMime = (count: number, index: number) => { - const typeHint = typesFromArray?.[index]; - const trimmed = typeof typeHint === "string" ? typeHint.trim() : ""; - if (trimmed) return trimmed; - return count === 1 ? ctx.MediaType : undefined; - }; - - if (pathsFromArray && pathsFromArray.length > 0) { - const count = pathsFromArray.length; - const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined; - return pathsFromArray - .map((value, index) => ({ - path: value?.trim() || undefined, - url: urls?.[index] ?? ctx.MediaUrl, - mime: resolveMime(count, index), - index, - })) - .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim())); - } - - if (urlsFromArray && urlsFromArray.length > 0) { - const count = urlsFromArray.length; - return urlsFromArray - .map((value, index) => ({ - path: undefined, - url: value?.trim() || undefined, - mime: resolveMime(count, index), - index, - })) - .filter((entry) => Boolean(entry.url?.trim())); - } - - const pathValue = ctx.MediaPath?.trim(); - const url = ctx.MediaUrl?.trim(); - if (!pathValue && !url) return []; - return [ - { - path: pathValue || undefined, - url: url || undefined, - mime: ctx.MediaType, - index: 0, - }, - ]; -} - -function isVideoAttachment(attachment: MediaAttachment): boolean { - if (attachment.mime?.startsWith("video/")) return true; - const ext = getFileExtension(attachment.path ?? attachment.url); - if (!ext) return false; - return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext); -} - -function isAudioAttachment(attachment: MediaAttachment): boolean { - if (attachment.mime?.startsWith("audio/")) return true; - return isAudioFileName(attachment.path ?? attachment.url); -} - -function isImageAttachment(attachment: MediaAttachment): boolean { - if (attachment.mime?.startsWith("image/")) return true; - const ext = getFileExtension(attachment.path ?? attachment.url); - if (!ext) return false; - return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext); -} - -function estimateBase64Size(bytes: number): number { - return Math.ceil(bytes / 3) * 4; -} - -function resolveVideoMaxBase64Bytes(maxBytes: number): number { - const expanded = Math.floor(maxBytes * (4 / 3)); - return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES); -} - -function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number { - const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds; - return Math.max(1000, Math.floor(value * 1000)); -} - -function resolvePrompt(capability: Capability, prompt?: string, maxChars?: number): string { - const base = prompt?.trim() || DEFAULT_PROMPT[capability]; - if (!maxChars || capability === "audio") return base; - return `${base} Respond in at most ${maxChars} characters.`; -} - -function resolveRequestUrl(input: RequestInfo | URL): string { - if (typeof input === "string") return input; - if (input instanceof URL) return input.toString(); - return input.url; -} - -function normalizeErrorMessage(err: unknown): string { - if (!err) return ""; - if (typeof err === "string") return err; - if (err instanceof Error) return err.message; - try { - return JSON.stringify(err); - } catch { - return ""; - } -} - -function resolveMaxChars(params: { - capability: Capability; - entry: MediaUnderstandingModelConfig; - cfg: ClawdbotConfig; -}): number | undefined { - const { capability, entry, cfg } = params; - const configured = entry.maxChars ?? cfg.tools?.media?.[capability]?.maxChars; - if (typeof configured === "number") return configured; - return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability]; -} +const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"]; function trimOutput(text: string, maxChars?: number): string { const trimmed = text.trim(); @@ -222,272 +64,32 @@ function trimOutput(text: string, maxChars?: number): string { return trimmed.slice(0, maxChars).trim(); } -function resolveConfigValue(primary: T | undefined, fallback: T): T { - return primary === undefined ? fallback : primary; -} - -function resolveCapabilityConfig( - cfg: ClawdbotConfig, - capability: Capability, -): MediaUnderstandingConfig | undefined { - return cfg.tools?.media?.[capability]; -} - -function resolveScopeDecision(params: { - scope?: MediaUnderstandingScopeConfig; - ctx: MsgContext; -}): "allow" | "deny" { - return resolveMediaUnderstandingScope({ - scope: params.scope, - sessionKey: params.ctx.SessionKey, - channel: params.ctx.Surface ?? params.ctx.Provider, - chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType), - }); -} - -function resolveModelEntries( - cfg: MediaUnderstandingConfig | undefined, - capability: Capability, -): MediaUnderstandingModelConfig[] { - const models = cfg?.models ?? []; - if (models.length === 0) return []; - return models.filter((entry) => { - const caps = entry.capabilities; - if (!caps || caps.length === 0) return true; - return caps.includes(capability); - }); -} - -function isMaxBytesError(err: unknown): boolean { - const message = normalizeErrorMessage(err); - if (!message) return false; - return message.includes("exceeds maxBytes") || message.includes("payload exceeds maxBytes"); -} - -async function loadAttachmentBuffer(params: { - attachment: MediaAttachment; - maxBytes: number; - timeoutMs: number; -}): Promise { - const { attachment, maxBytes, timeoutMs } = params; - const rawPath = normalizeAttachmentPath(attachment.path); - if (rawPath) { - const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath); - try { - const stat = await fs.stat(resolved); - if (!stat.isFile()) return undefined; - if (stat.size > maxBytes) { - if (shouldLogVerbose()) { - logVerbose( - `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`, - ); - } - return undefined; - } - const buffer = await fs.readFile(resolved); - const mime = - attachment.mime ?? - (await detectMime({ - buffer, - filePath: resolved, - })); - const fileName = path.basename(resolved) || `media-${attachment.index + 1}`; - return { buffer, mime, fileName }; - } catch (err) { - if (shouldLogVerbose()) { - logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`); - } - } - } - - const url = attachment.url?.trim(); - if (!url) return undefined; - - try { - const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) => - fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch); - const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes }); - if (fetched.buffer.length > maxBytes) { - if (shouldLogVerbose()) { - logVerbose( - `Skipping media attachment ${attachment.index + 1}: ${fetched.buffer.length} bytes exceeds ${maxBytes}`, - ); - } - return undefined; - } - const mime = - attachment.mime ?? - fetched.contentType ?? - (await detectMime({ - buffer: fetched.buffer, - filePath: fetched.fileName ?? url, - })); - const fileName = fetched.fileName ?? `media-${attachment.index + 1}`; - return { buffer: fetched.buffer, mime, fileName }; - } catch (err) { - if (shouldLogVerbose()) { - logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`); - } - } - - return undefined; -} - -async function resolveAttachmentPath(params: { - attachment: MediaAttachment; - maxBytes?: number; - timeoutMs: number; -}): Promise { - const { attachment, maxBytes, timeoutMs } = params; - const rawPath = normalizeAttachmentPath(attachment.path); - if (rawPath) { - const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath); - try { - const stat = await fs.stat(resolved); - if (!stat.isFile()) return undefined; - if (maxBytes && stat.size > maxBytes) { - if (shouldLogVerbose()) { - logVerbose( - `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`, - ); - } - return undefined; - } - return { path: resolved }; - } catch (err) { - if (shouldLogVerbose()) { - logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`); - } - } - } - - const url = attachment.url?.trim(); - if (!url) return undefined; - - try { - const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) => - fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch); - const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes }); - const buffer = fetched.buffer; - if (maxBytes && buffer.length > maxBytes) { - if (shouldLogVerbose()) { - logVerbose( - `Skipping media attachment ${attachment.index + 1}: ${buffer.length} bytes exceeds ${maxBytes}`, - ); - } - return undefined; - } - const extension = fetched.fileName ? path.extname(fetched.fileName) : ""; - const tmpPath = path.join( - os.tmpdir(), - `clawdbot-media-${crypto.randomUUID()}${extension || ""}`, - ); - await fs.writeFile(tmpPath, buffer); - return { - path: tmpPath, - cleanup: async () => { - await fs.unlink(tmpPath).catch(() => {}); - }, - }; - } catch (err) { - if (shouldLogVerbose()) { - logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`); - } - } - - return undefined; -} - -async function describeImageWithModel(params: { - cfg: ClawdbotConfig; - agentDir: string; - provider: string; - model: string; - prompt: string; - maxChars?: number; - buffer: Buffer; - mimeType: string; - profile?: string; - preferredProfile?: string; -}): Promise<{ text: string; model: string }> { - await ensureClawdbotModelsJson(params.cfg, params.agentDir); - const authStorage = discoverAuthStorage(params.agentDir); - const modelRegistry = discoverModels(authStorage, params.agentDir); - const model = modelRegistry.find(params.provider, params.model) as Model | null; - if (!model) { - throw new Error(`Unknown model: ${params.provider}/${params.model}`); - } - if (!model.input?.includes("image")) { - throw new Error(`Model does not support images: ${params.provider}/${params.model}`); - } - const apiKeyInfo = await getApiKeyForModel({ - model, - cfg: params.cfg, - agentDir: params.agentDir, - profileId: params.profile, - preferredProfile: params.preferredProfile, - }); - authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey); - - const base64 = params.buffer.toString("base64"); - if (model.provider === "minimax") { - const text = await minimaxUnderstandImage({ - apiKey: apiKeyInfo.apiKey, - prompt: params.prompt, - imageDataUrl: `data:${params.mimeType};base64,${base64}`, - modelBaseUrl: model.baseUrl, - }); - return { text, model: model.id }; - } - - const context: Context = { - messages: [ - { - role: "user", - content: [ - { type: "text", text: params.prompt }, - { type: "image", data: base64, mimeType: params.mimeType }, - ], - timestamp: Date.now(), - }, - ], - }; - const message = (await complete(model, context, { - apiKey: apiKeyInfo.apiKey, - maxTokens: 512, - })) as AssistantMessage; - const text = coerceImageAssistantText({ - message, - provider: model.provider, - model: model.id, - }); - return { text, model: model.id }; -} - async function runProviderEntry(params: { - capability: Capability; + capability: MediaUnderstandingCapability; entry: MediaUnderstandingModelConfig; cfg: ClawdbotConfig; ctx: MsgContext; - attachment: MediaAttachment; + attachmentIndex: number; + cache: MediaAttachmentCache; agentDir?: string; providerRegistry: Map; + config?: MediaUnderstandingConfig; }): Promise { - const { entry, capability, cfg, attachment } = params; + const { entry, capability, cfg } = params; const providerIdRaw = entry.provider?.trim(); if (!providerIdRaw) { throw new Error(`Provider entry missing provider for ${capability}`); } const providerId = normalizeMediaProviderId(providerIdRaw); - const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]); - const maxChars = resolveMaxChars({ capability, entry, cfg }); + const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config }); + const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config }); const timeoutMs = resolveTimeoutMs( - entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds, + entry.timeoutSeconds ?? params.config?.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS[capability], ); const prompt = resolvePrompt( capability, - entry.prompt ?? cfg.tools?.media?.[capability]?.prompt, + entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt, maxChars, ); @@ -499,27 +101,45 @@ async function runProviderEntry(params: { if (!modelId) { throw new Error("Image understanding requires model id"); } - const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs }); - if (!media) return null; - const mimeType = media.mime ?? "image/jpeg"; - const result = await describeImageWithModel({ - cfg, - agentDir: params.agentDir, - provider: providerId, - model: modelId, - prompt, - maxChars, - buffer: media.buffer, - mimeType, - profile: entry.profile, - preferredProfile: entry.preferredProfile, + const media = await params.cache.getBuffer({ + attachmentIndex: params.attachmentIndex, + maxBytes, + timeoutMs, }); + const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); + const result = provider?.describeImage + ? await provider.describeImage({ + buffer: media.buffer, + fileName: media.fileName, + mime: media.mime, + model: modelId, + provider: providerId, + prompt, + timeoutMs, + profile: entry.profile, + preferredProfile: entry.preferredProfile, + agentDir: params.agentDir, + cfg: params.cfg, + }) + : await describeImageWithModel({ + buffer: media.buffer, + fileName: media.fileName, + mime: media.mime, + model: modelId, + provider: providerId, + prompt, + timeoutMs, + profile: entry.profile, + preferredProfile: entry.preferredProfile, + agentDir: params.agentDir, + cfg: params.cfg, + }); return { kind: "image.description", - attachmentIndex: attachment.index, + attachmentIndex: params.attachmentIndex, text: trimOutput(result.text, maxChars), provider: providerId, - model: result.model, + model: result.model ?? modelId, }; } @@ -532,8 +152,11 @@ async function runProviderEntry(params: { if (!provider.transcribeAudio) { throw new Error(`Audio transcription provider "${providerId}" not available.`); } - const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs }); - if (!media) return null; + const media = await params.cache.getBuffer({ + attachmentIndex: params.attachmentIndex, + maxBytes, + timeoutMs, + }); const key = await resolveApiKeyForProvider({ provider: providerId, cfg, @@ -551,96 +174,94 @@ async function runProviderEntry(params: { baseUrl: providerConfig?.baseUrl, headers: providerConfig?.headers, model, - language: entry.language ?? cfg.tools?.media?.audio?.language, + language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language, prompt, timeoutMs, }); return { kind: "audio.transcription", - attachmentIndex: attachment.index, + attachmentIndex: params.attachmentIndex, text: trimOutput(result.text, maxChars), provider: providerId, model: result.model ?? model, }; } - if (capability === "video") { - if (!provider.describeVideo) { - throw new Error(`Video understanding provider "${providerId}" not available.`); - } - const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs }); - if (!media) return null; - const estimatedBase64Bytes = estimateBase64Size(media.buffer.length); - const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes); - if (estimatedBase64Bytes > maxBase64Bytes) { - if (shouldLogVerbose()) { - logVerbose( - `Skipping video attachment ${attachment.index + 1}: base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`, - ); - } - return null; - } - const key = await resolveApiKeyForProvider({ - provider: providerId, - cfg, - profileId: entry.profile, - preferredProfile: entry.preferredProfile, - agentDir: params.agentDir, - }); - const providerConfig = cfg.models?.providers?.[providerId]; - const result = await provider.describeVideo({ - buffer: media.buffer, - fileName: media.fileName, - mime: media.mime, - apiKey: key.apiKey, - baseUrl: providerConfig?.baseUrl, - headers: providerConfig?.headers, - model: entry.model, - prompt, - timeoutMs, - }); - return { - kind: "video.description", - attachmentIndex: attachment.index, - text: trimOutput(result.text, maxChars), - provider: providerId, - model: result.model ?? entry.model, - }; + if (!provider.describeVideo) { + throw new Error(`Video understanding provider "${providerId}" not available.`); } - - return null; + const media = await params.cache.getBuffer({ + attachmentIndex: params.attachmentIndex, + maxBytes, + timeoutMs, + }); + const estimatedBase64Bytes = estimateBase64Size(media.size); + const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes); + if (estimatedBase64Bytes > maxBase64Bytes) { + throw new MediaUnderstandingSkipError( + "maxBytes", + `Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`, + ); + } + const key = await resolveApiKeyForProvider({ + provider: providerId, + cfg, + profileId: entry.profile, + preferredProfile: entry.preferredProfile, + agentDir: params.agentDir, + }); + const providerConfig = cfg.models?.providers?.[providerId]; + const result = await provider.describeVideo({ + buffer: media.buffer, + fileName: media.fileName, + mime: media.mime, + apiKey: key.apiKey, + baseUrl: providerConfig?.baseUrl, + headers: providerConfig?.headers, + model: entry.model, + prompt, + timeoutMs, + }); + return { + kind: "video.description", + attachmentIndex: params.attachmentIndex, + text: trimOutput(result.text, maxChars), + provider: providerId, + model: result.model ?? entry.model, + }; } async function runCliEntry(params: { - capability: Capability; + capability: MediaUnderstandingCapability; entry: MediaUnderstandingModelConfig; cfg: ClawdbotConfig; ctx: MsgContext; - attachment: MediaAttachment; + attachmentIndex: number; + cache: MediaAttachmentCache; + config?: MediaUnderstandingConfig; }): Promise { - const { entry, capability, cfg, ctx, attachment } = params; + const { entry, capability, cfg, ctx } = params; const command = entry.command?.trim(); const args = entry.args ?? []; if (!command) { throw new Error(`CLI entry missing command for ${capability}`); } - const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]); - const maxChars = resolveMaxChars({ capability, entry, cfg }); + const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config }); + const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config }); const timeoutMs = resolveTimeoutMs( - entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds, + entry.timeoutSeconds ?? params.config?.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS[capability], ); const prompt = resolvePrompt( capability, - entry.prompt ?? cfg.tools?.media?.[capability]?.prompt, + entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt, maxChars, ); - const pathResult = await resolveAttachmentPath({ - attachment, + const pathResult = await params.cache.getPath({ + attachmentIndex: params.attachmentIndex, maxBytes, timeoutMs, }); - if (!pathResult) return null; const templCtx: MsgContext = { ...ctx, @@ -654,78 +275,67 @@ async function runCliEntry(params: { if (shouldLogVerbose()) { logVerbose(`Media understanding via CLI: ${argv.join(" ")}`); } - try { - const { stdout } = await runExec(argv[0], argv.slice(1), { - timeoutMs, - maxBuffer: CLI_OUTPUT_MAX_BUFFER, - }); - const text = trimOutput(stdout, maxChars); - if (!text) return null; - return { - kind: capability === "audio" ? "audio.transcription" : `${capability}.description`, - attachmentIndex: attachment.index, - text, - provider: "cli", - model: command, - }; - } finally { - if (pathResult.cleanup) { - await pathResult.cleanup(); - } - } + const { stdout } = await runExec(argv[0], argv.slice(1), { + timeoutMs, + maxBuffer: CLI_OUTPUT_MAX_BUFFER, + }); + const text = trimOutput(stdout, maxChars); + if (!text) return null; + return { + kind: capability === "audio" ? "audio.transcription" : `${capability}.description`, + attachmentIndex: params.attachmentIndex, + text, + provider: "cli", + model: command, + }; } -async function runCapability(params: { - capability: Capability; +async function runAttachmentEntries(params: { + capability: MediaUnderstandingCapability; cfg: ClawdbotConfig; ctx: MsgContext; - attachments: MediaAttachment[]; + attachmentIndex: number; agentDir?: string; providerRegistry: Map; + cache: MediaAttachmentCache; + entries: MediaUnderstandingModelConfig[]; + config?: MediaUnderstandingConfig; }): Promise { - const { capability, cfg, ctx, attachments } = params; - const config = resolveCapabilityConfig(cfg, capability); - if (!config || config.enabled === false) return null; - const entries = resolveModelEntries(config, capability); - if (entries.length === 0) return null; - - const scopeDecision = resolveScopeDecision({ scope: config.scope, ctx }); - if (scopeDecision === "deny") { - if (shouldLogVerbose()) { - logVerbose(`${capability} understanding disabled by scope policy.`); - } - return null; - } - - const attachment = attachments.find((item) => { - if (capability === "image") return isImageAttachment(item); - if (capability === "audio") return isAudioAttachment(item); - return isVideoAttachment(item); - }); - if (!attachment) return null; - + const { entries, capability } = params; for (const entry of entries) { try { const entryType = entry.type ?? (entry.command ? "cli" : "provider"); const result = entryType === "cli" - ? await runCliEntry({ capability, entry, cfg, ctx, attachment }) + ? await runCliEntry({ + capability, + entry, + cfg: params.cfg, + ctx: params.ctx, + attachmentIndex: params.attachmentIndex, + cache: params.cache, + config: params.config, + }) : await runProviderEntry({ capability, entry, - cfg, - ctx, - attachment, + cfg: params.cfg, + ctx: params.ctx, + attachmentIndex: params.attachmentIndex, + cache: params.cache, agentDir: params.agentDir, providerRegistry: params.providerRegistry, + config: params.config, }); if (result) return result; } catch (err) { - if (isMaxBytesError(err)) { + if (isMediaUnderstandingSkipError(err)) { if (shouldLogVerbose()) { - logVerbose(`Skipping ${capability} model due to size: ${String(err)}`); + logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`); } - } else if (shouldLogVerbose()) { + continue; + } + if (shouldLogVerbose()) { logVerbose(`${capability} understanding failed: ${String(err)}`); } } @@ -734,6 +344,49 @@ async function runCapability(params: { return null; } +async function runCapability(params: { + capability: MediaUnderstandingCapability; + cfg: ClawdbotConfig; + ctx: MsgContext; + attachments: MediaAttachmentCache; + attachmentIds: number[]; + agentDir?: string; + providerRegistry: Map; + config?: MediaUnderstandingConfig; +}): Promise { + const { capability, cfg, ctx } = params; + const config = params.config ?? resolveCapabilityConfig(cfg, capability); + if (!resolveCapabilityEnabled({ cfg, config })) return []; + + const entries = resolveModelEntries({ cfg, capability, config }); + if (entries.length === 0) return []; + + const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx }); + if (scopeDecision === "deny") { + if (shouldLogVerbose()) { + logVerbose(`${capability} understanding disabled by scope policy.`); + } + return []; + } + + const outputs: MediaUnderstandingOutput[] = []; + for (const attachmentIndex of params.attachmentIds) { + const output = await runAttachmentEntries({ + capability, + cfg, + ctx, + attachmentIndex, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + cache: params.attachments, + entries, + config, + }); + if (output) outputs.push(output); + } + return outputs; +} + export async function applyMediaUnderstanding(params: { ctx: MsgContext; cfg: ClawdbotConfig; @@ -749,56 +402,62 @@ export async function applyMediaUnderstanding(params: { const attachments = normalizeAttachments(ctx); const providerRegistry = buildMediaUnderstandingRegistry(params.providers); - const outputs: MediaUnderstandingOutput[] = []; + const cache = new MediaAttachmentCache(attachments); - const imageOutput = await runCapability({ - capability: "image", - cfg, - ctx, - attachments, - agentDir: params.agentDir, - providerRegistry, - }); - if (imageOutput) outputs.push(imageOutput); + try { + const tasks = CAPABILITY_ORDER.map((capability) => async () => { + const config = resolveCapabilityConfig(cfg, capability); + const attachmentPolicy = config?.attachments; + const selected = selectAttachments({ + capability, + attachments, + policy: attachmentPolicy, + }); + if (selected.length === 0) return [] as MediaUnderstandingOutput[]; + return await runCapability({ + capability, + cfg, + ctx, + attachments: cache, + attachmentIds: selected.map((item) => item.index), + agentDir: params.agentDir, + providerRegistry, + config, + }); + }); - const audioOutput = await runCapability({ - capability: "audio", - cfg, - ctx, - attachments, - agentDir: params.agentDir, - providerRegistry, - }); - if (audioOutput) outputs.push(audioOutput); - - const videoOutput = await runCapability({ - capability: "video", - cfg, - ctx, - attachments, - agentDir: params.agentDir, - providerRegistry, - }); - if (videoOutput) outputs.push(videoOutput); - - if (outputs.length > 0) { - ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs }); - const audioResult = outputs.find((output) => output.kind === "audio.transcription"); - if (audioResult) { - ctx.Transcript = audioResult.text; - ctx.CommandBody = audioResult.text; - ctx.RawBody = audioResult.text; - } else if (originalUserText) { - ctx.CommandBody = originalUserText; - ctx.RawBody = originalUserText; + const results = await runWithConcurrency(tasks, resolveConcurrency(cfg)); + const outputs: MediaUnderstandingOutput[] = []; + for (const [index] of CAPABILITY_ORDER.entries()) { + const entries = results[index] ?? []; + if (!Array.isArray(entries)) continue; + for (const entry of entries) { + outputs.push(entry); + } } - ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs]; - } - return { - outputs, - appliedImage: outputs.some((output) => output.kind === "image.description"), - appliedAudio: outputs.some((output) => output.kind === "audio.transcription"), - appliedVideo: outputs.some((output) => output.kind === "video.description"), - }; + if (outputs.length > 0) { + ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs }); + const audioOutputs = outputs.filter((output) => output.kind === "audio.transcription"); + if (audioOutputs.length > 0) { + const transcript = formatAudioTranscripts(audioOutputs); + ctx.Transcript = transcript; + ctx.CommandBody = transcript; + ctx.RawBody = transcript; + } else if (originalUserText) { + ctx.CommandBody = originalUserText; + ctx.RawBody = originalUserText; + } + ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs]; + } + + return { + outputs, + appliedImage: outputs.some((output) => output.kind === "image.description"), + appliedAudio: outputs.some((output) => output.kind === "audio.transcription"), + appliedVideo: outputs.some((output) => output.kind === "video.description"), + }; + } finally { + await cache.cleanup(); + } } diff --git a/src/media-understanding/attachments.ts b/src/media-understanding/attachments.ts new file mode 100644 index 000000000..4f1df9df2 --- /dev/null +++ b/src/media-understanding/attachments.ts @@ -0,0 +1,386 @@ +import crypto from "node:crypto"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +import type { MsgContext } from "../auto-reply/templating.js"; +import type { MediaUnderstandingAttachmentsConfig } from "../config/types.tools.js"; +import { fetchRemoteMedia, MediaFetchError } from "../media/fetch.js"; +import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js"; +import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { fetchWithTimeout } from "./providers/shared.js"; +import type { MediaAttachment, MediaUnderstandingCapability } from "./types.js"; +import { MediaUnderstandingSkipError } from "./errors.js"; + +type MediaBufferResult = { + buffer: Buffer; + mime?: string; + fileName: string; + size: number; +}; + +type MediaPathResult = { + path: string; + cleanup?: () => Promise | void; +}; + +type AttachmentCacheEntry = { + attachment: MediaAttachment; + resolvedPath?: string; + statSize?: number; + buffer?: Buffer; + bufferMime?: string; + bufferFileName?: string; + tempPath?: string; + tempCleanup?: () => Promise; +}; + +const DEFAULT_MAX_ATTACHMENTS = 1; + +function normalizeAttachmentPath(raw?: string | null): string | undefined { + const value = raw?.trim(); + if (!value) return undefined; + if (value.startsWith("file://")) { + try { + return fileURLToPath(value); + } catch { + return undefined; + } + } + return value; +} + +export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] { + const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined; + const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined; + const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined; + const resolveMime = (count: number, index: number) => { + const typeHint = typesFromArray?.[index]; + const trimmed = typeof typeHint === "string" ? typeHint.trim() : ""; + if (trimmed) return trimmed; + return count === 1 ? ctx.MediaType : undefined; + }; + + if (pathsFromArray && pathsFromArray.length > 0) { + const count = pathsFromArray.length; + const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined; + return pathsFromArray + .map((value, index) => ({ + path: value?.trim() || undefined, + url: urls?.[index] ?? ctx.MediaUrl, + mime: resolveMime(count, index), + index, + })) + .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim())); + } + + if (urlsFromArray && urlsFromArray.length > 0) { + const count = urlsFromArray.length; + return urlsFromArray + .map((value, index) => ({ + path: undefined, + url: value?.trim() || undefined, + mime: resolveMime(count, index), + index, + })) + .filter((entry) => Boolean(entry.url?.trim())); + } + + const pathValue = ctx.MediaPath?.trim(); + const url = ctx.MediaUrl?.trim(); + if (!pathValue && !url) return []; + return [ + { + path: pathValue || undefined, + url: url || undefined, + mime: ctx.MediaType, + index: 0, + }, + ]; +} + +export function isVideoAttachment(attachment: MediaAttachment): boolean { + if (attachment.mime?.startsWith("video/")) return true; + const ext = getFileExtension(attachment.path ?? attachment.url); + if (!ext) return false; + return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext); +} + +export function isAudioAttachment(attachment: MediaAttachment): boolean { + if (attachment.mime?.startsWith("audio/")) return true; + return isAudioFileName(attachment.path ?? attachment.url); +} + +export function isImageAttachment(attachment: MediaAttachment): boolean { + if (attachment.mime?.startsWith("image/")) return true; + const ext = getFileExtension(attachment.path ?? attachment.url); + if (!ext) return false; + return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext); +} + +function isAbortError(err: unknown): boolean { + if (!err) return false; + if (err instanceof Error && err.name === "AbortError") return true; + return false; +} + +function resolveRequestUrl(input: RequestInfo | URL): string { + if (typeof input === "string") return input; + if (input instanceof URL) return input.toString(); + return input.url; +} + +function orderAttachments( + attachments: MediaAttachment[], + prefer?: MediaUnderstandingAttachmentsConfig["prefer"], +): MediaAttachment[] { + if (!prefer || prefer === "first") return attachments; + if (prefer === "last") return [...attachments].reverse(); + if (prefer === "path") { + const withPath = attachments.filter((item) => item.path); + const withoutPath = attachments.filter((item) => !item.path); + return [...withPath, ...withoutPath]; + } + if (prefer === "url") { + const withUrl = attachments.filter((item) => item.url); + const withoutUrl = attachments.filter((item) => !item.url); + return [...withUrl, ...withoutUrl]; + } + return attachments; +} + +export function selectAttachments(params: { + capability: MediaUnderstandingCapability; + attachments: MediaAttachment[]; + policy?: MediaUnderstandingAttachmentsConfig; +}): MediaAttachment[] { + const { capability, attachments, policy } = params; + const matches = attachments.filter((item) => { + if (capability === "image") return isImageAttachment(item); + if (capability === "audio") return isAudioAttachment(item); + return isVideoAttachment(item); + }); + if (matches.length === 0) return []; + + const ordered = orderAttachments(matches, policy?.prefer); + const mode = policy?.mode ?? "first"; + const maxAttachments = policy?.maxAttachments ?? DEFAULT_MAX_ATTACHMENTS; + if (mode === "all") { + return ordered.slice(0, Math.max(1, maxAttachments)); + } + return ordered.slice(0, 1); +} + +export class MediaAttachmentCache { + private readonly entries = new Map(); + private readonly attachments: MediaAttachment[]; + + constructor(attachments: MediaAttachment[]) { + this.attachments = attachments; + for (const attachment of attachments) { + this.entries.set(attachment.index, { attachment }); + } + } + + async getBuffer(params: { + attachmentIndex: number; + maxBytes: number; + timeoutMs: number; + }): Promise { + const entry = await this.ensureEntry(params.attachmentIndex); + if (entry.buffer) { + if (entry.buffer.length > params.maxBytes) { + throw new MediaUnderstandingSkipError( + "maxBytes", + `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`, + ); + } + return { + buffer: entry.buffer, + mime: entry.bufferMime, + fileName: entry.bufferFileName ?? `media-${params.attachmentIndex + 1}`, + size: entry.buffer.length, + }; + } + + if (entry.resolvedPath) { + const size = await this.ensureLocalStat(entry); + if (entry.resolvedPath) { + if (size !== undefined && size > params.maxBytes) { + throw new MediaUnderstandingSkipError( + "maxBytes", + `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`, + ); + } + const buffer = await fs.readFile(entry.resolvedPath); + entry.buffer = buffer; + entry.bufferMime = + entry.bufferMime ?? + entry.attachment.mime ?? + (await detectMime({ + buffer, + filePath: entry.resolvedPath, + })); + entry.bufferFileName = + path.basename(entry.resolvedPath) || `media-${params.attachmentIndex + 1}`; + return { + buffer, + mime: entry.bufferMime, + fileName: entry.bufferFileName, + size: buffer.length, + }; + } + } + + const url = entry.attachment.url?.trim(); + if (!url) { + throw new MediaUnderstandingSkipError( + "empty", + `Attachment ${params.attachmentIndex + 1} has no path or URL.`, + ); + } + + try { + const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) => + fetchWithTimeout(resolveRequestUrl(input), init ?? {}, params.timeoutMs, fetch); + const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes: params.maxBytes }); + entry.buffer = fetched.buffer; + entry.bufferMime = + entry.attachment.mime ?? + fetched.contentType ?? + (await detectMime({ + buffer: fetched.buffer, + filePath: fetched.fileName ?? url, + })); + entry.bufferFileName = fetched.fileName ?? `media-${params.attachmentIndex + 1}`; + return { + buffer: fetched.buffer, + mime: entry.bufferMime, + fileName: entry.bufferFileName, + size: fetched.buffer.length, + }; + } catch (err) { + if (err instanceof MediaFetchError && err.code === "max_bytes") { + throw new MediaUnderstandingSkipError( + "maxBytes", + `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`, + ); + } + if (isAbortError(err)) { + throw new MediaUnderstandingSkipError( + "timeout", + `Attachment ${params.attachmentIndex + 1} timed out while fetching.`, + ); + } + throw err; + } + } + + async getPath(params: { + attachmentIndex: number; + maxBytes?: number; + timeoutMs: number; + }): Promise { + const entry = await this.ensureEntry(params.attachmentIndex); + if (entry.resolvedPath) { + if (params.maxBytes) { + const size = await this.ensureLocalStat(entry); + if (entry.resolvedPath) { + if (size !== undefined && size > params.maxBytes) { + throw new MediaUnderstandingSkipError( + "maxBytes", + `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`, + ); + } + } + } + if (entry.resolvedPath) { + return { path: entry.resolvedPath }; + } + } + + if (entry.tempPath) { + if (params.maxBytes && entry.buffer && entry.buffer.length > params.maxBytes) { + throw new MediaUnderstandingSkipError( + "maxBytes", + `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`, + ); + } + return { path: entry.tempPath, cleanup: entry.tempCleanup }; + } + + const maxBytes = params.maxBytes ?? Number.POSITIVE_INFINITY; + const bufferResult = await this.getBuffer({ + attachmentIndex: params.attachmentIndex, + maxBytes, + timeoutMs: params.timeoutMs, + }); + const extension = path.extname(bufferResult.fileName || "") || ""; + const tmpPath = path.join( + os.tmpdir(), + `clawdbot-media-${crypto.randomUUID()}${extension}`, + ); + await fs.writeFile(tmpPath, bufferResult.buffer); + entry.tempPath = tmpPath; + entry.tempCleanup = async () => { + await fs.unlink(tmpPath).catch(() => {}); + }; + return { path: tmpPath, cleanup: entry.tempCleanup }; + } + + async cleanup(): Promise { + const cleanups: Array | void> = []; + for (const entry of this.entries.values()) { + if (entry.tempCleanup) { + cleanups.push(Promise.resolve(entry.tempCleanup())); + entry.tempCleanup = undefined; + } + } + await Promise.all(cleanups); + } + + private async ensureEntry(attachmentIndex: number): Promise { + const existing = this.entries.get(attachmentIndex); + if (existing) { + if (!existing.resolvedPath) { + existing.resolvedPath = this.resolveLocalPath(existing.attachment); + } + return existing; + } + const attachment = + this.attachments.find((item) => item.index === attachmentIndex) ?? { index: attachmentIndex }; + const entry: AttachmentCacheEntry = { + attachment, + resolvedPath: this.resolveLocalPath(attachment), + }; + this.entries.set(attachmentIndex, entry); + return entry; + } + + private resolveLocalPath(attachment: MediaAttachment): string | undefined { + const rawPath = normalizeAttachmentPath(attachment.path); + if (!rawPath) return undefined; + return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath); + } + + private async ensureLocalStat(entry: AttachmentCacheEntry): Promise { + if (!entry.resolvedPath) return undefined; + if (entry.statSize !== undefined) return entry.statSize; + try { + const stat = await fs.stat(entry.resolvedPath); + if (!stat.isFile()) { + entry.resolvedPath = undefined; + return undefined; + } + entry.statSize = stat.size; + return stat.size; + } catch (err) { + entry.resolvedPath = undefined; + if (shouldLogVerbose()) { + logVerbose(`Failed to read attachment ${entry.attachment.index + 1}: ${String(err)}`); + } + return undefined; + } + } +} diff --git a/src/media-understanding/concurrency.ts b/src/media-understanding/concurrency.ts new file mode 100644 index 000000000..8ccba85f4 --- /dev/null +++ b/src/media-understanding/concurrency.ts @@ -0,0 +1,29 @@ +import { logVerbose, shouldLogVerbose } from "../globals.js"; + +export async function runWithConcurrency( + tasks: Array<() => Promise>, + limit: number, +): Promise { + if (tasks.length === 0) return []; + const resolvedLimit = Math.max(1, Math.min(limit, tasks.length)); + const results: T[] = Array.from({ length: tasks.length }); + let next = 0; + + const workers = Array.from({ length: resolvedLimit }, async () => { + while (true) { + const index = next; + next += 1; + if (index >= tasks.length) return; + try { + results[index] = await tasks[index](); + } catch (err) { + if (shouldLogVerbose()) { + logVerbose(`Media understanding task failed: ${String(err)}`); + } + } + } + }); + + await Promise.allSettled(workers); + return results; +} diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts new file mode 100644 index 000000000..92ce8835c --- /dev/null +++ b/src/media-understanding/defaults.ts @@ -0,0 +1,35 @@ +import type { MediaUnderstandingCapability } from "./types.js"; + +const MB = 1024 * 1024; + +export const DEFAULT_MAX_CHARS = 500; +export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record< + MediaUnderstandingCapability, + number | undefined +> = { + image: DEFAULT_MAX_CHARS, + audio: undefined, + video: DEFAULT_MAX_CHARS, +}; +export const DEFAULT_MAX_BYTES: Record = { + image: 10 * MB, + audio: 20 * MB, + video: 50 * MB, +}; +export const DEFAULT_TIMEOUT_SECONDS: Record = { + image: 60, + audio: 60, + video: 120, +}; +export const DEFAULT_PROMPT: Record = { + image: "Describe the image.", + audio: "Transcribe the audio.", + video: "Describe the video.", +}; +export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; +export const DEFAULT_AUDIO_MODELS: Record = { + groq: "whisper-large-v3-turbo", + openai: "whisper-1", +}; +export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; +export const DEFAULT_MEDIA_CONCURRENCY = 2; diff --git a/src/media-understanding/errors.ts b/src/media-understanding/errors.ts new file mode 100644 index 000000000..738670b1d --- /dev/null +++ b/src/media-understanding/errors.ts @@ -0,0 +1,17 @@ +export type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty"; + +export class MediaUnderstandingSkipError extends Error { + readonly reason: MediaUnderstandingSkipReason; + + constructor(reason: MediaUnderstandingSkipReason, message: string) { + super(message); + this.reason = reason; + this.name = "MediaUnderstandingSkipError"; + } +} + +export function isMediaUnderstandingSkipError( + err: unknown, +): err is MediaUnderstandingSkipError { + return err instanceof MediaUnderstandingSkipError; +} diff --git a/src/media-understanding/format.ts b/src/media-understanding/format.ts index ffa6f0145..f99cd8d3e 100644 --- a/src/media-understanding/format.ts +++ b/src/media-understanding/format.ts @@ -12,7 +12,7 @@ export function extractMediaUserText(body?: string): string | undefined { } function formatSection( - title: "Audio" | "Video" | "Image", + title: string, kind: "Transcript" | "Description", text: string, userText?: string, @@ -40,11 +40,21 @@ export function formatMediaUnderstandingBody(params: { sections.push(`User text:\n${userText}`); } + const counts = new Map(); for (const output of outputs) { + counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1); + } + const seen = new Map(); + + for (const output of outputs) { + const count = counts.get(output.kind) ?? 1; + const next = (seen.get(output.kind) ?? 0) + 1; + seen.set(output.kind, next); + const suffix = count > 1 ? ` ${next}/${count}` : ""; if (output.kind === "audio.transcription") { sections.push( formatSection( - "Audio", + `Audio${suffix}`, "Transcript", output.text, outputs.length === 1 ? userText : undefined, @@ -55,7 +65,7 @@ export function formatMediaUnderstandingBody(params: { if (output.kind === "image.description") { sections.push( formatSection( - "Image", + `Image${suffix}`, "Description", output.text, outputs.length === 1 ? userText : undefined, @@ -65,7 +75,7 @@ export function formatMediaUnderstandingBody(params: { } sections.push( formatSection( - "Video", + `Video${suffix}`, "Description", output.text, outputs.length === 1 ? userText : undefined, @@ -75,3 +85,10 @@ export function formatMediaUnderstandingBody(params: { return sections.join("\n\n").trim(); } + +export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string { + if (outputs.length === 1) return outputs[0].text; + return outputs + .map((output, index) => `Audio ${index + 1}:\n${output.text}`) + .join("\n\n"); +} diff --git a/src/media-understanding/providers/anthropic/index.ts b/src/media-understanding/providers/anthropic/index.ts new file mode 100644 index 000000000..3f9fc584c --- /dev/null +++ b/src/media-understanding/providers/anthropic/index.ts @@ -0,0 +1,7 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { describeImageWithModel } from "../image.js"; + +export const anthropicProvider: MediaUnderstandingProvider = { + id: "anthropic", + describeImage: describeImageWithModel, +}; diff --git a/src/media-understanding/providers/google/index.ts b/src/media-understanding/providers/google/index.ts index 285195dc7..d0f8bae3b 100644 --- a/src/media-understanding/providers/google/index.ts +++ b/src/media-understanding/providers/google/index.ts @@ -1,7 +1,9 @@ import type { MediaUnderstandingProvider } from "../../types.js"; +import { describeImageWithModel } from "../image.js"; import { describeGeminiVideo } from "./video.js"; export const googleProvider: MediaUnderstandingProvider = { id: "google", + describeImage: describeImageWithModel, describeVideo: describeGeminiVideo, }; diff --git a/src/media-understanding/providers/image.ts b/src/media-understanding/providers/image.ts new file mode 100644 index 000000000..bd056253a --- /dev/null +++ b/src/media-understanding/providers/image.ts @@ -0,0 +1,66 @@ +import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai"; +import { complete } from "@mariozechner/pi-ai"; +import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent"; + +import { getApiKeyForModel } from "../../agents/model-auth.js"; +import { ensureClawdbotModelsJson } from "../../agents/models-config.js"; +import { minimaxUnderstandImage } from "../../agents/minimax-vlm.js"; +import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js"; +import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js"; + +export async function describeImageWithModel( + params: ImageDescriptionRequest, +): Promise { + await ensureClawdbotModelsJson(params.cfg, params.agentDir); + const authStorage = discoverAuthStorage(params.agentDir); + const modelRegistry = discoverModels(authStorage, params.agentDir); + const model = modelRegistry.find(params.provider, params.model) as Model | null; + if (!model) { + throw new Error(`Unknown model: ${params.provider}/${params.model}`); + } + if (!model.input?.includes("image")) { + throw new Error(`Model does not support images: ${params.provider}/${params.model}`); + } + const apiKeyInfo = await getApiKeyForModel({ + model, + cfg: params.cfg, + agentDir: params.agentDir, + profileId: params.profile, + preferredProfile: params.preferredProfile, + }); + authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey); + + const base64 = params.buffer.toString("base64"); + if (model.provider === "minimax") { + const text = await minimaxUnderstandImage({ + apiKey: apiKeyInfo.apiKey, + prompt: params.prompt ?? "Describe the image.", + imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`, + modelBaseUrl: model.baseUrl, + }); + return { text, model: model.id }; + } + + const context: Context = { + messages: [ + { + role: "user", + content: [ + { type: "text", text: params.prompt ?? "Describe the image." }, + { type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" }, + ], + timestamp: Date.now(), + }, + ], + }; + const message = (await complete(model, context, { + apiKey: apiKeyInfo.apiKey, + maxTokens: params.maxTokens ?? 512, + })) as AssistantMessage; + const text = coerceImageAssistantText({ + message, + provider: model.provider, + model: model.id, + }); + return { text, model: model.id }; +} diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index fef5d6531..6f4387a10 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -1,10 +1,18 @@ import { normalizeProviderId } from "../../agents/model-selection.js"; import type { MediaUnderstandingProvider } from "../types.js"; +import { anthropicProvider } from "./anthropic/index.js"; import { googleProvider } from "./google/index.js"; import { groqProvider } from "./groq/index.js"; +import { minimaxProvider } from "./minimax/index.js"; import { openaiProvider } from "./openai/index.js"; -const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider]; +const PROVIDERS: MediaUnderstandingProvider[] = [ + groqProvider, + openaiProvider, + googleProvider, + anthropicProvider, + minimaxProvider, +]; export function normalizeMediaProviderId(id: string): string { const normalized = normalizeProviderId(id); diff --git a/src/media-understanding/providers/minimax/index.ts b/src/media-understanding/providers/minimax/index.ts new file mode 100644 index 000000000..8d5003538 --- /dev/null +++ b/src/media-understanding/providers/minimax/index.ts @@ -0,0 +1,7 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { describeImageWithModel } from "../image.js"; + +export const minimaxProvider: MediaUnderstandingProvider = { + id: "minimax", + describeImage: describeImageWithModel, +}; diff --git a/src/media-understanding/providers/openai/index.ts b/src/media-understanding/providers/openai/index.ts index f8af49928..0aabb275f 100644 --- a/src/media-understanding/providers/openai/index.ts +++ b/src/media-understanding/providers/openai/index.ts @@ -1,7 +1,9 @@ import type { MediaUnderstandingProvider } from "../../types.js"; +import { describeImageWithModel } from "../image.js"; import { transcribeOpenAiCompatibleAudio } from "./audio.js"; export const openaiProvider: MediaUnderstandingProvider = { id: "openai", + describeImage: describeImageWithModel, transcribeAudio: transcribeOpenAiCompatibleAudio, }; diff --git a/src/media-understanding/resolve.ts b/src/media-understanding/resolve.ts new file mode 100644 index 000000000..a65a17044 --- /dev/null +++ b/src/media-understanding/resolve.ts @@ -0,0 +1,154 @@ +import type { ClawdbotConfig } from "../config/config.js"; +import type { MsgContext } from "../auto-reply/templating.js"; +import type { + MediaUnderstandingConfig, + MediaUnderstandingModelConfig, + MediaUnderstandingScopeConfig, +} from "../config/types.tools.js"; +import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { + DEFAULT_MAX_BYTES, + DEFAULT_MAX_CHARS_BY_CAPABILITY, + DEFAULT_MEDIA_CONCURRENCY, + DEFAULT_PROMPT, +} from "./defaults.js"; +import { normalizeMediaProviderId } from "./providers/index.js"; +import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js"; +import type { MediaUnderstandingCapability } from "./types.js"; + +export function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number { + const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds; + return Math.max(1000, Math.floor(value * 1000)); +} + +export function resolvePrompt( + capability: MediaUnderstandingCapability, + prompt?: string, + maxChars?: number, +): string { + const base = prompt?.trim() || DEFAULT_PROMPT[capability]; + if (!maxChars || capability === "audio") return base; + return `${base} Respond in at most ${maxChars} characters.`; +} + +export function resolveMaxChars(params: { + capability: MediaUnderstandingCapability; + entry: MediaUnderstandingModelConfig; + cfg: ClawdbotConfig; + config?: MediaUnderstandingConfig; +}): number | undefined { + const { capability, entry, cfg } = params; + const configured = + entry.maxChars ?? params.config?.maxChars ?? cfg.tools?.media?.[capability]?.maxChars; + if (typeof configured === "number") return configured; + return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability]; +} + +export function resolveMaxBytes(params: { + capability: MediaUnderstandingCapability; + entry: MediaUnderstandingModelConfig; + cfg: ClawdbotConfig; + config?: MediaUnderstandingConfig; +}): number { + const configured = + params.entry.maxBytes ?? + params.config?.maxBytes ?? + params.cfg.tools?.media?.[params.capability]?.maxBytes; + if (typeof configured === "number") return configured; + return DEFAULT_MAX_BYTES[params.capability]; +} + +export function resolveCapabilityConfig( + cfg: ClawdbotConfig, + capability: MediaUnderstandingCapability, +): MediaUnderstandingConfig | undefined { + return cfg.tools?.media?.[capability]; +} + +export function resolveScopeDecision(params: { + scope?: MediaUnderstandingScopeConfig; + ctx: MsgContext; +}): "allow" | "deny" { + return resolveMediaUnderstandingScope({ + scope: params.scope, + sessionKey: params.ctx.SessionKey, + channel: params.ctx.Surface ?? params.ctx.Provider, + chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType), + }); +} + +function inferCapabilities( + entry: MediaUnderstandingModelConfig, +): MediaUnderstandingCapability[] | undefined { + if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") { + return ["image", "audio", "video"]; + } + const provider = normalizeMediaProviderId(entry.provider ?? ""); + if (!provider) return undefined; + if (provider === "openai" || provider === "anthropic" || provider === "minimax") { + return ["image"]; + } + if (provider === "google") { + return ["image", "audio", "video"]; + } + if (provider === "groq") { + return ["audio"]; + } + return undefined; +} + +export function resolveModelEntries(params: { + cfg: ClawdbotConfig; + capability: MediaUnderstandingCapability; + config?: MediaUnderstandingConfig; +}): MediaUnderstandingModelConfig[] { + const { cfg, capability, config } = params; + const sharedModels = cfg.tools?.media?.models ?? []; + const entries = [ + ...(config?.models ?? []).map((entry) => ({ entry, source: "capability" as const })), + ...sharedModels.map((entry) => ({ entry, source: "shared" as const })), + ]; + if (entries.length === 0) return []; + + return entries + .filter(({ entry, source }) => { + const caps = + entry.capabilities && entry.capabilities.length > 0 + ? entry.capabilities + : source === "shared" + ? inferCapabilities(entry) + : undefined; + if (!caps || caps.length === 0) { + if (source === "shared") { + if (shouldLogVerbose()) { + logVerbose( + `Skipping shared media model without capabilities: ${entry.provider ?? entry.command ?? "unknown"}`, + ); + } + return false; + } + return true; + } + return caps.includes(capability); + }) + .map(({ entry }) => entry); +} + +export function resolveConcurrency(cfg: ClawdbotConfig): number { + const configured = cfg.tools?.media?.concurrency; + if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) { + return Math.floor(configured); + } + return DEFAULT_MEDIA_CONCURRENCY; +} + +export function resolveCapabilityEnabled(params: { + cfg: ClawdbotConfig; + config?: MediaUnderstandingConfig; +}): boolean { + if (params.config?.enabled === false) return false; + const sharedModels = params.cfg.tools?.media?.models ?? []; + const hasModels = (params.config?.models?.length ?? 0) > 0 || sharedModels.length > 0; + if (!hasModels) return false; + return true; +} diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts index 85c897275..a74c79757 100644 --- a/src/media-understanding/types.ts +++ b/src/media-understanding/types.ts @@ -3,6 +3,8 @@ export type MediaUnderstandingKind = | "video.description" | "image.description"; +export type MediaUnderstandingCapability = "image" | "audio" | "video"; + export type MediaAttachment = { path?: string; url?: string; @@ -55,8 +57,29 @@ export type VideoDescriptionResult = { model?: string; }; +export type ImageDescriptionRequest = { + buffer: Buffer; + fileName: string; + mime?: string; + model: string; + provider: string; + prompt?: string; + maxTokens?: number; + timeoutMs: number; + profile?: string; + preferredProfile?: string; + agentDir: string; + cfg: import("../config/config.js").ClawdbotConfig; +}; + +export type ImageDescriptionResult = { + text: string; + model?: string; +}; + export type MediaUnderstandingProvider = { id: string; transcribeAudio?: (req: AudioTranscriptionRequest) => Promise; describeVideo?: (req: VideoDescriptionRequest) => Promise; + describeImage?: (req: ImageDescriptionRequest) => Promise; }; diff --git a/src/media-understanding/video.ts b/src/media-understanding/video.ts new file mode 100644 index 000000000..00773f40c --- /dev/null +++ b/src/media-understanding/video.ts @@ -0,0 +1,10 @@ +import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.js"; + +export function estimateBase64Size(bytes: number): number { + return Math.ceil(bytes / 3) * 4; +} + +export function resolveVideoMaxBase64Bytes(maxBytes: number): number { + const expanded = Math.floor(maxBytes * (4 / 3)); + return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES); +} diff --git a/src/media/fetch.ts b/src/media/fetch.ts index 6ee706d97..727ab7a5d 100644 --- a/src/media/fetch.ts +++ b/src/media/fetch.ts @@ -8,6 +8,18 @@ type FetchMediaResult = { fileName?: string; }; +export type MediaFetchErrorCode = "max_bytes" | "http_error" | "fetch_failed"; + +export class MediaFetchError extends Error { + readonly code: MediaFetchErrorCode; + + constructor(code: MediaFetchErrorCode, message: string) { + super(message); + this.code = code; + this.name = "MediaFetchError"; + } +} + export type FetchLike = (input: RequestInfo | URL, init?: RequestInit) => Promise; type FetchMediaOptions = { @@ -62,7 +74,7 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise maxBytes) { - throw new Error( + throw new MediaFetchError( + "max_bytes", `Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`, ); } @@ -128,7 +144,8 @@ async function readResponseWithLimit(res: Response, maxBytes: number): Promise maxBytes) { - throw new Error( + throw new MediaFetchError( + "max_bytes", `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`, ); } @@ -148,7 +165,8 @@ async function readResponseWithLimit(res: Response, maxBytes: number): Promise