From 1b973f75062104348d3dde1239120bdaa8213632 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 17 Jan 2026 03:52:37 +0000 Subject: [PATCH] feat: add inbound media understanding Co-authored-by: Tristan Manchester --- CHANGELOG.md | 1 + docs/gateway/configuration-examples.md | 17 +- docs/gateway/configuration.md | 56 +- docs/gateway/doctor.md | 2 +- docs/nodes/audio.md | 73 +- docs/nodes/images.md | 12 +- docs/nodes/media-understanding.md | 217 +++++ src/agents/bash-tools.exec.ts | 26 +- src/auto-reply/media-note.test.ts | 16 + src/auto-reply/media-note.ts | 33 +- src/auto-reply/reply/get-reply-directives.ts | 1 + src/auto-reply/reply/get-reply-run.ts | 14 +- src/auto-reply/reply/get-reply.ts | 19 +- src/auto-reply/templating.ts | 4 + ...etection.rejects-routing-allowfrom.test.ts | 17 +- src/config/legacy.migrations.part-2.ts | 32 +- src/config/legacy.rules.ts | 2 +- src/config/legacy.shared.ts | 2 +- src/config/schema.ts | 24 +- src/config/types.messages.ts | 2 +- src/config/types.tools.ts | 82 +- src/config/zod-schema.agent-runtime.ts | 8 +- src/config/zod-schema.core.ts | 60 +- src/media-understanding/apply.test.ts | 258 ++++++ src/media-understanding/apply.ts | 804 ++++++++++++++++++ src/media-understanding/format.test.ts | 91 ++ src/media-understanding/format.ts | 77 ++ src/media-understanding/index.ts | 9 + .../providers/google/index.ts | 7 + .../providers/google/video.test.ts | 93 ++ .../providers/google/video.ts | 84 ++ .../providers/groq/index.ts | 13 + src/media-understanding/providers/index.ts | 35 + .../providers/openai/audio.test.ts | 86 ++ .../providers/openai/audio.ts | 61 ++ .../providers/openai/index.ts | 7 + src/media-understanding/providers/shared.ts | 33 + src/media-understanding/scope.test.ts | 46 + src/media-understanding/scope.ts | 54 ++ src/media-understanding/types.ts | 62 ++ src/media/fetch.test.ts | 47 + src/media/fetch.ts | 61 +- 42 files changed, 2547 insertions(+), 101 deletions(-) create mode 100644 docs/nodes/media-understanding.md create mode 100644 src/media-understanding/apply.test.ts create mode 100644 src/media-understanding/apply.ts create mode 100644 src/media-understanding/format.test.ts create mode 100644 src/media-understanding/format.ts create mode 100644 src/media-understanding/index.ts create mode 100644 src/media-understanding/providers/google/index.ts create mode 100644 src/media-understanding/providers/google/video.test.ts create mode 100644 src/media-understanding/providers/google/video.ts create mode 100644 src/media-understanding/providers/groq/index.ts create mode 100644 src/media-understanding/providers/index.ts create mode 100644 src/media-understanding/providers/openai/audio.test.ts create mode 100644 src/media-understanding/providers/openai/audio.ts create mode 100644 src/media-understanding/providers/openai/index.ts create mode 100644 src/media-understanding/providers/shared.ts create mode 100644 src/media-understanding/scope.test.ts create mode 100644 src/media-understanding/scope.ts create mode 100644 src/media-understanding/types.ts create mode 100644 src/media/fetch.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index b0c9ca8f9..7123c9353 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -89,6 +89,7 @@ - Docs: add Date & Time guide and update prompt/timezone configuration docs. - Messages: debounce rapid inbound messages across channels with per-connector overrides. (#971) — thanks @juanpablodlc. - Messages: allow media-only sends (CLI/tool) and show Telegram voice recording status for voice notes. (#957) — thanks @rdev. +- Media: add optional inbound media understanding for image/audio/video with provider + CLI fallbacks. (#1005) — thanks @tristanmanchester. - Auth/Status: keep auth profiles sticky per session (rotate on compaction/new), surface provider usage headers in `/status` and `clawdbot models status`, and update docs. - CLI: add `--json` output for `clawdbot daemon` lifecycle/install commands. - Memory: make `node-llama-cpp` an optional dependency (avoid Node 25 install failures) and improve local-embeddings fallback/errors. diff --git a/docs/gateway/configuration-examples.md b/docs/gateway/configuration-examples.md index 0aecc034d..cf95229c3 100644 --- a/docs/gateway/configuration-examples.md +++ b/docs/gateway/configuration-examples.md @@ -124,10 +124,21 @@ Save to `~/.clawdbot/clawdbot.json` and you can DM the bot from that number. // Tooling tools: { - audio: { - transcription: { - args: ["--model", "base", "{{MediaPath}}"], + media: { + audio: { + enabled: true, + maxBytes: 20971520, + models: [ + { provider: "openai", model: "whisper-1" }, + // Optional CLI fallback (Whisper binary): + // { type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] } + ], timeoutSeconds: 120 + }, + video: { + enabled: true, + maxBytes: 52428800, + models: [{ provider: "google", model: "gemini-3-flash-preview" }] } } }, diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 5b85428b2..3e2b55d6c 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1769,6 +1769,58 @@ Legacy: `tools.bash` is still accepted as an alias. - `tools.web.fetch.firecrawl.maxAgeMs` (optional) - `tools.web.fetch.firecrawl.timeoutSeconds` (optional) +`tools.media` configures inbound media understanding (image/audio/video): +- `tools.media.image` / `tools.media.audio` / `tools.media.video`: + - `enabled`: opt-out switch (default true). + - `prompt`: optional prompt override (image/video append a `maxChars` hint automatically). + - `maxChars`: max output characters (default 500 for image/video; unset for audio). + - `maxBytes`: max media size to send (defaults: image 10MB, audio 20MB, video 50MB). + - `timeoutSeconds`: request timeout (defaults: image 60s, audio 60s, video 120s). + - `language`: optional audio hint. + - `scope`: optional gating (first match wins) with `match.channel`, `match.chatType`, or `match.keyPrefix`. + - `models`: ordered list of model entries; failures or oversize media fall back to the next entry. +- Each `models[]` entry: + - Provider entry (`type: "provider"` or omitted): + - `provider`: API provider id (`openai`, `anthropic`, `google`/`gemini`, `groq`, etc). + - `model`: model id override (required for image; defaults to `whisper-1`/`whisper-large-v3-turbo` for audio providers, and `gemini-3-flash-preview` for video). + - `profile` / `preferredProfile`: auth profile selection. + - CLI entry (`type: "cli"`): + - `command`: executable to run. + - `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc). + - `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry. + - `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language` can be overridden per entry. + +If no models are configured (or `enabled: false`), understanding is skipped; the model still receives the original attachments. + +Provider auth follows the standard model auth order (auth profiles, env vars like `OPENAI_API_KEY`/`GROQ_API_KEY`/`GEMINI_API_KEY`, or `models.providers.*.apiKey`). + +Example: +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + maxBytes: 20971520, + scope: { + default: "deny", + rules: [{ action: "allow", match: { chatType: "direct" } }] + }, + models: [ + { provider: "openai", model: "whisper-1" }, + { type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] } + ] + }, + video: { + enabled: true, + maxBytes: 52428800, + models: [{ provider: "google", model: "gemini-3-flash-preview" }] + } + } + } +} +``` + `agents.defaults.subagents` configures sub-agent defaults: - `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call. - `maxConcurrent`: max concurrent sub-agent runs (default 1) @@ -2848,7 +2900,7 @@ clawdbot dns setup --apply ## Template variables -Template placeholders are expanded in `tools.audio.transcription.args` (and any future templated argument fields). +Template placeholders are expanded in `tools.media.*.models[].args` (and any future templated argument fields). | Variable | Description | |----------|-------------| @@ -2864,6 +2916,8 @@ Template placeholders are expanded in `tools.audio.transcription.args` (and any | `{{MediaPath}}` | Local media path (if downloaded) | | `{{MediaType}}` | Media type (image/audio/document/…) | | `{{Transcript}}` | Audio transcript (when enabled) | +| `{{Prompt}}` | Resolved media prompt for CLI entries | +| `{{MaxChars}}` | Resolved max output chars for CLI entries | | `{{ChatType}}` | `"direct"` or `"group"` | | `{{GroupSubject}}` | Group subject (best effort) | | `{{GroupMembers}}` | Group members preview (best effort) | diff --git a/docs/gateway/doctor.md b/docs/gateway/doctor.md index 0574566b0..b3c77e848 100644 --- a/docs/gateway/doctor.md +++ b/docs/gateway/doctor.md @@ -111,7 +111,7 @@ Current migrations: - `routing.bindings` → top-level `bindings` - `routing.agents`/`routing.defaultAgentId` → `agents.list` + `agents.list[].default` - `routing.agentToAgent` → `tools.agentToAgent` -- `routing.transcribeAudio` → `tools.audio.transcription` +- `routing.transcribeAudio` → `tools.media.audio.models` - `bindings[].match.accountID` → `bindings[].match.accountId` - `identity` → `agents.list[].identity` - `agent.*` → `agents.defaults` + `tools.*` (tools/elevated/exec/sandbox/subagents) diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index 349a8f212..b6019b26e 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -3,25 +3,59 @@ summary: "How inbound audio/voice notes are downloaded, transcribed, and injecte read_when: - Changing audio transcription or media handling --- -# Audio / Voice Notes — 2025-12-05 +# Audio / Voice Notes — 2026-01-17 ## What works -- **Optional transcription**: If `tools.audio.transcription` is set in `~/.clawdbot/clawdbot.json`, Clawdbot will: - 1) Download inbound audio to a temp path when WhatsApp only provides a URL. - 2) Run the configured CLI args (templated with `{{MediaPath}}`), expecting transcript on stdout. - 3) Replace `Body` with the transcript, set `{{Transcript}}`, and prepend the original media path plus a `Transcript:` section in the command prompt so models see both. - 4) Continue through the normal auto-reply pipeline (templating, sessions, Pi command). -- **Verbose logging**: In `--verbose`, we log when transcription runs and when the transcript replaces the body. +- **Media understanding (audio)**: If `tools.media.audio` is enabled and has `models`, Clawdbot: + 1) Locates the first audio attachment (local path or URL) and downloads it if needed. + 2) Enforces `maxBytes` before sending to each model entry. + 3) Runs the first eligible model entry in order (provider or CLI). + 4) If it fails or skips (size/timeout), it tries the next entry. + 5) On success, it replaces `Body` with an `[Audio]` block and sets `{{Transcript}}`. +- **Command parsing**: When transcription succeeds, `CommandBody`/`RawBody` are set to the transcript so slash commands still work. +- **Verbose logging**: In `--verbose`, we log when transcription runs and when it replaces the body. -## Config example (Whisper CLI) -Requires `whisper` CLI installed: +## Config examples + +### Provider + CLI fallback (OpenAI + Whisper CLI) ```json5 { tools: { - audio: { - transcription: { - args: ["--model", "base", "{{MediaPath}}"], - timeoutSeconds: 45 + media: { + audio: { + enabled: true, + maxBytes: 20971520, + models: [ + { provider: "openai", model: "whisper-1" }, + { + type: "cli", + command: "whisper", + args: ["--model", "base", "{{MediaPath}}"], + timeoutSeconds: 45 + } + ] + } + } + } +} +``` + +### Provider-only with scope gating +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + scope: { + default: "allow", + rules: [ + { action: "deny", match: { chatType: "group" } } + ] + }, + models: [ + { provider: "openai", model: "whisper-1" } + ] } } } @@ -29,12 +63,13 @@ Requires `whisper` CLI installed: ``` ## Notes & limits -- We don’t ship a transcriber; you opt in with the Whisper CLI on your PATH. -- Size guard: inbound audio must be ≤5 MB (matches the temp media store and transcript pipeline). -- Outbound caps: web send supports audio/voice up to 16 MB (sent as a voice note with `ptt: true`). -- If transcription fails, we fall back to the original body/media note; replies still go through. -- Transcript is available to templates as `{{Transcript}}`; models get both the media path and a `Transcript:` block in the prompt when using command mode. +- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`). +- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried. +- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output. +- Transcript is available to templates as `{{Transcript}}`. +- CLI stdout is capped (5MB); keep CLI output concise. ## Gotchas +- Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`. - Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`. -- Keep timeouts reasonable (`timeoutSeconds`, default 45s) to avoid blocking the reply queue. +- Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue. diff --git a/docs/nodes/images.md b/docs/nodes/images.md index ff3664f7a..4d163e535 100644 --- a/docs/nodes/images.md +++ b/docs/nodes/images.md @@ -38,13 +38,23 @@ The WhatsApp channel runs via **Baileys Web**. This document captures the curren - `{{MediaUrl}}` pseudo-URL for the inbound media. - `{{MediaPath}}` local temp path written before running the command. - When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/`. -- Audio transcription (if configured via `tools.audio.transcription`) runs before templating and can replace `Body` with the transcript. +- Media understanding (if configured via `tools.media.*`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`. + - Audio sets `{{Transcript}}` and uses the transcript for command parsing so slash commands still work. + - Video and image descriptions preserve any caption text for command parsing. +- Only the first matching image/audio/video attachment is processed; remaining attachments are left untouched. ## Limits & Errors +**Outbound send caps (WhatsApp web send)** - Images: ~6 MB cap after recompression. - Audio/voice/video: 16 MB cap; documents: 100 MB cap. - Oversize or unreadable media → clear error in logs and the reply is skipped. +**Media understanding caps (transcription/description)** +- Image default: 10 MB (`tools.media.image.maxBytes`). +- Audio default: 20 MB (`tools.media.audio.maxBytes`). +- Video default: 50 MB (`tools.media.video.maxBytes`). +- Oversize media skips understanding, but replies still go through with the original body. + ## Notes for Tests - Cover send + reply flows for image/audio/document cases. - Validate recompression for images (size bound) and voice-note flag for audio. diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md new file mode 100644 index 000000000..fe8e69a52 --- /dev/null +++ b/docs/nodes/media-understanding.md @@ -0,0 +1,217 @@ +--- +summary: "Inbound image/audio/video understanding (optional) with provider + CLI fallbacks" +read_when: + - Designing or refactoring media understanding + - Tuning inbound audio/video/image preprocessing +--- +# Media Understanding (Inbound) — 2026-01-17 + +Clawdbot can optionally **summarize inbound media** (image/audio/video) before the reply pipeline runs. This is **opt-in** and separate from the base attachment flow—if understanding is off, models still receive the original files/URLs as usual. + +## Goals +- Optional: pre‑digest inbound media into short text for faster routing + better command parsing. +- Preserve original media delivery to the model (always). +- Support **provider APIs** and **CLI fallbacks**. +- Allow multiple models with ordered fallback (error/size/timeout). + +## High‑level behavior +1) Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`). +2) For each enabled capability (image/audio/video), pick the **first matching attachment**. +3) Choose the first eligible model entry (size + capability + auth). +4) If a model fails or the media is too large, **fall back to the next entry**. +5) On success: + - `Body` becomes `[Image]`, `[Audio]`, or `[Video]` block. + - Audio sets `{{Transcript}}` and `CommandBody`/`RawBody` for command parsing. + - Captions are preserved as `User text:` inside the block. + +If understanding fails or is disabled, **the reply flow continues** with the original body + attachments. + +## Config overview +Use **per‑capability configs** under `tools.media`. Each capability can define: +- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`) +- **ordered `models` list** (fallback order) +- `scope` (optional gating by channel/chatType/session key) + +```json5 +{ + tools: { + media: { + image: { /* config */ }, + audio: { /* config */ }, + video: { /* config */ } + } + } +} +``` + +### Model entries +Each `models[]` entry can be **provider** or **CLI**: + +```json5 +{ + type: "provider", // default if omitted + provider: "openai", + model: "gpt-5.2", + prompt: "Describe the image in <= 500 chars.", + maxChars: 500, + maxBytes: 10485760, + timeoutSeconds: 60, + capabilities: ["image"], // optional, used for multi‑modal entries + profile: "vision-profile", + preferredProfile: "vision-fallback" +} +``` + +```json5 +{ + type: "cli", + command: "gemini", + args: [ + "-m", + "gemini-3-flash", + "--allowed-tools", + "read_file", + "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters." + ], + maxChars: 500, + maxBytes: 52428800, + timeoutSeconds: 120, + capabilities: ["video", "image"] +} +``` + +## Defaults and limits +Recommended defaults: +- `maxChars`: **500** for image/video (short, command‑friendly) +- `maxChars`: **unset** for audio (full transcript unless you set a limit) +- `maxBytes`: + - image: **10MB** + - audio: **20MB** + - video: **50MB** + +Rules: +- If media exceeds `maxBytes`, that model is skipped and the **next model is tried**. +- If the model returns more than `maxChars`, output is trimmed. +- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only). + +## Capabilities (optional) +If you set `capabilities`, the entry only runs for those media types. Suggested +defaults when you opt in: +- `openai`, `anthropic`: **image** +- `google` (Gemini API): **image + audio + video** +- CLI entries: declare the exact capabilities you support. + +If you omit `capabilities`, the entry is eligible for the list it appears in. + +## Provider support matrix (Clawdbot integrations) +| Capability | Provider integration | Notes | +|------------|----------------------|-------| +| Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. | +| Audio | OpenAI, Groq | Provider transcription (Whisper). | +| Video | Google (Gemini API) | Provider video understanding. | + +## Recommended providers +**Image** +- Prefer your active model if it supports images. +- Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`. + +**Audio** +- `openai/whisper-1` or `groq/whisper-large-v3-turbo`. +- CLI fallback: `whisper` binary. + +**Video** +- `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer). +- CLI fallback: `gemini` CLI (supports `read_file` on video/audio). + +## Config examples + +### 1) Audio + Video only (image off) +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + models: [ + { provider: "openai", model: "whisper-1" }, + { + type: "cli", + command: "whisper", + args: ["--model", "base", "{{MediaPath}}"] + } + ] + }, + video: { + enabled: true, + maxChars: 500, + models: [ + { provider: "google", model: "gemini-3-flash-preview" }, + { + type: "cli", + command: "gemini", + args: [ + "-m", + "gemini-3-flash", + "--allowed-tools", + "read_file", + "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters." + ] + } + ] + } + } + } +} +``` + +### 2) Optional image understanding +```json5 +{ + tools: { + media: { + image: { + enabled: true, + maxBytes: 10485760, + maxChars: 500, + models: [ + { provider: "openai", model: "gpt-5.2" }, + { provider: "anthropic", model: "claude-opus-4-5" }, + { + type: "cli", + command: "gemini", + args: [ + "-m", + "gemini-3-flash", + "--allowed-tools", + "read_file", + "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters." + ] + } + ] + } + } + } +} +``` + +### 3) Multi‑modal single entry (explicit capabilities) +```json5 +{ + tools: { + media: { + image: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] }, + audio: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] }, + video: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] } + } + } +} +``` + +## Notes +- Understanding is **best‑effort**. Errors do not block replies. +- Attachments are still passed to models even when understanding is disabled. +- Use `scope` to limit where understanding runs (e.g. only DMs). + +## Related docs +- [Configuration](/gateway/configuration) +- [Image & Media Support](/nodes/images) diff --git a/src/agents/bash-tools.exec.ts b/src/agents/bash-tools.exec.ts index d97b0e653..5ef6ccd15 100644 --- a/src/agents/bash-tools.exec.ts +++ b/src/agents/bash-tools.exec.ts @@ -254,7 +254,10 @@ export function createExecTool( let yielded = false; let yieldTimer: NodeJS.Timeout | null = null; let timeoutTimer: NodeJS.Timeout | null = null; + let timeoutFinalizeTimer: NodeJS.Timeout | null = null; let timedOut = false; + const timeoutFinalizeMs = 1000; + let rejectFn: ((err: Error) => void) | null = null; const settle = (fn: () => void) => { if (settled) return; @@ -262,6 +265,18 @@ export function createExecTool( fn(); }; + const effectiveTimeout = + typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec; + const finalizeTimeout = () => { + if (session.exited) return; + markExited(session, null, "SIGKILL", "failed"); + if (settled || !rejectFn) return; + const aggregated = session.aggregated.trim(); + const reason = `Command timed out after ${effectiveTimeout} seconds`; + const message = aggregated ? `${aggregated}\n\n${reason}` : reason; + settle(() => rejectFn?.(new Error(message))); + }; + // Tool-call abort should not kill backgrounded sessions; timeouts still must. const onAbortSignal = () => { if (yielded || session.backgrounded) return; @@ -272,15 +287,17 @@ export function createExecTool( const onTimeout = () => { timedOut = true; killSession(session); + if (!timeoutFinalizeTimer) { + timeoutFinalizeTimer = setTimeout(() => { + finalizeTimeout(); + }, timeoutFinalizeMs); + } }; if (signal?.aborted) onAbortSignal(); else if (signal) { signal.addEventListener("abort", onAbortSignal, { once: true }); } - - const effectiveTimeout = - typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec; if (effectiveTimeout > 0) { timeoutTimer = setTimeout(() => { onTimeout(); @@ -321,6 +338,7 @@ export function createExecTool( }); return new Promise>((resolve, reject) => { + rejectFn = reject; const resolveRunning = () => { settle(() => resolve({ @@ -369,6 +387,7 @@ export function createExecTool( const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => { if (yieldTimer) clearTimeout(yieldTimer); if (timeoutTimer) clearTimeout(timeoutTimer); + if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer); const durationMs = Date.now() - startedAt; const wasSignal = exitSignal != null; const isSuccess = code === 0 && !wasSignal && !signal?.aborted && !timedOut; @@ -421,6 +440,7 @@ export function createExecTool( child.once("error", (err) => { if (yieldTimer) clearTimeout(yieldTimer); if (timeoutTimer) clearTimeout(timeoutTimer); + if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer); markExited(session, null, null, "failed"); settle(() => reject(err)); }); diff --git a/src/auto-reply/media-note.test.ts b/src/auto-reply/media-note.test.ts index 3e6dd3527..0dd403386 100644 --- a/src/auto-reply/media-note.test.ts +++ b/src/auto-reply/media-note.test.ts @@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => { ].join("\n"), ); }); + + it("skips media notes for attachments with understanding output", () => { + const note = buildInboundMediaNote({ + MediaPaths: ["/tmp/a.png", "/tmp/b.png"], + MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"], + MediaUnderstanding: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "hello", + provider: "groq", + }, + ], + }); + expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]"); + }); }); diff --git a/src/auto-reply/media-note.ts b/src/auto-reply/media-note.ts index d08e7011d..aafadb999 100644 --- a/src/auto-reply/media-note.ts +++ b/src/auto-reply/media-note.ts @@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: { } export function buildInboundMediaNote(ctx: MsgContext): string | undefined { + // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel. + const suppressed = new Set( + Array.isArray(ctx.MediaUnderstanding) + ? ctx.MediaUnderstanding.map((output) => output.attachmentIndex) + : [], + ); const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined; const paths = pathsFromArray && pathsFromArray.length > 0 @@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined { ? ctx.MediaTypes : undefined; - if (paths.length === 1) { + const entries = paths + .map((entry, index) => ({ + path: entry ?? "", + type: types?.[index] ?? ctx.MediaType, + url: urls?.[index] ?? ctx.MediaUrl, + index, + })) + .filter((entry) => !suppressed.has(entry.index)); + if (entries.length === 0) return undefined; + if (entries.length === 1) { return formatMediaAttachedLine({ - path: paths[0] ?? "", - type: types?.[0] ?? ctx.MediaType, - url: urls?.[0] ?? ctx.MediaUrl, + path: entries[0]?.path ?? "", + type: entries[0]?.type, + url: entries[0]?.url, }); } - const count = paths.length; + const count = entries.length; const lines: string[] = [`[media attached: ${count} files]`]; - for (const [idx, mediaPath] of paths.entries()) { + for (const [idx, entry] of entries.entries()) { lines.push( formatMediaAttachedLine({ - path: mediaPath, + path: entry.path, index: idx + 1, total: count, - type: types?.[idx], - url: urls?.[idx], + type: entry.type, + url: entry.url, }), ); } diff --git a/src/auto-reply/reply/get-reply-directives.ts b/src/auto-reply/reply/get-reply-directives.ts index f0c062d4b..27ec6144f 100644 --- a/src/auto-reply/reply/get-reply-directives.ts +++ b/src/auto-reply/reply/get-reply-directives.ts @@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: { const commandSource = sessionCtx.CommandBody ?? sessionCtx.RawBody ?? + sessionCtx.Transcript ?? sessionCtx.BodyStripped ?? sessionCtx.Body ?? ""; diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts index ea539d341..a87877ad5 100644 --- a/src/auto-reply/reply/get-reply-run.ts +++ b/src/auto-reply/reply/get-reply-run.ts @@ -87,7 +87,6 @@ type RunPreparedReplyParams = { cap?: number; dropPolicy?: InlineDirectives["dropPolicy"]; }; - transcribedText?: string; typing: TypingController; opts?: GetReplyOptions; defaultModel: string; @@ -210,7 +209,6 @@ export async function runPreparedReply( model, perMessageQueueMode, perMessageQueueOptions, - transcribedText, typing, opts, defaultModel, @@ -325,11 +323,7 @@ export async function runPreparedReply( sessionEntry = skillResult.sessionEntry ?? sessionEntry; currentSystemSent = skillResult.systemSent; const skillsSnapshot = skillResult.skillsSnapshot; - const prefixedBody = transcribedText - ? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`] - .filter(Boolean) - .join("\n\n") - : [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n"); + const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n"); const mediaNote = buildInboundMediaNote(ctx); const mediaReplyHint = mediaNote ? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body." @@ -370,11 +364,7 @@ export async function runPreparedReply( } const sessionIdFinal = sessionId ?? crypto.randomUUID(); const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry); - const queueBodyBase = transcribedText - ? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`] - .filter(Boolean) - .join("\n\n") - : [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n"); + const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n"); const queuedBody = mediaNote ? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim() : queueBodyBase; diff --git a/src/auto-reply/reply/get-reply.ts b/src/auto-reply/reply/get-reply.ts index 11b0c7e5b..9bd2cdbdf 100644 --- a/src/auto-reply/reply/get-reply.ts +++ b/src/auto-reply/reply/get-reply.ts @@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js"; import { resolveAgentTimeoutMs } from "../../agents/timeout.js"; import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js"; import { type ClawdbotConfig, loadConfig } from "../../config/config.js"; -import { logVerbose } from "../../globals.js"; import { defaultRuntime } from "../../runtime.js"; import { resolveCommandAuthorization } from "../command-auth.js"; import type { MsgContext } from "../templating.js"; import { SILENT_REPLY_TOKEN } from "../tokens.js"; -import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js"; +import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; import type { GetReplyOptions, ReplyPayload } from "../types.js"; import { resolveDefaultModel } from "./directive-handling.js"; import { resolveReplyDirectives } from "./get-reply-directives.js"; @@ -75,16 +74,11 @@ export async function getReplyFromConfig( }); opts?.onTypingController?.(typing); - let transcribedText: string | undefined; - if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) { - const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime); - if (transcribed?.text) { - transcribedText = transcribed.text; - ctx.Body = transcribed.text; - ctx.Transcript = transcribed.text; - logVerbose("Replaced Body with audio transcript for reply flow"); - } - } + await applyMediaUnderstanding({ + ctx, + cfg, + agentDir, + }); const commandAuthorized = ctx.CommandAuthorized ?? true; resolveCommandAuthorization({ @@ -253,7 +247,6 @@ export async function getReplyFromConfig( model, perMessageQueueMode, perMessageQueueOptions, - transcribedText, typing, opts, defaultModel, diff --git a/src/auto-reply/templating.ts b/src/auto-reply/templating.ts index 20c2ef6b8..7bf105b86 100644 --- a/src/auto-reply/templating.ts +++ b/src/auto-reply/templating.ts @@ -1,6 +1,7 @@ import type { ChannelId } from "../channels/plugins/types.js"; import type { InternalMessageChannel } from "../utils/message-channel.js"; import type { CommandArgs } from "./commands-registry.types.js"; +import type { MediaUnderstandingOutput } from "../media-understanding/types.js"; /** Valid message channels for routing. */ export type OriginatingChannelType = ChannelId | InternalMessageChannel; @@ -41,6 +42,9 @@ export type MsgContext = { /** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */ MediaRemoteHost?: string; Transcript?: string; + MediaUnderstanding?: MediaUnderstandingOutput[]; + Prompt?: string; + MaxChars?: number; ChatType?: string; GroupSubject?: string; GroupRoom?: string; diff --git a/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts b/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts index d0a01b55c..641a1e85b 100644 --- a/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts +++ b/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts @@ -65,7 +65,7 @@ describe("legacy config detection", () => { expect(res.config?.messages?.groupChat?.mentionPatterns).toEqual(["@clawd"]); expect(res.config?.routing?.groupChat?.mentionPatterns).toBeUndefined(); }); - it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/audio", async () => { + it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/media", async () => { vi.resetModules(); const { migrateLegacyConfig } = await import("./config.js"); const res = migrateLegacyConfig({ @@ -80,7 +80,7 @@ describe("legacy config detection", () => { }); expect(res.changes).toContain("Moved routing.agentToAgent → tools.agentToAgent."); expect(res.changes).toContain("Moved routing.queue → messages.queue."); - expect(res.changes).toContain("Moved routing.transcribeAudio → tools.audio.transcription."); + expect(res.changes).toContain("Moved routing.transcribeAudio → tools.media.audio.models."); expect(res.config?.tools?.agentToAgent).toEqual({ enabled: true, allow: ["main"], @@ -89,9 +89,16 @@ describe("legacy config detection", () => { mode: "queue", cap: 3, }); - expect(res.config?.tools?.audio?.transcription).toEqual({ - args: ["--model", "base"], - timeoutSeconds: 2, + expect(res.config?.tools?.media?.audio).toEqual({ + enabled: true, + models: [ + { + command: "whisper", + type: "cli", + args: ["--model", "base"], + timeoutSeconds: 2, + }, + ], }); expect(res.config?.routing).toBeUndefined(); }); diff --git a/src/config/legacy.migrations.part-2.ts b/src/config/legacy.migrations.part-2.ts index e48d1b1f8..c161fb5b9 100644 --- a/src/config/legacy.migrations.part-2.ts +++ b/src/config/legacy.migrations.part-2.ts @@ -330,14 +330,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [ const mapped = mapLegacyAudioTranscription(routing.transcribeAudio); if (mapped) { const tools = ensureRecord(raw, "tools"); - const toolsAudio = ensureRecord(tools, "audio"); - if (toolsAudio.transcription === undefined) { - toolsAudio.transcription = mapped; - changes.push("Moved routing.transcribeAudio → tools.audio.transcription."); + const media = ensureRecord(tools, "media"); + const mediaAudio = ensureRecord(media, "audio"); + const models = Array.isArray(mediaAudio.models) + ? (mediaAudio.models as unknown[]) + : []; + if (models.length === 0) { + mediaAudio.enabled = true; + mediaAudio.models = [mapped]; + changes.push("Moved routing.transcribeAudio → tools.media.audio.models."); } else { - changes.push( - "Removed routing.transcribeAudio (tools.audio.transcription already set).", - ); + changes.push("Removed routing.transcribeAudio (tools.media.audio.models already set)."); } } else { changes.push("Removed routing.transcribeAudio (unsupported transcription CLI)."); @@ -350,12 +353,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [ const mapped = mapLegacyAudioTranscription(audio.transcription); if (mapped) { const tools = ensureRecord(raw, "tools"); - const toolsAudio = ensureRecord(tools, "audio"); - if (toolsAudio.transcription === undefined) { - toolsAudio.transcription = mapped; - changes.push("Moved audio.transcription → tools.audio.transcription."); + const media = ensureRecord(tools, "media"); + const mediaAudio = ensureRecord(media, "audio"); + const models = Array.isArray(mediaAudio.models) + ? (mediaAudio.models as unknown[]) + : []; + if (models.length === 0) { + mediaAudio.enabled = true; + mediaAudio.models = [mapped]; + changes.push("Moved audio.transcription → tools.media.audio.models."); } else { - changes.push("Removed audio.transcription (tools.audio.transcription already set)."); + changes.push("Removed audio.transcription (tools.media.audio.models already set)."); } delete audio.transcription; if (Object.keys(audio).length === 0) delete raw.audio; diff --git a/src/config/legacy.rules.ts b/src/config/legacy.rules.ts index b04a74783..31bc48037 100644 --- a/src/config/legacy.rules.ts +++ b/src/config/legacy.rules.ts @@ -69,7 +69,7 @@ export const LEGACY_CONFIG_RULES: LegacyConfigRule[] = [ { path: ["routing", "transcribeAudio"], message: - "routing.transcribeAudio was moved; use tools.audio.transcription instead (auto-migrated on load).", + "routing.transcribeAudio was moved; use tools.media.audio.models instead (auto-migrated on load).", }, { path: ["telegram", "requireMention"], diff --git a/src/config/legacy.shared.ts b/src/config/legacy.shared.ts index 567ca308f..d2d361f00 100644 --- a/src/config/legacy.shared.ts +++ b/src/config/legacy.shared.ts @@ -56,7 +56,7 @@ export const mapLegacyAudioTranscription = (value: unknown): Record = {}; + const result: Record = { command: rawExecutable, type: "cli" }; if (args.length > 0) result.args = args; if (timeoutSeconds !== undefined) result.timeoutSeconds = timeoutSeconds; return result; diff --git a/src/config/schema.ts b/src/config/schema.ts index f3ea4211b..a7a9b60a6 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -102,8 +102,28 @@ const FIELD_LABELS: Record = { "gateway.remote.password": "Remote Gateway Password", "gateway.auth.token": "Gateway Token", "gateway.auth.password": "Gateway Password", - "tools.audio.transcription.args": "Audio Transcription Args", - "tools.audio.transcription.timeoutSeconds": "Audio Transcription Timeout (sec)", + "tools.media.image.enabled": "Enable Image Understanding", + "tools.media.image.maxBytes": "Image Understanding Max Bytes", + "tools.media.image.maxChars": "Image Understanding Max Chars", + "tools.media.image.prompt": "Image Understanding Prompt", + "tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)", + "tools.media.image.models": "Image Understanding Models", + "tools.media.image.scope": "Image Understanding Scope", + "tools.media.audio.enabled": "Enable Audio Understanding", + "tools.media.audio.maxBytes": "Audio Understanding Max Bytes", + "tools.media.audio.maxChars": "Audio Understanding Max Chars", + "tools.media.audio.prompt": "Audio Understanding Prompt", + "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)", + "tools.media.audio.language": "Audio Understanding Language", + "tools.media.audio.models": "Audio Understanding Models", + "tools.media.audio.scope": "Audio Understanding Scope", + "tools.media.video.enabled": "Enable Video Understanding", + "tools.media.video.maxBytes": "Video Understanding Max Bytes", + "tools.media.video.maxChars": "Video Understanding Max Chars", + "tools.media.video.prompt": "Video Understanding Prompt", + "tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)", + "tools.media.video.models": "Video Understanding Models", + "tools.media.video.scope": "Video Understanding Scope", "tools.profile": "Tool Profile", "agents.list[].tools.profile": "Agent Tool Profile", "tools.byProvider": "Tool Policy by Provider", diff --git a/src/config/types.messages.ts b/src/config/types.messages.ts index 40ed2d614..691ca617a 100644 --- a/src/config/types.messages.ts +++ b/src/config/types.messages.ts @@ -47,7 +47,7 @@ export type BroadcastConfig = { }; export type AudioConfig = { - /** @deprecated Use tools.audio.transcription instead. */ + /** @deprecated Use tools.media.audio.models instead. */ transcription?: { // Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout. command: string[]; diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 5c87a3f89..80f7dab8e 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -1,4 +1,76 @@ -import type { AgentElevatedAllowFromConfig } from "./types.base.js"; +import type { AgentElevatedAllowFromConfig, SessionSendPolicyAction } from "./types.base.js"; + +export type MediaUnderstandingScopeMatch = { + channel?: string; + chatType?: "direct" | "group" | "room"; + keyPrefix?: string; +}; + +export type MediaUnderstandingScopeRule = { + action: SessionSendPolicyAction; + match?: MediaUnderstandingScopeMatch; +}; + +export type MediaUnderstandingScopeConfig = { + default?: SessionSendPolicyAction; + rules?: MediaUnderstandingScopeRule[]; +}; + +export type MediaUnderstandingCapability = "image" | "audio" | "video"; + +export type MediaUnderstandingModelConfig = { + /** provider API id (e.g. openai, google). */ + provider?: string; + /** Model id for provider-based understanding. */ + model?: string; + /** Optional capability tags for shared model lists. */ + capabilities?: MediaUnderstandingCapability[]; + /** Use a CLI command instead of provider API. */ + type?: "provider" | "cli"; + /** CLI binary (required when type=cli). */ + command?: string; + /** CLI args (template-enabled). */ + args?: string[]; + /** Optional prompt override for this model entry. */ + prompt?: string; + /** Optional max output characters for this model entry. */ + maxChars?: number; + /** Optional max bytes for this model entry. */ + maxBytes?: number; + /** Optional timeout override (seconds) for this model entry. */ + timeoutSeconds?: number; + /** Optional language hint for audio transcription. */ + language?: string; + /** Auth profile id to use for this provider. */ + profile?: string; + /** Preferred profile id if multiple are available. */ + preferredProfile?: string; +}; + +export type MediaUnderstandingConfig = { + /** Enable media understanding when models are configured. */ + enabled?: boolean; + /** Optional scope gating for understanding. */ + scope?: MediaUnderstandingScopeConfig; + /** Default max bytes to send. */ + maxBytes?: number; + /** Default max output characters. */ + maxChars?: number; + /** Default prompt. */ + prompt?: string; + /** Default timeout (seconds). */ + timeoutSeconds?: number; + /** Default language hint (audio). */ + language?: string; + /** Ordered model list (fallbacks in order). */ + models?: MediaUnderstandingModelConfig[]; +}; + +export type MediaToolsConfig = { + image?: MediaUnderstandingConfig; + audio?: MediaUnderstandingConfig; + video?: MediaUnderstandingConfig; +}; export type ToolProfileId = "minimal" | "coding" | "messaging" | "full"; @@ -127,13 +199,7 @@ export type ToolsConfig = { }; }; }; - audio?: { - transcription?: { - /** CLI args (template-enabled). */ - args?: string[]; - timeoutSeconds?: number; - }; - }; + media?: MediaToolsConfig; /** Message tool configuration. */ message?: { /** diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts index 5946b1b06..6032530aa 100644 --- a/src/config/zod-schema.agent-runtime.ts +++ b/src/config/zod-schema.agent-runtime.ts @@ -5,7 +5,7 @@ import { GroupChatSchema, HumanDelaySchema, IdentitySchema, - ToolsAudioTranscriptionSchema, + ToolsMediaSchema, } from "./zod-schema.core.js"; export const HeartbeatSchema = z @@ -283,11 +283,7 @@ export const ToolsSchema = z deny: z.array(z.string()).optional(), byProvider: z.record(z.string(), ToolPolicyWithProfileSchema).optional(), web: ToolsWebSchema, - audio: z - .object({ - transcription: ToolsAudioTranscriptionSchema, - }) - .optional(), + media: ToolsMediaSchema, message: z .object({ allowCrossContextSend: z.boolean().optional(), diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 78327f120..330d55338 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -240,10 +240,68 @@ export const ExecutableTokenSchema = z .string() .refine(isSafeExecutableValue, "expected safe executable name or path"); -export const ToolsAudioTranscriptionSchema = z +export const MediaUnderstandingScopeSchema = z .object({ + default: z.union([z.literal("allow"), z.literal("deny")]).optional(), + rules: z + .array( + z.object({ + action: z.union([z.literal("allow"), z.literal("deny")]), + match: z + .object({ + channel: z.string().optional(), + chatType: z + .union([z.literal("direct"), z.literal("group"), z.literal("room")]) + .optional(), + keyPrefix: z.string().optional(), + }) + .optional(), + }), + ) + .optional(), + }) + .optional(); + +export const MediaUnderstandingCapabilitiesSchema = z + .array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")])) + .optional(); + +export const MediaUnderstandingModelSchema = z + .object({ + provider: z.string().optional(), + model: z.string().optional(), + capabilities: MediaUnderstandingCapabilitiesSchema, + type: z.union([z.literal("provider"), z.literal("cli")]).optional(), + command: z.string().optional(), args: z.array(z.string()).optional(), + prompt: z.string().optional(), + maxChars: z.number().int().positive().optional(), + maxBytes: z.number().int().positive().optional(), timeoutSeconds: z.number().int().positive().optional(), + language: z.string().optional(), + profile: z.string().optional(), + preferredProfile: z.string().optional(), + }) + .optional(); + +export const ToolsMediaUnderstandingSchema = z + .object({ + enabled: z.boolean().optional(), + scope: MediaUnderstandingScopeSchema, + maxBytes: z.number().int().positive().optional(), + maxChars: z.number().int().positive().optional(), + prompt: z.string().optional(), + timeoutSeconds: z.number().int().positive().optional(), + language: z.string().optional(), + models: z.array(MediaUnderstandingModelSchema).optional(), + }) + .optional(); + +export const ToolsMediaSchema = z + .object({ + image: ToolsMediaUnderstandingSchema.optional(), + audio: ToolsMediaUnderstandingSchema.optional(), + video: ToolsMediaUnderstandingSchema.optional(), }) .optional(); diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts new file mode 100644 index 000000000..f52685e6f --- /dev/null +++ b/src/media-understanding/apply.test.ts @@ -0,0 +1,258 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { beforeEach, describe, expect, it, vi } from "vitest"; + +import type { ClawdbotConfig } from "../config/config.js"; +import type { MsgContext } from "../auto-reply/templating.js"; +import { resolveApiKeyForProvider } from "../agents/model-auth.js"; +import { fetchRemoteMedia } from "../media/fetch.js"; + +vi.mock("../agents/model-auth.js", () => ({ + resolveApiKeyForProvider: vi.fn(async () => ({ + apiKey: "test-key", + source: "test", + })), +})); + +vi.mock("../media/fetch.js", () => ({ + fetchRemoteMedia: vi.fn(), +})); + +vi.mock("../process/exec.js", () => ({ + runExec: vi.fn(), +})); + +async function loadApply() { + return await import("./apply.js"); +} + +describe("applyMediaUnderstanding", () => { + const mockedResolveApiKey = vi.mocked(resolveApiKeyForProvider); + const mockedFetchRemoteMedia = vi.mocked(fetchRemoteMedia); + + beforeEach(() => { + mockedResolveApiKey.mockClear(); + mockedFetchRemoteMedia.mockReset(); + mockedFetchRemoteMedia.mockResolvedValue({ + buffer: Buffer.from("audio-bytes"), + contentType: "audio/ogg", + fileName: "note.ogg", + }); + }); + + it("sets Transcript and replaces Body when audio transcription succeeds", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); + const audioPath = path.join(dir, "note.ogg"); + await fs.writeFile(audioPath, "hello"); + + const ctx: MsgContext = { + Body: "", + MediaPath: audioPath, + MediaType: "audio/ogg", + }; + const cfg: ClawdbotConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { + id: "groq", + transcribeAudio: async () => ({ text: "transcribed text" }), + }, + }, + }); + + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toBe("transcribed text"); + expect(ctx.Body).toBe("[Audio]\nTranscript:\ntranscribed text"); + expect(ctx.CommandBody).toBe("transcribed text"); + expect(ctx.RawBody).toBe("transcribed text"); + }); + + it("handles URL-only attachments for audio transcription", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const ctx: MsgContext = { + Body: "", + MediaUrl: "https://example.com/note.ogg", + MediaType: "audio/ogg", + ChatType: "dm", + }; + const cfg: ClawdbotConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + scope: { + default: "deny", + rules: [{ action: "allow", match: { chatType: "direct" } }], + }, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { + id: "groq", + transcribeAudio: async () => ({ text: "remote transcript" }), + }, + }, + }); + + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toBe("remote transcript"); + expect(ctx.Body).toBe("[Audio]\nTranscript:\nremote transcript"); + }); + + it("skips audio transcription when attachment exceeds maxBytes", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); + const audioPath = path.join(dir, "large.wav"); + await fs.writeFile(audioPath, "0123456789"); + + const ctx: MsgContext = { + Body: "", + MediaPath: audioPath, + MediaType: "audio/wav", + }; + const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" })); + const cfg: ClawdbotConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 4, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { groq: { id: "groq", transcribeAudio } }, + }); + + expect(result.appliedAudio).toBe(false); + expect(transcribeAudio).not.toHaveBeenCalled(); + expect(ctx.Body).toBe(""); + }); + + it("falls back to CLI model when provider fails", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); + const audioPath = path.join(dir, "note.ogg"); + await fs.writeFile(audioPath, "hello"); + + const ctx: MsgContext = { + Body: "", + MediaPath: audioPath, + MediaType: "audio/ogg", + }; + const cfg: ClawdbotConfig = { + tools: { + media: { + audio: { + enabled: true, + models: [ + { provider: "groq" }, + { + type: "cli", + command: "whisper", + args: ["{{MediaPath}}"], + }, + ], + }, + }, + }, + }; + + const execModule = await import("../process/exec.js"); + vi.mocked(execModule.runExec).mockResolvedValue({ + stdout: "cli transcript\n", + stderr: "", + }); + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { + id: "groq", + transcribeAudio: async () => { + throw new Error("boom"); + }, + }, + }, + }); + + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toBe("cli transcript"); + expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript"); + }); + + it("uses CLI image understanding and preserves caption for commands", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); + const imagePath = path.join(dir, "photo.jpg"); + await fs.writeFile(imagePath, "image-bytes"); + + const ctx: MsgContext = { + Body: " show Dom", + MediaPath: imagePath, + MediaType: "image/jpeg", + }; + const cfg: ClawdbotConfig = { + tools: { + media: { + image: { + enabled: true, + models: [ + { + type: "cli", + command: "gemini", + args: ["--file", "{{MediaPath}}", "--prompt", "{{Prompt}}"], + }, + ], + }, + }, + }, + }; + + const execModule = await import("../process/exec.js"); + vi.mocked(execModule.runExec).mockResolvedValue({ + stdout: "image description\n", + stderr: "", + }); + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + }); + + expect(result.appliedImage).toBe(true); + expect(ctx.Body).toBe("[Image]\nUser text:\nshow Dom\nDescription:\nimage description"); + expect(ctx.CommandBody).toBe("show Dom"); + expect(ctx.RawBody).toBe("show Dom"); + }); +}); diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts new file mode 100644 index 000000000..558b76f57 --- /dev/null +++ b/src/media-understanding/apply.ts @@ -0,0 +1,804 @@ +import crypto from "node:crypto"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai"; +import { complete } from "@mariozechner/pi-ai"; +import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent"; + +import type { ClawdbotConfig } from "../config/config.js"; +import type { MsgContext } from "../auto-reply/templating.js"; +import { applyTemplate } from "../auto-reply/templating.js"; +import { getApiKeyForModel, resolveApiKeyForProvider } from "../agents/model-auth.js"; +import { ensureClawdbotModelsJson } from "../agents/models-config.js"; +import { minimaxUnderstandImage } from "../agents/minimax-vlm.js"; +import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { fetchRemoteMedia } from "../media/fetch.js"; +import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js"; +import { runExec } from "../process/exec.js"; +import type { + MediaUnderstandingConfig, + MediaUnderstandingModelConfig, + MediaUnderstandingScopeConfig, +} from "../config/types.tools.js"; +import { extractMediaUserText, formatMediaUnderstandingBody } from "./format.js"; +import { + buildMediaUnderstandingRegistry, + getMediaUnderstandingProvider, + normalizeMediaProviderId, +} from "./providers/index.js"; +import { fetchWithTimeout } from "./providers/shared.js"; +import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js"; +import type { + MediaAttachment, + MediaUnderstandingOutput, + MediaUnderstandingProvider, +} from "./types.js"; +import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js"; + +const MB = 1024 * 1024; +const DEFAULT_MAX_CHARS = 500; +const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record = { + image: DEFAULT_MAX_CHARS, + audio: undefined, + video: DEFAULT_MAX_CHARS, +}; +const DEFAULT_MAX_BYTES: Record = { + image: 10 * MB, + audio: 20 * MB, + video: 50 * MB, +}; +const DEFAULT_TIMEOUT_SECONDS: Record = { + image: 60, + audio: 60, + video: 120, +}; +const DEFAULT_PROMPT: Record = { + image: "Describe the image.", + audio: "Transcribe the audio.", + video: "Describe the video.", +}; +const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; +const DEFAULT_AUDIO_MODELS: Record = { + groq: "whisper-large-v3-turbo", + openai: "whisper-1", +}; +const CLI_OUTPUT_MAX_BUFFER = 5 * MB; + +export type ApplyMediaUnderstandingResult = { + outputs: MediaUnderstandingOutput[]; + appliedImage: boolean; + appliedAudio: boolean; + appliedVideo: boolean; +}; + +type Capability = "image" | "audio" | "video"; + +type MediaBufferResult = { + buffer: Buffer; + mime?: string; + fileName: string; +}; + +type MediaPathResult = { + path: string; + cleanup?: () => Promise | void; +}; + +function normalizeAttachmentPath(raw?: string | null): string | undefined { + const value = raw?.trim(); + if (!value) return undefined; + if (value.startsWith("file://")) { + try { + return fileURLToPath(value); + } catch { + return undefined; + } + } + return value; +} + +function normalizeAttachments(ctx: MsgContext): MediaAttachment[] { + const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined; + const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined; + const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined; + const resolveMime = (count: number, index: number) => { + const typeHint = typesFromArray?.[index]; + const trimmed = typeof typeHint === "string" ? typeHint.trim() : ""; + if (trimmed) return trimmed; + return count === 1 ? ctx.MediaType : undefined; + }; + + if (pathsFromArray && pathsFromArray.length > 0) { + const count = pathsFromArray.length; + const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined; + return pathsFromArray + .map((value, index) => ({ + path: value?.trim() || undefined, + url: urls?.[index] ?? ctx.MediaUrl, + mime: resolveMime(count, index), + index, + })) + .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim())); + } + + if (urlsFromArray && urlsFromArray.length > 0) { + const count = urlsFromArray.length; + return urlsFromArray + .map((value, index) => ({ + path: undefined, + url: value?.trim() || undefined, + mime: resolveMime(count, index), + index, + })) + .filter((entry) => Boolean(entry.url?.trim())); + } + + const pathValue = ctx.MediaPath?.trim(); + const url = ctx.MediaUrl?.trim(); + if (!pathValue && !url) return []; + return [ + { + path: pathValue || undefined, + url: url || undefined, + mime: ctx.MediaType, + index: 0, + }, + ]; +} + +function isVideoAttachment(attachment: MediaAttachment): boolean { + if (attachment.mime?.startsWith("video/")) return true; + const ext = getFileExtension(attachment.path ?? attachment.url); + if (!ext) return false; + return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext); +} + +function isAudioAttachment(attachment: MediaAttachment): boolean { + if (attachment.mime?.startsWith("audio/")) return true; + return isAudioFileName(attachment.path ?? attachment.url); +} + +function isImageAttachment(attachment: MediaAttachment): boolean { + if (attachment.mime?.startsWith("image/")) return true; + const ext = getFileExtension(attachment.path ?? attachment.url); + if (!ext) return false; + return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext); +} + +function estimateBase64Size(bytes: number): number { + return Math.ceil(bytes / 3) * 4; +} + +function resolveVideoMaxBase64Bytes(maxBytes: number): number { + const expanded = Math.floor(maxBytes * (4 / 3)); + return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES); +} + +function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number { + const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds; + return Math.max(1000, Math.floor(value * 1000)); +} + +function resolvePrompt(capability: Capability, prompt?: string, maxChars?: number): string { + const base = prompt?.trim() || DEFAULT_PROMPT[capability]; + if (!maxChars || capability === "audio") return base; + return `${base} Respond in at most ${maxChars} characters.`; +} + +function resolveRequestUrl(input: RequestInfo | URL): string { + if (typeof input === "string") return input; + if (input instanceof URL) return input.toString(); + return input.url; +} + +function normalizeErrorMessage(err: unknown): string { + if (!err) return ""; + if (typeof err === "string") return err; + if (err instanceof Error) return err.message; + try { + return JSON.stringify(err); + } catch { + return ""; + } +} + +function resolveMaxChars(params: { + capability: Capability; + entry: MediaUnderstandingModelConfig; + cfg: ClawdbotConfig; +}): number | undefined { + const { capability, entry, cfg } = params; + const configured = entry.maxChars ?? cfg.tools?.media?.[capability]?.maxChars; + if (typeof configured === "number") return configured; + return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability]; +} + +function trimOutput(text: string, maxChars?: number): string { + const trimmed = text.trim(); + if (!maxChars || trimmed.length <= maxChars) return trimmed; + return trimmed.slice(0, maxChars).trim(); +} + +function resolveConfigValue(primary: T | undefined, fallback: T): T { + return primary === undefined ? fallback : primary; +} + +function resolveCapabilityConfig( + cfg: ClawdbotConfig, + capability: Capability, +): MediaUnderstandingConfig | undefined { + return cfg.tools?.media?.[capability]; +} + +function resolveScopeDecision(params: { + scope?: MediaUnderstandingScopeConfig; + ctx: MsgContext; +}): "allow" | "deny" { + return resolveMediaUnderstandingScope({ + scope: params.scope, + sessionKey: params.ctx.SessionKey, + channel: params.ctx.Surface ?? params.ctx.Provider, + chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType), + }); +} + +function resolveModelEntries( + cfg: MediaUnderstandingConfig | undefined, + capability: Capability, +): MediaUnderstandingModelConfig[] { + const models = cfg?.models ?? []; + if (models.length === 0) return []; + return models.filter((entry) => { + const caps = entry.capabilities; + if (!caps || caps.length === 0) return true; + return caps.includes(capability); + }); +} + +function isMaxBytesError(err: unknown): boolean { + const message = normalizeErrorMessage(err); + if (!message) return false; + return message.includes("exceeds maxBytes") || message.includes("payload exceeds maxBytes"); +} + +async function loadAttachmentBuffer(params: { + attachment: MediaAttachment; + maxBytes: number; + timeoutMs: number; +}): Promise { + const { attachment, maxBytes, timeoutMs } = params; + const rawPath = normalizeAttachmentPath(attachment.path); + if (rawPath) { + const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath); + try { + const stat = await fs.stat(resolved); + if (!stat.isFile()) return undefined; + if (stat.size > maxBytes) { + if (shouldLogVerbose()) { + logVerbose( + `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`, + ); + } + return undefined; + } + const buffer = await fs.readFile(resolved); + const mime = + attachment.mime ?? + (await detectMime({ + buffer, + filePath: resolved, + })); + const fileName = path.basename(resolved) || `media-${attachment.index + 1}`; + return { buffer, mime, fileName }; + } catch (err) { + if (shouldLogVerbose()) { + logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`); + } + } + } + + const url = attachment.url?.trim(); + if (!url) return undefined; + + try { + const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) => + fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch); + const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes }); + if (fetched.buffer.length > maxBytes) { + if (shouldLogVerbose()) { + logVerbose( + `Skipping media attachment ${attachment.index + 1}: ${fetched.buffer.length} bytes exceeds ${maxBytes}`, + ); + } + return undefined; + } + const mime = + attachment.mime ?? + fetched.contentType ?? + (await detectMime({ + buffer: fetched.buffer, + filePath: fetched.fileName ?? url, + })); + const fileName = fetched.fileName ?? `media-${attachment.index + 1}`; + return { buffer: fetched.buffer, mime, fileName }; + } catch (err) { + if (shouldLogVerbose()) { + logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`); + } + } + + return undefined; +} + +async function resolveAttachmentPath(params: { + attachment: MediaAttachment; + maxBytes?: number; + timeoutMs: number; +}): Promise { + const { attachment, maxBytes, timeoutMs } = params; + const rawPath = normalizeAttachmentPath(attachment.path); + if (rawPath) { + const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath); + try { + const stat = await fs.stat(resolved); + if (!stat.isFile()) return undefined; + if (maxBytes && stat.size > maxBytes) { + if (shouldLogVerbose()) { + logVerbose( + `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`, + ); + } + return undefined; + } + return { path: resolved }; + } catch (err) { + if (shouldLogVerbose()) { + logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`); + } + } + } + + const url = attachment.url?.trim(); + if (!url) return undefined; + + try { + const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) => + fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch); + const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes }); + const buffer = fetched.buffer; + if (maxBytes && buffer.length > maxBytes) { + if (shouldLogVerbose()) { + logVerbose( + `Skipping media attachment ${attachment.index + 1}: ${buffer.length} bytes exceeds ${maxBytes}`, + ); + } + return undefined; + } + const extension = fetched.fileName ? path.extname(fetched.fileName) : ""; + const tmpPath = path.join( + os.tmpdir(), + `clawdbot-media-${crypto.randomUUID()}${extension || ""}`, + ); + await fs.writeFile(tmpPath, buffer); + return { + path: tmpPath, + cleanup: async () => { + await fs.unlink(tmpPath).catch(() => {}); + }, + }; + } catch (err) { + if (shouldLogVerbose()) { + logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`); + } + } + + return undefined; +} + +async function describeImageWithModel(params: { + cfg: ClawdbotConfig; + agentDir: string; + provider: string; + model: string; + prompt: string; + maxChars?: number; + buffer: Buffer; + mimeType: string; + profile?: string; + preferredProfile?: string; +}): Promise<{ text: string; model: string }> { + await ensureClawdbotModelsJson(params.cfg, params.agentDir); + const authStorage = discoverAuthStorage(params.agentDir); + const modelRegistry = discoverModels(authStorage, params.agentDir); + const model = modelRegistry.find(params.provider, params.model) as Model | null; + if (!model) { + throw new Error(`Unknown model: ${params.provider}/${params.model}`); + } + if (!model.input?.includes("image")) { + throw new Error(`Model does not support images: ${params.provider}/${params.model}`); + } + const apiKeyInfo = await getApiKeyForModel({ + model, + cfg: params.cfg, + agentDir: params.agentDir, + profileId: params.profile, + preferredProfile: params.preferredProfile, + }); + authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey); + + const base64 = params.buffer.toString("base64"); + if (model.provider === "minimax") { + const text = await minimaxUnderstandImage({ + apiKey: apiKeyInfo.apiKey, + prompt: params.prompt, + imageDataUrl: `data:${params.mimeType};base64,${base64}`, + modelBaseUrl: model.baseUrl, + }); + return { text, model: model.id }; + } + + const context: Context = { + messages: [ + { + role: "user", + content: [ + { type: "text", text: params.prompt }, + { type: "image", data: base64, mimeType: params.mimeType }, + ], + timestamp: Date.now(), + }, + ], + }; + const message = (await complete(model, context, { + apiKey: apiKeyInfo.apiKey, + maxTokens: 512, + })) as AssistantMessage; + const text = coerceImageAssistantText({ + message, + provider: model.provider, + model: model.id, + }); + return { text, model: model.id }; +} + +async function runProviderEntry(params: { + capability: Capability; + entry: MediaUnderstandingModelConfig; + cfg: ClawdbotConfig; + ctx: MsgContext; + attachment: MediaAttachment; + agentDir?: string; + providerRegistry: Map; +}): Promise { + const { entry, capability, cfg, attachment } = params; + const providerIdRaw = entry.provider?.trim(); + if (!providerIdRaw) { + throw new Error(`Provider entry missing provider for ${capability}`); + } + const providerId = normalizeMediaProviderId(providerIdRaw); + const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]); + const maxChars = resolveMaxChars({ capability, entry, cfg }); + const timeoutMs = resolveTimeoutMs( + entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds, + DEFAULT_TIMEOUT_SECONDS[capability], + ); + const prompt = resolvePrompt( + capability, + entry.prompt ?? cfg.tools?.media?.[capability]?.prompt, + maxChars, + ); + + if (capability === "image") { + if (!params.agentDir) { + throw new Error("Image understanding requires agentDir"); + } + const modelId = entry.model?.trim(); + if (!modelId) { + throw new Error("Image understanding requires model id"); + } + const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs }); + if (!media) return null; + const mimeType = media.mime ?? "image/jpeg"; + const result = await describeImageWithModel({ + cfg, + agentDir: params.agentDir, + provider: providerId, + model: modelId, + prompt, + maxChars, + buffer: media.buffer, + mimeType, + profile: entry.profile, + preferredProfile: entry.preferredProfile, + }); + return { + kind: "image.description", + attachmentIndex: attachment.index, + text: trimOutput(result.text, maxChars), + provider: providerId, + model: result.model, + }; + } + + const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); + if (!provider) { + throw new Error(`Media provider not available: ${providerId}`); + } + + if (capability === "audio") { + if (!provider.transcribeAudio) { + throw new Error(`Audio transcription provider "${providerId}" not available.`); + } + const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs }); + if (!media) return null; + const key = await resolveApiKeyForProvider({ + provider: providerId, + cfg, + profileId: entry.profile, + preferredProfile: entry.preferredProfile, + agentDir: params.agentDir, + }); + const providerConfig = cfg.models?.providers?.[providerId]; + const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model; + const result = await provider.transcribeAudio({ + buffer: media.buffer, + fileName: media.fileName, + mime: media.mime, + apiKey: key.apiKey, + baseUrl: providerConfig?.baseUrl, + headers: providerConfig?.headers, + model, + language: entry.language ?? cfg.tools?.media?.audio?.language, + prompt, + timeoutMs, + }); + return { + kind: "audio.transcription", + attachmentIndex: attachment.index, + text: trimOutput(result.text, maxChars), + provider: providerId, + model: result.model ?? model, + }; + } + + if (capability === "video") { + if (!provider.describeVideo) { + throw new Error(`Video understanding provider "${providerId}" not available.`); + } + const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs }); + if (!media) return null; + const estimatedBase64Bytes = estimateBase64Size(media.buffer.length); + const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes); + if (estimatedBase64Bytes > maxBase64Bytes) { + if (shouldLogVerbose()) { + logVerbose( + `Skipping video attachment ${attachment.index + 1}: base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`, + ); + } + return null; + } + const key = await resolveApiKeyForProvider({ + provider: providerId, + cfg, + profileId: entry.profile, + preferredProfile: entry.preferredProfile, + agentDir: params.agentDir, + }); + const providerConfig = cfg.models?.providers?.[providerId]; + const result = await provider.describeVideo({ + buffer: media.buffer, + fileName: media.fileName, + mime: media.mime, + apiKey: key.apiKey, + baseUrl: providerConfig?.baseUrl, + headers: providerConfig?.headers, + model: entry.model, + prompt, + timeoutMs, + }); + return { + kind: "video.description", + attachmentIndex: attachment.index, + text: trimOutput(result.text, maxChars), + provider: providerId, + model: result.model ?? entry.model, + }; + } + + return null; +} + +async function runCliEntry(params: { + capability: Capability; + entry: MediaUnderstandingModelConfig; + cfg: ClawdbotConfig; + ctx: MsgContext; + attachment: MediaAttachment; +}): Promise { + const { entry, capability, cfg, ctx, attachment } = params; + const command = entry.command?.trim(); + const args = entry.args ?? []; + if (!command) { + throw new Error(`CLI entry missing command for ${capability}`); + } + const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]); + const maxChars = resolveMaxChars({ capability, entry, cfg }); + const timeoutMs = resolveTimeoutMs( + entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds, + DEFAULT_TIMEOUT_SECONDS[capability], + ); + const prompt = resolvePrompt( + capability, + entry.prompt ?? cfg.tools?.media?.[capability]?.prompt, + maxChars, + ); + const pathResult = await resolveAttachmentPath({ + attachment, + maxBytes, + timeoutMs, + }); + if (!pathResult) return null; + + const templCtx: MsgContext = { + ...ctx, + MediaPath: pathResult.path, + Prompt: prompt, + MaxChars: maxChars, + }; + const argv = [command, ...args].map((part, index) => + index === 0 ? part : applyTemplate(part, templCtx), + ); + if (shouldLogVerbose()) { + logVerbose(`Media understanding via CLI: ${argv.join(" ")}`); + } + try { + const { stdout } = await runExec(argv[0], argv.slice(1), { + timeoutMs, + maxBuffer: CLI_OUTPUT_MAX_BUFFER, + }); + const text = trimOutput(stdout, maxChars); + if (!text) return null; + return { + kind: capability === "audio" ? "audio.transcription" : `${capability}.description`, + attachmentIndex: attachment.index, + text, + provider: "cli", + model: command, + }; + } finally { + if (pathResult.cleanup) { + await pathResult.cleanup(); + } + } +} + +async function runCapability(params: { + capability: Capability; + cfg: ClawdbotConfig; + ctx: MsgContext; + attachments: MediaAttachment[]; + agentDir?: string; + providerRegistry: Map; +}): Promise { + const { capability, cfg, ctx, attachments } = params; + const config = resolveCapabilityConfig(cfg, capability); + if (!config || config.enabled === false) return null; + const entries = resolveModelEntries(config, capability); + if (entries.length === 0) return null; + + const scopeDecision = resolveScopeDecision({ scope: config.scope, ctx }); + if (scopeDecision === "deny") { + if (shouldLogVerbose()) { + logVerbose(`${capability} understanding disabled by scope policy.`); + } + return null; + } + + const attachment = attachments.find((item) => { + if (capability === "image") return isImageAttachment(item); + if (capability === "audio") return isAudioAttachment(item); + return isVideoAttachment(item); + }); + if (!attachment) return null; + + for (const entry of entries) { + try { + const entryType = entry.type ?? (entry.command ? "cli" : "provider"); + const result = + entryType === "cli" + ? await runCliEntry({ capability, entry, cfg, ctx, attachment }) + : await runProviderEntry({ + capability, + entry, + cfg, + ctx, + attachment, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + }); + if (result) return result; + } catch (err) { + if (isMaxBytesError(err)) { + if (shouldLogVerbose()) { + logVerbose(`Skipping ${capability} model due to size: ${String(err)}`); + } + } else if (shouldLogVerbose()) { + logVerbose(`${capability} understanding failed: ${String(err)}`); + } + } + } + + return null; +} + +export async function applyMediaUnderstanding(params: { + ctx: MsgContext; + cfg: ClawdbotConfig; + agentDir?: string; + providers?: Record; +}): Promise { + const { ctx, cfg } = params; + const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body]; + const originalUserText = + commandCandidates + .map((value) => extractMediaUserText(value)) + .find((value) => value && value.trim()) ?? undefined; + + const attachments = normalizeAttachments(ctx); + const providerRegistry = buildMediaUnderstandingRegistry(params.providers); + const outputs: MediaUnderstandingOutput[] = []; + + const imageOutput = await runCapability({ + capability: "image", + cfg, + ctx, + attachments, + agentDir: params.agentDir, + providerRegistry, + }); + if (imageOutput) outputs.push(imageOutput); + + const audioOutput = await runCapability({ + capability: "audio", + cfg, + ctx, + attachments, + agentDir: params.agentDir, + providerRegistry, + }); + if (audioOutput) outputs.push(audioOutput); + + const videoOutput = await runCapability({ + capability: "video", + cfg, + ctx, + attachments, + agentDir: params.agentDir, + providerRegistry, + }); + if (videoOutput) outputs.push(videoOutput); + + if (outputs.length > 0) { + ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs }); + const audioResult = outputs.find((output) => output.kind === "audio.transcription"); + if (audioResult) { + ctx.Transcript = audioResult.text; + ctx.CommandBody = audioResult.text; + ctx.RawBody = audioResult.text; + } else if (originalUserText) { + ctx.CommandBody = originalUserText; + ctx.RawBody = originalUserText; + } + ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs]; + } + + return { + outputs, + appliedImage: outputs.some((output) => output.kind === "image.description"), + appliedAudio: outputs.some((output) => output.kind === "audio.transcription"), + appliedVideo: outputs.some((output) => output.kind === "video.description"), + }; +} diff --git a/src/media-understanding/format.test.ts b/src/media-understanding/format.test.ts new file mode 100644 index 000000000..172ecadf9 --- /dev/null +++ b/src/media-understanding/format.test.ts @@ -0,0 +1,91 @@ +import { describe, expect, it } from "vitest"; +import { formatMediaUnderstandingBody } from "./format.js"; + +describe("formatMediaUnderstandingBody", () => { + it("replaces placeholder body with transcript", () => { + const body = formatMediaUnderstandingBody({ + body: "", + outputs: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "hello world", + provider: "groq", + }, + ], + }); + expect(body).toBe("[Audio]\nTranscript:\nhello world"); + }); + + it("includes user text when body is meaningful", () => { + const body = formatMediaUnderstandingBody({ + body: "caption here", + outputs: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "transcribed", + provider: "groq", + }, + ], + }); + expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed"); + }); + + it("strips leading media placeholders from user text", () => { + const body = formatMediaUnderstandingBody({ + body: " caption here", + outputs: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "transcribed", + provider: "groq", + }, + ], + }); + expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed"); + }); + + it("keeps user text once when multiple outputs exist", () => { + const body = formatMediaUnderstandingBody({ + body: "caption here", + outputs: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "audio text", + provider: "groq", + }, + { + kind: "video.description", + attachmentIndex: 1, + text: "video text", + provider: "google", + }, + ], + }); + expect(body).toBe( + [ + "User text:\ncaption here", + "[Audio]\nTranscript:\naudio text", + "[Video]\nDescription:\nvideo text", + ].join("\n\n"), + ); + }); + + it("formats image outputs", () => { + const body = formatMediaUnderstandingBody({ + body: "", + outputs: [ + { + kind: "image.description", + attachmentIndex: 0, + text: "a cat", + provider: "openai", + }, + ], + }); + expect(body).toBe("[Image]\nDescription:\na cat"); + }); +}); diff --git a/src/media-understanding/format.ts b/src/media-understanding/format.ts new file mode 100644 index 000000000..ffa6f0145 --- /dev/null +++ b/src/media-understanding/format.ts @@ -0,0 +1,77 @@ +import type { MediaUnderstandingOutput } from "./types.js"; + +const MEDIA_PLACEHOLDER_RE = /^]+>(\s*\([^)]*\))?$/i; +const MEDIA_PLACEHOLDER_TOKEN_RE = /^]+>(\s*\([^)]*\))?\s*/i; + +export function extractMediaUserText(body?: string): string | undefined { + const trimmed = body?.trim() ?? ""; + if (!trimmed) return undefined; + if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return undefined; + const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim(); + return cleaned || undefined; +} + +function formatSection( + title: "Audio" | "Video" | "Image", + kind: "Transcript" | "Description", + text: string, + userText?: string, +): string { + const lines = [`[${title}]`]; + if (userText) { + lines.push(`User text:\n${userText}`); + } + lines.push(`${kind}:\n${text}`); + return lines.join("\n"); +} + +export function formatMediaUnderstandingBody(params: { + body?: string; + outputs: MediaUnderstandingOutput[]; +}): string { + const outputs = params.outputs.filter((output) => output.text.trim()); + if (outputs.length === 0) { + return params.body ?? ""; + } + + const userText = extractMediaUserText(params.body); + const sections: string[] = []; + if (userText && outputs.length > 1) { + sections.push(`User text:\n${userText}`); + } + + for (const output of outputs) { + if (output.kind === "audio.transcription") { + sections.push( + formatSection( + "Audio", + "Transcript", + output.text, + outputs.length === 1 ? userText : undefined, + ), + ); + continue; + } + if (output.kind === "image.description") { + sections.push( + formatSection( + "Image", + "Description", + output.text, + outputs.length === 1 ? userText : undefined, + ), + ); + continue; + } + sections.push( + formatSection( + "Video", + "Description", + output.text, + outputs.length === 1 ? userText : undefined, + ), + ); + } + + return sections.join("\n\n").trim(); +} diff --git a/src/media-understanding/index.ts b/src/media-understanding/index.ts new file mode 100644 index 000000000..6afa22a54 --- /dev/null +++ b/src/media-understanding/index.ts @@ -0,0 +1,9 @@ +export { applyMediaUnderstanding } from "./apply.js"; +export { formatMediaUnderstandingBody } from "./format.js"; +export { resolveMediaUnderstandingScope } from "./scope.js"; +export type { + MediaAttachment, + MediaUnderstandingOutput, + MediaUnderstandingProvider, + MediaUnderstandingKind, +} from "./types.js"; diff --git a/src/media-understanding/providers/google/index.ts b/src/media-understanding/providers/google/index.ts new file mode 100644 index 000000000..285195dc7 --- /dev/null +++ b/src/media-understanding/providers/google/index.ts @@ -0,0 +1,7 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { describeGeminiVideo } from "./video.js"; + +export const googleProvider: MediaUnderstandingProvider = { + id: "google", + describeVideo: describeGeminiVideo, +}; diff --git a/src/media-understanding/providers/google/video.test.ts b/src/media-understanding/providers/google/video.test.ts new file mode 100644 index 000000000..f94778cdd --- /dev/null +++ b/src/media-understanding/providers/google/video.test.ts @@ -0,0 +1,93 @@ +import { describe, expect, it } from "vitest"; + +import { describeGeminiVideo } from "./video.js"; + +const resolveRequestUrl = (input: RequestInfo | URL) => { + if (typeof input === "string") return input; + if (input instanceof URL) return input.toString(); + return input.url; +}; + +describe("describeGeminiVideo", () => { + it("respects case-insensitive x-goog-api-key overrides", async () => { + let seenKey: string | null = null; + const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => { + const headers = new Headers(init?.headers); + seenKey = headers.get("x-goog-api-key"); + return new Response( + JSON.stringify({ + candidates: [{ content: { parts: [{ text: "video ok" }] } }], + }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + }; + + const result = await describeGeminiVideo({ + buffer: Buffer.from("video"), + fileName: "clip.mp4", + apiKey: "test-key", + timeoutMs: 1000, + headers: { "X-Goog-Api-Key": "override" }, + fetchFn, + }); + + expect(seenKey).toBe("override"); + expect(result.text).toBe("video ok"); + }); + + it("builds the expected request payload", async () => { + let seenUrl: string | null = null; + let seenInit: RequestInit | undefined; + const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => { + seenUrl = resolveRequestUrl(input); + seenInit = init; + return new Response( + JSON.stringify({ + candidates: [ + { + content: { + parts: [{ text: "first" }, { text: " second " }, { text: "" }], + }, + }, + ], + }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + }; + + const result = await describeGeminiVideo({ + buffer: Buffer.from("video-bytes"), + fileName: "clip.mp4", + apiKey: "test-key", + timeoutMs: 1500, + baseUrl: "https://example.com/v1beta/", + model: "gemini-3-pro", + headers: { "X-Other": "1" }, + fetchFn, + }); + + expect(result.model).toBe("gemini-3-pro-preview"); + expect(result.text).toBe("first\nsecond"); + expect(seenUrl).toBe("https://example.com/v1beta/models/gemini-3-pro-preview:generateContent"); + expect(seenInit?.method).toBe("POST"); + expect(seenInit?.signal).toBeInstanceOf(AbortSignal); + + const headers = new Headers(seenInit?.headers); + expect(headers.get("x-goog-api-key")).toBe("test-key"); + expect(headers.get("content-type")).toBe("application/json"); + expect(headers.get("x-other")).toBe("1"); + + const bodyText = + typeof seenInit?.body === "string" + ? seenInit.body + : Buffer.isBuffer(seenInit?.body) + ? seenInit.body.toString("utf8") + : ""; + const body = JSON.parse(bodyText); + expect(body.contents?.[0]?.parts?.[0]?.text).toBe("Describe the video."); + expect(body.contents?.[0]?.parts?.[1]?.inline_data?.mime_type).toBe("video/mp4"); + expect(body.contents?.[0]?.parts?.[1]?.inline_data?.data).toBe( + Buffer.from("video-bytes").toString("base64"), + ); + }); +}); diff --git a/src/media-understanding/providers/google/video.ts b/src/media-understanding/providers/google/video.ts new file mode 100644 index 000000000..0f483b280 --- /dev/null +++ b/src/media-understanding/providers/google/video.ts @@ -0,0 +1,84 @@ +import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js"; +import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js"; +import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js"; + +export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"; +const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview"; +const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video."; + +function resolveModel(model?: string): string { + const trimmed = model?.trim(); + if (!trimmed) return DEFAULT_GOOGLE_VIDEO_MODEL; + return normalizeGoogleModelId(trimmed); +} + +function resolvePrompt(prompt?: string): string { + const trimmed = prompt?.trim(); + return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT; +} + +export async function describeGeminiVideo( + params: VideoDescriptionRequest, +): Promise { + const fetchFn = params.fetchFn ?? fetch; + const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL); + const model = resolveModel(params.model); + const url = `${baseUrl}/models/${model}:generateContent`; + + const headers = new Headers(params.headers); + if (!headers.has("content-type")) { + headers.set("content-type", "application/json"); + } + if (!headers.has("x-goog-api-key")) { + headers.set("x-goog-api-key", params.apiKey); + } + + const body = { + contents: [ + { + role: "user", + parts: [ + { text: resolvePrompt(params.prompt) }, + { + inline_data: { + mime_type: params.mime ?? "video/mp4", + data: params.buffer.toString("base64"), + }, + }, + ], + }, + ], + }; + + const res = await fetchWithTimeout( + url, + { + method: "POST", + headers, + body: JSON.stringify(body), + }, + params.timeoutMs, + fetchFn, + ); + + if (!res.ok) { + const detail = await readErrorResponse(res); + const suffix = detail ? `: ${detail}` : ""; + throw new Error(`Video description failed (HTTP ${res.status})${suffix}`); + } + + const payload = (await res.json()) as { + candidates?: Array<{ + content?: { parts?: Array<{ text?: string }> }; + }>; + }; + const parts = payload.candidates?.[0]?.content?.parts ?? []; + const text = parts + .map((part) => part?.text?.trim()) + .filter(Boolean) + .join("\n"); + if (!text) { + throw new Error("Video description response missing text"); + } + return { text, model }; +} diff --git a/src/media-understanding/providers/groq/index.ts b/src/media-understanding/providers/groq/index.ts new file mode 100644 index 000000000..451799e8e --- /dev/null +++ b/src/media-understanding/providers/groq/index.ts @@ -0,0 +1,13 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js"; + +const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1"; + +export const groqProvider: MediaUnderstandingProvider = { + id: "groq", + transcribeAudio: (req) => + transcribeOpenAiCompatibleAudio({ + ...req, + baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL, + }), +}; diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts new file mode 100644 index 000000000..fef5d6531 --- /dev/null +++ b/src/media-understanding/providers/index.ts @@ -0,0 +1,35 @@ +import { normalizeProviderId } from "../../agents/model-selection.js"; +import type { MediaUnderstandingProvider } from "../types.js"; +import { googleProvider } from "./google/index.js"; +import { groqProvider } from "./groq/index.js"; +import { openaiProvider } from "./openai/index.js"; + +const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider]; + +export function normalizeMediaProviderId(id: string): string { + const normalized = normalizeProviderId(id); + if (normalized === "gemini") return "google"; + return normalized; +} + +export function buildMediaUnderstandingRegistry( + overrides?: Record, +): Map { + const registry = new Map(); + for (const provider of PROVIDERS) { + registry.set(normalizeMediaProviderId(provider.id), provider); + } + if (overrides) { + for (const [key, provider] of Object.entries(overrides)) { + registry.set(normalizeMediaProviderId(key), provider); + } + } + return registry; +} + +export function getMediaUnderstandingProvider( + id: string, + registry: Map, +): MediaUnderstandingProvider | undefined { + return registry.get(normalizeMediaProviderId(id)); +} diff --git a/src/media-understanding/providers/openai/audio.test.ts b/src/media-understanding/providers/openai/audio.test.ts new file mode 100644 index 000000000..88c713f2a --- /dev/null +++ b/src/media-understanding/providers/openai/audio.test.ts @@ -0,0 +1,86 @@ +import { describe, expect, it } from "vitest"; + +import { transcribeOpenAiCompatibleAudio } from "./audio.js"; + +const resolveRequestUrl = (input: RequestInfo | URL) => { + if (typeof input === "string") return input; + if (input instanceof URL) return input.toString(); + return input.url; +}; + +describe("transcribeOpenAiCompatibleAudio", () => { + it("respects lowercase authorization header overrides", async () => { + let seenAuth: string | null = null; + const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => { + const headers = new Headers(init?.headers); + seenAuth = headers.get("authorization"); + return new Response(JSON.stringify({ text: "ok" }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; + + const result = await transcribeOpenAiCompatibleAudio({ + buffer: Buffer.from("audio"), + fileName: "note.mp3", + apiKey: "test-key", + timeoutMs: 1000, + headers: { authorization: "Bearer override" }, + fetchFn, + }); + + expect(seenAuth).toBe("Bearer override"); + expect(result.text).toBe("ok"); + }); + + it("builds the expected request payload", async () => { + let seenUrl: string | null = null; + let seenInit: RequestInit | undefined; + const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => { + seenUrl = resolveRequestUrl(input); + seenInit = init; + return new Response(JSON.stringify({ text: "hello" }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; + + const result = await transcribeOpenAiCompatibleAudio({ + buffer: Buffer.from("audio-bytes"), + fileName: "voice.wav", + apiKey: "test-key", + timeoutMs: 1234, + baseUrl: "https://api.example.com/v1/", + model: " ", + language: " en ", + prompt: " hello ", + mime: "audio/wav", + headers: { "X-Custom": "1" }, + fetchFn, + }); + + expect(result.model).toBe("whisper-1"); + expect(result.text).toBe("hello"); + expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions"); + expect(seenInit?.method).toBe("POST"); + expect(seenInit?.signal).toBeInstanceOf(AbortSignal); + + const headers = new Headers(seenInit?.headers); + expect(headers.get("authorization")).toBe("Bearer test-key"); + expect(headers.get("x-custom")).toBe("1"); + + const form = seenInit?.body as FormData; + expect(form).toBeInstanceOf(FormData); + expect(form.get("model")).toBe("whisper-1"); + expect(form.get("language")).toBe("en"); + expect(form.get("prompt")).toBe("hello"); + const file = form.get("file") as Blob | { type?: string; name?: string } | null; + expect(file).not.toBeNull(); + if (file) { + expect(file.type).toBe("audio/wav"); + if ("name" in file && typeof file.name === "string") { + expect(file.name).toBe("voice.wav"); + } + } + }); +}); diff --git a/src/media-understanding/providers/openai/audio.ts b/src/media-understanding/providers/openai/audio.ts new file mode 100644 index 000000000..65ac5735a --- /dev/null +++ b/src/media-understanding/providers/openai/audio.ts @@ -0,0 +1,61 @@ +import path from "node:path"; + +import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js"; +import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js"; + +export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1"; +const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1"; + +function resolveModel(model?: string): string { + const trimmed = model?.trim(); + return trimmed || DEFAULT_OPENAI_AUDIO_MODEL; +} + +export async function transcribeOpenAiCompatibleAudio( + params: AudioTranscriptionRequest, +): Promise { + const fetchFn = params.fetchFn ?? fetch; + const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL); + const url = `${baseUrl}/audio/transcriptions`; + + const model = resolveModel(params.model); + const form = new FormData(); + const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio"; + const bytes = new Uint8Array(params.buffer); + const blob = new Blob([bytes], { + type: params.mime ?? "application/octet-stream", + }); + form.append("file", blob, fileName); + form.append("model", model); + if (params.language?.trim()) form.append("language", params.language.trim()); + if (params.prompt?.trim()) form.append("prompt", params.prompt.trim()); + + const headers = new Headers(params.headers); + if (!headers.has("authorization")) { + headers.set("authorization", `Bearer ${params.apiKey}`); + } + + const res = await fetchWithTimeout( + url, + { + method: "POST", + headers, + body: form, + }, + params.timeoutMs, + fetchFn, + ); + + if (!res.ok) { + const detail = await readErrorResponse(res); + const suffix = detail ? `: ${detail}` : ""; + throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`); + } + + const payload = (await res.json()) as { text?: string }; + const text = payload.text?.trim(); + if (!text) { + throw new Error("Audio transcription response missing text"); + } + return { text, model }; +} diff --git a/src/media-understanding/providers/openai/index.ts b/src/media-understanding/providers/openai/index.ts new file mode 100644 index 000000000..f8af49928 --- /dev/null +++ b/src/media-understanding/providers/openai/index.ts @@ -0,0 +1,7 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { transcribeOpenAiCompatibleAudio } from "./audio.js"; + +export const openaiProvider: MediaUnderstandingProvider = { + id: "openai", + transcribeAudio: transcribeOpenAiCompatibleAudio, +}; diff --git a/src/media-understanding/providers/shared.ts b/src/media-understanding/providers/shared.ts new file mode 100644 index 000000000..9dd36992e --- /dev/null +++ b/src/media-understanding/providers/shared.ts @@ -0,0 +1,33 @@ +const MAX_ERROR_CHARS = 300; + +export function normalizeBaseUrl(baseUrl: string | undefined, fallback: string): string { + const raw = baseUrl?.trim() || fallback; + return raw.replace(/\/+$/, ""); +} + +export async function fetchWithTimeout( + url: string, + init: RequestInit, + timeoutMs: number, + fetchFn: typeof fetch, +): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs)); + try { + return await fetchFn(url, { ...init, signal: controller.signal }); + } finally { + clearTimeout(timer); + } +} + +export async function readErrorResponse(res: Response): Promise { + try { + const text = await res.text(); + const collapsed = text.replace(/\s+/g, " ").trim(); + if (!collapsed) return undefined; + if (collapsed.length <= MAX_ERROR_CHARS) return collapsed; + return `${collapsed.slice(0, MAX_ERROR_CHARS)}…`; + } catch { + return undefined; + } +} diff --git a/src/media-understanding/scope.test.ts b/src/media-understanding/scope.test.ts new file mode 100644 index 000000000..9331d8577 --- /dev/null +++ b/src/media-understanding/scope.test.ts @@ -0,0 +1,46 @@ +import { describe, expect, it } from "vitest"; +import { resolveMediaUnderstandingScope } from "./scope.js"; + +describe("resolveMediaUnderstandingScope", () => { + it("defaults to allow when scope is undefined", () => { + expect(resolveMediaUnderstandingScope({})).toBe("allow"); + }); + + it("uses first matching rule", () => { + const decision = resolveMediaUnderstandingScope({ + scope: { + default: "deny", + rules: [ + { action: "allow", match: { channel: "whatsapp" } }, + { action: "deny", match: { channel: "whatsapp", chatType: "direct" } }, + ], + }, + channel: "whatsapp", + chatType: "direct", + sessionKey: "whatsapp:direct:123", + }); + expect(decision).toBe("allow"); + }); + + it("matches keyPrefix when provided", () => { + const decision = resolveMediaUnderstandingScope({ + scope: { + default: "deny", + rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }], + }, + sessionKey: "agent:main:whatsapp:group:123", + }); + expect(decision).toBe("allow"); + }); + + it("matches keyPrefix case-insensitively", () => { + const decision = resolveMediaUnderstandingScope({ + scope: { + default: "deny", + rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }], + }, + sessionKey: "AGENT:MAIN:WHATSAPP:GROUP:123", + }); + expect(decision).toBe("allow"); + }); +}); diff --git a/src/media-understanding/scope.ts b/src/media-understanding/scope.ts new file mode 100644 index 000000000..55660fafb --- /dev/null +++ b/src/media-understanding/scope.ts @@ -0,0 +1,54 @@ +import type { MediaUnderstandingScopeConfig } from "../config/types.tools.js"; + +export type MediaUnderstandingScopeDecision = "allow" | "deny"; + +function normalizeDecision(value?: string | null): MediaUnderstandingScopeDecision | undefined { + const normalized = value?.trim().toLowerCase(); + if (normalized === "allow") return "allow"; + if (normalized === "deny") return "deny"; + return undefined; +} + +function normalizeMatch(value?: string | null): string | undefined { + const normalized = value?.trim().toLowerCase(); + return normalized || undefined; +} + +export function normalizeMediaUnderstandingChatType(raw?: string | null): string | undefined { + const value = raw?.trim().toLowerCase(); + if (!value) return undefined; + if (value === "dm" || value === "direct_message" || value === "private") return "direct"; + if (value === "groups") return "group"; + if (value === "channel") return "room"; + return value; +} + +export function resolveMediaUnderstandingScope(params: { + scope?: MediaUnderstandingScopeConfig; + sessionKey?: string; + channel?: string; + chatType?: string; +}): MediaUnderstandingScopeDecision { + const scope = params.scope; + if (!scope) return "allow"; + + const channel = normalizeMatch(params.channel); + const chatType = normalizeMatch(params.chatType); + const sessionKey = normalizeMatch(params.sessionKey) ?? ""; + + for (const rule of scope.rules ?? []) { + if (!rule) continue; + const action = normalizeDecision(rule.action) ?? "allow"; + const match = rule.match ?? {}; + const matchChannel = normalizeMatch(match.channel); + const matchChatType = normalizeMatch(match.chatType); + const matchPrefix = normalizeMatch(match.keyPrefix); + + if (matchChannel && matchChannel !== channel) continue; + if (matchChatType && matchChatType !== chatType) continue; + if (matchPrefix && !sessionKey.startsWith(matchPrefix)) continue; + return action; + } + + return normalizeDecision(scope.default) ?? "allow"; +} diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts new file mode 100644 index 000000000..85c897275 --- /dev/null +++ b/src/media-understanding/types.ts @@ -0,0 +1,62 @@ +export type MediaUnderstandingKind = + | "audio.transcription" + | "video.description" + | "image.description"; + +export type MediaAttachment = { + path?: string; + url?: string; + mime?: string; + index: number; +}; + +export type MediaUnderstandingOutput = { + kind: MediaUnderstandingKind; + attachmentIndex: number; + text: string; + provider: string; + model?: string; +}; + +export type AudioTranscriptionRequest = { + buffer: Buffer; + fileName: string; + mime?: string; + apiKey: string; + baseUrl?: string; + headers?: Record; + model?: string; + language?: string; + prompt?: string; + timeoutMs: number; + fetchFn?: typeof fetch; +}; + +export type AudioTranscriptionResult = { + text: string; + model?: string; +}; + +export type VideoDescriptionRequest = { + buffer: Buffer; + fileName: string; + mime?: string; + apiKey: string; + baseUrl?: string; + headers?: Record; + model?: string; + prompt?: string; + timeoutMs: number; + fetchFn?: typeof fetch; +}; + +export type VideoDescriptionResult = { + text: string; + model?: string; +}; + +export type MediaUnderstandingProvider = { + id: string; + transcribeAudio?: (req: AudioTranscriptionRequest) => Promise; + describeVideo?: (req: VideoDescriptionRequest) => Promise; +}; diff --git a/src/media/fetch.test.ts b/src/media/fetch.test.ts new file mode 100644 index 000000000..46445b1bb --- /dev/null +++ b/src/media/fetch.test.ts @@ -0,0 +1,47 @@ +import { describe, expect, it } from "vitest"; + +import { fetchRemoteMedia } from "./fetch.js"; + +function makeStream(chunks: Uint8Array[]) { + return new ReadableStream({ + start(controller) { + for (const chunk of chunks) { + controller.enqueue(chunk); + } + controller.close(); + }, + }); +} + +describe("fetchRemoteMedia", () => { + it("rejects when content-length exceeds maxBytes", async () => { + const fetchImpl = async () => + new Response(makeStream([new Uint8Array([1, 2, 3, 4, 5])]), { + status: 200, + headers: { "content-length": "5" }, + }); + + await expect( + fetchRemoteMedia({ + url: "https://example.com/file.bin", + fetchImpl, + maxBytes: 4, + }), + ).rejects.toThrow("exceeds maxBytes"); + }); + + it("rejects when streamed payload exceeds maxBytes", async () => { + const fetchImpl = async () => + new Response(makeStream([new Uint8Array([1, 2, 3]), new Uint8Array([4, 5, 6])]), { + status: 200, + }); + + await expect( + fetchRemoteMedia({ + url: "https://example.com/file.bin", + fetchImpl, + maxBytes: 4, + }), + ).rejects.toThrow("exceeds maxBytes"); + }); +}); diff --git a/src/media/fetch.ts b/src/media/fetch.ts index 28312ebe5..6ee706d97 100644 --- a/src/media/fetch.ts +++ b/src/media/fetch.ts @@ -14,6 +14,7 @@ type FetchMediaOptions = { url: string; fetchImpl?: FetchLike; filePathHint?: string; + maxBytes?: number; }; function stripQuotes(value: string): string { @@ -51,7 +52,7 @@ async function readErrorBodySnippet(res: Response, maxChars = 200): Promise { - const { url, fetchImpl, filePathHint } = options; + const { url, fetchImpl, filePathHint, maxBytes } = options; const fetcher: FetchLike | undefined = fetchImpl ?? globalThis.fetch; if (!fetcher) { throw new Error("fetch is not available"); @@ -77,7 +78,19 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise maxBytes) { + throw new Error( + `Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`, + ); + } + } + + const buffer = maxBytes + ? await readResponseWithLimit(res, maxBytes) + : Buffer.from(await res.arrayBuffer()); let fileNameFromUrl: string | undefined; try { const parsed = new URL(url); @@ -109,3 +122,47 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise { + const body = res.body; + if (!body || typeof body.getReader !== "function") { + const fallback = Buffer.from(await res.arrayBuffer()); + if (fallback.length > maxBytes) { + throw new Error( + `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`, + ); + } + return fallback; + } + + const reader = body.getReader(); + const chunks: Uint8Array[] = []; + let total = 0; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (value?.length) { + total += value.length; + if (total > maxBytes) { + try { + await reader.cancel(); + } catch {} + throw new Error( + `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`, + ); + } + chunks.push(value); + } + } + } finally { + try { + reader.releaseLock(); + } catch {} + } + + return Buffer.concat( + chunks.map((chunk) => Buffer.from(chunk)), + total, + ); +}