From 1b973f75062104348d3dde1239120bdaa8213632 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sat, 17 Jan 2026 03:52:37 +0000
Subject: [PATCH] feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
---
 CHANGELOG.md                                  |   1 +
 docs/gateway/configuration-examples.md        |  17 +-
 docs/gateway/configuration.md                 |  56 +-
 docs/gateway/doctor.md                        |   2 +-
 docs/nodes/audio.md                           |  73 +-
 docs/nodes/images.md                          |  12 +-
 docs/nodes/media-understanding.md             | 217 +++++
 src/agents/bash-tools.exec.ts                 |  26 +-
 src/auto-reply/media-note.test.ts             |  16 +
 src/auto-reply/media-note.ts                  |  33 +-
 src/auto-reply/reply/get-reply-directives.ts  |   1 +
 src/auto-reply/reply/get-reply-run.ts         |  14 +-
 src/auto-reply/reply/get-reply.ts             |  19 +-
 src/auto-reply/templating.ts                  |   4 +
 ...etection.rejects-routing-allowfrom.test.ts |  17 +-
 src/config/legacy.migrations.part-2.ts        |  32 +-
 src/config/legacy.rules.ts                    |   2 +-
 src/config/legacy.shared.ts                   |   2 +-
 src/config/schema.ts                          |  24 +-
 src/config/types.messages.ts                  |   2 +-
 src/config/types.tools.ts                     |  82 +-
 src/config/zod-schema.agent-runtime.ts        |   8 +-
 src/config/zod-schema.core.ts                 |  60 +-
 src/media-understanding/apply.test.ts         | 258 ++++++
 src/media-understanding/apply.ts              | 804 ++++++++++++++++++
 src/media-understanding/format.test.ts        |  91 ++
 src/media-understanding/format.ts             |  77 ++
 src/media-understanding/index.ts              |   9 +
 .../providers/google/index.ts                 |   7 +
 .../providers/google/video.test.ts            |  93 ++
 .../providers/google/video.ts                 |  84 ++
 .../providers/groq/index.ts                   |  13 +
 src/media-understanding/providers/index.ts    |  35 +
 .../providers/openai/audio.test.ts            |  86 ++
 .../providers/openai/audio.ts                 |  61 ++
 .../providers/openai/index.ts                 |   7 +
 src/media-understanding/providers/shared.ts   |  33 +
 src/media-understanding/scope.test.ts         |  46 +
 src/media-understanding/scope.ts              |  54 ++
 src/media-understanding/types.ts              |  62 ++
 src/media/fetch.test.ts                       |  47 +
 src/media/fetch.ts                            |  61 +-
 42 files changed, 2547 insertions(+), 101 deletions(-)
 create mode 100644 docs/nodes/media-understanding.md
 create mode 100644 src/media-understanding/apply.test.ts
 create mode 100644 src/media-understanding/apply.ts
 create mode 100644 src/media-understanding/format.test.ts
 create mode 100644 src/media-understanding/format.ts
 create mode 100644 src/media-understanding/index.ts
 create mode 100644 src/media-understanding/providers/google/index.ts
 create mode 100644 src/media-understanding/providers/google/video.test.ts
 create mode 100644 src/media-understanding/providers/google/video.ts
 create mode 100644 src/media-understanding/providers/groq/index.ts
 create mode 100644 src/media-understanding/providers/index.ts
 create mode 100644 src/media-understanding/providers/openai/audio.test.ts
 create mode 100644 src/media-understanding/providers/openai/audio.ts
 create mode 100644 src/media-understanding/providers/openai/index.ts
 create mode 100644 src/media-understanding/providers/shared.ts
 create mode 100644 src/media-understanding/scope.test.ts
 create mode 100644 src/media-understanding/scope.ts
 create mode 100644 src/media-understanding/types.ts
 create mode 100644 src/media/fetch.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b0c9ca8f9..7123c9353 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -89,6 +89,7 @@
 - Docs: add Date & Time guide and update prompt/timezone configuration docs.
 - Messages: debounce rapid inbound messages across channels with per-connector overrides. (#971) — thanks @juanpablodlc.
 - Messages: allow media-only sends (CLI/tool) and show Telegram voice recording status for voice notes. (#957) — thanks @rdev.
+- Media: add optional inbound media understanding for image/audio/video with provider + CLI fallbacks. (#1005) — thanks @tristanmanchester.
 - Auth/Status: keep auth profiles sticky per session (rotate on compaction/new), surface provider usage headers in `/status` and `clawdbot models status`, and update docs.
 - CLI: add `--json` output for `clawdbot daemon` lifecycle/install commands.
 - Memory: make `node-llama-cpp` an optional dependency (avoid Node 25 install failures) and improve local-embeddings fallback/errors.
diff --git a/docs/gateway/configuration-examples.md b/docs/gateway/configuration-examples.md
index 0aecc034d..cf95229c3 100644
--- a/docs/gateway/configuration-examples.md
+++ b/docs/gateway/configuration-examples.md
@@ -124,10 +124,21 @@ Save to `~/.clawdbot/clawdbot.json` and you can DM the bot from that number.
 
   // Tooling
   tools: {
-    audio: {
-      transcription: {
-        args: ["--model", "base", "{{MediaPath}}"],
+    media: {
+      audio: {
+        enabled: true,
+        maxBytes: 20971520,
+        models: [
+          { provider: "openai", model: "whisper-1" },
+          // Optional CLI fallback (Whisper binary):
+          // { type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] }
+        ],
         timeoutSeconds: 120
+      },
+      video: {
+        enabled: true,
+        maxBytes: 52428800,
+        models: [{ provider: "google", model: "gemini-3-flash-preview" }]
       }
     }
   },
diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md
index 5b85428b2..3e2b55d6c 100644
--- a/docs/gateway/configuration.md
+++ b/docs/gateway/configuration.md
@@ -1769,6 +1769,58 @@ Legacy: `tools.bash` is still accepted as an alias.
 - `tools.web.fetch.firecrawl.maxAgeMs` (optional)
 - `tools.web.fetch.firecrawl.timeoutSeconds` (optional)
 
+`tools.media` configures inbound media understanding (image/audio/video):
+- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
+  - `enabled`: opt-out switch (default true).
+  - `prompt`: optional prompt override (image/video append a `maxChars` hint automatically).
+  - `maxChars`: max output characters (default 500 for image/video; unset for audio).
+  - `maxBytes`: max media size to send (defaults: image 10MB, audio 20MB, video 50MB).
+  - `timeoutSeconds`: request timeout (defaults: image 60s, audio 60s, video 120s).
+  - `language`: optional audio hint.
+  - `scope`: optional gating (first match wins) with `match.channel`, `match.chatType`, or `match.keyPrefix`.
+  - `models`: ordered list of model entries; failures or oversize media fall back to the next entry.
+- Each `models[]` entry:
+  - Provider entry (`type: "provider"` or omitted):
+    - `provider`: API provider id (`openai`, `anthropic`, `google`/`gemini`, `groq`, etc).
+    - `model`: model id override (required for image; defaults to `whisper-1`/`whisper-large-v3-turbo` for audio providers, and `gemini-3-flash-preview` for video).
+    - `profile` / `preferredProfile`: auth profile selection.
+  - CLI entry (`type: "cli"`):
+    - `command`: executable to run.
+    - `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc).
+  - `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry.
+  - `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language` can be overridden per entry.
+
+If no models are configured (or `enabled: false`), understanding is skipped; the model still receives the original attachments.
+
+Provider auth follows the standard model auth order (auth profiles, env vars like `OPENAI_API_KEY`/`GROQ_API_KEY`/`GEMINI_API_KEY`, or `models.providers.*.apiKey`).
+
+Example:
+```json5
+{
+  tools: {
+    media: {
+      audio: {
+        enabled: true,
+        maxBytes: 20971520,
+        scope: {
+          default: "deny",
+          rules: [{ action: "allow", match: { chatType: "direct" } }]
+        },
+        models: [
+          { provider: "openai", model: "whisper-1" },
+          { type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] }
+        ]
+      },
+      video: {
+        enabled: true,
+        maxBytes: 52428800,
+        models: [{ provider: "google", model: "gemini-3-flash-preview" }]
+      }
+    }
+  }
+}
+```
+
 `agents.defaults.subagents` configures sub-agent defaults:
 - `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call.
 - `maxConcurrent`: max concurrent sub-agent runs (default 1)
@@ -2848,7 +2900,7 @@ clawdbot dns setup --apply
 
 ## Template variables
 
-Template placeholders are expanded in `tools.audio.transcription.args` (and any future templated argument fields).
+Template placeholders are expanded in `tools.media.*.models[].args` (and any future templated argument fields).
 
 | Variable | Description |
 |----------|-------------|
@@ -2864,6 +2916,8 @@ Template placeholders are expanded in `tools.audio.transcription.args` (and any
 | `{{MediaPath}}` | Local media path (if downloaded) |
 | `{{MediaType}}` | Media type (image/audio/document/…) |
 | `{{Transcript}}` | Audio transcript (when enabled) |
+| `{{Prompt}}` | Resolved media prompt for CLI entries |
+| `{{MaxChars}}` | Resolved max output chars for CLI entries |
 | `{{ChatType}}` | `"direct"` or `"group"` |
 | `{{GroupSubject}}` | Group subject (best effort) |
 | `{{GroupMembers}}` | Group members preview (best effort) |
diff --git a/docs/gateway/doctor.md b/docs/gateway/doctor.md
index 0574566b0..b3c77e848 100644
--- a/docs/gateway/doctor.md
+++ b/docs/gateway/doctor.md
@@ -111,7 +111,7 @@ Current migrations:
 - `routing.bindings` → top-level `bindings`
 - `routing.agents`/`routing.defaultAgentId` → `agents.list` + `agents.list[].default`
 - `routing.agentToAgent` → `tools.agentToAgent`
-- `routing.transcribeAudio` → `tools.audio.transcription`
+- `routing.transcribeAudio` → `tools.media.audio.models`
 - `bindings[].match.accountID` → `bindings[].match.accountId`
 - `identity` → `agents.list[].identity`
 - `agent.*` → `agents.defaults` + `tools.*` (tools/elevated/exec/sandbox/subagents)
diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md
index 349a8f212..b6019b26e 100644
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -3,25 +3,59 @@ summary: "How inbound audio/voice notes are downloaded, transcribed, and injecte
 read_when:
   - Changing audio transcription or media handling
 ---
-# Audio / Voice Notes — 2025-12-05
+# Audio / Voice Notes — 2026-01-17
 
 ## What works
-- **Optional transcription**: If `tools.audio.transcription` is set in `~/.clawdbot/clawdbot.json`, Clawdbot will:
-  1) Download inbound audio to a temp path when WhatsApp only provides a URL.
-  2) Run the configured CLI args (templated with `{{MediaPath}}`), expecting transcript on stdout.
-  3) Replace `Body` with the transcript, set `{{Transcript}}`, and prepend the original media path plus a `Transcript:` section in the command prompt so models see both.
-  4) Continue through the normal auto-reply pipeline (templating, sessions, Pi command).
-- **Verbose logging**: In `--verbose`, we log when transcription runs and when the transcript replaces the body.
+- **Media understanding (audio)**: If `tools.media.audio` is enabled and has `models`, Clawdbot:
+  1) Locates the first audio attachment (local path or URL) and downloads it if needed.
+  2) Enforces `maxBytes` before sending to each model entry.
+  3) Runs the first eligible model entry in order (provider or CLI).
+  4) If it fails or skips (size/timeout), it tries the next entry.
+  5) On success, it replaces `Body` with an `[Audio]` block and sets `{{Transcript}}`.
+- **Command parsing**: When transcription succeeds, `CommandBody`/`RawBody` are set to the transcript so slash commands still work.
+- **Verbose logging**: In `--verbose`, we log when transcription runs and when it replaces the body.
 
-## Config example (Whisper CLI)
-Requires `whisper` CLI installed:
+## Config examples
+
+### Provider + CLI fallback (OpenAI + Whisper CLI)
 ```json5
 {
   tools: {
-    audio: {
-      transcription: {
-        args: ["--model", "base", "{{MediaPath}}"],
-        timeoutSeconds: 45
+    media: {
+      audio: {
+        enabled: true,
+        maxBytes: 20971520,
+        models: [
+          { provider: "openai", model: "whisper-1" },
+          {
+            type: "cli",
+            command: "whisper",
+            args: ["--model", "base", "{{MediaPath}}"],
+            timeoutSeconds: 45
+          }
+        ]
+      }
+    }
+  }
+}
+```
+
+### Provider-only with scope gating
+```json5
+{
+  tools: {
+    media: {
+      audio: {
+        enabled: true,
+        scope: {
+          default: "allow",
+          rules: [
+            { action: "deny", match: { chatType: "group" } }
+          ]
+        },
+        models: [
+          { provider: "openai", model: "whisper-1" }
+        ]
       }
     }
   }
@@ -29,12 +63,13 @@ Requires `whisper` CLI installed:
 ```
 
 ## Notes & limits
-- We don’t ship a transcriber; you opt in with the Whisper CLI on your PATH.
-- Size guard: inbound audio must be ≤5 MB (matches the temp media store and transcript pipeline).
-- Outbound caps: web send supports audio/voice up to 16 MB (sent as a voice note with `ptt: true`).
-- If transcription fails, we fall back to the original body/media note; replies still go through.
-- Transcript is available to templates as `{{Transcript}}`; models get both the media path and a `Transcript:` block in the prompt when using command mode.
+- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
+- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
+- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
+- Transcript is available to templates as `{{Transcript}}`.
+- CLI stdout is capped (5MB); keep CLI output concise.
 
 ## Gotchas
+- Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`.
 - Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
-- Keep timeouts reasonable (`timeoutSeconds`, default 45s) to avoid blocking the reply queue.
+- Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue.
diff --git a/docs/nodes/images.md b/docs/nodes/images.md
index ff3664f7a..4d163e535 100644
--- a/docs/nodes/images.md
+++ b/docs/nodes/images.md
@@ -38,13 +38,23 @@ The WhatsApp channel runs via **Baileys Web**. This document captures the curren
   - `{{MediaUrl}}` pseudo-URL for the inbound media.
   - `{{MediaPath}}` local temp path written before running the command.
 - When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/<filename>`.
-- Audio transcription (if configured via `tools.audio.transcription`) runs before templating and can replace `Body` with the transcript.
+- Media understanding (if configured via `tools.media.*`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`.
+  - Audio sets `{{Transcript}}` and uses the transcript for command parsing so slash commands still work.
+  - Video and image descriptions preserve any caption text for command parsing.
+- Only the first matching image/audio/video attachment is processed; remaining attachments are left untouched.
 
 ## Limits & Errors
+**Outbound send caps (WhatsApp web send)**
 - Images: ~6 MB cap after recompression.
 - Audio/voice/video: 16 MB cap; documents: 100 MB cap.
 - Oversize or unreadable media → clear error in logs and the reply is skipped.
 
+**Media understanding caps (transcription/description)**
+- Image default: 10 MB (`tools.media.image.maxBytes`).
+- Audio default: 20 MB (`tools.media.audio.maxBytes`).
+- Video default: 50 MB (`tools.media.video.maxBytes`).
+- Oversize media skips understanding, but replies still go through with the original body.
+
 ## Notes for Tests
 - Cover send + reply flows for image/audio/document cases.
 - Validate recompression for images (size bound) and voice-note flag for audio.
diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md
new file mode 100644
index 000000000..fe8e69a52
--- /dev/null
+++ b/docs/nodes/media-understanding.md
@@ -0,0 +1,217 @@
+---
+summary: "Inbound image/audio/video understanding (optional) with provider + CLI fallbacks"
+read_when:
+  - Designing or refactoring media understanding
+  - Tuning inbound audio/video/image preprocessing
+---
+# Media Understanding (Inbound) — 2026-01-17
+
+Clawdbot can optionally **summarize inbound media** (image/audio/video) before the reply pipeline runs. This is **opt-in** and separate from the base attachment flow—if understanding is off, models still receive the original files/URLs as usual.
+
+## Goals
+- Optional: pre‑digest inbound media into short text for faster routing + better command parsing.
+- Preserve original media delivery to the model (always).
+- Support **provider APIs** and **CLI fallbacks**.
+- Allow multiple models with ordered fallback (error/size/timeout).
+
+## High‑level behavior
+1) Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`).
+2) For each enabled capability (image/audio/video), pick the **first matching attachment**.
+3) Choose the first eligible model entry (size + capability + auth).  
+4) If a model fails or the media is too large, **fall back to the next entry**.
+5) On success:
+   - `Body` becomes `[Image]`, `[Audio]`, or `[Video]` block.
+   - Audio sets `{{Transcript}}` and `CommandBody`/`RawBody` for command parsing.
+   - Captions are preserved as `User text:` inside the block.
+
+If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
+
+## Config overview
+Use **per‑capability configs** under `tools.media`. Each capability can define:
+- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
+- **ordered `models` list** (fallback order)
+- `scope` (optional gating by channel/chatType/session key)
+
+```json5
+{
+  tools: {
+    media: {
+      image: { /* config */ },
+      audio: { /* config */ },
+      video: { /* config */ }
+    }
+  }
+}
+```
+
+### Model entries
+Each `models[]` entry can be **provider** or **CLI**:
+
+```json5
+{
+  type: "provider",        // default if omitted
+  provider: "openai",
+  model: "gpt-5.2",
+  prompt: "Describe the image in <= 500 chars.",
+  maxChars: 500,
+  maxBytes: 10485760,
+  timeoutSeconds: 60,
+  capabilities: ["image"], // optional, used for multi‑modal entries
+  profile: "vision-profile",
+  preferredProfile: "vision-fallback"
+}
+```
+
+```json5
+{
+  type: "cli",
+  command: "gemini",
+  args: [
+    "-m",
+    "gemini-3-flash",
+    "--allowed-tools",
+    "read_file",
+    "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
+  ],
+  maxChars: 500,
+  maxBytes: 52428800,
+  timeoutSeconds: 120,
+  capabilities: ["video", "image"]
+}
+```
+
+## Defaults and limits
+Recommended defaults:
+- `maxChars`: **500** for image/video (short, command‑friendly)
+- `maxChars`: **unset** for audio (full transcript unless you set a limit)
+- `maxBytes`:
+  - image: **10MB**
+  - audio: **20MB**
+  - video: **50MB**
+
+Rules:
+- If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
+- If the model returns more than `maxChars`, output is trimmed.
+- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
+
+## Capabilities (optional)
+If you set `capabilities`, the entry only runs for those media types. Suggested
+defaults when you opt in:
+- `openai`, `anthropic`: **image**
+- `google` (Gemini API): **image + audio + video**
+- CLI entries: declare the exact capabilities you support.
+
+If you omit `capabilities`, the entry is eligible for the list it appears in.
+
+## Provider support matrix (Clawdbot integrations)
+| Capability | Provider integration | Notes |
+|------------|----------------------|-------|
+| Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. |
+| Audio | OpenAI, Groq | Provider transcription (Whisper). |
+| Video | Google (Gemini API) | Provider video understanding. |
+
+## Recommended providers
+**Image**
+- Prefer your active model if it supports images.
+- Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`.
+
+**Audio**
+- `openai/whisper-1` or `groq/whisper-large-v3-turbo`.
+- CLI fallback: `whisper` binary.
+
+**Video**
+- `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
+- CLI fallback: `gemini` CLI (supports `read_file` on video/audio).
+
+## Config examples
+
+### 1) Audio + Video only (image off)
+```json5
+{
+  tools: {
+    media: {
+      audio: {
+        enabled: true,
+        models: [
+          { provider: "openai", model: "whisper-1" },
+          {
+            type: "cli",
+            command: "whisper",
+            args: ["--model", "base", "{{MediaPath}}"]
+          }
+        ]
+      },
+      video: {
+        enabled: true,
+        maxChars: 500,
+        models: [
+          { provider: "google", model: "gemini-3-flash-preview" },
+          {
+            type: "cli",
+            command: "gemini",
+            args: [
+              "-m",
+              "gemini-3-flash",
+              "--allowed-tools",
+              "read_file",
+              "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
+            ]
+          }
+        ]
+      }
+    }
+  }
+}
+```
+
+### 2) Optional image understanding
+```json5
+{
+  tools: {
+    media: {
+      image: {
+        enabled: true,
+        maxBytes: 10485760,
+        maxChars: 500,
+        models: [
+          { provider: "openai", model: "gpt-5.2" },
+          { provider: "anthropic", model: "claude-opus-4-5" },
+          {
+            type: "cli",
+            command: "gemini",
+            args: [
+              "-m",
+              "gemini-3-flash",
+              "--allowed-tools",
+              "read_file",
+              "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
+            ]
+          }
+        ]
+      }
+    }
+  }
+}
+```
+
+### 3) Multi‑modal single entry (explicit capabilities)
+```json5
+{
+  tools: {
+    media: {
+      image: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] },
+      audio: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] },
+      video: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] }
+    }
+  }
+}
+```
+
+## Notes
+- Understanding is **best‑effort**. Errors do not block replies.
+- Attachments are still passed to models even when understanding is disabled.
+- Use `scope` to limit where understanding runs (e.g. only DMs).
+
+## Related docs
+- [Configuration](/gateway/configuration)
+- [Image & Media Support](/nodes/images)
diff --git a/src/agents/bash-tools.exec.ts b/src/agents/bash-tools.exec.ts
index d97b0e653..5ef6ccd15 100644
--- a/src/agents/bash-tools.exec.ts
+++ b/src/agents/bash-tools.exec.ts
@@ -254,7 +254,10 @@ export function createExecTool(
       let yielded = false;
       let yieldTimer: NodeJS.Timeout | null = null;
       let timeoutTimer: NodeJS.Timeout | null = null;
+      let timeoutFinalizeTimer: NodeJS.Timeout | null = null;
       let timedOut = false;
+      const timeoutFinalizeMs = 1000;
+      let rejectFn: ((err: Error) => void) | null = null;
 
       const settle = (fn: () => void) => {
         if (settled) return;
@@ -262,6 +265,18 @@ export function createExecTool(
         fn();
       };
 
+      const effectiveTimeout =
+        typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
+      const finalizeTimeout = () => {
+        if (session.exited) return;
+        markExited(session, null, "SIGKILL", "failed");
+        if (settled || !rejectFn) return;
+        const aggregated = session.aggregated.trim();
+        const reason = `Command timed out after ${effectiveTimeout} seconds`;
+        const message = aggregated ? `${aggregated}\n\n${reason}` : reason;
+        settle(() => rejectFn?.(new Error(message)));
+      };
+
       // Tool-call abort should not kill backgrounded sessions; timeouts still must.
       const onAbortSignal = () => {
         if (yielded || session.backgrounded) return;
@@ -272,15 +287,17 @@ export function createExecTool(
       const onTimeout = () => {
         timedOut = true;
         killSession(session);
+        if (!timeoutFinalizeTimer) {
+          timeoutFinalizeTimer = setTimeout(() => {
+            finalizeTimeout();
+          }, timeoutFinalizeMs);
+        }
       };
 
       if (signal?.aborted) onAbortSignal();
       else if (signal) {
         signal.addEventListener("abort", onAbortSignal, { once: true });
       }
-
-      const effectiveTimeout =
-        typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
       if (effectiveTimeout > 0) {
         timeoutTimer = setTimeout(() => {
           onTimeout();
@@ -321,6 +338,7 @@ export function createExecTool(
       });
 
       return new Promise<AgentToolResult<ExecToolDetails>>((resolve, reject) => {
+        rejectFn = reject;
         const resolveRunning = () => {
           settle(() =>
             resolve({
@@ -369,6 +387,7 @@ export function createExecTool(
         const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
           if (yieldTimer) clearTimeout(yieldTimer);
           if (timeoutTimer) clearTimeout(timeoutTimer);
+          if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
           const durationMs = Date.now() - startedAt;
           const wasSignal = exitSignal != null;
           const isSuccess = code === 0 && !wasSignal && !signal?.aborted && !timedOut;
@@ -421,6 +440,7 @@ export function createExecTool(
         child.once("error", (err) => {
           if (yieldTimer) clearTimeout(yieldTimer);
           if (timeoutTimer) clearTimeout(timeoutTimer);
+          if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
           markExited(session, null, null, "failed");
           settle(() => reject(err));
         });
diff --git a/src/auto-reply/media-note.test.ts b/src/auto-reply/media-note.test.ts
index 3e6dd3527..0dd403386 100644
--- a/src/auto-reply/media-note.test.ts
+++ b/src/auto-reply/media-note.test.ts
@@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => {
       ].join("\n"),
     );
   });
+
+  it("skips media notes for attachments with understanding output", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
+      MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
+      MediaUnderstanding: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "hello",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
+  });
 });
diff --git a/src/auto-reply/media-note.ts b/src/auto-reply/media-note.ts
index d08e7011d..aafadb999 100644
--- a/src/auto-reply/media-note.ts
+++ b/src/auto-reply/media-note.ts
@@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: {
 }
 
 export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
+  // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
+  const suppressed = new Set(
+    Array.isArray(ctx.MediaUnderstanding)
+      ? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
+      : [],
+  );
   const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
   const paths =
     pathsFromArray && pathsFromArray.length > 0
@@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
       ? ctx.MediaTypes
       : undefined;
 
-  if (paths.length === 1) {
+  const entries = paths
+    .map((entry, index) => ({
+      path: entry ?? "",
+      type: types?.[index] ?? ctx.MediaType,
+      url: urls?.[index] ?? ctx.MediaUrl,
+      index,
+    }))
+    .filter((entry) => !suppressed.has(entry.index));
+  if (entries.length === 0) return undefined;
+  if (entries.length === 1) {
     return formatMediaAttachedLine({
-      path: paths[0] ?? "",
-      type: types?.[0] ?? ctx.MediaType,
-      url: urls?.[0] ?? ctx.MediaUrl,
+      path: entries[0]?.path ?? "",
+      type: entries[0]?.type,
+      url: entries[0]?.url,
     });
   }
 
-  const count = paths.length;
+  const count = entries.length;
   const lines: string[] = [`[media attached: ${count} files]`];
-  for (const [idx, mediaPath] of paths.entries()) {
+  for (const [idx, entry] of entries.entries()) {
     lines.push(
       formatMediaAttachedLine({
-        path: mediaPath,
+        path: entry.path,
         index: idx + 1,
         total: count,
-        type: types?.[idx],
-        url: urls?.[idx],
+        type: entry.type,
+        url: entry.url,
       }),
     );
   }
diff --git a/src/auto-reply/reply/get-reply-directives.ts b/src/auto-reply/reply/get-reply-directives.ts
index f0c062d4b..27ec6144f 100644
--- a/src/auto-reply/reply/get-reply-directives.ts
+++ b/src/auto-reply/reply/get-reply-directives.ts
@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
   const commandSource =
     sessionCtx.CommandBody ??
     sessionCtx.RawBody ??
+    sessionCtx.Transcript ??
     sessionCtx.BodyStripped ??
     sessionCtx.Body ??
     "";
diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts
index ea539d341..a87877ad5 100644
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
     cap?: number;
     dropPolicy?: InlineDirectives["dropPolicy"];
   };
-  transcribedText?: string;
   typing: TypingController;
   opts?: GetReplyOptions;
   defaultModel: string;
@@ -210,7 +209,6 @@ export async function runPreparedReply(
     model,
     perMessageQueueMode,
     perMessageQueueOptions,
-    transcribedText,
     typing,
     opts,
     defaultModel,
@@ -325,11 +323,7 @@ export async function runPreparedReply(
   sessionEntry = skillResult.sessionEntry ?? sessionEntry;
   currentSystemSent = skillResult.systemSent;
   const skillsSnapshot = skillResult.skillsSnapshot;
-  const prefixedBody = transcribedText
-    ? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
-        .filter(Boolean)
-        .join("\n\n")
-    : [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
+  const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
   const mediaNote = buildInboundMediaNote(ctx);
   const mediaReplyHint = mediaNote
     ? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
@@ -370,11 +364,7 @@ export async function runPreparedReply(
   }
   const sessionIdFinal = sessionId ?? crypto.randomUUID();
   const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
-  const queueBodyBase = transcribedText
-    ? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
-        .filter(Boolean)
-        .join("\n\n")
-    : [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
+  const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
   const queuedBody = mediaNote
     ? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
     : queueBodyBase;
diff --git a/src/auto-reply/reply/get-reply.ts b/src/auto-reply/reply/get-reply.ts
index 11b0c7e5b..9bd2cdbdf 100644
--- a/src/auto-reply/reply/get-reply.ts
+++ b/src/auto-reply/reply/get-reply.ts
@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
 import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
 import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
 import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
-import { logVerbose } from "../../globals.js";
 import { defaultRuntime } from "../../runtime.js";
 import { resolveCommandAuthorization } from "../command-auth.js";
 import type { MsgContext } from "../templating.js";
 import { SILENT_REPLY_TOKEN } from "../tokens.js";
-import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveDefaultModel } from "./directive-handling.js";
 import { resolveReplyDirectives } from "./get-reply-directives.js";
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
   });
   opts?.onTypingController?.(typing);
 
-  let transcribedText: string | undefined;
-  if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
-    const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
-    if (transcribed?.text) {
-      transcribedText = transcribed.text;
-      ctx.Body = transcribed.text;
-      ctx.Transcript = transcribed.text;
-      logVerbose("Replaced Body with audio transcript for reply flow");
-    }
-  }
+  await applyMediaUnderstanding({
+    ctx,
+    cfg,
+    agentDir,
+  });
 
   const commandAuthorized = ctx.CommandAuthorized ?? true;
   resolveCommandAuthorization({
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
     model,
     perMessageQueueMode,
     perMessageQueueOptions,
-    transcribedText,
     typing,
     opts,
     defaultModel,
diff --git a/src/auto-reply/templating.ts b/src/auto-reply/templating.ts
index 20c2ef6b8..7bf105b86 100644
--- a/src/auto-reply/templating.ts
+++ b/src/auto-reply/templating.ts
@@ -1,6 +1,7 @@
 import type { ChannelId } from "../channels/plugins/types.js";
 import type { InternalMessageChannel } from "../utils/message-channel.js";
 import type { CommandArgs } from "./commands-registry.types.js";
+import type { MediaUnderstandingOutput } from "../media-understanding/types.js";
 
 /** Valid message channels for routing. */
 export type OriginatingChannelType = ChannelId | InternalMessageChannel;
@@ -41,6 +42,9 @@ export type MsgContext = {
   /** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
   MediaRemoteHost?: string;
   Transcript?: string;
+  MediaUnderstanding?: MediaUnderstandingOutput[];
+  Prompt?: string;
+  MaxChars?: number;
   ChatType?: string;
   GroupSubject?: string;
   GroupRoom?: string;
diff --git a/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts b/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts
index d0a01b55c..641a1e85b 100644
--- a/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts
+++ b/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts
@@ -65,7 +65,7 @@ describe("legacy config detection", () => {
     expect(res.config?.messages?.groupChat?.mentionPatterns).toEqual(["@clawd"]);
     expect(res.config?.routing?.groupChat?.mentionPatterns).toBeUndefined();
   });
-  it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/audio", async () => {
+  it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/media", async () => {
     vi.resetModules();
     const { migrateLegacyConfig } = await import("./config.js");
     const res = migrateLegacyConfig({
@@ -80,7 +80,7 @@ describe("legacy config detection", () => {
     });
     expect(res.changes).toContain("Moved routing.agentToAgent → tools.agentToAgent.");
     expect(res.changes).toContain("Moved routing.queue → messages.queue.");
-    expect(res.changes).toContain("Moved routing.transcribeAudio → tools.audio.transcription.");
+    expect(res.changes).toContain("Moved routing.transcribeAudio → tools.media.audio.models.");
     expect(res.config?.tools?.agentToAgent).toEqual({
       enabled: true,
       allow: ["main"],
@@ -89,9 +89,16 @@ describe("legacy config detection", () => {
       mode: "queue",
       cap: 3,
     });
-    expect(res.config?.tools?.audio?.transcription).toEqual({
-      args: ["--model", "base"],
-      timeoutSeconds: 2,
+    expect(res.config?.tools?.media?.audio).toEqual({
+      enabled: true,
+      models: [
+        {
+          command: "whisper",
+          type: "cli",
+          args: ["--model", "base"],
+          timeoutSeconds: 2,
+        },
+      ],
     });
     expect(res.config?.routing).toBeUndefined();
   });
diff --git a/src/config/legacy.migrations.part-2.ts b/src/config/legacy.migrations.part-2.ts
index e48d1b1f8..c161fb5b9 100644
--- a/src/config/legacy.migrations.part-2.ts
+++ b/src/config/legacy.migrations.part-2.ts
@@ -330,14 +330,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
         const mapped = mapLegacyAudioTranscription(routing.transcribeAudio);
         if (mapped) {
           const tools = ensureRecord(raw, "tools");
-          const toolsAudio = ensureRecord(tools, "audio");
-          if (toolsAudio.transcription === undefined) {
-            toolsAudio.transcription = mapped;
-            changes.push("Moved routing.transcribeAudio → tools.audio.transcription.");
+          const media = ensureRecord(tools, "media");
+          const mediaAudio = ensureRecord(media, "audio");
+          const models = Array.isArray(mediaAudio.models)
+            ? (mediaAudio.models as unknown[])
+            : [];
+          if (models.length === 0) {
+            mediaAudio.enabled = true;
+            mediaAudio.models = [mapped];
+            changes.push("Moved routing.transcribeAudio → tools.media.audio.models.");
           } else {
-            changes.push(
-              "Removed routing.transcribeAudio (tools.audio.transcription already set).",
-            );
+            changes.push("Removed routing.transcribeAudio (tools.media.audio.models already set).");
           }
         } else {
           changes.push("Removed routing.transcribeAudio (unsupported transcription CLI).");
@@ -350,12 +353,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
         const mapped = mapLegacyAudioTranscription(audio.transcription);
         if (mapped) {
           const tools = ensureRecord(raw, "tools");
-          const toolsAudio = ensureRecord(tools, "audio");
-          if (toolsAudio.transcription === undefined) {
-            toolsAudio.transcription = mapped;
-            changes.push("Moved audio.transcription → tools.audio.transcription.");
+          const media = ensureRecord(tools, "media");
+          const mediaAudio = ensureRecord(media, "audio");
+          const models = Array.isArray(mediaAudio.models)
+            ? (mediaAudio.models as unknown[])
+            : [];
+          if (models.length === 0) {
+            mediaAudio.enabled = true;
+            mediaAudio.models = [mapped];
+            changes.push("Moved audio.transcription → tools.media.audio.models.");
           } else {
-            changes.push("Removed audio.transcription (tools.audio.transcription already set).");
+            changes.push("Removed audio.transcription (tools.media.audio.models already set).");
           }
           delete audio.transcription;
           if (Object.keys(audio).length === 0) delete raw.audio;
diff --git a/src/config/legacy.rules.ts b/src/config/legacy.rules.ts
index b04a74783..31bc48037 100644
--- a/src/config/legacy.rules.ts
+++ b/src/config/legacy.rules.ts
@@ -69,7 +69,7 @@ export const LEGACY_CONFIG_RULES: LegacyConfigRule[] = [
   {
     path: ["routing", "transcribeAudio"],
     message:
-      "routing.transcribeAudio was moved; use tools.audio.transcription instead (auto-migrated on load).",
+      "routing.transcribeAudio was moved; use tools.media.audio.models instead (auto-migrated on load).",
   },
   {
     path: ["telegram", "requireMention"],
diff --git a/src/config/legacy.shared.ts b/src/config/legacy.shared.ts
index 567ca308f..d2d361f00 100644
--- a/src/config/legacy.shared.ts
+++ b/src/config/legacy.shared.ts
@@ -56,7 +56,7 @@ export const mapLegacyAudioTranscription = (value: unknown): Record<string, unkn
   const timeoutSeconds =
     typeof transcriber?.timeoutSeconds === "number" ? transcriber?.timeoutSeconds : undefined;
 
-  const result: Record<string, unknown> = {};
+  const result: Record<string, unknown> = { command: rawExecutable, type: "cli" };
   if (args.length > 0) result.args = args;
   if (timeoutSeconds !== undefined) result.timeoutSeconds = timeoutSeconds;
   return result;
diff --git a/src/config/schema.ts b/src/config/schema.ts
index f3ea4211b..a7a9b60a6 100644
--- a/src/config/schema.ts
+++ b/src/config/schema.ts
@@ -102,8 +102,28 @@ const FIELD_LABELS: Record<string, string> = {
   "gateway.remote.password": "Remote Gateway Password",
   "gateway.auth.token": "Gateway Token",
   "gateway.auth.password": "Gateway Password",
-  "tools.audio.transcription.args": "Audio Transcription Args",
-  "tools.audio.transcription.timeoutSeconds": "Audio Transcription Timeout (sec)",
+  "tools.media.image.enabled": "Enable Image Understanding",
+  "tools.media.image.maxBytes": "Image Understanding Max Bytes",
+  "tools.media.image.maxChars": "Image Understanding Max Chars",
+  "tools.media.image.prompt": "Image Understanding Prompt",
+  "tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
+  "tools.media.image.models": "Image Understanding Models",
+  "tools.media.image.scope": "Image Understanding Scope",
+  "tools.media.audio.enabled": "Enable Audio Understanding",
+  "tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
+  "tools.media.audio.maxChars": "Audio Understanding Max Chars",
+  "tools.media.audio.prompt": "Audio Understanding Prompt",
+  "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
+  "tools.media.audio.language": "Audio Understanding Language",
+  "tools.media.audio.models": "Audio Understanding Models",
+  "tools.media.audio.scope": "Audio Understanding Scope",
+  "tools.media.video.enabled": "Enable Video Understanding",
+  "tools.media.video.maxBytes": "Video Understanding Max Bytes",
+  "tools.media.video.maxChars": "Video Understanding Max Chars",
+  "tools.media.video.prompt": "Video Understanding Prompt",
+  "tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
+  "tools.media.video.models": "Video Understanding Models",
+  "tools.media.video.scope": "Video Understanding Scope",
   "tools.profile": "Tool Profile",
   "agents.list[].tools.profile": "Agent Tool Profile",
   "tools.byProvider": "Tool Policy by Provider",
diff --git a/src/config/types.messages.ts b/src/config/types.messages.ts
index 40ed2d614..691ca617a 100644
--- a/src/config/types.messages.ts
+++ b/src/config/types.messages.ts
@@ -47,7 +47,7 @@ export type BroadcastConfig = {
 };
 
 export type AudioConfig = {
-  /** @deprecated Use tools.audio.transcription instead. */
+  /** @deprecated Use tools.media.audio.models instead. */
   transcription?: {
     // Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout.
     command: string[];
diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts
index 5c87a3f89..80f7dab8e 100644
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -1,4 +1,76 @@
-import type { AgentElevatedAllowFromConfig } from "./types.base.js";
+import type { AgentElevatedAllowFromConfig, SessionSendPolicyAction } from "./types.base.js";
+
+export type MediaUnderstandingScopeMatch = {
+  channel?: string;
+  chatType?: "direct" | "group" | "room";
+  keyPrefix?: string;
+};
+
+export type MediaUnderstandingScopeRule = {
+  action: SessionSendPolicyAction;
+  match?: MediaUnderstandingScopeMatch;
+};
+
+export type MediaUnderstandingScopeConfig = {
+  default?: SessionSendPolicyAction;
+  rules?: MediaUnderstandingScopeRule[];
+};
+
+export type MediaUnderstandingCapability = "image" | "audio" | "video";
+
+export type MediaUnderstandingModelConfig = {
+  /** provider API id (e.g. openai, google). */
+  provider?: string;
+  /** Model id for provider-based understanding. */
+  model?: string;
+  /** Optional capability tags for shared model lists. */
+  capabilities?: MediaUnderstandingCapability[];
+  /** Use a CLI command instead of provider API. */
+  type?: "provider" | "cli";
+  /** CLI binary (required when type=cli). */
+  command?: string;
+  /** CLI args (template-enabled). */
+  args?: string[];
+  /** Optional prompt override for this model entry. */
+  prompt?: string;
+  /** Optional max output characters for this model entry. */
+  maxChars?: number;
+  /** Optional max bytes for this model entry. */
+  maxBytes?: number;
+  /** Optional timeout override (seconds) for this model entry. */
+  timeoutSeconds?: number;
+  /** Optional language hint for audio transcription. */
+  language?: string;
+  /** Auth profile id to use for this provider. */
+  profile?: string;
+  /** Preferred profile id if multiple are available. */
+  preferredProfile?: string;
+};
+
+export type MediaUnderstandingConfig = {
+  /** Enable media understanding when models are configured. */
+  enabled?: boolean;
+  /** Optional scope gating for understanding. */
+  scope?: MediaUnderstandingScopeConfig;
+  /** Default max bytes to send. */
+  maxBytes?: number;
+  /** Default max output characters. */
+  maxChars?: number;
+  /** Default prompt. */
+  prompt?: string;
+  /** Default timeout (seconds). */
+  timeoutSeconds?: number;
+  /** Default language hint (audio). */
+  language?: string;
+  /** Ordered model list (fallbacks in order). */
+  models?: MediaUnderstandingModelConfig[];
+};
+
+export type MediaToolsConfig = {
+  image?: MediaUnderstandingConfig;
+  audio?: MediaUnderstandingConfig;
+  video?: MediaUnderstandingConfig;
+};
 
 export type ToolProfileId = "minimal" | "coding" | "messaging" | "full";
 
@@ -127,13 +199,7 @@ export type ToolsConfig = {
       };
     };
   };
-  audio?: {
-    transcription?: {
-      /** CLI args (template-enabled). */
-      args?: string[];
-      timeoutSeconds?: number;
-    };
-  };
+  media?: MediaToolsConfig;
   /** Message tool configuration. */
   message?: {
     /**
diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts
index 5946b1b06..6032530aa 100644
--- a/src/config/zod-schema.agent-runtime.ts
+++ b/src/config/zod-schema.agent-runtime.ts
@@ -5,7 +5,7 @@ import {
   GroupChatSchema,
   HumanDelaySchema,
   IdentitySchema,
-  ToolsAudioTranscriptionSchema,
+  ToolsMediaSchema,
 } from "./zod-schema.core.js";
 
 export const HeartbeatSchema = z
@@ -283,11 +283,7 @@ export const ToolsSchema = z
     deny: z.array(z.string()).optional(),
     byProvider: z.record(z.string(), ToolPolicyWithProfileSchema).optional(),
     web: ToolsWebSchema,
-    audio: z
-      .object({
-        transcription: ToolsAudioTranscriptionSchema,
-      })
-      .optional(),
+    media: ToolsMediaSchema,
     message: z
       .object({
         allowCrossContextSend: z.boolean().optional(),
diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts
index 78327f120..330d55338 100644
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -240,10 +240,68 @@ export const ExecutableTokenSchema = z
   .string()
   .refine(isSafeExecutableValue, "expected safe executable name or path");
 
-export const ToolsAudioTranscriptionSchema = z
+export const MediaUnderstandingScopeSchema = z
   .object({
+    default: z.union([z.literal("allow"), z.literal("deny")]).optional(),
+    rules: z
+      .array(
+        z.object({
+          action: z.union([z.literal("allow"), z.literal("deny")]),
+          match: z
+            .object({
+              channel: z.string().optional(),
+              chatType: z
+                .union([z.literal("direct"), z.literal("group"), z.literal("room")])
+                .optional(),
+              keyPrefix: z.string().optional(),
+            })
+            .optional(),
+        }),
+      )
+      .optional(),
+  })
+  .optional();
+
+export const MediaUnderstandingCapabilitiesSchema = z
+  .array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
+  .optional();
+
+export const MediaUnderstandingModelSchema = z
+  .object({
+    provider: z.string().optional(),
+    model: z.string().optional(),
+    capabilities: MediaUnderstandingCapabilitiesSchema,
+    type: z.union([z.literal("provider"), z.literal("cli")]).optional(),
+    command: z.string().optional(),
     args: z.array(z.string()).optional(),
+    prompt: z.string().optional(),
+    maxChars: z.number().int().positive().optional(),
+    maxBytes: z.number().int().positive().optional(),
     timeoutSeconds: z.number().int().positive().optional(),
+    language: z.string().optional(),
+    profile: z.string().optional(),
+    preferredProfile: z.string().optional(),
+  })
+  .optional();
+
+export const ToolsMediaUnderstandingSchema = z
+  .object({
+    enabled: z.boolean().optional(),
+    scope: MediaUnderstandingScopeSchema,
+    maxBytes: z.number().int().positive().optional(),
+    maxChars: z.number().int().positive().optional(),
+    prompt: z.string().optional(),
+    timeoutSeconds: z.number().int().positive().optional(),
+    language: z.string().optional(),
+    models: z.array(MediaUnderstandingModelSchema).optional(),
+  })
+  .optional();
+
+export const ToolsMediaSchema = z
+  .object({
+    image: ToolsMediaUnderstandingSchema.optional(),
+    audio: ToolsMediaUnderstandingSchema.optional(),
+    video: ToolsMediaUnderstandingSchema.optional(),
   })
   .optional();
 
diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
new file mode 100644
index 000000000..f52685e6f
--- /dev/null
+++ b/src/media-understanding/apply.test.ts
@@ -0,0 +1,258 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import { resolveApiKeyForProvider } from "../agents/model-auth.js";
+import { fetchRemoteMedia } from "../media/fetch.js";
+
+vi.mock("../agents/model-auth.js", () => ({
+  resolveApiKeyForProvider: vi.fn(async () => ({
+    apiKey: "test-key",
+    source: "test",
+  })),
+}));
+
+vi.mock("../media/fetch.js", () => ({
+  fetchRemoteMedia: vi.fn(),
+}));
+
+vi.mock("../process/exec.js", () => ({
+  runExec: vi.fn(),
+}));
+
+async function loadApply() {
+  return await import("./apply.js");
+}
+
+describe("applyMediaUnderstanding", () => {
+  const mockedResolveApiKey = vi.mocked(resolveApiKeyForProvider);
+  const mockedFetchRemoteMedia = vi.mocked(fetchRemoteMedia);
+
+  beforeEach(() => {
+    mockedResolveApiKey.mockClear();
+    mockedFetchRemoteMedia.mockReset();
+    mockedFetchRemoteMedia.mockResolvedValue({
+      buffer: Buffer.from("audio-bytes"),
+      contentType: "audio/ogg",
+      fileName: "note.ogg",
+    });
+  });
+
+  it("sets Transcript and replaces Body when audio transcription succeeds", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "note.ogg");
+    await fs.writeFile(audioPath, "hello");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/ogg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => ({ text: "transcribed text" }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("transcribed text");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\ntranscribed text");
+    expect(ctx.CommandBody).toBe("transcribed text");
+    expect(ctx.RawBody).toBe("transcribed text");
+  });
+
+  it("handles URL-only attachments for audio transcription", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaUrl: "https://example.com/note.ogg",
+      MediaType: "audio/ogg",
+      ChatType: "dm",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            scope: {
+              default: "deny",
+              rules: [{ action: "allow", match: { chatType: "direct" } }],
+            },
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => ({ text: "remote transcript" }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("remote transcript");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\nremote transcript");
+  });
+
+  it("skips audio transcription when attachment exceeds maxBytes", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "large.wav");
+    await fs.writeFile(audioPath, "0123456789");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/wav",
+    };
+    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 4,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: { groq: { id: "groq", transcribeAudio } },
+    });
+
+    expect(result.appliedAudio).toBe(false);
+    expect(transcribeAudio).not.toHaveBeenCalled();
+    expect(ctx.Body).toBe("<media:audio>");
+  });
+
+  it("falls back to CLI model when provider fails", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "note.ogg");
+    await fs.writeFile(audioPath, "hello");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/ogg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            models: [
+              { provider: "groq" },
+              {
+                type: "cli",
+                command: "whisper",
+                args: ["{{MediaPath}}"],
+              },
+            ],
+          },
+        },
+      },
+    };
+
+    const execModule = await import("../process/exec.js");
+    vi.mocked(execModule.runExec).mockResolvedValue({
+      stdout: "cli transcript\n",
+      stderr: "",
+    });
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => {
+            throw new Error("boom");
+          },
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("cli transcript");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
+  });
+
+  it("uses CLI image understanding and preserves caption for commands", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const imagePath = path.join(dir, "photo.jpg");
+    await fs.writeFile(imagePath, "image-bytes");
+
+    const ctx: MsgContext = {
+      Body: "<media:image> show Dom",
+      MediaPath: imagePath,
+      MediaType: "image/jpeg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          image: {
+            enabled: true,
+            models: [
+              {
+                type: "cli",
+                command: "gemini",
+                args: ["--file", "{{MediaPath}}", "--prompt", "{{Prompt}}"],
+              },
+            ],
+          },
+        },
+      },
+    };
+
+    const execModule = await import("../process/exec.js");
+    vi.mocked(execModule.runExec).mockResolvedValue({
+      stdout: "image description\n",
+      stderr: "",
+    });
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+    });
+
+    expect(result.appliedImage).toBe(true);
+    expect(ctx.Body).toBe("[Image]\nUser text:\nshow Dom\nDescription:\nimage description");
+    expect(ctx.CommandBody).toBe("show Dom");
+    expect(ctx.RawBody).toBe("show Dom");
+  });
+});
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
new file mode 100644
index 000000000..558b76f57
--- /dev/null
+++ b/src/media-understanding/apply.ts
@@ -0,0 +1,804 @@
+import crypto from "node:crypto";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
+import { complete } from "@mariozechner/pi-ai";
+import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import { applyTemplate } from "../auto-reply/templating.js";
+import { getApiKeyForModel, resolveApiKeyForProvider } from "../agents/model-auth.js";
+import { ensureClawdbotModelsJson } from "../agents/models-config.js";
+import { minimaxUnderstandImage } from "../agents/minimax-vlm.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { fetchRemoteMedia } from "../media/fetch.js";
+import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
+import { runExec } from "../process/exec.js";
+import type {
+  MediaUnderstandingConfig,
+  MediaUnderstandingModelConfig,
+  MediaUnderstandingScopeConfig,
+} from "../config/types.tools.js";
+import { extractMediaUserText, formatMediaUnderstandingBody } from "./format.js";
+import {
+  buildMediaUnderstandingRegistry,
+  getMediaUnderstandingProvider,
+  normalizeMediaProviderId,
+} from "./providers/index.js";
+import { fetchWithTimeout } from "./providers/shared.js";
+import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
+import type {
+  MediaAttachment,
+  MediaUnderstandingOutput,
+  MediaUnderstandingProvider,
+} from "./types.js";
+import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
+
+const MB = 1024 * 1024;
+const DEFAULT_MAX_CHARS = 500;
+const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<Capability, number | undefined> = {
+  image: DEFAULT_MAX_CHARS,
+  audio: undefined,
+  video: DEFAULT_MAX_CHARS,
+};
+const DEFAULT_MAX_BYTES: Record<Capability, number> = {
+  image: 10 * MB,
+  audio: 20 * MB,
+  video: 50 * MB,
+};
+const DEFAULT_TIMEOUT_SECONDS: Record<Capability, number> = {
+  image: 60,
+  audio: 60,
+  video: 120,
+};
+const DEFAULT_PROMPT: Record<Capability, string> = {
+  image: "Describe the image.",
+  audio: "Transcribe the audio.",
+  video: "Describe the video.",
+};
+const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
+const DEFAULT_AUDIO_MODELS: Record<string, string> = {
+  groq: "whisper-large-v3-turbo",
+  openai: "whisper-1",
+};
+const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
+
+export type ApplyMediaUnderstandingResult = {
+  outputs: MediaUnderstandingOutput[];
+  appliedImage: boolean;
+  appliedAudio: boolean;
+  appliedVideo: boolean;
+};
+
+type Capability = "image" | "audio" | "video";
+
+type MediaBufferResult = {
+  buffer: Buffer;
+  mime?: string;
+  fileName: string;
+};
+
+type MediaPathResult = {
+  path: string;
+  cleanup?: () => Promise<void> | void;
+};
+
+function normalizeAttachmentPath(raw?: string | null): string | undefined {
+  const value = raw?.trim();
+  if (!value) return undefined;
+  if (value.startsWith("file://")) {
+    try {
+      return fileURLToPath(value);
+    } catch {
+      return undefined;
+    }
+  }
+  return value;
+}
+
+function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
+  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
+  const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
+  const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
+  const resolveMime = (count: number, index: number) => {
+    const typeHint = typesFromArray?.[index];
+    const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
+    if (trimmed) return trimmed;
+    return count === 1 ? ctx.MediaType : undefined;
+  };
+
+  if (pathsFromArray && pathsFromArray.length > 0) {
+    const count = pathsFromArray.length;
+    const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
+    return pathsFromArray
+      .map((value, index) => ({
+        path: value?.trim() || undefined,
+        url: urls?.[index] ?? ctx.MediaUrl,
+        mime: resolveMime(count, index),
+        index,
+      }))
+      .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
+  }
+
+  if (urlsFromArray && urlsFromArray.length > 0) {
+    const count = urlsFromArray.length;
+    return urlsFromArray
+      .map((value, index) => ({
+        path: undefined,
+        url: value?.trim() || undefined,
+        mime: resolveMime(count, index),
+        index,
+      }))
+      .filter((entry) => Boolean(entry.url?.trim()));
+  }
+
+  const pathValue = ctx.MediaPath?.trim();
+  const url = ctx.MediaUrl?.trim();
+  if (!pathValue && !url) return [];
+  return [
+    {
+      path: pathValue || undefined,
+      url: url || undefined,
+      mime: ctx.MediaType,
+      index: 0,
+    },
+  ];
+}
+
+function isVideoAttachment(attachment: MediaAttachment): boolean {
+  if (attachment.mime?.startsWith("video/")) return true;
+  const ext = getFileExtension(attachment.path ?? attachment.url);
+  if (!ext) return false;
+  return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
+}
+
+function isAudioAttachment(attachment: MediaAttachment): boolean {
+  if (attachment.mime?.startsWith("audio/")) return true;
+  return isAudioFileName(attachment.path ?? attachment.url);
+}
+
+function isImageAttachment(attachment: MediaAttachment): boolean {
+  if (attachment.mime?.startsWith("image/")) return true;
+  const ext = getFileExtension(attachment.path ?? attachment.url);
+  if (!ext) return false;
+  return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
+}
+
+function estimateBase64Size(bytes: number): number {
+  return Math.ceil(bytes / 3) * 4;
+}
+
+function resolveVideoMaxBase64Bytes(maxBytes: number): number {
+  const expanded = Math.floor(maxBytes * (4 / 3));
+  return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
+}
+
+function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
+  const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
+  return Math.max(1000, Math.floor(value * 1000));
+}
+
+function resolvePrompt(capability: Capability, prompt?: string, maxChars?: number): string {
+  const base = prompt?.trim() || DEFAULT_PROMPT[capability];
+  if (!maxChars || capability === "audio") return base;
+  return `${base} Respond in at most ${maxChars} characters.`;
+}
+
+function resolveRequestUrl(input: RequestInfo | URL): string {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+}
+
+function normalizeErrorMessage(err: unknown): string {
+  if (!err) return "";
+  if (typeof err === "string") return err;
+  if (err instanceof Error) return err.message;
+  try {
+    return JSON.stringify(err);
+  } catch {
+    return "";
+  }
+}
+
+function resolveMaxChars(params: {
+  capability: Capability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+}): number | undefined {
+  const { capability, entry, cfg } = params;
+  const configured = entry.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
+  if (typeof configured === "number") return configured;
+  return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
+}
+
+function trimOutput(text: string, maxChars?: number): string {
+  const trimmed = text.trim();
+  if (!maxChars || trimmed.length <= maxChars) return trimmed;
+  return trimmed.slice(0, maxChars).trim();
+}
+
+function resolveConfigValue<T>(primary: T | undefined, fallback: T): T {
+  return primary === undefined ? fallback : primary;
+}
+
+function resolveCapabilityConfig(
+  cfg: ClawdbotConfig,
+  capability: Capability,
+): MediaUnderstandingConfig | undefined {
+  return cfg.tools?.media?.[capability];
+}
+
+function resolveScopeDecision(params: {
+  scope?: MediaUnderstandingScopeConfig;
+  ctx: MsgContext;
+}): "allow" | "deny" {
+  return resolveMediaUnderstandingScope({
+    scope: params.scope,
+    sessionKey: params.ctx.SessionKey,
+    channel: params.ctx.Surface ?? params.ctx.Provider,
+    chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
+  });
+}
+
+function resolveModelEntries(
+  cfg: MediaUnderstandingConfig | undefined,
+  capability: Capability,
+): MediaUnderstandingModelConfig[] {
+  const models = cfg?.models ?? [];
+  if (models.length === 0) return [];
+  return models.filter((entry) => {
+    const caps = entry.capabilities;
+    if (!caps || caps.length === 0) return true;
+    return caps.includes(capability);
+  });
+}
+
+function isMaxBytesError(err: unknown): boolean {
+  const message = normalizeErrorMessage(err);
+  if (!message) return false;
+  return message.includes("exceeds maxBytes") || message.includes("payload exceeds maxBytes");
+}
+
+async function loadAttachmentBuffer(params: {
+  attachment: MediaAttachment;
+  maxBytes: number;
+  timeoutMs: number;
+}): Promise<MediaBufferResult | undefined> {
+  const { attachment, maxBytes, timeoutMs } = params;
+  const rawPath = normalizeAttachmentPath(attachment.path);
+  if (rawPath) {
+    const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
+    try {
+      const stat = await fs.stat(resolved);
+      if (!stat.isFile()) return undefined;
+      if (stat.size > maxBytes) {
+        if (shouldLogVerbose()) {
+          logVerbose(
+            `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
+          );
+        }
+        return undefined;
+      }
+      const buffer = await fs.readFile(resolved);
+      const mime =
+        attachment.mime ??
+        (await detectMime({
+          buffer,
+          filePath: resolved,
+        }));
+      const fileName = path.basename(resolved) || `media-${attachment.index + 1}`;
+      return { buffer, mime, fileName };
+    } catch (err) {
+      if (shouldLogVerbose()) {
+        logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
+      }
+    }
+  }
+
+  const url = attachment.url?.trim();
+  if (!url) return undefined;
+
+  try {
+    const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
+      fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
+    const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
+    if (fetched.buffer.length > maxBytes) {
+      if (shouldLogVerbose()) {
+        logVerbose(
+          `Skipping media attachment ${attachment.index + 1}: ${fetched.buffer.length} bytes exceeds ${maxBytes}`,
+        );
+      }
+      return undefined;
+    }
+    const mime =
+      attachment.mime ??
+      fetched.contentType ??
+      (await detectMime({
+        buffer: fetched.buffer,
+        filePath: fetched.fileName ?? url,
+      }));
+    const fileName = fetched.fileName ?? `media-${attachment.index + 1}`;
+    return { buffer: fetched.buffer, mime, fileName };
+  } catch (err) {
+    if (shouldLogVerbose()) {
+      logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
+    }
+  }
+
+  return undefined;
+}
+
+async function resolveAttachmentPath(params: {
+  attachment: MediaAttachment;
+  maxBytes?: number;
+  timeoutMs: number;
+}): Promise<MediaPathResult | undefined> {
+  const { attachment, maxBytes, timeoutMs } = params;
+  const rawPath = normalizeAttachmentPath(attachment.path);
+  if (rawPath) {
+    const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
+    try {
+      const stat = await fs.stat(resolved);
+      if (!stat.isFile()) return undefined;
+      if (maxBytes && stat.size > maxBytes) {
+        if (shouldLogVerbose()) {
+          logVerbose(
+            `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
+          );
+        }
+        return undefined;
+      }
+      return { path: resolved };
+    } catch (err) {
+      if (shouldLogVerbose()) {
+        logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
+      }
+    }
+  }
+
+  const url = attachment.url?.trim();
+  if (!url) return undefined;
+
+  try {
+    const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
+      fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
+    const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
+    const buffer = fetched.buffer;
+    if (maxBytes && buffer.length > maxBytes) {
+      if (shouldLogVerbose()) {
+        logVerbose(
+          `Skipping media attachment ${attachment.index + 1}: ${buffer.length} bytes exceeds ${maxBytes}`,
+        );
+      }
+      return undefined;
+    }
+    const extension = fetched.fileName ? path.extname(fetched.fileName) : "";
+    const tmpPath = path.join(
+      os.tmpdir(),
+      `clawdbot-media-${crypto.randomUUID()}${extension || ""}`,
+    );
+    await fs.writeFile(tmpPath, buffer);
+    return {
+      path: tmpPath,
+      cleanup: async () => {
+        await fs.unlink(tmpPath).catch(() => {});
+      },
+    };
+  } catch (err) {
+    if (shouldLogVerbose()) {
+      logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
+    }
+  }
+
+  return undefined;
+}
+
+async function describeImageWithModel(params: {
+  cfg: ClawdbotConfig;
+  agentDir: string;
+  provider: string;
+  model: string;
+  prompt: string;
+  maxChars?: number;
+  buffer: Buffer;
+  mimeType: string;
+  profile?: string;
+  preferredProfile?: string;
+}): Promise<{ text: string; model: string }> {
+  await ensureClawdbotModelsJson(params.cfg, params.agentDir);
+  const authStorage = discoverAuthStorage(params.agentDir);
+  const modelRegistry = discoverModels(authStorage, params.agentDir);
+  const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
+  if (!model) {
+    throw new Error(`Unknown model: ${params.provider}/${params.model}`);
+  }
+  if (!model.input?.includes("image")) {
+    throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
+  }
+  const apiKeyInfo = await getApiKeyForModel({
+    model,
+    cfg: params.cfg,
+    agentDir: params.agentDir,
+    profileId: params.profile,
+    preferredProfile: params.preferredProfile,
+  });
+  authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
+
+  const base64 = params.buffer.toString("base64");
+  if (model.provider === "minimax") {
+    const text = await minimaxUnderstandImage({
+      apiKey: apiKeyInfo.apiKey,
+      prompt: params.prompt,
+      imageDataUrl: `data:${params.mimeType};base64,${base64}`,
+      modelBaseUrl: model.baseUrl,
+    });
+    return { text, model: model.id };
+  }
+
+  const context: Context = {
+    messages: [
+      {
+        role: "user",
+        content: [
+          { type: "text", text: params.prompt },
+          { type: "image", data: base64, mimeType: params.mimeType },
+        ],
+        timestamp: Date.now(),
+      },
+    ],
+  };
+  const message = (await complete(model, context, {
+    apiKey: apiKeyInfo.apiKey,
+    maxTokens: 512,
+  })) as AssistantMessage;
+  const text = coerceImageAssistantText({
+    message,
+    provider: model.provider,
+    model: model.id,
+  });
+  return { text, model: model.id };
+}
+
+async function runProviderEntry(params: {
+  capability: Capability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachment: MediaAttachment;
+  agentDir?: string;
+  providerRegistry: Map<string, MediaUnderstandingProvider>;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { entry, capability, cfg, attachment } = params;
+  const providerIdRaw = entry.provider?.trim();
+  if (!providerIdRaw) {
+    throw new Error(`Provider entry missing provider for ${capability}`);
+  }
+  const providerId = normalizeMediaProviderId(providerIdRaw);
+  const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
+  const maxChars = resolveMaxChars({ capability, entry, cfg });
+  const timeoutMs = resolveTimeoutMs(
+    entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
+    DEFAULT_TIMEOUT_SECONDS[capability],
+  );
+  const prompt = resolvePrompt(
+    capability,
+    entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
+    maxChars,
+  );
+
+  if (capability === "image") {
+    if (!params.agentDir) {
+      throw new Error("Image understanding requires agentDir");
+    }
+    const modelId = entry.model?.trim();
+    if (!modelId) {
+      throw new Error("Image understanding requires model id");
+    }
+    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
+    if (!media) return null;
+    const mimeType = media.mime ?? "image/jpeg";
+    const result = await describeImageWithModel({
+      cfg,
+      agentDir: params.agentDir,
+      provider: providerId,
+      model: modelId,
+      prompt,
+      maxChars,
+      buffer: media.buffer,
+      mimeType,
+      profile: entry.profile,
+      preferredProfile: entry.preferredProfile,
+    });
+    return {
+      kind: "image.description",
+      attachmentIndex: attachment.index,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model,
+    };
+  }
+
+  const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
+  if (!provider) {
+    throw new Error(`Media provider not available: ${providerId}`);
+  }
+
+  if (capability === "audio") {
+    if (!provider.transcribeAudio) {
+      throw new Error(`Audio transcription provider "${providerId}" not available.`);
+    }
+    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
+    if (!media) return null;
+    const key = await resolveApiKeyForProvider({
+      provider: providerId,
+      cfg,
+      profileId: entry.profile,
+      preferredProfile: entry.preferredProfile,
+      agentDir: params.agentDir,
+    });
+    const providerConfig = cfg.models?.providers?.[providerId];
+    const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
+    const result = await provider.transcribeAudio({
+      buffer: media.buffer,
+      fileName: media.fileName,
+      mime: media.mime,
+      apiKey: key.apiKey,
+      baseUrl: providerConfig?.baseUrl,
+      headers: providerConfig?.headers,
+      model,
+      language: entry.language ?? cfg.tools?.media?.audio?.language,
+      prompt,
+      timeoutMs,
+    });
+    return {
+      kind: "audio.transcription",
+      attachmentIndex: attachment.index,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model ?? model,
+    };
+  }
+
+  if (capability === "video") {
+    if (!provider.describeVideo) {
+      throw new Error(`Video understanding provider "${providerId}" not available.`);
+    }
+    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
+    if (!media) return null;
+    const estimatedBase64Bytes = estimateBase64Size(media.buffer.length);
+    const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
+    if (estimatedBase64Bytes > maxBase64Bytes) {
+      if (shouldLogVerbose()) {
+        logVerbose(
+          `Skipping video attachment ${attachment.index + 1}: base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
+        );
+      }
+      return null;
+    }
+    const key = await resolveApiKeyForProvider({
+      provider: providerId,
+      cfg,
+      profileId: entry.profile,
+      preferredProfile: entry.preferredProfile,
+      agentDir: params.agentDir,
+    });
+    const providerConfig = cfg.models?.providers?.[providerId];
+    const result = await provider.describeVideo({
+      buffer: media.buffer,
+      fileName: media.fileName,
+      mime: media.mime,
+      apiKey: key.apiKey,
+      baseUrl: providerConfig?.baseUrl,
+      headers: providerConfig?.headers,
+      model: entry.model,
+      prompt,
+      timeoutMs,
+    });
+    return {
+      kind: "video.description",
+      attachmentIndex: attachment.index,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model ?? entry.model,
+    };
+  }
+
+  return null;
+}
+
+async function runCliEntry(params: {
+  capability: Capability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachment: MediaAttachment;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { entry, capability, cfg, ctx, attachment } = params;
+  const command = entry.command?.trim();
+  const args = entry.args ?? [];
+  if (!command) {
+    throw new Error(`CLI entry missing command for ${capability}`);
+  }
+  const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
+  const maxChars = resolveMaxChars({ capability, entry, cfg });
+  const timeoutMs = resolveTimeoutMs(
+    entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
+    DEFAULT_TIMEOUT_SECONDS[capability],
+  );
+  const prompt = resolvePrompt(
+    capability,
+    entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
+    maxChars,
+  );
+  const pathResult = await resolveAttachmentPath({
+    attachment,
+    maxBytes,
+    timeoutMs,
+  });
+  if (!pathResult) return null;
+
+  const templCtx: MsgContext = {
+    ...ctx,
+    MediaPath: pathResult.path,
+    Prompt: prompt,
+    MaxChars: maxChars,
+  };
+  const argv = [command, ...args].map((part, index) =>
+    index === 0 ? part : applyTemplate(part, templCtx),
+  );
+  if (shouldLogVerbose()) {
+    logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
+  }
+  try {
+    const { stdout } = await runExec(argv[0], argv.slice(1), {
+      timeoutMs,
+      maxBuffer: CLI_OUTPUT_MAX_BUFFER,
+    });
+    const text = trimOutput(stdout, maxChars);
+    if (!text) return null;
+    return {
+      kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
+      attachmentIndex: attachment.index,
+      text,
+      provider: "cli",
+      model: command,
+    };
+  } finally {
+    if (pathResult.cleanup) {
+      await pathResult.cleanup();
+    }
+  }
+}
+
+async function runCapability(params: {
+  capability: Capability;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachments: MediaAttachment[];
+  agentDir?: string;
+  providerRegistry: Map<string, MediaUnderstandingProvider>;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { capability, cfg, ctx, attachments } = params;
+  const config = resolveCapabilityConfig(cfg, capability);
+  if (!config || config.enabled === false) return null;
+  const entries = resolveModelEntries(config, capability);
+  if (entries.length === 0) return null;
+
+  const scopeDecision = resolveScopeDecision({ scope: config.scope, ctx });
+  if (scopeDecision === "deny") {
+    if (shouldLogVerbose()) {
+      logVerbose(`${capability} understanding disabled by scope policy.`);
+    }
+    return null;
+  }
+
+  const attachment = attachments.find((item) => {
+    if (capability === "image") return isImageAttachment(item);
+    if (capability === "audio") return isAudioAttachment(item);
+    return isVideoAttachment(item);
+  });
+  if (!attachment) return null;
+
+  for (const entry of entries) {
+    try {
+      const entryType = entry.type ?? (entry.command ? "cli" : "provider");
+      const result =
+        entryType === "cli"
+          ? await runCliEntry({ capability, entry, cfg, ctx, attachment })
+          : await runProviderEntry({
+              capability,
+              entry,
+              cfg,
+              ctx,
+              attachment,
+              agentDir: params.agentDir,
+              providerRegistry: params.providerRegistry,
+            });
+      if (result) return result;
+    } catch (err) {
+      if (isMaxBytesError(err)) {
+        if (shouldLogVerbose()) {
+          logVerbose(`Skipping ${capability} model due to size: ${String(err)}`);
+        }
+      } else if (shouldLogVerbose()) {
+        logVerbose(`${capability} understanding failed: ${String(err)}`);
+      }
+    }
+  }
+
+  return null;
+}
+
+export async function applyMediaUnderstanding(params: {
+  ctx: MsgContext;
+  cfg: ClawdbotConfig;
+  agentDir?: string;
+  providers?: Record<string, MediaUnderstandingProvider>;
+}): Promise<ApplyMediaUnderstandingResult> {
+  const { ctx, cfg } = params;
+  const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
+  const originalUserText =
+    commandCandidates
+      .map((value) => extractMediaUserText(value))
+      .find((value) => value && value.trim()) ?? undefined;
+
+  const attachments = normalizeAttachments(ctx);
+  const providerRegistry = buildMediaUnderstandingRegistry(params.providers);
+  const outputs: MediaUnderstandingOutput[] = [];
+
+  const imageOutput = await runCapability({
+    capability: "image",
+    cfg,
+    ctx,
+    attachments,
+    agentDir: params.agentDir,
+    providerRegistry,
+  });
+  if (imageOutput) outputs.push(imageOutput);
+
+  const audioOutput = await runCapability({
+    capability: "audio",
+    cfg,
+    ctx,
+    attachments,
+    agentDir: params.agentDir,
+    providerRegistry,
+  });
+  if (audioOutput) outputs.push(audioOutput);
+
+  const videoOutput = await runCapability({
+    capability: "video",
+    cfg,
+    ctx,
+    attachments,
+    agentDir: params.agentDir,
+    providerRegistry,
+  });
+  if (videoOutput) outputs.push(videoOutput);
+
+  if (outputs.length > 0) {
+    ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
+    const audioResult = outputs.find((output) => output.kind === "audio.transcription");
+    if (audioResult) {
+      ctx.Transcript = audioResult.text;
+      ctx.CommandBody = audioResult.text;
+      ctx.RawBody = audioResult.text;
+    } else if (originalUserText) {
+      ctx.CommandBody = originalUserText;
+      ctx.RawBody = originalUserText;
+    }
+    ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
+  }
+
+  return {
+    outputs,
+    appliedImage: outputs.some((output) => output.kind === "image.description"),
+    appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
+    appliedVideo: outputs.some((output) => output.kind === "video.description"),
+  };
+}
diff --git a/src/media-understanding/format.test.ts b/src/media-understanding/format.test.ts
new file mode 100644
index 000000000..172ecadf9
--- /dev/null
+++ b/src/media-understanding/format.test.ts
@@ -0,0 +1,91 @@
+import { describe, expect, it } from "vitest";
+import { formatMediaUnderstandingBody } from "./format.js";
+
+describe("formatMediaUnderstandingBody", () => {
+  it("replaces placeholder body with transcript", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "<media:audio>",
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "hello world",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe("[Audio]\nTranscript:\nhello world");
+  });
+
+  it("includes user text when body is meaningful", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "caption here",
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "transcribed",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
+  });
+
+  it("strips leading media placeholders from user text", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "<media:audio> caption here",
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "transcribed",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
+  });
+
+  it("keeps user text once when multiple outputs exist", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "caption here",
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "audio text",
+          provider: "groq",
+        },
+        {
+          kind: "video.description",
+          attachmentIndex: 1,
+          text: "video text",
+          provider: "google",
+        },
+      ],
+    });
+    expect(body).toBe(
+      [
+        "User text:\ncaption here",
+        "[Audio]\nTranscript:\naudio text",
+        "[Video]\nDescription:\nvideo text",
+      ].join("\n\n"),
+    );
+  });
+
+  it("formats image outputs", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "<media:image>",
+      outputs: [
+        {
+          kind: "image.description",
+          attachmentIndex: 0,
+          text: "a cat",
+          provider: "openai",
+        },
+      ],
+    });
+    expect(body).toBe("[Image]\nDescription:\na cat");
+  });
+});
diff --git a/src/media-understanding/format.ts b/src/media-understanding/format.ts
new file mode 100644
index 000000000..ffa6f0145
--- /dev/null
+++ b/src/media-understanding/format.ts
@@ -0,0 +1,77 @@
+import type { MediaUnderstandingOutput } from "./types.js";
+
+const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
+const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
+
+export function extractMediaUserText(body?: string): string | undefined {
+  const trimmed = body?.trim() ?? "";
+  if (!trimmed) return undefined;
+  if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return undefined;
+  const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
+  return cleaned || undefined;
+}
+
+function formatSection(
+  title: "Audio" | "Video" | "Image",
+  kind: "Transcript" | "Description",
+  text: string,
+  userText?: string,
+): string {
+  const lines = [`[${title}]`];
+  if (userText) {
+    lines.push(`User text:\n${userText}`);
+  }
+  lines.push(`${kind}:\n${text}`);
+  return lines.join("\n");
+}
+
+export function formatMediaUnderstandingBody(params: {
+  body?: string;
+  outputs: MediaUnderstandingOutput[];
+}): string {
+  const outputs = params.outputs.filter((output) => output.text.trim());
+  if (outputs.length === 0) {
+    return params.body ?? "";
+  }
+
+  const userText = extractMediaUserText(params.body);
+  const sections: string[] = [];
+  if (userText && outputs.length > 1) {
+    sections.push(`User text:\n${userText}`);
+  }
+
+  for (const output of outputs) {
+    if (output.kind === "audio.transcription") {
+      sections.push(
+        formatSection(
+          "Audio",
+          "Transcript",
+          output.text,
+          outputs.length === 1 ? userText : undefined,
+        ),
+      );
+      continue;
+    }
+    if (output.kind === "image.description") {
+      sections.push(
+        formatSection(
+          "Image",
+          "Description",
+          output.text,
+          outputs.length === 1 ? userText : undefined,
+        ),
+      );
+      continue;
+    }
+    sections.push(
+      formatSection(
+        "Video",
+        "Description",
+        output.text,
+        outputs.length === 1 ? userText : undefined,
+      ),
+    );
+  }
+
+  return sections.join("\n\n").trim();
+}
diff --git a/src/media-understanding/index.ts b/src/media-understanding/index.ts
new file mode 100644
index 000000000..6afa22a54
--- /dev/null
+++ b/src/media-understanding/index.ts
@@ -0,0 +1,9 @@
+export { applyMediaUnderstanding } from "./apply.js";
+export { formatMediaUnderstandingBody } from "./format.js";
+export { resolveMediaUnderstandingScope } from "./scope.js";
+export type {
+  MediaAttachment,
+  MediaUnderstandingOutput,
+  MediaUnderstandingProvider,
+  MediaUnderstandingKind,
+} from "./types.js";
diff --git a/src/media-understanding/providers/google/index.ts b/src/media-understanding/providers/google/index.ts
new file mode 100644
index 000000000..285195dc7
--- /dev/null
+++ b/src/media-understanding/providers/google/index.ts
@@ -0,0 +1,7 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { describeGeminiVideo } from "./video.js";
+
+export const googleProvider: MediaUnderstandingProvider = {
+  id: "google",
+  describeVideo: describeGeminiVideo,
+};
diff --git a/src/media-understanding/providers/google/video.test.ts b/src/media-understanding/providers/google/video.test.ts
new file mode 100644
index 000000000..f94778cdd
--- /dev/null
+++ b/src/media-understanding/providers/google/video.test.ts
@@ -0,0 +1,93 @@
+import { describe, expect, it } from "vitest";
+
+import { describeGeminiVideo } from "./video.js";
+
+const resolveRequestUrl = (input: RequestInfo | URL) => {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+};
+
+describe("describeGeminiVideo", () => {
+  it("respects case-insensitive x-goog-api-key overrides", async () => {
+    let seenKey: string | null = null;
+    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
+      const headers = new Headers(init?.headers);
+      seenKey = headers.get("x-goog-api-key");
+      return new Response(
+        JSON.stringify({
+          candidates: [{ content: { parts: [{ text: "video ok" }] } }],
+        }),
+        { status: 200, headers: { "content-type": "application/json" } },
+      );
+    };
+
+    const result = await describeGeminiVideo({
+      buffer: Buffer.from("video"),
+      fileName: "clip.mp4",
+      apiKey: "test-key",
+      timeoutMs: 1000,
+      headers: { "X-Goog-Api-Key": "override" },
+      fetchFn,
+    });
+
+    expect(seenKey).toBe("override");
+    expect(result.text).toBe("video ok");
+  });
+
+  it("builds the expected request payload", async () => {
+    let seenUrl: string | null = null;
+    let seenInit: RequestInit | undefined;
+    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
+      seenUrl = resolveRequestUrl(input);
+      seenInit = init;
+      return new Response(
+        JSON.stringify({
+          candidates: [
+            {
+              content: {
+                parts: [{ text: "first" }, { text: " second " }, { text: "" }],
+              },
+            },
+          ],
+        }),
+        { status: 200, headers: { "content-type": "application/json" } },
+      );
+    };
+
+    const result = await describeGeminiVideo({
+      buffer: Buffer.from("video-bytes"),
+      fileName: "clip.mp4",
+      apiKey: "test-key",
+      timeoutMs: 1500,
+      baseUrl: "https://example.com/v1beta/",
+      model: "gemini-3-pro",
+      headers: { "X-Other": "1" },
+      fetchFn,
+    });
+
+    expect(result.model).toBe("gemini-3-pro-preview");
+    expect(result.text).toBe("first\nsecond");
+    expect(seenUrl).toBe("https://example.com/v1beta/models/gemini-3-pro-preview:generateContent");
+    expect(seenInit?.method).toBe("POST");
+    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
+
+    const headers = new Headers(seenInit?.headers);
+    expect(headers.get("x-goog-api-key")).toBe("test-key");
+    expect(headers.get("content-type")).toBe("application/json");
+    expect(headers.get("x-other")).toBe("1");
+
+    const bodyText =
+      typeof seenInit?.body === "string"
+        ? seenInit.body
+        : Buffer.isBuffer(seenInit?.body)
+          ? seenInit.body.toString("utf8")
+          : "";
+    const body = JSON.parse(bodyText);
+    expect(body.contents?.[0]?.parts?.[0]?.text).toBe("Describe the video.");
+    expect(body.contents?.[0]?.parts?.[1]?.inline_data?.mime_type).toBe("video/mp4");
+    expect(body.contents?.[0]?.parts?.[1]?.inline_data?.data).toBe(
+      Buffer.from("video-bytes").toString("base64"),
+    );
+  });
+});
diff --git a/src/media-understanding/providers/google/video.ts b/src/media-understanding/providers/google/video.ts
new file mode 100644
index 000000000..0f483b280
--- /dev/null
+++ b/src/media-understanding/providers/google/video.ts
@@ -0,0 +1,84 @@
+import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
+import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
+import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
+
+export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
+const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
+const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  if (!trimmed) return DEFAULT_GOOGLE_VIDEO_MODEL;
+  return normalizeGoogleModelId(trimmed);
+}
+
+function resolvePrompt(prompt?: string): string {
+  const trimmed = prompt?.trim();
+  return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
+}
+
+export async function describeGeminiVideo(
+  params: VideoDescriptionRequest,
+): Promise<VideoDescriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
+  const model = resolveModel(params.model);
+  const url = `${baseUrl}/models/${model}:generateContent`;
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("content-type")) {
+    headers.set("content-type", "application/json");
+  }
+  if (!headers.has("x-goog-api-key")) {
+    headers.set("x-goog-api-key", params.apiKey);
+  }
+
+  const body = {
+    contents: [
+      {
+        role: "user",
+        parts: [
+          { text: resolvePrompt(params.prompt) },
+          {
+            inline_data: {
+              mime_type: params.mime ?? "video/mp4",
+              data: params.buffer.toString("base64"),
+            },
+          },
+        ],
+      },
+    ],
+  };
+
+  const res = await fetchWithTimeout(
+    url,
+    {
+      method: "POST",
+      headers,
+      body: JSON.stringify(body),
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  if (!res.ok) {
+    const detail = await readErrorResponse(res);
+    const suffix = detail ? `: ${detail}` : "";
+    throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
+  }
+
+  const payload = (await res.json()) as {
+    candidates?: Array<{
+      content?: { parts?: Array<{ text?: string }> };
+    }>;
+  };
+  const parts = payload.candidates?.[0]?.content?.parts ?? [];
+  const text = parts
+    .map((part) => part?.text?.trim())
+    .filter(Boolean)
+    .join("\n");
+  if (!text) {
+    throw new Error("Video description response missing text");
+  }
+  return { text, model };
+}
diff --git a/src/media-understanding/providers/groq/index.ts b/src/media-understanding/providers/groq/index.ts
new file mode 100644
index 000000000..451799e8e
--- /dev/null
+++ b/src/media-understanding/providers/groq/index.ts
@@ -0,0 +1,13 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
+
+const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
+
+export const groqProvider: MediaUnderstandingProvider = {
+  id: "groq",
+  transcribeAudio: (req) =>
+    transcribeOpenAiCompatibleAudio({
+      ...req,
+      baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
+    }),
+};
diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts
new file mode 100644
index 000000000..fef5d6531
--- /dev/null
+++ b/src/media-understanding/providers/index.ts
@@ -0,0 +1,35 @@
+import { normalizeProviderId } from "../../agents/model-selection.js";
+import type { MediaUnderstandingProvider } from "../types.js";
+import { googleProvider } from "./google/index.js";
+import { groqProvider } from "./groq/index.js";
+import { openaiProvider } from "./openai/index.js";
+
+const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
+
+export function normalizeMediaProviderId(id: string): string {
+  const normalized = normalizeProviderId(id);
+  if (normalized === "gemini") return "google";
+  return normalized;
+}
+
+export function buildMediaUnderstandingRegistry(
+  overrides?: Record<string, MediaUnderstandingProvider>,
+): Map<string, MediaUnderstandingProvider> {
+  const registry = new Map<string, MediaUnderstandingProvider>();
+  for (const provider of PROVIDERS) {
+    registry.set(normalizeMediaProviderId(provider.id), provider);
+  }
+  if (overrides) {
+    for (const [key, provider] of Object.entries(overrides)) {
+      registry.set(normalizeMediaProviderId(key), provider);
+    }
+  }
+  return registry;
+}
+
+export function getMediaUnderstandingProvider(
+  id: string,
+  registry: Map<string, MediaUnderstandingProvider>,
+): MediaUnderstandingProvider | undefined {
+  return registry.get(normalizeMediaProviderId(id));
+}
diff --git a/src/media-understanding/providers/openai/audio.test.ts b/src/media-understanding/providers/openai/audio.test.ts
new file mode 100644
index 000000000..88c713f2a
--- /dev/null
+++ b/src/media-understanding/providers/openai/audio.test.ts
@@ -0,0 +1,86 @@
+import { describe, expect, it } from "vitest";
+
+import { transcribeOpenAiCompatibleAudio } from "./audio.js";
+
+const resolveRequestUrl = (input: RequestInfo | URL) => {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+};
+
+describe("transcribeOpenAiCompatibleAudio", () => {
+  it("respects lowercase authorization header overrides", async () => {
+    let seenAuth: string | null = null;
+    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
+      const headers = new Headers(init?.headers);
+      seenAuth = headers.get("authorization");
+      return new Response(JSON.stringify({ text: "ok" }), {
+        status: 200,
+        headers: { "content-type": "application/json" },
+      });
+    };
+
+    const result = await transcribeOpenAiCompatibleAudio({
+      buffer: Buffer.from("audio"),
+      fileName: "note.mp3",
+      apiKey: "test-key",
+      timeoutMs: 1000,
+      headers: { authorization: "Bearer override" },
+      fetchFn,
+    });
+
+    expect(seenAuth).toBe("Bearer override");
+    expect(result.text).toBe("ok");
+  });
+
+  it("builds the expected request payload", async () => {
+    let seenUrl: string | null = null;
+    let seenInit: RequestInit | undefined;
+    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
+      seenUrl = resolveRequestUrl(input);
+      seenInit = init;
+      return new Response(JSON.stringify({ text: "hello" }), {
+        status: 200,
+        headers: { "content-type": "application/json" },
+      });
+    };
+
+    const result = await transcribeOpenAiCompatibleAudio({
+      buffer: Buffer.from("audio-bytes"),
+      fileName: "voice.wav",
+      apiKey: "test-key",
+      timeoutMs: 1234,
+      baseUrl: "https://api.example.com/v1/",
+      model: " ",
+      language: " en ",
+      prompt: " hello ",
+      mime: "audio/wav",
+      headers: { "X-Custom": "1" },
+      fetchFn,
+    });
+
+    expect(result.model).toBe("whisper-1");
+    expect(result.text).toBe("hello");
+    expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
+    expect(seenInit?.method).toBe("POST");
+    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
+
+    const headers = new Headers(seenInit?.headers);
+    expect(headers.get("authorization")).toBe("Bearer test-key");
+    expect(headers.get("x-custom")).toBe("1");
+
+    const form = seenInit?.body as FormData;
+    expect(form).toBeInstanceOf(FormData);
+    expect(form.get("model")).toBe("whisper-1");
+    expect(form.get("language")).toBe("en");
+    expect(form.get("prompt")).toBe("hello");
+    const file = form.get("file") as Blob | { type?: string; name?: string } | null;
+    expect(file).not.toBeNull();
+    if (file) {
+      expect(file.type).toBe("audio/wav");
+      if ("name" in file && typeof file.name === "string") {
+        expect(file.name).toBe("voice.wav");
+      }
+    }
+  });
+});
diff --git a/src/media-understanding/providers/openai/audio.ts b/src/media-understanding/providers/openai/audio.ts
new file mode 100644
index 000000000..65ac5735a
--- /dev/null
+++ b/src/media-understanding/providers/openai/audio.ts
@@ -0,0 +1,61 @@
+import path from "node:path";
+
+import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
+import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
+
+export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
+const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
+}
+
+export async function transcribeOpenAiCompatibleAudio(
+  params: AudioTranscriptionRequest,
+): Promise<AudioTranscriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
+  const url = `${baseUrl}/audio/transcriptions`;
+
+  const model = resolveModel(params.model);
+  const form = new FormData();
+  const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
+  const bytes = new Uint8Array(params.buffer);
+  const blob = new Blob([bytes], {
+    type: params.mime ?? "application/octet-stream",
+  });
+  form.append("file", blob, fileName);
+  form.append("model", model);
+  if (params.language?.trim()) form.append("language", params.language.trim());
+  if (params.prompt?.trim()) form.append("prompt", params.prompt.trim());
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("authorization")) {
+    headers.set("authorization", `Bearer ${params.apiKey}`);
+  }
+
+  const res = await fetchWithTimeout(
+    url,
+    {
+      method: "POST",
+      headers,
+      body: form,
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  if (!res.ok) {
+    const detail = await readErrorResponse(res);
+    const suffix = detail ? `: ${detail}` : "";
+    throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
+  }
+
+  const payload = (await res.json()) as { text?: string };
+  const text = payload.text?.trim();
+  if (!text) {
+    throw new Error("Audio transcription response missing text");
+  }
+  return { text, model };
+}
diff --git a/src/media-understanding/providers/openai/index.ts b/src/media-understanding/providers/openai/index.ts
new file mode 100644
index 000000000..f8af49928
--- /dev/null
+++ b/src/media-understanding/providers/openai/index.ts
@@ -0,0 +1,7 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { transcribeOpenAiCompatibleAudio } from "./audio.js";
+
+export const openaiProvider: MediaUnderstandingProvider = {
+  id: "openai",
+  transcribeAudio: transcribeOpenAiCompatibleAudio,
+};
diff --git a/src/media-understanding/providers/shared.ts b/src/media-understanding/providers/shared.ts
new file mode 100644
index 000000000..9dd36992e
--- /dev/null
+++ b/src/media-understanding/providers/shared.ts
@@ -0,0 +1,33 @@
+const MAX_ERROR_CHARS = 300;
+
+export function normalizeBaseUrl(baseUrl: string | undefined, fallback: string): string {
+  const raw = baseUrl?.trim() || fallback;
+  return raw.replace(/\/+$/, "");
+}
+
+export async function fetchWithTimeout(
+  url: string,
+  init: RequestInit,
+  timeoutMs: number,
+  fetchFn: typeof fetch,
+): Promise<Response> {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
+  try {
+    return await fetchFn(url, { ...init, signal: controller.signal });
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+export async function readErrorResponse(res: Response): Promise<string | undefined> {
+  try {
+    const text = await res.text();
+    const collapsed = text.replace(/\s+/g, " ").trim();
+    if (!collapsed) return undefined;
+    if (collapsed.length <= MAX_ERROR_CHARS) return collapsed;
+    return `${collapsed.slice(0, MAX_ERROR_CHARS)}…`;
+  } catch {
+    return undefined;
+  }
+}
diff --git a/src/media-understanding/scope.test.ts b/src/media-understanding/scope.test.ts
new file mode 100644
index 000000000..9331d8577
--- /dev/null
+++ b/src/media-understanding/scope.test.ts
@@ -0,0 +1,46 @@
+import { describe, expect, it } from "vitest";
+import { resolveMediaUnderstandingScope } from "./scope.js";
+
+describe("resolveMediaUnderstandingScope", () => {
+  it("defaults to allow when scope is undefined", () => {
+    expect(resolveMediaUnderstandingScope({})).toBe("allow");
+  });
+
+  it("uses first matching rule", () => {
+    const decision = resolveMediaUnderstandingScope({
+      scope: {
+        default: "deny",
+        rules: [
+          { action: "allow", match: { channel: "whatsapp" } },
+          { action: "deny", match: { channel: "whatsapp", chatType: "direct" } },
+        ],
+      },
+      channel: "whatsapp",
+      chatType: "direct",
+      sessionKey: "whatsapp:direct:123",
+    });
+    expect(decision).toBe("allow");
+  });
+
+  it("matches keyPrefix when provided", () => {
+    const decision = resolveMediaUnderstandingScope({
+      scope: {
+        default: "deny",
+        rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
+      },
+      sessionKey: "agent:main:whatsapp:group:123",
+    });
+    expect(decision).toBe("allow");
+  });
+
+  it("matches keyPrefix case-insensitively", () => {
+    const decision = resolveMediaUnderstandingScope({
+      scope: {
+        default: "deny",
+        rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
+      },
+      sessionKey: "AGENT:MAIN:WHATSAPP:GROUP:123",
+    });
+    expect(decision).toBe("allow");
+  });
+});
diff --git a/src/media-understanding/scope.ts b/src/media-understanding/scope.ts
new file mode 100644
index 000000000..55660fafb
--- /dev/null
+++ b/src/media-understanding/scope.ts
@@ -0,0 +1,54 @@
+import type { MediaUnderstandingScopeConfig } from "../config/types.tools.js";
+
+export type MediaUnderstandingScopeDecision = "allow" | "deny";
+
+function normalizeDecision(value?: string | null): MediaUnderstandingScopeDecision | undefined {
+  const normalized = value?.trim().toLowerCase();
+  if (normalized === "allow") return "allow";
+  if (normalized === "deny") return "deny";
+  return undefined;
+}
+
+function normalizeMatch(value?: string | null): string | undefined {
+  const normalized = value?.trim().toLowerCase();
+  return normalized || undefined;
+}
+
+export function normalizeMediaUnderstandingChatType(raw?: string | null): string | undefined {
+  const value = raw?.trim().toLowerCase();
+  if (!value) return undefined;
+  if (value === "dm" || value === "direct_message" || value === "private") return "direct";
+  if (value === "groups") return "group";
+  if (value === "channel") return "room";
+  return value;
+}
+
+export function resolveMediaUnderstandingScope(params: {
+  scope?: MediaUnderstandingScopeConfig;
+  sessionKey?: string;
+  channel?: string;
+  chatType?: string;
+}): MediaUnderstandingScopeDecision {
+  const scope = params.scope;
+  if (!scope) return "allow";
+
+  const channel = normalizeMatch(params.channel);
+  const chatType = normalizeMatch(params.chatType);
+  const sessionKey = normalizeMatch(params.sessionKey) ?? "";
+
+  for (const rule of scope.rules ?? []) {
+    if (!rule) continue;
+    const action = normalizeDecision(rule.action) ?? "allow";
+    const match = rule.match ?? {};
+    const matchChannel = normalizeMatch(match.channel);
+    const matchChatType = normalizeMatch(match.chatType);
+    const matchPrefix = normalizeMatch(match.keyPrefix);
+
+    if (matchChannel && matchChannel !== channel) continue;
+    if (matchChatType && matchChatType !== chatType) continue;
+    if (matchPrefix && !sessionKey.startsWith(matchPrefix)) continue;
+    return action;
+  }
+
+  return normalizeDecision(scope.default) ?? "allow";
+}
diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts
new file mode 100644
index 000000000..85c897275
--- /dev/null
+++ b/src/media-understanding/types.ts
@@ -0,0 +1,62 @@
+export type MediaUnderstandingKind =
+  | "audio.transcription"
+  | "video.description"
+  | "image.description";
+
+export type MediaAttachment = {
+  path?: string;
+  url?: string;
+  mime?: string;
+  index: number;
+};
+
+export type MediaUnderstandingOutput = {
+  kind: MediaUnderstandingKind;
+  attachmentIndex: number;
+  text: string;
+  provider: string;
+  model?: string;
+};
+
+export type AudioTranscriptionRequest = {
+  buffer: Buffer;
+  fileName: string;
+  mime?: string;
+  apiKey: string;
+  baseUrl?: string;
+  headers?: Record<string, string>;
+  model?: string;
+  language?: string;
+  prompt?: string;
+  timeoutMs: number;
+  fetchFn?: typeof fetch;
+};
+
+export type AudioTranscriptionResult = {
+  text: string;
+  model?: string;
+};
+
+export type VideoDescriptionRequest = {
+  buffer: Buffer;
+  fileName: string;
+  mime?: string;
+  apiKey: string;
+  baseUrl?: string;
+  headers?: Record<string, string>;
+  model?: string;
+  prompt?: string;
+  timeoutMs: number;
+  fetchFn?: typeof fetch;
+};
+
+export type VideoDescriptionResult = {
+  text: string;
+  model?: string;
+};
+
+export type MediaUnderstandingProvider = {
+  id: string;
+  transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
+  describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
+};
diff --git a/src/media/fetch.test.ts b/src/media/fetch.test.ts
new file mode 100644
index 000000000..46445b1bb
--- /dev/null
+++ b/src/media/fetch.test.ts
@@ -0,0 +1,47 @@
+import { describe, expect, it } from "vitest";
+
+import { fetchRemoteMedia } from "./fetch.js";
+
+function makeStream(chunks: Uint8Array[]) {
+  return new ReadableStream<Uint8Array>({
+    start(controller) {
+      for (const chunk of chunks) {
+        controller.enqueue(chunk);
+      }
+      controller.close();
+    },
+  });
+}
+
+describe("fetchRemoteMedia", () => {
+  it("rejects when content-length exceeds maxBytes", async () => {
+    const fetchImpl = async () =>
+      new Response(makeStream([new Uint8Array([1, 2, 3, 4, 5])]), {
+        status: 200,
+        headers: { "content-length": "5" },
+      });
+
+    await expect(
+      fetchRemoteMedia({
+        url: "https://example.com/file.bin",
+        fetchImpl,
+        maxBytes: 4,
+      }),
+    ).rejects.toThrow("exceeds maxBytes");
+  });
+
+  it("rejects when streamed payload exceeds maxBytes", async () => {
+    const fetchImpl = async () =>
+      new Response(makeStream([new Uint8Array([1, 2, 3]), new Uint8Array([4, 5, 6])]), {
+        status: 200,
+      });
+
+    await expect(
+      fetchRemoteMedia({
+        url: "https://example.com/file.bin",
+        fetchImpl,
+        maxBytes: 4,
+      }),
+    ).rejects.toThrow("exceeds maxBytes");
+  });
+});
diff --git a/src/media/fetch.ts b/src/media/fetch.ts
index 28312ebe5..6ee706d97 100644
--- a/src/media/fetch.ts
+++ b/src/media/fetch.ts
@@ -14,6 +14,7 @@ type FetchMediaOptions = {
   url: string;
   fetchImpl?: FetchLike;
   filePathHint?: string;
+  maxBytes?: number;
 };
 
 function stripQuotes(value: string): string {
@@ -51,7 +52,7 @@ async function readErrorBodySnippet(res: Response, maxChars = 200): Promise<stri
 }
 
 export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<FetchMediaResult> {
-  const { url, fetchImpl, filePathHint } = options;
+  const { url, fetchImpl, filePathHint, maxBytes } = options;
   const fetcher: FetchLike | undefined = fetchImpl ?? globalThis.fetch;
   if (!fetcher) {
     throw new Error("fetch is not available");
@@ -77,7 +78,19 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
     throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
   }
 
-  const buffer = Buffer.from(await res.arrayBuffer());
+  const contentLength = res.headers.get("content-length");
+  if (maxBytes && contentLength) {
+    const length = Number(contentLength);
+    if (Number.isFinite(length) && length > maxBytes) {
+      throw new Error(
+        `Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
+      );
+    }
+  }
+
+  const buffer = maxBytes
+    ? await readResponseWithLimit(res, maxBytes)
+    : Buffer.from(await res.arrayBuffer());
   let fileNameFromUrl: string | undefined;
   try {
     const parsed = new URL(url);
@@ -109,3 +122,47 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
     fileName,
   };
 }
+
+async function readResponseWithLimit(res: Response, maxBytes: number): Promise<Buffer> {
+  const body = res.body;
+  if (!body || typeof body.getReader !== "function") {
+    const fallback = Buffer.from(await res.arrayBuffer());
+    if (fallback.length > maxBytes) {
+      throw new Error(
+        `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
+      );
+    }
+    return fallback;
+  }
+
+  const reader = body.getReader();
+  const chunks: Uint8Array[] = [];
+  let total = 0;
+  try {
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      if (value?.length) {
+        total += value.length;
+        if (total > maxBytes) {
+          try {
+            await reader.cancel();
+          } catch {}
+          throw new Error(
+            `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
+          );
+        }
+        chunks.push(value);
+      }
+    }
+  } finally {
+    try {
+      reader.releaseLock();
+    } catch {}
+  }
+
+  return Buffer.concat(
+    chunks.map((chunk) => Buffer.from(chunk)),
+    total,
+  );
+}