diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md
index 3e2b55d6c..076c30704 100644
--- a/docs/gateway/configuration.md
+++ b/docs/gateway/configuration.md
@@ -1770,13 +1770,16 @@ Legacy: `tools.bash` is still accepted as an alias.
 - `tools.web.fetch.firecrawl.timeoutSeconds` (optional)
 
 `tools.media` configures inbound media understanding (image/audio/video):
+- `tools.media.models`: shared model list (capability-tagged; used after per-cap lists).
+- `tools.media.concurrency`: max concurrent capability runs (default 2).
 - `tools.media.image` / `tools.media.audio` / `tools.media.video`:
-  - `enabled`: opt-out switch (default true).
+  - `enabled`: opt-out switch (default true when models are configured).
   - `prompt`: optional prompt override (image/video append a `maxChars` hint automatically).
   - `maxChars`: max output characters (default 500 for image/video; unset for audio).
   - `maxBytes`: max media size to send (defaults: image 10MB, audio 20MB, video 50MB).
   - `timeoutSeconds`: request timeout (defaults: image 60s, audio 60s, video 120s).
   - `language`: optional audio hint.
+  - `attachments`: attachment policy (`mode`, `maxAttachments`, `prefer`).
   - `scope`: optional gating (first match wins) with `match.channel`, `match.chatType`, or `match.keyPrefix`.
   - `models`: ordered list of model entries; failures or oversize media fall back to the next entry.
 - Each `models[]` entry:
@@ -1787,7 +1790,7 @@ Legacy: `tools.bash` is still accepted as an alias.
   - CLI entry (`type: "cli"`):
     - `command`: executable to run.
     - `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc).
-  - `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry.
+  - `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry. Defaults when omitted: `openai`/`anthropic`/`minimax` → image, `google` → image+audio+video, `groq` → audio.
   - `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language` can be overridden per entry.
 
 If no models are configured (or `enabled: false`), understanding is skipped; the model still receives the original attachments.
@@ -2900,7 +2903,7 @@ clawdbot dns setup --apply
 
 ## Template variables
 
-Template placeholders are expanded in `tools.media.*.models[].args` (and any future templated argument fields).
+Template placeholders are expanded in `tools.media.*.models[].args` and `tools.media.models[].args` (and any future templated argument fields).
 
 | Variable | Description |
 |----------|-------------|
diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md
index b6019b26e..ba68b35e9 100644
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -6,7 +6,7 @@ read_when:
 # Audio / Voice Notes — 2026-01-17
 
 ## What works
-- **Media understanding (audio)**: If `tools.media.audio` is enabled and has `models`, Clawdbot:
+- **Media understanding (audio)**: If `tools.media.audio` is enabled (or a shared `tools.media.models` entry supports audio), Clawdbot:
   1) Locates the first audio attachment (local path or URL) and downloads it if needed.
   2) Enforces `maxBytes` before sending to each model entry.
   3) Runs the first eligible model entry in order (provider or CLI).
@@ -66,6 +66,7 @@ read_when:
 - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
 - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
 - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
+- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
 - Transcript is available to templates as `{{Transcript}}`.
 - CLI stdout is capped (5MB); keep CLI output concise.
 
diff --git a/docs/nodes/images.md b/docs/nodes/images.md
index 4d163e535..bb9188738 100644
--- a/docs/nodes/images.md
+++ b/docs/nodes/images.md
@@ -38,10 +38,10 @@ The WhatsApp channel runs via **Baileys Web**. This document captures the curren
   - `{{MediaUrl}}` pseudo-URL for the inbound media.
   - `{{MediaPath}}` local temp path written before running the command.
 - When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/<filename>`.
-- Media understanding (if configured via `tools.media.*`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`.
+- Media understanding (if configured via `tools.media.*` or shared `tools.media.models`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`.
   - Audio sets `{{Transcript}}` and uses the transcript for command parsing so slash commands still work.
   - Video and image descriptions preserve any caption text for command parsing.
-- Only the first matching image/audio/video attachment is processed; remaining attachments are left untouched.
+- By default only the first matching image/audio/video attachment is processed; set `tools.media.<cap>.attachments` to process multiple attachments.
 
 ## Limits & Errors
 **Outbound send caps (WhatsApp web send)**
diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md
index fe8e69a52..12f4c2317 100644
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -16,7 +16,7 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t
 
 ## High‑level behavior
 1) Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`).
-2) For each enabled capability (image/audio/video), pick the **first matching attachment**.
+2) For each enabled capability (image/audio/video), select attachments per policy (default: **first**).
 3) Choose the first eligible model entry (size + capability + auth).  
 4) If a model fails or the media is too large, **fall back to the next entry**.
 5) On success:
@@ -27,18 +27,23 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t
 If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
 
 ## Config overview
-Use **per‑capability configs** under `tools.media`. Each capability can define:
-- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
-- **ordered `models` list** (fallback order)
-- `scope` (optional gating by channel/chatType/session key)
+`tools.media` supports **shared models** plus per‑capability overrides:
+- `tools.media.models`: shared model list (use `capabilities` to gate).
+- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
+  - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
+  - optional **per‑capability `models` list** (preferred before shared models)
+  - `attachments` policy (`mode`, `maxAttachments`, `prefer`)
+  - `scope` (optional gating by channel/chatType/session key)
+- `tools.media.concurrency`: max concurrent capability runs (default **2**).
 
 ```json5
 {
   tools: {
     media: {
-      image: { /* config */ },
-      audio: { /* config */ },
-      video: { /* config */ }
+      models: [ /* shared list */ ],
+      image: { /* optional overrides */ },
+      audio: { /* optional overrides */ },
+      video: { /* optional overrides */ }
     }
   }
 }
@@ -95,12 +100,13 @@ Rules:
 - `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
 
 ## Capabilities (optional)
-If you set `capabilities`, the entry only runs for those media types. Suggested
-defaults when you opt in:
-- `openai`, `anthropic`: **image**
+If you set `capabilities`, the entry only runs for those media types. For shared
+lists, Clawdbot can infer defaults:
+- `openai`, `anthropic`, `minimax`: **image**
 - `google` (Gemini API): **image + audio + video**
-- CLI entries: declare the exact capabilities you support.
+- `groq`: **audio**
 
+For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
 If you omit `capabilities`, the entry is eligible for the list it appears in.
 
 ## Provider support matrix (Clawdbot integrations)
@@ -123,9 +129,49 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
 - `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
 - CLI fallback: `gemini` CLI (supports `read_file` on video/audio).
 
+## Attachment policy
+Per‑capability `attachments` controls which attachments are processed:
+- `mode`: `first` (default) or `all`
+- `maxAttachments`: cap the number processed (default **1**)
+- `prefer`: `first`, `last`, `path`, `url`
+
+When `mode: "all"`, outputs are labeled `[Image 1/2]`, `[Audio 2/2]`, etc.
+
 ## Config examples
 
-### 1) Audio + Video only (image off)
+### 1) Shared models list + overrides
+```json5
+{
+  tools: {
+    media: {
+      models: [
+        { provider: "openai", model: "gpt-5.2", capabilities: ["image"] },
+        { provider: "google", model: "gemini-3-flash-preview", capabilities: ["image", "audio", "video"] },
+        {
+          type: "cli",
+          command: "gemini",
+          args: [
+            "-m",
+            "gemini-3-flash",
+            "--allowed-tools",
+            "read_file",
+            "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
+          ],
+          capabilities: ["image", "video"]
+        }
+      ],
+      audio: {
+        attachments: { mode: "all", maxAttachments: 2 }
+      },
+      video: {
+        maxChars: 500
+      }
+    }
+  }
+}
+```
+
+### 2) Audio + Video only (image off)
 ```json5
 {
   tools: {
@@ -164,7 +210,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
 }
 ```
 
-### 2) Optional image understanding
+### 3) Optional image understanding
 ```json5
 {
   tools: {
@@ -194,7 +240,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
 }
 ```
 
-### 3) Multi‑modal single entry (explicit capabilities)
+### 4) Multi‑modal single entry (explicit capabilities)
 ```json5
 {
   tools: {
diff --git a/src/config/schema.ts b/src/config/schema.ts
index a7a9b60a6..60bb21961 100644
--- a/src/config/schema.ts
+++ b/src/config/schema.ts
@@ -107,14 +107,18 @@ const FIELD_LABELS: Record<string, string> = {
   "tools.media.image.maxChars": "Image Understanding Max Chars",
   "tools.media.image.prompt": "Image Understanding Prompt",
   "tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
+  "tools.media.image.attachments": "Image Understanding Attachment Policy",
   "tools.media.image.models": "Image Understanding Models",
   "tools.media.image.scope": "Image Understanding Scope",
+  "tools.media.models": "Media Understanding Shared Models",
+  "tools.media.concurrency": "Media Understanding Concurrency",
   "tools.media.audio.enabled": "Enable Audio Understanding",
   "tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
   "tools.media.audio.maxChars": "Audio Understanding Max Chars",
   "tools.media.audio.prompt": "Audio Understanding Prompt",
   "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
   "tools.media.audio.language": "Audio Understanding Language",
+  "tools.media.audio.attachments": "Audio Understanding Attachment Policy",
   "tools.media.audio.models": "Audio Understanding Models",
   "tools.media.audio.scope": "Audio Understanding Scope",
   "tools.media.video.enabled": "Enable Video Understanding",
@@ -122,6 +126,7 @@ const FIELD_LABELS: Record<string, string> = {
   "tools.media.video.maxChars": "Video Understanding Max Chars",
   "tools.media.video.prompt": "Video Understanding Prompt",
   "tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
+  "tools.media.video.attachments": "Video Understanding Attachment Policy",
   "tools.media.video.models": "Video Understanding Models",
   "tools.media.video.scope": "Video Understanding Scope",
   "tools.profile": "Tool Profile",
diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts
index f749c1814..f3e9736e7 100644
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -18,6 +18,15 @@ export type MediaUnderstandingScopeConfig = {
 
 export type MediaUnderstandingCapability = "image" | "audio" | "video";
 
+export type MediaUnderstandingAttachmentsConfig = {
+  /** Select the first matching attachment or process multiple. */
+  mode?: "first" | "all";
+  /** Max number of attachments to process (default: 1). */
+  maxAttachments?: number;
+  /** Attachment ordering preference. */
+  prefer?: "first" | "last" | "path" | "url";
+};
+
 export type MediaUnderstandingModelConfig = {
   /** provider API id (e.g. openai, google). */
   provider?: string;
@@ -62,11 +71,17 @@ export type MediaUnderstandingConfig = {
   timeoutSeconds?: number;
   /** Default language hint (audio). */
   language?: string;
+  /** Attachment selection policy. */
+  attachments?: MediaUnderstandingAttachmentsConfig;
   /** Ordered model list (fallbacks in order). */
   models?: MediaUnderstandingModelConfig[];
 };
 
 export type MediaToolsConfig = {
+  /** Shared model list applied across image/audio/video. */
+  models?: MediaUnderstandingModelConfig[];
+  /** Max concurrent media understanding runs. */
+  concurrency?: number;
   image?: MediaUnderstandingConfig;
   audio?: MediaUnderstandingConfig;
   video?: MediaUnderstandingConfig;
diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts
index 1af60b7a5..c8479ef37 100644
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -271,6 +271,14 @@ export const MediaUnderstandingCapabilitiesSchema = z
   .array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
   .optional();
 
+export const MediaUnderstandingAttachmentsSchema = z
+  .object({
+    mode: z.union([z.literal("first"), z.literal("all")]).optional(),
+    maxAttachments: z.number().int().positive().optional(),
+    prefer: z.union([z.literal("first"), z.literal("last"), z.literal("path"), z.literal("url")]).optional(),
+  })
+  .optional();
+
 export const MediaUnderstandingModelSchema = z
   .object({
     provider: z.string().optional(),
@@ -298,12 +306,15 @@ export const ToolsMediaUnderstandingSchema = z
     prompt: z.string().optional(),
     timeoutSeconds: z.number().int().positive().optional(),
     language: z.string().optional(),
+    attachments: MediaUnderstandingAttachmentsSchema,
     models: z.array(MediaUnderstandingModelSchema).optional(),
   })
   .optional();
 
 export const ToolsMediaSchema = z
   .object({
+    models: z.array(MediaUnderstandingModelSchema).optional(),
+    concurrency: z.number().int().positive().optional(),
     image: ToolsMediaUnderstandingSchema.optional(),
     audio: ToolsMediaUnderstandingSchema.optional(),
     video: ToolsMediaUnderstandingSchema.optional(),
diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
index f52685e6f..b753cac31 100644
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -255,4 +255,90 @@ describe("applyMediaUnderstanding", () => {
     expect(ctx.CommandBody).toBe("show Dom");
     expect(ctx.RawBody).toBe("show Dom");
   });
+
+  it("uses shared media models list when capability config is missing", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const imagePath = path.join(dir, "shared.jpg");
+    await fs.writeFile(imagePath, "image-bytes");
+
+    const ctx: MsgContext = {
+      Body: "<media:image>",
+      MediaPath: imagePath,
+      MediaType: "image/jpeg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          models: [
+            {
+              type: "cli",
+              command: "gemini",
+              args: ["--allowed-tools", "read_file", "{{MediaPath}}"],
+              capabilities: ["image"],
+            },
+          ],
+        },
+      },
+    };
+
+    const execModule = await import("../process/exec.js");
+    vi.mocked(execModule.runExec).mockResolvedValue({
+      stdout: "shared description\n",
+      stderr: "",
+    });
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+    });
+
+    expect(result.appliedImage).toBe(true);
+    expect(ctx.Body).toBe("[Image]\nDescription:\nshared description");
+  });
+
+  it("handles multiple audio attachments when attachment mode is all", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPathA = path.join(dir, "note-a.ogg");
+    const audioPathB = path.join(dir, "note-b.ogg");
+    await fs.writeFile(audioPathA, "hello");
+    await fs.writeFile(audioPathB, "world");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPaths: [audioPathA, audioPathB],
+      MediaTypes: ["audio/ogg", "audio/ogg"],
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            attachments: { mode: "all", maxAttachments: 2 },
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async (req) => ({ text: req.fileName }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("Audio 1:\nnote-a.ogg\n\nAudio 2:\nnote-b.ogg");
+    expect(ctx.Body).toBe(
+      ["[Audio 1/2]\nTranscript:\nnote-a.ogg", "[Audio 2/2]\nTranscript:\nnote-b.ogg"].join(
+        "\n\n",
+      ),
+    );
+  });
 });
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 558b76f57..1b05348a6 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -1,71 +1,53 @@
-import crypto from "node:crypto";
-import fs from "node:fs/promises";
-import os from "node:os";
-import path from "node:path";
-import { fileURLToPath } from "node:url";
-
-import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
-import { complete } from "@mariozechner/pi-ai";
-import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
-
 import type { ClawdbotConfig } from "../config/config.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import { applyTemplate } from "../auto-reply/templating.js";
-import { getApiKeyForModel, resolveApiKeyForProvider } from "../agents/model-auth.js";
-import { ensureClawdbotModelsJson } from "../agents/models-config.js";
-import { minimaxUnderstandImage } from "../agents/minimax-vlm.js";
+import { resolveApiKeyForProvider } from "../agents/model-auth.js";
 import { logVerbose, shouldLogVerbose } from "../globals.js";
-import { fetchRemoteMedia } from "../media/fetch.js";
-import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
 import { runExec } from "../process/exec.js";
 import type {
   MediaUnderstandingConfig,
   MediaUnderstandingModelConfig,
-  MediaUnderstandingScopeConfig,
 } from "../config/types.tools.js";
-import { extractMediaUserText, formatMediaUnderstandingBody } from "./format.js";
+import {
+  MediaAttachmentCache,
+  normalizeAttachments,
+  selectAttachments,
+} from "./attachments.js";
+import {
+  CLI_OUTPUT_MAX_BUFFER,
+  DEFAULT_AUDIO_MODELS,
+  DEFAULT_TIMEOUT_SECONDS,
+} from "./defaults.js";
+import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js";
+import {
+  extractMediaUserText,
+  formatAudioTranscripts,
+  formatMediaUnderstandingBody,
+} from "./format.js";
 import {
   buildMediaUnderstandingRegistry,
   getMediaUnderstandingProvider,
   normalizeMediaProviderId,
 } from "./providers/index.js";
-import { fetchWithTimeout } from "./providers/shared.js";
-import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
+import { describeImageWithModel } from "./providers/image.js";
+import {
+  resolveCapabilityConfig,
+  resolveCapabilityEnabled,
+  resolveConcurrency,
+  resolveMaxBytes,
+  resolveMaxChars,
+  resolveModelEntries,
+  resolvePrompt,
+  resolveScopeDecision,
+  resolveTimeoutMs,
+} from "./resolve.js";
 import type {
-  MediaAttachment,
+  MediaUnderstandingCapability,
   MediaUnderstandingOutput,
   MediaUnderstandingProvider,
 } from "./types.js";
-import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
-
-const MB = 1024 * 1024;
-const DEFAULT_MAX_CHARS = 500;
-const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<Capability, number | undefined> = {
-  image: DEFAULT_MAX_CHARS,
-  audio: undefined,
-  video: DEFAULT_MAX_CHARS,
-};
-const DEFAULT_MAX_BYTES: Record<Capability, number> = {
-  image: 10 * MB,
-  audio: 20 * MB,
-  video: 50 * MB,
-};
-const DEFAULT_TIMEOUT_SECONDS: Record<Capability, number> = {
-  image: 60,
-  audio: 60,
-  video: 120,
-};
-const DEFAULT_PROMPT: Record<Capability, string> = {
-  image: "Describe the image.",
-  audio: "Transcribe the audio.",
-  video: "Describe the video.",
-};
-const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
-const DEFAULT_AUDIO_MODELS: Record<string, string> = {
-  groq: "whisper-large-v3-turbo",
-  openai: "whisper-1",
-};
-const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
+import { runWithConcurrency } from "./concurrency.js";
+import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
 
 export type ApplyMediaUnderstandingResult = {
   outputs: MediaUnderstandingOutput[];
@@ -74,147 +56,7 @@ export type ApplyMediaUnderstandingResult = {
   appliedVideo: boolean;
 };
 
-type Capability = "image" | "audio" | "video";
-
-type MediaBufferResult = {
-  buffer: Buffer;
-  mime?: string;
-  fileName: string;
-};
-
-type MediaPathResult = {
-  path: string;
-  cleanup?: () => Promise<void> | void;
-};
-
-function normalizeAttachmentPath(raw?: string | null): string | undefined {
-  const value = raw?.trim();
-  if (!value) return undefined;
-  if (value.startsWith("file://")) {
-    try {
-      return fileURLToPath(value);
-    } catch {
-      return undefined;
-    }
-  }
-  return value;
-}
-
-function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
-  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
-  const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
-  const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
-  const resolveMime = (count: number, index: number) => {
-    const typeHint = typesFromArray?.[index];
-    const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
-    if (trimmed) return trimmed;
-    return count === 1 ? ctx.MediaType : undefined;
-  };
-
-  if (pathsFromArray && pathsFromArray.length > 0) {
-    const count = pathsFromArray.length;
-    const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
-    return pathsFromArray
-      .map((value, index) => ({
-        path: value?.trim() || undefined,
-        url: urls?.[index] ?? ctx.MediaUrl,
-        mime: resolveMime(count, index),
-        index,
-      }))
-      .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
-  }
-
-  if (urlsFromArray && urlsFromArray.length > 0) {
-    const count = urlsFromArray.length;
-    return urlsFromArray
-      .map((value, index) => ({
-        path: undefined,
-        url: value?.trim() || undefined,
-        mime: resolveMime(count, index),
-        index,
-      }))
-      .filter((entry) => Boolean(entry.url?.trim()));
-  }
-
-  const pathValue = ctx.MediaPath?.trim();
-  const url = ctx.MediaUrl?.trim();
-  if (!pathValue && !url) return [];
-  return [
-    {
-      path: pathValue || undefined,
-      url: url || undefined,
-      mime: ctx.MediaType,
-      index: 0,
-    },
-  ];
-}
-
-function isVideoAttachment(attachment: MediaAttachment): boolean {
-  if (attachment.mime?.startsWith("video/")) return true;
-  const ext = getFileExtension(attachment.path ?? attachment.url);
-  if (!ext) return false;
-  return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
-}
-
-function isAudioAttachment(attachment: MediaAttachment): boolean {
-  if (attachment.mime?.startsWith("audio/")) return true;
-  return isAudioFileName(attachment.path ?? attachment.url);
-}
-
-function isImageAttachment(attachment: MediaAttachment): boolean {
-  if (attachment.mime?.startsWith("image/")) return true;
-  const ext = getFileExtension(attachment.path ?? attachment.url);
-  if (!ext) return false;
-  return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
-}
-
-function estimateBase64Size(bytes: number): number {
-  return Math.ceil(bytes / 3) * 4;
-}
-
-function resolveVideoMaxBase64Bytes(maxBytes: number): number {
-  const expanded = Math.floor(maxBytes * (4 / 3));
-  return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
-}
-
-function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
-  const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
-  return Math.max(1000, Math.floor(value * 1000));
-}
-
-function resolvePrompt(capability: Capability, prompt?: string, maxChars?: number): string {
-  const base = prompt?.trim() || DEFAULT_PROMPT[capability];
-  if (!maxChars || capability === "audio") return base;
-  return `${base} Respond in at most ${maxChars} characters.`;
-}
-
-function resolveRequestUrl(input: RequestInfo | URL): string {
-  if (typeof input === "string") return input;
-  if (input instanceof URL) return input.toString();
-  return input.url;
-}
-
-function normalizeErrorMessage(err: unknown): string {
-  if (!err) return "";
-  if (typeof err === "string") return err;
-  if (err instanceof Error) return err.message;
-  try {
-    return JSON.stringify(err);
-  } catch {
-    return "";
-  }
-}
-
-function resolveMaxChars(params: {
-  capability: Capability;
-  entry: MediaUnderstandingModelConfig;
-  cfg: ClawdbotConfig;
-}): number | undefined {
-  const { capability, entry, cfg } = params;
-  const configured = entry.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
-  if (typeof configured === "number") return configured;
-  return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
-}
+const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
 
 function trimOutput(text: string, maxChars?: number): string {
   const trimmed = text.trim();
@@ -222,272 +64,32 @@ function trimOutput(text: string, maxChars?: number): string {
   return trimmed.slice(0, maxChars).trim();
 }
 
-function resolveConfigValue<T>(primary: T | undefined, fallback: T): T {
-  return primary === undefined ? fallback : primary;
-}
-
-function resolveCapabilityConfig(
-  cfg: ClawdbotConfig,
-  capability: Capability,
-): MediaUnderstandingConfig | undefined {
-  return cfg.tools?.media?.[capability];
-}
-
-function resolveScopeDecision(params: {
-  scope?: MediaUnderstandingScopeConfig;
-  ctx: MsgContext;
-}): "allow" | "deny" {
-  return resolveMediaUnderstandingScope({
-    scope: params.scope,
-    sessionKey: params.ctx.SessionKey,
-    channel: params.ctx.Surface ?? params.ctx.Provider,
-    chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
-  });
-}
-
-function resolveModelEntries(
-  cfg: MediaUnderstandingConfig | undefined,
-  capability: Capability,
-): MediaUnderstandingModelConfig[] {
-  const models = cfg?.models ?? [];
-  if (models.length === 0) return [];
-  return models.filter((entry) => {
-    const caps = entry.capabilities;
-    if (!caps || caps.length === 0) return true;
-    return caps.includes(capability);
-  });
-}
-
-function isMaxBytesError(err: unknown): boolean {
-  const message = normalizeErrorMessage(err);
-  if (!message) return false;
-  return message.includes("exceeds maxBytes") || message.includes("payload exceeds maxBytes");
-}
-
-async function loadAttachmentBuffer(params: {
-  attachment: MediaAttachment;
-  maxBytes: number;
-  timeoutMs: number;
-}): Promise<MediaBufferResult | undefined> {
-  const { attachment, maxBytes, timeoutMs } = params;
-  const rawPath = normalizeAttachmentPath(attachment.path);
-  if (rawPath) {
-    const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
-    try {
-      const stat = await fs.stat(resolved);
-      if (!stat.isFile()) return undefined;
-      if (stat.size > maxBytes) {
-        if (shouldLogVerbose()) {
-          logVerbose(
-            `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
-          );
-        }
-        return undefined;
-      }
-      const buffer = await fs.readFile(resolved);
-      const mime =
-        attachment.mime ??
-        (await detectMime({
-          buffer,
-          filePath: resolved,
-        }));
-      const fileName = path.basename(resolved) || `media-${attachment.index + 1}`;
-      return { buffer, mime, fileName };
-    } catch (err) {
-      if (shouldLogVerbose()) {
-        logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
-      }
-    }
-  }
-
-  const url = attachment.url?.trim();
-  if (!url) return undefined;
-
-  try {
-    const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
-      fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
-    const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
-    if (fetched.buffer.length > maxBytes) {
-      if (shouldLogVerbose()) {
-        logVerbose(
-          `Skipping media attachment ${attachment.index + 1}: ${fetched.buffer.length} bytes exceeds ${maxBytes}`,
-        );
-      }
-      return undefined;
-    }
-    const mime =
-      attachment.mime ??
-      fetched.contentType ??
-      (await detectMime({
-        buffer: fetched.buffer,
-        filePath: fetched.fileName ?? url,
-      }));
-    const fileName = fetched.fileName ?? `media-${attachment.index + 1}`;
-    return { buffer: fetched.buffer, mime, fileName };
-  } catch (err) {
-    if (shouldLogVerbose()) {
-      logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
-    }
-  }
-
-  return undefined;
-}
-
-async function resolveAttachmentPath(params: {
-  attachment: MediaAttachment;
-  maxBytes?: number;
-  timeoutMs: number;
-}): Promise<MediaPathResult | undefined> {
-  const { attachment, maxBytes, timeoutMs } = params;
-  const rawPath = normalizeAttachmentPath(attachment.path);
-  if (rawPath) {
-    const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
-    try {
-      const stat = await fs.stat(resolved);
-      if (!stat.isFile()) return undefined;
-      if (maxBytes && stat.size > maxBytes) {
-        if (shouldLogVerbose()) {
-          logVerbose(
-            `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
-          );
-        }
-        return undefined;
-      }
-      return { path: resolved };
-    } catch (err) {
-      if (shouldLogVerbose()) {
-        logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
-      }
-    }
-  }
-
-  const url = attachment.url?.trim();
-  if (!url) return undefined;
-
-  try {
-    const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
-      fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
-    const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
-    const buffer = fetched.buffer;
-    if (maxBytes && buffer.length > maxBytes) {
-      if (shouldLogVerbose()) {
-        logVerbose(
-          `Skipping media attachment ${attachment.index + 1}: ${buffer.length} bytes exceeds ${maxBytes}`,
-        );
-      }
-      return undefined;
-    }
-    const extension = fetched.fileName ? path.extname(fetched.fileName) : "";
-    const tmpPath = path.join(
-      os.tmpdir(),
-      `clawdbot-media-${crypto.randomUUID()}${extension || ""}`,
-    );
-    await fs.writeFile(tmpPath, buffer);
-    return {
-      path: tmpPath,
-      cleanup: async () => {
-        await fs.unlink(tmpPath).catch(() => {});
-      },
-    };
-  } catch (err) {
-    if (shouldLogVerbose()) {
-      logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
-    }
-  }
-
-  return undefined;
-}
-
-async function describeImageWithModel(params: {
-  cfg: ClawdbotConfig;
-  agentDir: string;
-  provider: string;
-  model: string;
-  prompt: string;
-  maxChars?: number;
-  buffer: Buffer;
-  mimeType: string;
-  profile?: string;
-  preferredProfile?: string;
-}): Promise<{ text: string; model: string }> {
-  await ensureClawdbotModelsJson(params.cfg, params.agentDir);
-  const authStorage = discoverAuthStorage(params.agentDir);
-  const modelRegistry = discoverModels(authStorage, params.agentDir);
-  const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
-  if (!model) {
-    throw new Error(`Unknown model: ${params.provider}/${params.model}`);
-  }
-  if (!model.input?.includes("image")) {
-    throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
-  }
-  const apiKeyInfo = await getApiKeyForModel({
-    model,
-    cfg: params.cfg,
-    agentDir: params.agentDir,
-    profileId: params.profile,
-    preferredProfile: params.preferredProfile,
-  });
-  authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
-
-  const base64 = params.buffer.toString("base64");
-  if (model.provider === "minimax") {
-    const text = await minimaxUnderstandImage({
-      apiKey: apiKeyInfo.apiKey,
-      prompt: params.prompt,
-      imageDataUrl: `data:${params.mimeType};base64,${base64}`,
-      modelBaseUrl: model.baseUrl,
-    });
-    return { text, model: model.id };
-  }
-
-  const context: Context = {
-    messages: [
-      {
-        role: "user",
-        content: [
-          { type: "text", text: params.prompt },
-          { type: "image", data: base64, mimeType: params.mimeType },
-        ],
-        timestamp: Date.now(),
-      },
-    ],
-  };
-  const message = (await complete(model, context, {
-    apiKey: apiKeyInfo.apiKey,
-    maxTokens: 512,
-  })) as AssistantMessage;
-  const text = coerceImageAssistantText({
-    message,
-    provider: model.provider,
-    model: model.id,
-  });
-  return { text, model: model.id };
-}
-
 async function runProviderEntry(params: {
-  capability: Capability;
+  capability: MediaUnderstandingCapability;
   entry: MediaUnderstandingModelConfig;
   cfg: ClawdbotConfig;
   ctx: MsgContext;
-  attachment: MediaAttachment;
+  attachmentIndex: number;
+  cache: MediaAttachmentCache;
   agentDir?: string;
   providerRegistry: Map<string, MediaUnderstandingProvider>;
+  config?: MediaUnderstandingConfig;
 }): Promise<MediaUnderstandingOutput | null> {
-  const { entry, capability, cfg, attachment } = params;
+  const { entry, capability, cfg } = params;
   const providerIdRaw = entry.provider?.trim();
   if (!providerIdRaw) {
     throw new Error(`Provider entry missing provider for ${capability}`);
   }
   const providerId = normalizeMediaProviderId(providerIdRaw);
-  const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
-  const maxChars = resolveMaxChars({ capability, entry, cfg });
+  const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
+  const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
   const timeoutMs = resolveTimeoutMs(
-    entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
+    entry.timeoutSeconds ?? params.config?.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
     DEFAULT_TIMEOUT_SECONDS[capability],
   );
   const prompt = resolvePrompt(
     capability,
-    entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
+    entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
     maxChars,
   );
 
@@ -499,27 +101,45 @@ async function runProviderEntry(params: {
     if (!modelId) {
       throw new Error("Image understanding requires model id");
     }
-    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
-    if (!media) return null;
-    const mimeType = media.mime ?? "image/jpeg";
-    const result = await describeImageWithModel({
-      cfg,
-      agentDir: params.agentDir,
-      provider: providerId,
-      model: modelId,
-      prompt,
-      maxChars,
-      buffer: media.buffer,
-      mimeType,
-      profile: entry.profile,
-      preferredProfile: entry.preferredProfile,
+    const media = await params.cache.getBuffer({
+      attachmentIndex: params.attachmentIndex,
+      maxBytes,
+      timeoutMs,
     });
+    const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
+    const result = provider?.describeImage
+      ? await provider.describeImage({
+          buffer: media.buffer,
+          fileName: media.fileName,
+          mime: media.mime,
+          model: modelId,
+          provider: providerId,
+          prompt,
+          timeoutMs,
+          profile: entry.profile,
+          preferredProfile: entry.preferredProfile,
+          agentDir: params.agentDir,
+          cfg: params.cfg,
+        })
+      : await describeImageWithModel({
+          buffer: media.buffer,
+          fileName: media.fileName,
+          mime: media.mime,
+          model: modelId,
+          provider: providerId,
+          prompt,
+          timeoutMs,
+          profile: entry.profile,
+          preferredProfile: entry.preferredProfile,
+          agentDir: params.agentDir,
+          cfg: params.cfg,
+        });
     return {
       kind: "image.description",
-      attachmentIndex: attachment.index,
+      attachmentIndex: params.attachmentIndex,
       text: trimOutput(result.text, maxChars),
       provider: providerId,
-      model: result.model,
+      model: result.model ?? modelId,
     };
   }
 
@@ -532,8 +152,11 @@ async function runProviderEntry(params: {
     if (!provider.transcribeAudio) {
       throw new Error(`Audio transcription provider "${providerId}" not available.`);
     }
-    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
-    if (!media) return null;
+    const media = await params.cache.getBuffer({
+      attachmentIndex: params.attachmentIndex,
+      maxBytes,
+      timeoutMs,
+    });
     const key = await resolveApiKeyForProvider({
       provider: providerId,
       cfg,
@@ -551,96 +174,94 @@ async function runProviderEntry(params: {
       baseUrl: providerConfig?.baseUrl,
       headers: providerConfig?.headers,
       model,
-      language: entry.language ?? cfg.tools?.media?.audio?.language,
+      language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
       prompt,
       timeoutMs,
     });
     return {
       kind: "audio.transcription",
-      attachmentIndex: attachment.index,
+      attachmentIndex: params.attachmentIndex,
       text: trimOutput(result.text, maxChars),
       provider: providerId,
       model: result.model ?? model,
     };
   }
 
-  if (capability === "video") {
-    if (!provider.describeVideo) {
-      throw new Error(`Video understanding provider "${providerId}" not available.`);
-    }
-    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
-    if (!media) return null;
-    const estimatedBase64Bytes = estimateBase64Size(media.buffer.length);
-    const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
-    if (estimatedBase64Bytes > maxBase64Bytes) {
-      if (shouldLogVerbose()) {
-        logVerbose(
-          `Skipping video attachment ${attachment.index + 1}: base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
-        );
-      }
-      return null;
-    }
-    const key = await resolveApiKeyForProvider({
-      provider: providerId,
-      cfg,
-      profileId: entry.profile,
-      preferredProfile: entry.preferredProfile,
-      agentDir: params.agentDir,
-    });
-    const providerConfig = cfg.models?.providers?.[providerId];
-    const result = await provider.describeVideo({
-      buffer: media.buffer,
-      fileName: media.fileName,
-      mime: media.mime,
-      apiKey: key.apiKey,
-      baseUrl: providerConfig?.baseUrl,
-      headers: providerConfig?.headers,
-      model: entry.model,
-      prompt,
-      timeoutMs,
-    });
-    return {
-      kind: "video.description",
-      attachmentIndex: attachment.index,
-      text: trimOutput(result.text, maxChars),
-      provider: providerId,
-      model: result.model ?? entry.model,
-    };
+  if (!provider.describeVideo) {
+    throw new Error(`Video understanding provider "${providerId}" not available.`);
   }
-
-  return null;
+  const media = await params.cache.getBuffer({
+    attachmentIndex: params.attachmentIndex,
+    maxBytes,
+    timeoutMs,
+  });
+  const estimatedBase64Bytes = estimateBase64Size(media.size);
+  const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
+  if (estimatedBase64Bytes > maxBase64Bytes) {
+    throw new MediaUnderstandingSkipError(
+      "maxBytes",
+      `Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
+    );
+  }
+  const key = await resolveApiKeyForProvider({
+    provider: providerId,
+    cfg,
+    profileId: entry.profile,
+    preferredProfile: entry.preferredProfile,
+    agentDir: params.agentDir,
+  });
+  const providerConfig = cfg.models?.providers?.[providerId];
+  const result = await provider.describeVideo({
+    buffer: media.buffer,
+    fileName: media.fileName,
+    mime: media.mime,
+    apiKey: key.apiKey,
+    baseUrl: providerConfig?.baseUrl,
+    headers: providerConfig?.headers,
+    model: entry.model,
+    prompt,
+    timeoutMs,
+  });
+  return {
+    kind: "video.description",
+    attachmentIndex: params.attachmentIndex,
+    text: trimOutput(result.text, maxChars),
+    provider: providerId,
+    model: result.model ?? entry.model,
+  };
 }
 
 async function runCliEntry(params: {
-  capability: Capability;
+  capability: MediaUnderstandingCapability;
   entry: MediaUnderstandingModelConfig;
   cfg: ClawdbotConfig;
   ctx: MsgContext;
-  attachment: MediaAttachment;
+  attachmentIndex: number;
+  cache: MediaAttachmentCache;
+  config?: MediaUnderstandingConfig;
 }): Promise<MediaUnderstandingOutput | null> {
-  const { entry, capability, cfg, ctx, attachment } = params;
+  const { entry, capability, cfg, ctx } = params;
   const command = entry.command?.trim();
   const args = entry.args ?? [];
   if (!command) {
     throw new Error(`CLI entry missing command for ${capability}`);
   }
-  const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
-  const maxChars = resolveMaxChars({ capability, entry, cfg });
+  const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
+  const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
   const timeoutMs = resolveTimeoutMs(
-    entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
+    entry.timeoutSeconds ?? params.config?.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
     DEFAULT_TIMEOUT_SECONDS[capability],
   );
   const prompt = resolvePrompt(
     capability,
-    entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
+    entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
     maxChars,
   );
-  const pathResult = await resolveAttachmentPath({
-    attachment,
+  const pathResult = await params.cache.getPath({
+    attachmentIndex: params.attachmentIndex,
     maxBytes,
     timeoutMs,
   });
-  if (!pathResult) return null;
 
   const templCtx: MsgContext = {
     ...ctx,
@@ -654,78 +275,67 @@ async function runCliEntry(params: {
   if (shouldLogVerbose()) {
     logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
   }
-  try {
-    const { stdout } = await runExec(argv[0], argv.slice(1), {
-      timeoutMs,
-      maxBuffer: CLI_OUTPUT_MAX_BUFFER,
-    });
-    const text = trimOutput(stdout, maxChars);
-    if (!text) return null;
-    return {
-      kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
-      attachmentIndex: attachment.index,
-      text,
-      provider: "cli",
-      model: command,
-    };
-  } finally {
-    if (pathResult.cleanup) {
-      await pathResult.cleanup();
-    }
-  }
+  const { stdout } = await runExec(argv[0], argv.slice(1), {
+    timeoutMs,
+    maxBuffer: CLI_OUTPUT_MAX_BUFFER,
+  });
+  const text = trimOutput(stdout, maxChars);
+  if (!text) return null;
+  return {
+    kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
+    attachmentIndex: params.attachmentIndex,
+    text,
+    provider: "cli",
+    model: command,
+  };
 }
 
-async function runCapability(params: {
-  capability: Capability;
+async function runAttachmentEntries(params: {
+  capability: MediaUnderstandingCapability;
   cfg: ClawdbotConfig;
   ctx: MsgContext;
-  attachments: MediaAttachment[];
+  attachmentIndex: number;
   agentDir?: string;
   providerRegistry: Map<string, MediaUnderstandingProvider>;
+  cache: MediaAttachmentCache;
+  entries: MediaUnderstandingModelConfig[];
+  config?: MediaUnderstandingConfig;
 }): Promise<MediaUnderstandingOutput | null> {
-  const { capability, cfg, ctx, attachments } = params;
-  const config = resolveCapabilityConfig(cfg, capability);
-  if (!config || config.enabled === false) return null;
-  const entries = resolveModelEntries(config, capability);
-  if (entries.length === 0) return null;
-
-  const scopeDecision = resolveScopeDecision({ scope: config.scope, ctx });
-  if (scopeDecision === "deny") {
-    if (shouldLogVerbose()) {
-      logVerbose(`${capability} understanding disabled by scope policy.`);
-    }
-    return null;
-  }
-
-  const attachment = attachments.find((item) => {
-    if (capability === "image") return isImageAttachment(item);
-    if (capability === "audio") return isAudioAttachment(item);
-    return isVideoAttachment(item);
-  });
-  if (!attachment) return null;
-
+  const { entries, capability } = params;
   for (const entry of entries) {
     try {
       const entryType = entry.type ?? (entry.command ? "cli" : "provider");
       const result =
         entryType === "cli"
-          ? await runCliEntry({ capability, entry, cfg, ctx, attachment })
+          ? await runCliEntry({
+              capability,
+              entry,
+              cfg: params.cfg,
+              ctx: params.ctx,
+              attachmentIndex: params.attachmentIndex,
+              cache: params.cache,
+              config: params.config,
+            })
           : await runProviderEntry({
               capability,
               entry,
-              cfg,
-              ctx,
-              attachment,
+              cfg: params.cfg,
+              ctx: params.ctx,
+              attachmentIndex: params.attachmentIndex,
+              cache: params.cache,
               agentDir: params.agentDir,
               providerRegistry: params.providerRegistry,
+              config: params.config,
             });
       if (result) return result;
     } catch (err) {
-      if (isMaxBytesError(err)) {
+      if (isMediaUnderstandingSkipError(err)) {
         if (shouldLogVerbose()) {
-          logVerbose(`Skipping ${capability} model due to size: ${String(err)}`);
+          logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
         }
-      } else if (shouldLogVerbose()) {
+        continue;
+      }
+      if (shouldLogVerbose()) {
         logVerbose(`${capability} understanding failed: ${String(err)}`);
       }
     }
@@ -734,6 +344,49 @@ async function runCapability(params: {
   return null;
 }
 
+async function runCapability(params: {
+  capability: MediaUnderstandingCapability;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachments: MediaAttachmentCache;
+  attachmentIds: number[];
+  agentDir?: string;
+  providerRegistry: Map<string, MediaUnderstandingProvider>;
+  config?: MediaUnderstandingConfig;
+}): Promise<MediaUnderstandingOutput[]> {
+  const { capability, cfg, ctx } = params;
+  const config = params.config ?? resolveCapabilityConfig(cfg, capability);
+  if (!resolveCapabilityEnabled({ cfg, config })) return [];
+
+  const entries = resolveModelEntries({ cfg, capability, config });
+  if (entries.length === 0) return [];
+
+  const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
+  if (scopeDecision === "deny") {
+    if (shouldLogVerbose()) {
+      logVerbose(`${capability} understanding disabled by scope policy.`);
+    }
+    return [];
+  }
+
+  const outputs: MediaUnderstandingOutput[] = [];
+  for (const attachmentIndex of params.attachmentIds) {
+    const output = await runAttachmentEntries({
+      capability,
+      cfg,
+      ctx,
+      attachmentIndex,
+      agentDir: params.agentDir,
+      providerRegistry: params.providerRegistry,
+      cache: params.attachments,
+      entries,
+      config,
+    });
+    if (output) outputs.push(output);
+  }
+  return outputs;
+}
+
 export async function applyMediaUnderstanding(params: {
   ctx: MsgContext;
   cfg: ClawdbotConfig;
@@ -749,56 +402,62 @@ export async function applyMediaUnderstanding(params: {
 
   const attachments = normalizeAttachments(ctx);
   const providerRegistry = buildMediaUnderstandingRegistry(params.providers);
-  const outputs: MediaUnderstandingOutput[] = [];
+  const cache = new MediaAttachmentCache(attachments);
 
-  const imageOutput = await runCapability({
-    capability: "image",
-    cfg,
-    ctx,
-    attachments,
-    agentDir: params.agentDir,
-    providerRegistry,
-  });
-  if (imageOutput) outputs.push(imageOutput);
+  try {
+    const tasks = CAPABILITY_ORDER.map((capability) => async () => {
+      const config = resolveCapabilityConfig(cfg, capability);
+      const attachmentPolicy = config?.attachments;
+      const selected = selectAttachments({
+        capability,
+        attachments,
+        policy: attachmentPolicy,
+      });
+      if (selected.length === 0) return [] as MediaUnderstandingOutput[];
+      return await runCapability({
+        capability,
+        cfg,
+        ctx,
+        attachments: cache,
+        attachmentIds: selected.map((item) => item.index),
+        agentDir: params.agentDir,
+        providerRegistry,
+        config,
+      });
+    });
 
-  const audioOutput = await runCapability({
-    capability: "audio",
-    cfg,
-    ctx,
-    attachments,
-    agentDir: params.agentDir,
-    providerRegistry,
-  });
-  if (audioOutput) outputs.push(audioOutput);
-
-  const videoOutput = await runCapability({
-    capability: "video",
-    cfg,
-    ctx,
-    attachments,
-    agentDir: params.agentDir,
-    providerRegistry,
-  });
-  if (videoOutput) outputs.push(videoOutput);
-
-  if (outputs.length > 0) {
-    ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
-    const audioResult = outputs.find((output) => output.kind === "audio.transcription");
-    if (audioResult) {
-      ctx.Transcript = audioResult.text;
-      ctx.CommandBody = audioResult.text;
-      ctx.RawBody = audioResult.text;
-    } else if (originalUserText) {
-      ctx.CommandBody = originalUserText;
-      ctx.RawBody = originalUserText;
+    const results = await runWithConcurrency(tasks, resolveConcurrency(cfg));
+    const outputs: MediaUnderstandingOutput[] = [];
+    for (const [index] of CAPABILITY_ORDER.entries()) {
+      const entries = results[index] ?? [];
+      if (!Array.isArray(entries)) continue;
+      for (const entry of entries) {
+        outputs.push(entry);
+      }
     }
-    ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
-  }
 
-  return {
-    outputs,
-    appliedImage: outputs.some((output) => output.kind === "image.description"),
-    appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
-    appliedVideo: outputs.some((output) => output.kind === "video.description"),
-  };
+    if (outputs.length > 0) {
+      ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
+      const audioOutputs = outputs.filter((output) => output.kind === "audio.transcription");
+      if (audioOutputs.length > 0) {
+        const transcript = formatAudioTranscripts(audioOutputs);
+        ctx.Transcript = transcript;
+        ctx.CommandBody = transcript;
+        ctx.RawBody = transcript;
+      } else if (originalUserText) {
+        ctx.CommandBody = originalUserText;
+        ctx.RawBody = originalUserText;
+      }
+      ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
+    }
+
+    return {
+      outputs,
+      appliedImage: outputs.some((output) => output.kind === "image.description"),
+      appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
+      appliedVideo: outputs.some((output) => output.kind === "video.description"),
+    };
+  } finally {
+    await cache.cleanup();
+  }
 }
diff --git a/src/media-understanding/attachments.ts b/src/media-understanding/attachments.ts
new file mode 100644
index 000000000..4f1df9df2
--- /dev/null
+++ b/src/media-understanding/attachments.ts
@@ -0,0 +1,386 @@
+import crypto from "node:crypto";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { MediaUnderstandingAttachmentsConfig } from "../config/types.tools.js";
+import { fetchRemoteMedia, MediaFetchError } from "../media/fetch.js";
+import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { fetchWithTimeout } from "./providers/shared.js";
+import type { MediaAttachment, MediaUnderstandingCapability } from "./types.js";
+import { MediaUnderstandingSkipError } from "./errors.js";
+
+type MediaBufferResult = {
+  buffer: Buffer;
+  mime?: string;
+  fileName: string;
+  size: number;
+};
+
+type MediaPathResult = {
+  path: string;
+  cleanup?: () => Promise<void> | void;
+};
+
+type AttachmentCacheEntry = {
+  attachment: MediaAttachment;
+  resolvedPath?: string;
+  statSize?: number;
+  buffer?: Buffer;
+  bufferMime?: string;
+  bufferFileName?: string;
+  tempPath?: string;
+  tempCleanup?: () => Promise<void>;
+};
+
+const DEFAULT_MAX_ATTACHMENTS = 1;
+
+function normalizeAttachmentPath(raw?: string | null): string | undefined {
+  const value = raw?.trim();
+  if (!value) return undefined;
+  if (value.startsWith("file://")) {
+    try {
+      return fileURLToPath(value);
+    } catch {
+      return undefined;
+    }
+  }
+  return value;
+}
+
+export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
+  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
+  const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
+  const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
+  const resolveMime = (count: number, index: number) => {
+    const typeHint = typesFromArray?.[index];
+    const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
+    if (trimmed) return trimmed;
+    return count === 1 ? ctx.MediaType : undefined;
+  };
+
+  if (pathsFromArray && pathsFromArray.length > 0) {
+    const count = pathsFromArray.length;
+    const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
+    return pathsFromArray
+      .map((value, index) => ({
+        path: value?.trim() || undefined,
+        url: urls?.[index] ?? ctx.MediaUrl,
+        mime: resolveMime(count, index),
+        index,
+      }))
+      .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
+  }
+
+  if (urlsFromArray && urlsFromArray.length > 0) {
+    const count = urlsFromArray.length;
+    return urlsFromArray
+      .map((value, index) => ({
+        path: undefined,
+        url: value?.trim() || undefined,
+        mime: resolveMime(count, index),
+        index,
+      }))
+      .filter((entry) => Boolean(entry.url?.trim()));
+  }
+
+  const pathValue = ctx.MediaPath?.trim();
+  const url = ctx.MediaUrl?.trim();
+  if (!pathValue && !url) return [];
+  return [
+    {
+      path: pathValue || undefined,
+      url: url || undefined,
+      mime: ctx.MediaType,
+      index: 0,
+    },
+  ];
+}
+
+export function isVideoAttachment(attachment: MediaAttachment): boolean {
+  if (attachment.mime?.startsWith("video/")) return true;
+  const ext = getFileExtension(attachment.path ?? attachment.url);
+  if (!ext) return false;
+  return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
+}
+
+export function isAudioAttachment(attachment: MediaAttachment): boolean {
+  if (attachment.mime?.startsWith("audio/")) return true;
+  return isAudioFileName(attachment.path ?? attachment.url);
+}
+
+export function isImageAttachment(attachment: MediaAttachment): boolean {
+  if (attachment.mime?.startsWith("image/")) return true;
+  const ext = getFileExtension(attachment.path ?? attachment.url);
+  if (!ext) return false;
+  return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
+}
+
+function isAbortError(err: unknown): boolean {
+  if (!err) return false;
+  if (err instanceof Error && err.name === "AbortError") return true;
+  return false;
+}
+
+function resolveRequestUrl(input: RequestInfo | URL): string {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+}
+
+function orderAttachments(
+  attachments: MediaAttachment[],
+  prefer?: MediaUnderstandingAttachmentsConfig["prefer"],
+): MediaAttachment[] {
+  if (!prefer || prefer === "first") return attachments;
+  if (prefer === "last") return [...attachments].reverse();
+  if (prefer === "path") {
+    const withPath = attachments.filter((item) => item.path);
+    const withoutPath = attachments.filter((item) => !item.path);
+    return [...withPath, ...withoutPath];
+  }
+  if (prefer === "url") {
+    const withUrl = attachments.filter((item) => item.url);
+    const withoutUrl = attachments.filter((item) => !item.url);
+    return [...withUrl, ...withoutUrl];
+  }
+  return attachments;
+}
+
+export function selectAttachments(params: {
+  capability: MediaUnderstandingCapability;
+  attachments: MediaAttachment[];
+  policy?: MediaUnderstandingAttachmentsConfig;
+}): MediaAttachment[] {
+  const { capability, attachments, policy } = params;
+  const matches = attachments.filter((item) => {
+    if (capability === "image") return isImageAttachment(item);
+    if (capability === "audio") return isAudioAttachment(item);
+    return isVideoAttachment(item);
+  });
+  if (matches.length === 0) return [];
+
+  const ordered = orderAttachments(matches, policy?.prefer);
+  const mode = policy?.mode ?? "first";
+  const maxAttachments = policy?.maxAttachments ?? DEFAULT_MAX_ATTACHMENTS;
+  if (mode === "all") {
+    return ordered.slice(0, Math.max(1, maxAttachments));
+  }
+  return ordered.slice(0, 1);
+}
+
+export class MediaAttachmentCache {
+  private readonly entries = new Map<number, AttachmentCacheEntry>();
+  private readonly attachments: MediaAttachment[];
+
+  constructor(attachments: MediaAttachment[]) {
+    this.attachments = attachments;
+    for (const attachment of attachments) {
+      this.entries.set(attachment.index, { attachment });
+    }
+  }
+
+  async getBuffer(params: {
+    attachmentIndex: number;
+    maxBytes: number;
+    timeoutMs: number;
+  }): Promise<MediaBufferResult> {
+    const entry = await this.ensureEntry(params.attachmentIndex);
+    if (entry.buffer) {
+      if (entry.buffer.length > params.maxBytes) {
+        throw new MediaUnderstandingSkipError(
+          "maxBytes",
+          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+        );
+      }
+      return {
+        buffer: entry.buffer,
+        mime: entry.bufferMime,
+        fileName: entry.bufferFileName ?? `media-${params.attachmentIndex + 1}`,
+        size: entry.buffer.length,
+      };
+    }
+
+    if (entry.resolvedPath) {
+      const size = await this.ensureLocalStat(entry);
+      if (entry.resolvedPath) {
+        if (size !== undefined && size > params.maxBytes) {
+          throw new MediaUnderstandingSkipError(
+            "maxBytes",
+            `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+          );
+        }
+        const buffer = await fs.readFile(entry.resolvedPath);
+        entry.buffer = buffer;
+        entry.bufferMime =
+          entry.bufferMime ??
+          entry.attachment.mime ??
+          (await detectMime({
+            buffer,
+            filePath: entry.resolvedPath,
+          }));
+        entry.bufferFileName =
+          path.basename(entry.resolvedPath) || `media-${params.attachmentIndex + 1}`;
+        return {
+          buffer,
+          mime: entry.bufferMime,
+          fileName: entry.bufferFileName,
+          size: buffer.length,
+        };
+      }
+    }
+
+    const url = entry.attachment.url?.trim();
+    if (!url) {
+      throw new MediaUnderstandingSkipError(
+        "empty",
+        `Attachment ${params.attachmentIndex + 1} has no path or URL.`,
+      );
+    }
+
+    try {
+      const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
+        fetchWithTimeout(resolveRequestUrl(input), init ?? {}, params.timeoutMs, fetch);
+      const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes: params.maxBytes });
+      entry.buffer = fetched.buffer;
+      entry.bufferMime =
+        entry.attachment.mime ??
+        fetched.contentType ??
+        (await detectMime({
+          buffer: fetched.buffer,
+          filePath: fetched.fileName ?? url,
+        }));
+      entry.bufferFileName = fetched.fileName ?? `media-${params.attachmentIndex + 1}`;
+      return {
+        buffer: fetched.buffer,
+        mime: entry.bufferMime,
+        fileName: entry.bufferFileName,
+        size: fetched.buffer.length,
+      };
+    } catch (err) {
+      if (err instanceof MediaFetchError && err.code === "max_bytes") {
+        throw new MediaUnderstandingSkipError(
+          "maxBytes",
+          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+        );
+      }
+      if (isAbortError(err)) {
+        throw new MediaUnderstandingSkipError(
+          "timeout",
+          `Attachment ${params.attachmentIndex + 1} timed out while fetching.`,
+        );
+      }
+      throw err;
+    }
+  }
+
+  async getPath(params: {
+    attachmentIndex: number;
+    maxBytes?: number;
+    timeoutMs: number;
+  }): Promise<MediaPathResult> {
+    const entry = await this.ensureEntry(params.attachmentIndex);
+    if (entry.resolvedPath) {
+      if (params.maxBytes) {
+        const size = await this.ensureLocalStat(entry);
+        if (entry.resolvedPath) {
+          if (size !== undefined && size > params.maxBytes) {
+            throw new MediaUnderstandingSkipError(
+              "maxBytes",
+              `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+            );
+          }
+        }
+      }
+      if (entry.resolvedPath) {
+        return { path: entry.resolvedPath };
+      }
+    }
+
+    if (entry.tempPath) {
+      if (params.maxBytes && entry.buffer && entry.buffer.length > params.maxBytes) {
+        throw new MediaUnderstandingSkipError(
+          "maxBytes",
+          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+        );
+      }
+      return { path: entry.tempPath, cleanup: entry.tempCleanup };
+    }
+
+    const maxBytes = params.maxBytes ?? Number.POSITIVE_INFINITY;
+    const bufferResult = await this.getBuffer({
+      attachmentIndex: params.attachmentIndex,
+      maxBytes,
+      timeoutMs: params.timeoutMs,
+    });
+    const extension = path.extname(bufferResult.fileName || "") || "";
+    const tmpPath = path.join(
+      os.tmpdir(),
+      `clawdbot-media-${crypto.randomUUID()}${extension}`,
+    );
+    await fs.writeFile(tmpPath, bufferResult.buffer);
+    entry.tempPath = tmpPath;
+    entry.tempCleanup = async () => {
+      await fs.unlink(tmpPath).catch(() => {});
+    };
+    return { path: tmpPath, cleanup: entry.tempCleanup };
+  }
+
+  async cleanup(): Promise<void> {
+    const cleanups: Array<Promise<void> | void> = [];
+    for (const entry of this.entries.values()) {
+      if (entry.tempCleanup) {
+        cleanups.push(Promise.resolve(entry.tempCleanup()));
+        entry.tempCleanup = undefined;
+      }
+    }
+    await Promise.all(cleanups);
+  }
+
+  private async ensureEntry(attachmentIndex: number): Promise<AttachmentCacheEntry> {
+    const existing = this.entries.get(attachmentIndex);
+    if (existing) {
+      if (!existing.resolvedPath) {
+        existing.resolvedPath = this.resolveLocalPath(existing.attachment);
+      }
+      return existing;
+    }
+    const attachment =
+      this.attachments.find((item) => item.index === attachmentIndex) ?? { index: attachmentIndex };
+    const entry: AttachmentCacheEntry = {
+      attachment,
+      resolvedPath: this.resolveLocalPath(attachment),
+    };
+    this.entries.set(attachmentIndex, entry);
+    return entry;
+  }
+
+  private resolveLocalPath(attachment: MediaAttachment): string | undefined {
+    const rawPath = normalizeAttachmentPath(attachment.path);
+    if (!rawPath) return undefined;
+    return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
+  }
+
+  private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {
+    if (!entry.resolvedPath) return undefined;
+    if (entry.statSize !== undefined) return entry.statSize;
+    try {
+      const stat = await fs.stat(entry.resolvedPath);
+      if (!stat.isFile()) {
+        entry.resolvedPath = undefined;
+        return undefined;
+      }
+      entry.statSize = stat.size;
+      return stat.size;
+    } catch (err) {
+      entry.resolvedPath = undefined;
+      if (shouldLogVerbose()) {
+        logVerbose(`Failed to read attachment ${entry.attachment.index + 1}: ${String(err)}`);
+      }
+      return undefined;
+    }
+  }
+}
diff --git a/src/media-understanding/concurrency.ts b/src/media-understanding/concurrency.ts
new file mode 100644
index 000000000..8ccba85f4
--- /dev/null
+++ b/src/media-understanding/concurrency.ts
@@ -0,0 +1,29 @@
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+
+export async function runWithConcurrency<T>(
+  tasks: Array<() => Promise<T>>,
+  limit: number,
+): Promise<T[]> {
+  if (tasks.length === 0) return [];
+  const resolvedLimit = Math.max(1, Math.min(limit, tasks.length));
+  const results: T[] = Array.from({ length: tasks.length });
+  let next = 0;
+
+  const workers = Array.from({ length: resolvedLimit }, async () => {
+    while (true) {
+      const index = next;
+      next += 1;
+      if (index >= tasks.length) return;
+      try {
+        results[index] = await tasks[index]();
+      } catch (err) {
+        if (shouldLogVerbose()) {
+          logVerbose(`Media understanding task failed: ${String(err)}`);
+        }
+      }
+    }
+  });
+
+  await Promise.allSettled(workers);
+  return results;
+}
diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts
new file mode 100644
index 000000000..92ce8835c
--- /dev/null
+++ b/src/media-understanding/defaults.ts
@@ -0,0 +1,35 @@
+import type { MediaUnderstandingCapability } from "./types.js";
+
+const MB = 1024 * 1024;
+
+export const DEFAULT_MAX_CHARS = 500;
+export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<
+  MediaUnderstandingCapability,
+  number | undefined
+> = {
+  image: DEFAULT_MAX_CHARS,
+  audio: undefined,
+  video: DEFAULT_MAX_CHARS,
+};
+export const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number> = {
+  image: 10 * MB,
+  audio: 20 * MB,
+  video: 50 * MB,
+};
+export const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number> = {
+  image: 60,
+  audio: 60,
+  video: 120,
+};
+export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
+  image: "Describe the image.",
+  audio: "Transcribe the audio.",
+  video: "Describe the video.",
+};
+export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
+export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
+  groq: "whisper-large-v3-turbo",
+  openai: "whisper-1",
+};
+export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
+export const DEFAULT_MEDIA_CONCURRENCY = 2;
diff --git a/src/media-understanding/errors.ts b/src/media-understanding/errors.ts
new file mode 100644
index 000000000..738670b1d
--- /dev/null
+++ b/src/media-understanding/errors.ts
@@ -0,0 +1,17 @@
+export type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty";
+
+export class MediaUnderstandingSkipError extends Error {
+  readonly reason: MediaUnderstandingSkipReason;
+
+  constructor(reason: MediaUnderstandingSkipReason, message: string) {
+    super(message);
+    this.reason = reason;
+    this.name = "MediaUnderstandingSkipError";
+  }
+}
+
+export function isMediaUnderstandingSkipError(
+  err: unknown,
+): err is MediaUnderstandingSkipError {
+  return err instanceof MediaUnderstandingSkipError;
+}
diff --git a/src/media-understanding/format.ts b/src/media-understanding/format.ts
index ffa6f0145..f99cd8d3e 100644
--- a/src/media-understanding/format.ts
+++ b/src/media-understanding/format.ts
@@ -12,7 +12,7 @@ export function extractMediaUserText(body?: string): string | undefined {
 }
 
 function formatSection(
-  title: "Audio" | "Video" | "Image",
+  title: string,
   kind: "Transcript" | "Description",
   text: string,
   userText?: string,
@@ -40,11 +40,21 @@ export function formatMediaUnderstandingBody(params: {
     sections.push(`User text:\n${userText}`);
   }
 
+  const counts = new Map<MediaUnderstandingOutput["kind"], number>();
   for (const output of outputs) {
+    counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
+  }
+  const seen = new Map<MediaUnderstandingOutput["kind"], number>();
+
+  for (const output of outputs) {
+    const count = counts.get(output.kind) ?? 1;
+    const next = (seen.get(output.kind) ?? 0) + 1;
+    seen.set(output.kind, next);
+    const suffix = count > 1 ? ` ${next}/${count}` : "";
     if (output.kind === "audio.transcription") {
       sections.push(
         formatSection(
-          "Audio",
+          `Audio${suffix}`,
           "Transcript",
           output.text,
           outputs.length === 1 ? userText : undefined,
@@ -55,7 +65,7 @@ export function formatMediaUnderstandingBody(params: {
     if (output.kind === "image.description") {
       sections.push(
         formatSection(
-          "Image",
+          `Image${suffix}`,
           "Description",
           output.text,
           outputs.length === 1 ? userText : undefined,
@@ -65,7 +75,7 @@ export function formatMediaUnderstandingBody(params: {
     }
     sections.push(
       formatSection(
-        "Video",
+        `Video${suffix}`,
         "Description",
         output.text,
         outputs.length === 1 ? userText : undefined,
@@ -75,3 +85,10 @@ export function formatMediaUnderstandingBody(params: {
 
   return sections.join("\n\n").trim();
 }
+
+export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string {
+  if (outputs.length === 1) return outputs[0].text;
+  return outputs
+    .map((output, index) => `Audio ${index + 1}:\n${output.text}`)
+    .join("\n\n");
+}
diff --git a/src/media-understanding/providers/anthropic/index.ts b/src/media-understanding/providers/anthropic/index.ts
new file mode 100644
index 000000000..3f9fc584c
--- /dev/null
+++ b/src/media-understanding/providers/anthropic/index.ts
@@ -0,0 +1,7 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { describeImageWithModel } from "../image.js";
+
+export const anthropicProvider: MediaUnderstandingProvider = {
+  id: "anthropic",
+  describeImage: describeImageWithModel,
+};
diff --git a/src/media-understanding/providers/google/index.ts b/src/media-understanding/providers/google/index.ts
index 285195dc7..d0f8bae3b 100644
--- a/src/media-understanding/providers/google/index.ts
+++ b/src/media-understanding/providers/google/index.ts
@@ -1,7 +1,9 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
+import { describeImageWithModel } from "../image.js";
 import { describeGeminiVideo } from "./video.js";
 
 export const googleProvider: MediaUnderstandingProvider = {
   id: "google",
+  describeImage: describeImageWithModel,
   describeVideo: describeGeminiVideo,
 };
diff --git a/src/media-understanding/providers/image.ts b/src/media-understanding/providers/image.ts
new file mode 100644
index 000000000..bd056253a
--- /dev/null
+++ b/src/media-understanding/providers/image.ts
@@ -0,0 +1,66 @@
+import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
+import { complete } from "@mariozechner/pi-ai";
+import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
+
+import { getApiKeyForModel } from "../../agents/model-auth.js";
+import { ensureClawdbotModelsJson } from "../../agents/models-config.js";
+import { minimaxUnderstandImage } from "../../agents/minimax-vlm.js";
+import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js";
+import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js";
+
+export async function describeImageWithModel(
+  params: ImageDescriptionRequest,
+): Promise<ImageDescriptionResult> {
+  await ensureClawdbotModelsJson(params.cfg, params.agentDir);
+  const authStorage = discoverAuthStorage(params.agentDir);
+  const modelRegistry = discoverModels(authStorage, params.agentDir);
+  const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
+  if (!model) {
+    throw new Error(`Unknown model: ${params.provider}/${params.model}`);
+  }
+  if (!model.input?.includes("image")) {
+    throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
+  }
+  const apiKeyInfo = await getApiKeyForModel({
+    model,
+    cfg: params.cfg,
+    agentDir: params.agentDir,
+    profileId: params.profile,
+    preferredProfile: params.preferredProfile,
+  });
+  authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
+
+  const base64 = params.buffer.toString("base64");
+  if (model.provider === "minimax") {
+    const text = await minimaxUnderstandImage({
+      apiKey: apiKeyInfo.apiKey,
+      prompt: params.prompt ?? "Describe the image.",
+      imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`,
+      modelBaseUrl: model.baseUrl,
+    });
+    return { text, model: model.id };
+  }
+
+  const context: Context = {
+    messages: [
+      {
+        role: "user",
+        content: [
+          { type: "text", text: params.prompt ?? "Describe the image." },
+          { type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" },
+        ],
+        timestamp: Date.now(),
+      },
+    ],
+  };
+  const message = (await complete(model, context, {
+    apiKey: apiKeyInfo.apiKey,
+    maxTokens: params.maxTokens ?? 512,
+  })) as AssistantMessage;
+  const text = coerceImageAssistantText({
+    message,
+    provider: model.provider,
+    model: model.id,
+  });
+  return { text, model: model.id };
+}
diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts
index fef5d6531..6f4387a10 100644
--- a/src/media-understanding/providers/index.ts
+++ b/src/media-understanding/providers/index.ts
@@ -1,10 +1,18 @@
 import { normalizeProviderId } from "../../agents/model-selection.js";
 import type { MediaUnderstandingProvider } from "../types.js";
+import { anthropicProvider } from "./anthropic/index.js";
 import { googleProvider } from "./google/index.js";
 import { groqProvider } from "./groq/index.js";
+import { minimaxProvider } from "./minimax/index.js";
 import { openaiProvider } from "./openai/index.js";
 
-const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
+const PROVIDERS: MediaUnderstandingProvider[] = [
+  groqProvider,
+  openaiProvider,
+  googleProvider,
+  anthropicProvider,
+  minimaxProvider,
+];
 
 export function normalizeMediaProviderId(id: string): string {
   const normalized = normalizeProviderId(id);
diff --git a/src/media-understanding/providers/minimax/index.ts b/src/media-understanding/providers/minimax/index.ts
new file mode 100644
index 000000000..8d5003538
--- /dev/null
+++ b/src/media-understanding/providers/minimax/index.ts
@@ -0,0 +1,7 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { describeImageWithModel } from "../image.js";
+
+export const minimaxProvider: MediaUnderstandingProvider = {
+  id: "minimax",
+  describeImage: describeImageWithModel,
+};
diff --git a/src/media-understanding/providers/openai/index.ts b/src/media-understanding/providers/openai/index.ts
index f8af49928..0aabb275f 100644
--- a/src/media-understanding/providers/openai/index.ts
+++ b/src/media-understanding/providers/openai/index.ts
@@ -1,7 +1,9 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
+import { describeImageWithModel } from "../image.js";
 import { transcribeOpenAiCompatibleAudio } from "./audio.js";
 
 export const openaiProvider: MediaUnderstandingProvider = {
   id: "openai",
+  describeImage: describeImageWithModel,
   transcribeAudio: transcribeOpenAiCompatibleAudio,
 };
diff --git a/src/media-understanding/resolve.ts b/src/media-understanding/resolve.ts
new file mode 100644
index 000000000..a65a17044
--- /dev/null
+++ b/src/media-understanding/resolve.ts
@@ -0,0 +1,154 @@
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import type {
+  MediaUnderstandingConfig,
+  MediaUnderstandingModelConfig,
+  MediaUnderstandingScopeConfig,
+} from "../config/types.tools.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import {
+  DEFAULT_MAX_BYTES,
+  DEFAULT_MAX_CHARS_BY_CAPABILITY,
+  DEFAULT_MEDIA_CONCURRENCY,
+  DEFAULT_PROMPT,
+} from "./defaults.js";
+import { normalizeMediaProviderId } from "./providers/index.js";
+import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
+import type { MediaUnderstandingCapability } from "./types.js";
+
+export function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
+  const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
+  return Math.max(1000, Math.floor(value * 1000));
+}
+
+export function resolvePrompt(
+  capability: MediaUnderstandingCapability,
+  prompt?: string,
+  maxChars?: number,
+): string {
+  const base = prompt?.trim() || DEFAULT_PROMPT[capability];
+  if (!maxChars || capability === "audio") return base;
+  return `${base} Respond in at most ${maxChars} characters.`;
+}
+
+export function resolveMaxChars(params: {
+  capability: MediaUnderstandingCapability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+  config?: MediaUnderstandingConfig;
+}): number | undefined {
+  const { capability, entry, cfg } = params;
+  const configured =
+    entry.maxChars ?? params.config?.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
+  if (typeof configured === "number") return configured;
+  return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
+}
+
+export function resolveMaxBytes(params: {
+  capability: MediaUnderstandingCapability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+  config?: MediaUnderstandingConfig;
+}): number {
+  const configured =
+    params.entry.maxBytes ??
+    params.config?.maxBytes ??
+    params.cfg.tools?.media?.[params.capability]?.maxBytes;
+  if (typeof configured === "number") return configured;
+  return DEFAULT_MAX_BYTES[params.capability];
+}
+
+export function resolveCapabilityConfig(
+  cfg: ClawdbotConfig,
+  capability: MediaUnderstandingCapability,
+): MediaUnderstandingConfig | undefined {
+  return cfg.tools?.media?.[capability];
+}
+
+export function resolveScopeDecision(params: {
+  scope?: MediaUnderstandingScopeConfig;
+  ctx: MsgContext;
+}): "allow" | "deny" {
+  return resolveMediaUnderstandingScope({
+    scope: params.scope,
+    sessionKey: params.ctx.SessionKey,
+    channel: params.ctx.Surface ?? params.ctx.Provider,
+    chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
+  });
+}
+
+function inferCapabilities(
+  entry: MediaUnderstandingModelConfig,
+): MediaUnderstandingCapability[] | undefined {
+  if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") {
+    return ["image", "audio", "video"];
+  }
+  const provider = normalizeMediaProviderId(entry.provider ?? "");
+  if (!provider) return undefined;
+  if (provider === "openai" || provider === "anthropic" || provider === "minimax") {
+    return ["image"];
+  }
+  if (provider === "google") {
+    return ["image", "audio", "video"];
+  }
+  if (provider === "groq") {
+    return ["audio"];
+  }
+  return undefined;
+}
+
+export function resolveModelEntries(params: {
+  cfg: ClawdbotConfig;
+  capability: MediaUnderstandingCapability;
+  config?: MediaUnderstandingConfig;
+}): MediaUnderstandingModelConfig[] {
+  const { cfg, capability, config } = params;
+  const sharedModels = cfg.tools?.media?.models ?? [];
+  const entries = [
+    ...(config?.models ?? []).map((entry) => ({ entry, source: "capability" as const })),
+    ...sharedModels.map((entry) => ({ entry, source: "shared" as const })),
+  ];
+  if (entries.length === 0) return [];
+
+  return entries
+    .filter(({ entry, source }) => {
+      const caps =
+        entry.capabilities && entry.capabilities.length > 0
+          ? entry.capabilities
+          : source === "shared"
+            ? inferCapabilities(entry)
+            : undefined;
+      if (!caps || caps.length === 0) {
+        if (source === "shared") {
+          if (shouldLogVerbose()) {
+            logVerbose(
+              `Skipping shared media model without capabilities: ${entry.provider ?? entry.command ?? "unknown"}`,
+            );
+          }
+          return false;
+        }
+        return true;
+      }
+      return caps.includes(capability);
+    })
+    .map(({ entry }) => entry);
+}
+
+export function resolveConcurrency(cfg: ClawdbotConfig): number {
+  const configured = cfg.tools?.media?.concurrency;
+  if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
+    return Math.floor(configured);
+  }
+  return DEFAULT_MEDIA_CONCURRENCY;
+}
+
+export function resolveCapabilityEnabled(params: {
+  cfg: ClawdbotConfig;
+  config?: MediaUnderstandingConfig;
+}): boolean {
+  if (params.config?.enabled === false) return false;
+  const sharedModels = params.cfg.tools?.media?.models ?? [];
+  const hasModels = (params.config?.models?.length ?? 0) > 0 || sharedModels.length > 0;
+  if (!hasModels) return false;
+  return true;
+}
diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts
index 85c897275..a74c79757 100644
--- a/src/media-understanding/types.ts
+++ b/src/media-understanding/types.ts
@@ -3,6 +3,8 @@ export type MediaUnderstandingKind =
   | "video.description"
   | "image.description";
 
+export type MediaUnderstandingCapability = "image" | "audio" | "video";
+
 export type MediaAttachment = {
   path?: string;
   url?: string;
@@ -55,8 +57,29 @@ export type VideoDescriptionResult = {
   model?: string;
 };
 
+export type ImageDescriptionRequest = {
+  buffer: Buffer;
+  fileName: string;
+  mime?: string;
+  model: string;
+  provider: string;
+  prompt?: string;
+  maxTokens?: number;
+  timeoutMs: number;
+  profile?: string;
+  preferredProfile?: string;
+  agentDir: string;
+  cfg: import("../config/config.js").ClawdbotConfig;
+};
+
+export type ImageDescriptionResult = {
+  text: string;
+  model?: string;
+};
+
 export type MediaUnderstandingProvider = {
   id: string;
   transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
   describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
+  describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;
 };
diff --git a/src/media-understanding/video.ts b/src/media-understanding/video.ts
new file mode 100644
index 000000000..00773f40c
--- /dev/null
+++ b/src/media-understanding/video.ts
@@ -0,0 +1,10 @@
+import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.js";
+
+export function estimateBase64Size(bytes: number): number {
+  return Math.ceil(bytes / 3) * 4;
+}
+
+export function resolveVideoMaxBase64Bytes(maxBytes: number): number {
+  const expanded = Math.floor(maxBytes * (4 / 3));
+  return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
+}
diff --git a/src/media/fetch.ts b/src/media/fetch.ts
index 6ee706d97..727ab7a5d 100644
--- a/src/media/fetch.ts
+++ b/src/media/fetch.ts
@@ -8,6 +8,18 @@ type FetchMediaResult = {
   fileName?: string;
 };
 
+export type MediaFetchErrorCode = "max_bytes" | "http_error" | "fetch_failed";
+
+export class MediaFetchError extends Error {
+  readonly code: MediaFetchErrorCode;
+
+  constructor(code: MediaFetchErrorCode, message: string) {
+    super(message);
+    this.code = code;
+    this.name = "MediaFetchError";
+  }
+}
+
 export type FetchLike = (input: RequestInfo | URL, init?: RequestInit) => Promise<Response>;
 
 type FetchMediaOptions = {
@@ -62,7 +74,7 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
   try {
     res = await fetcher(url);
   } catch (err) {
-    throw new Error(`Failed to fetch media from ${url}: ${String(err)}`);
+    throw new MediaFetchError("fetch_failed", `Failed to fetch media from ${url}: ${String(err)}`);
   }
 
   if (!res.ok) {
@@ -75,14 +87,18 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
       const snippet = await readErrorBodySnippet(res);
       if (snippet) detail += `; body: ${snippet}`;
     }
-    throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
+    throw new MediaFetchError(
+      "http_error",
+      `Failed to fetch media from ${url}${redirected}: ${detail}`,
+    );
   }
 
   const contentLength = res.headers.get("content-length");
   if (maxBytes && contentLength) {
     const length = Number(contentLength);
     if (Number.isFinite(length) && length > maxBytes) {
-      throw new Error(
+      throw new MediaFetchError(
+        "max_bytes",
         `Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
       );
     }
@@ -128,7 +144,8 @@ async function readResponseWithLimit(res: Response, maxBytes: number): Promise<B
   if (!body || typeof body.getReader !== "function") {
     const fallback = Buffer.from(await res.arrayBuffer());
     if (fallback.length > maxBytes) {
-      throw new Error(
+      throw new MediaFetchError(
+        "max_bytes",
         `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
       );
     }
@@ -148,7 +165,8 @@ async function readResponseWithLimit(res: Response, maxBytes: number): Promise<B
           try {
             await reader.cancel();
           } catch {}
-          throw new Error(
+          throw new MediaFetchError(
+            "max_bytes",
             `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
           );
         }