refactor: unify media understanding pipeline

2026-01-17 04:38:20 +00:00
parent 49ecbd8fea
commit fcb7c9ff65
24 changed files with 1250 additions and 643 deletions
--- a/docs/gateway/configuration.md
+++ b/docs/gateway/configuration.md
@@ -1770,13 +1770,16 @@ Legacy: `tools.bash` is still accepted as an alias.
 - `tools.web.fetch.firecrawl.timeoutSeconds` (optional)
 `tools.media` configures inbound media understanding (image/audio/video):
 - `tools.media.models`: shared model list (capability-tagged; used after per-cap lists).
 - `tools.media.concurrency`: max concurrent capability runs (default 2).
 - `tools.media.image` / `tools.media.audio` / `tools.media.video`:
-  - `enabled`: opt-out switch (default true).
+  - `enabled`: opt-out switch (default true when models are configured).
  - `prompt`: optional prompt override (image/video append a `maxChars` hint automatically).
  - `maxChars`: max output characters (default 500 for image/video; unset for audio).
  - `maxBytes`: max media size to send (defaults: image 10MB, audio 20MB, video 50MB).
  - `timeoutSeconds`: request timeout (defaults: image 60s, audio 60s, video 120s).
  - `language`: optional audio hint.
  - `attachments`: attachment policy (`mode`, `maxAttachments`, `prefer`).
  - `scope`: optional gating (first match wins) with `match.channel`, `match.chatType`, or `match.keyPrefix`.
  - `models`: ordered list of model entries; failures or oversize media fall back to the next entry.
 - Each `models[]` entry:
@@ -1787,7 +1790,7 @@ Legacy: `tools.bash` is still accepted as an alias.
  - CLI entry (`type: "cli"`):
    - `command`: executable to run.
    - `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc).
-  - `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry.
+  - `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry. Defaults when omitted: `openai`/`anthropic`/`minimax` → image, `google` → image+audio+video, `groq` → audio.
  - `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language` can be overridden per entry.
 If no models are configured (or `enabled: false`), understanding is skipped; the model still receives the original attachments.
@@ -2900,7 +2903,7 @@ clawdbot dns setup --apply
 ## Template variables
-Template placeholders are expanded in `tools.media.*.models[].args` (and any future templated argument fields).
+Template placeholders are expanded in `tools.media.*.models[].args` and `tools.media.models[].args` (and any future templated argument fields).
 | Variable | Description |
 |----------|-------------|
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -6,7 +6,7 @@ read_when:
 # Audio / Voice Notes — 2026-01-17
 ## What works
- **Media understanding (audio)**: If `tools.media.audio` is enabled and has `models`, Clawdbot:
+- **Media understanding (audio)**: If `tools.media.audio` is enabled (or a shared `tools.media.models` entry supports audio), Clawdbot:
  1) Locates the first audio attachment (local path or URL) and downloads it if needed.
  2) Enforces `maxBytes` before sending to each model entry.
  3) Runs the first eligible model entry in order (provider or CLI).
@@ -66,6 +66,7 @@ read_when:
 - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
 - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
 - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
 - Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
 - Transcript is available to templates as `{{Transcript}}`.
 - CLI stdout is capped (5MB); keep CLI output concise.
--- a/docs/nodes/images.md
+++ b/docs/nodes/images.md
@@ -38,10 +38,10 @@ The WhatsApp channel runs via **Baileys Web**. This document captures the curren
  - `{{MediaUrl}}` pseudo-URL for the inbound media.
  - `{{MediaPath}}` local temp path written before running the command.
 - When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/<filename>`.
- Media understanding (if configured via `tools.media.*`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`.
+- Media understanding (if configured via `tools.media.*` or shared `tools.media.models`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`.
  - Audio sets `{{Transcript}}` and uses the transcript for command parsing so slash commands still work.
  - Video and image descriptions preserve any caption text for command parsing.
- Only the first matching image/audio/video attachment is processed; remaining attachments are left untouched.
+- By default only the first matching image/audio/video attachment is processed; set `tools.media.<cap>.attachments` to process multiple attachments.
 ## Limits & Errors
 **Outbound send caps (WhatsApp web send)**
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -16,7 +16,7 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t
 ## High‑level behavior
 1) Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`).
-2) For each enabled capability (image/audio/video), pick the **first matching attachment**.
+2) For each enabled capability (image/audio/video), select attachments per policy (default: **first**).
 3) Choose the first eligible model entry (size + capability + auth).  
 4) If a model fails or the media is too large, **fall back to the next entry**.
 5) On success:
@@ -27,18 +27,23 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t
 If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
 ## Config overview
-Use **per‑capability configs** under `tools.media`. Each capability can define:
+`tools.media` supports **shared models** plus per‑capability overrides:
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
+- `tools.media.models`: shared model list (use `capabilities` to gate).
- **ordered `models` list** (fallback order)
+- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
- `scope` (optional gating by channel/chatType/session key)
+  - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
  - optional **per‑capability `models` list** (preferred before shared models)
  - `attachments` policy (`mode`, `maxAttachments`, `prefer`)
  - `scope` (optional gating by channel/chatType/session key)
 - `tools.media.concurrency`: max concurrent capability runs (default **2**).
 ```json5
 {
  tools: {
    media: {
-      image: { /* config */ },
+      models: [ /* shared list */ ],
-      audio: { /* config */ },
+      image: { /* optional overrides */ },
-      video: { /* config */ }
+      audio: { /* optional overrides */ },
      video: { /* optional overrides */ }
    }
  }
 }
@@ -95,12 +100,13 @@ Rules:
 - `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
 ## Capabilities (optional)
-If you set `capabilities`, the entry only runs for those media types. Suggested
+If you set `capabilities`, the entry only runs for those media types. For shared
-defaults when you opt in:
+lists, Clawdbot can infer defaults:
- `openai`, `anthropic`: **image**
+- `openai`, `anthropic`, `minimax`: **image**
 - `google` (Gemini API): **image + audio + video**
- CLI entries: declare the exact capabilities you support.
+- `groq`: **audio**
 For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
 If you omit `capabilities`, the entry is eligible for the list it appears in.
 ## Provider support matrix (Clawdbot integrations)
@@ -123,9 +129,49 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
 - `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
 - CLI fallback: `gemini` CLI (supports `read_file` on video/audio).
 ## Attachment policy
 Per‑capability `attachments` controls which attachments are processed:
 - `mode`: `first` (default) or `all`
 - `maxAttachments`: cap the number processed (default **1**)
 - `prefer`: `first`, `last`, `path`, `url`
 When `mode: "all"`, outputs are labeled `[Image 1/2]`, `[Audio 2/2]`, etc.
 ## Config examples
-### 1) Audio + Video only (image off)
+### 1) Shared models list + overrides
 ```json5
 {
  tools: {
    media: {
      models: [
        { provider: "openai", model: "gpt-5.2", capabilities: ["image"] },
        { provider: "google", model: "gemini-3-flash-preview", capabilities: ["image", "audio", "video"] },
        {
          type: "cli",
          command: "gemini",
          args: [
            "-m",
            "gemini-3-flash",
            "--allowed-tools",
            "read_file",
            "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
          ],
          capabilities: ["image", "video"]
        }
      ],
      audio: {
        attachments: { mode: "all", maxAttachments: 2 }
      },
      video: {
        maxChars: 500
      }
    }
  }
 }
 ```
 ### 2) Audio + Video only (image off)
 ```json5
 {
  tools: {
@@ -164,7 +210,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
 }
 ```
-### 2) Optional image understanding
+### 3) Optional image understanding
 ```json5
 {
  tools: {
@@ -194,7 +240,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
 }
 ```
-### 3) Multi‑modal single entry (explicit capabilities)
+### 4) Multi‑modal single entry (explicit capabilities)
 ```json5
 {
  tools: {
--- a/src/config/schema.ts
+++ b/src/config/schema.ts
@@ -107,14 +107,18 @@ const FIELD_LABELS: Record<string, string> = {
  "tools.media.image.maxChars": "Image Understanding Max Chars",
  "tools.media.image.prompt": "Image Understanding Prompt",
  "tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
  "tools.media.image.attachments": "Image Understanding Attachment Policy",
  "tools.media.image.models": "Image Understanding Models",
  "tools.media.image.scope": "Image Understanding Scope",
  "tools.media.models": "Media Understanding Shared Models",
  "tools.media.concurrency": "Media Understanding Concurrency",
  "tools.media.audio.enabled": "Enable Audio Understanding",
  "tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
  "tools.media.audio.maxChars": "Audio Understanding Max Chars",
  "tools.media.audio.prompt": "Audio Understanding Prompt",
  "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
  "tools.media.audio.language": "Audio Understanding Language",
  "tools.media.audio.attachments": "Audio Understanding Attachment Policy",
  "tools.media.audio.models": "Audio Understanding Models",
  "tools.media.audio.scope": "Audio Understanding Scope",
  "tools.media.video.enabled": "Enable Video Understanding",
@@ -122,6 +126,7 @@ const FIELD_LABELS: Record<string, string> = {
  "tools.media.video.maxChars": "Video Understanding Max Chars",
  "tools.media.video.prompt": "Video Understanding Prompt",
  "tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
  "tools.media.video.attachments": "Video Understanding Attachment Policy",
  "tools.media.video.models": "Video Understanding Models",
  "tools.media.video.scope": "Video Understanding Scope",
  "tools.profile": "Tool Profile",
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -18,6 +18,15 @@ export type MediaUnderstandingScopeConfig = {
 export type MediaUnderstandingCapability = "image" | "audio" | "video";
 export type MediaUnderstandingAttachmentsConfig = {
  /** Select the first matching attachment or process multiple. */
  mode?: "first" | "all";
  /** Max number of attachments to process (default: 1). */
  maxAttachments?: number;
  /** Attachment ordering preference. */
  prefer?: "first" | "last" | "path" | "url";
 };
 export type MediaUnderstandingModelConfig = {
  /** provider API id (e.g. openai, google). */
  provider?: string;
@@ -62,11 +71,17 @@ export type MediaUnderstandingConfig = {
  timeoutSeconds?: number;
  /** Default language hint (audio). */
  language?: string;
  /** Attachment selection policy. */
  attachments?: MediaUnderstandingAttachmentsConfig;
  /** Ordered model list (fallbacks in order). */
  models?: MediaUnderstandingModelConfig[];
 };
 export type MediaToolsConfig = {
  /** Shared model list applied across image/audio/video. */
  models?: MediaUnderstandingModelConfig[];
  /** Max concurrent media understanding runs. */
  concurrency?: number;
  image?: MediaUnderstandingConfig;
  audio?: MediaUnderstandingConfig;
  video?: MediaUnderstandingConfig;
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -271,6 +271,14 @@ export const MediaUnderstandingCapabilitiesSchema = z
  .array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
  .optional();
 export const MediaUnderstandingAttachmentsSchema = z
  .object({
    mode: z.union([z.literal("first"), z.literal("all")]).optional(),
    maxAttachments: z.number().int().positive().optional(),
    prefer: z.union([z.literal("first"), z.literal("last"), z.literal("path"), z.literal("url")]).optional(),
  })
  .optional();
 export const MediaUnderstandingModelSchema = z
  .object({
    provider: z.string().optional(),
@@ -298,12 +306,15 @@ export const ToolsMediaUnderstandingSchema = z
    prompt: z.string().optional(),
    timeoutSeconds: z.number().int().positive().optional(),
    language: z.string().optional(),
    attachments: MediaUnderstandingAttachmentsSchema,
    models: z.array(MediaUnderstandingModelSchema).optional(),
  })
  .optional();
 export const ToolsMediaSchema = z
  .object({
    models: z.array(MediaUnderstandingModelSchema).optional(),
    concurrency: z.number().int().positive().optional(),
    image: ToolsMediaUnderstandingSchema.optional(),
    audio: ToolsMediaUnderstandingSchema.optional(),
    video: ToolsMediaUnderstandingSchema.optional(),
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -255,4 +255,90 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.CommandBody).toBe("show Dom");
    expect(ctx.RawBody).toBe("show Dom");
  });
  it("uses shared media models list when capability config is missing", async () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
    const imagePath = path.join(dir, "shared.jpg");
    await fs.writeFile(imagePath, "image-bytes");
    const ctx: MsgContext = {
      Body: "<media:image>",
      MediaPath: imagePath,
      MediaType: "image/jpeg",
    };
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          models: [
            {
              type: "cli",
              command: "gemini",
              args: ["--allowed-tools", "read_file", "{{MediaPath}}"],
              capabilities: ["image"],
            },
          ],
        },
      },
    };
    const execModule = await import("../process/exec.js");
    vi.mocked(execModule.runExec).mockResolvedValue({
      stdout: "shared description\n",
      stderr: "",
    });
    const result = await applyMediaUnderstanding({
      ctx,
      cfg,
    });
    expect(result.appliedImage).toBe(true);
    expect(ctx.Body).toBe("[Image]\nDescription:\nshared description");
  });
  it("handles multiple audio attachments when attachment mode is all", async () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
    const audioPathA = path.join(dir, "note-a.ogg");
    const audioPathB = path.join(dir, "note-b.ogg");
    await fs.writeFile(audioPathA, "hello");
    await fs.writeFile(audioPathB, "world");
    const ctx: MsgContext = {
      Body: "<media:audio>",
      MediaPaths: [audioPathA, audioPathB],
      MediaTypes: ["audio/ogg", "audio/ogg"],
    };
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          audio: {
            enabled: true,
            attachments: { mode: "all", maxAttachments: 2 },
            models: [{ provider: "groq" }],
          },
        },
      },
    };
    const result = await applyMediaUnderstanding({
      ctx,
      cfg,
      providers: {
        groq: {
          id: "groq",
          transcribeAudio: async (req) => ({ text: req.fileName }),
        },
      },
    });
    expect(result.appliedAudio).toBe(true);
    expect(ctx.Transcript).toBe("Audio 1:\nnote-a.ogg\n\nAudio 2:\nnote-b.ogg");
    expect(ctx.Body).toBe(
      ["[Audio 1/2]\nTranscript:\nnote-a.ogg", "[Audio 2/2]\nTranscript:\nnote-b.ogg"].join(
        "\n\n",
      ),
    );
  });
 });
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
--- a/src/media-understanding/attachments.ts
+++ b/src/media-understanding/attachments.ts
@@ -0,0 +1,386 @@
 import crypto from "node:crypto";
 import fs from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
 import type { MsgContext } from "../auto-reply/templating.js";
 import type { MediaUnderstandingAttachmentsConfig } from "../config/types.tools.js";
 import { fetchRemoteMedia, MediaFetchError } from "../media/fetch.js";
 import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
 import { logVerbose, shouldLogVerbose } from "../globals.js";
 import { fetchWithTimeout } from "./providers/shared.js";
 import type { MediaAttachment, MediaUnderstandingCapability } from "./types.js";
 import { MediaUnderstandingSkipError } from "./errors.js";
 type MediaBufferResult = {
  buffer: Buffer;
  mime?: string;
  fileName: string;
  size: number;
 };
 type MediaPathResult = {
  path: string;
  cleanup?: () => Promise<void> | void;
 };
 type AttachmentCacheEntry = {
  attachment: MediaAttachment;
  resolvedPath?: string;
  statSize?: number;
  buffer?: Buffer;
  bufferMime?: string;
  bufferFileName?: string;
  tempPath?: string;
  tempCleanup?: () => Promise<void>;
 };
 const DEFAULT_MAX_ATTACHMENTS = 1;
 function normalizeAttachmentPath(raw?: string | null): string | undefined {
  const value = raw?.trim();
  if (!value) return undefined;
  if (value.startsWith("file://")) {
    try {
      return fileURLToPath(value);
    } catch {
      return undefined;
    }
  }
  return value;
 }
 export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
  const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
  const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
  const resolveMime = (count: number, index: number) => {
    const typeHint = typesFromArray?.[index];
    const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
    if (trimmed) return trimmed;
    return count === 1 ? ctx.MediaType : undefined;
  };
  if (pathsFromArray && pathsFromArray.length > 0) {
    const count = pathsFromArray.length;
    const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
    return pathsFromArray
      .map((value, index) => ({
        path: value?.trim() || undefined,
        url: urls?.[index] ?? ctx.MediaUrl,
        mime: resolveMime(count, index),
        index,
      }))
      .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
  }
  if (urlsFromArray && urlsFromArray.length > 0) {
    const count = urlsFromArray.length;
    return urlsFromArray
      .map((value, index) => ({
        path: undefined,
        url: value?.trim() || undefined,
        mime: resolveMime(count, index),
        index,
      }))
      .filter((entry) => Boolean(entry.url?.trim()));
  }
  const pathValue = ctx.MediaPath?.trim();
  const url = ctx.MediaUrl?.trim();
  if (!pathValue && !url) return [];
  return [
    {
      path: pathValue || undefined,
      url: url || undefined,
      mime: ctx.MediaType,
      index: 0,
    },
  ];
 }
 export function isVideoAttachment(attachment: MediaAttachment): boolean {
  if (attachment.mime?.startsWith("video/")) return true;
  const ext = getFileExtension(attachment.path ?? attachment.url);
  if (!ext) return false;
  return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
 }
 export function isAudioAttachment(attachment: MediaAttachment): boolean {
  if (attachment.mime?.startsWith("audio/")) return true;
  return isAudioFileName(attachment.path ?? attachment.url);
 }
 export function isImageAttachment(attachment: MediaAttachment): boolean {
  if (attachment.mime?.startsWith("image/")) return true;
  const ext = getFileExtension(attachment.path ?? attachment.url);
  if (!ext) return false;
  return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
 }
 function isAbortError(err: unknown): boolean {
  if (!err) return false;
  if (err instanceof Error && err.name === "AbortError") return true;
  return false;
 }
 function resolveRequestUrl(input: RequestInfo | URL): string {
  if (typeof input === "string") return input;
  if (input instanceof URL) return input.toString();
  return input.url;
 }
 function orderAttachments(
  attachments: MediaAttachment[],
  prefer?: MediaUnderstandingAttachmentsConfig["prefer"],
 ): MediaAttachment[] {
  if (!prefer || prefer === "first") return attachments;
  if (prefer === "last") return [...attachments].reverse();
  if (prefer === "path") {
    const withPath = attachments.filter((item) => item.path);
    const withoutPath = attachments.filter((item) => !item.path);
    return [...withPath, ...withoutPath];
  }
  if (prefer === "url") {
    const withUrl = attachments.filter((item) => item.url);
    const withoutUrl = attachments.filter((item) => !item.url);
    return [...withUrl, ...withoutUrl];
  }
  return attachments;
 }
 export function selectAttachments(params: {
  capability: MediaUnderstandingCapability;
  attachments: MediaAttachment[];
  policy?: MediaUnderstandingAttachmentsConfig;
 }): MediaAttachment[] {
  const { capability, attachments, policy } = params;
  const matches = attachments.filter((item) => {
    if (capability === "image") return isImageAttachment(item);
    if (capability === "audio") return isAudioAttachment(item);
    return isVideoAttachment(item);
  });
  if (matches.length === 0) return [];
  const ordered = orderAttachments(matches, policy?.prefer);
  const mode = policy?.mode ?? "first";
  const maxAttachments = policy?.maxAttachments ?? DEFAULT_MAX_ATTACHMENTS;
  if (mode === "all") {
    return ordered.slice(0, Math.max(1, maxAttachments));
  }
  return ordered.slice(0, 1);
 }
 export class MediaAttachmentCache {
  private readonly entries = new Map<number, AttachmentCacheEntry>();
  private readonly attachments: MediaAttachment[];
  constructor(attachments: MediaAttachment[]) {
    this.attachments = attachments;
    for (const attachment of attachments) {
      this.entries.set(attachment.index, { attachment });
    }
  }
  async getBuffer(params: {
    attachmentIndex: number;
    maxBytes: number;
    timeoutMs: number;
  }): Promise<MediaBufferResult> {
    const entry = await this.ensureEntry(params.attachmentIndex);
    if (entry.buffer) {
      if (entry.buffer.length > params.maxBytes) {
        throw new MediaUnderstandingSkipError(
          "maxBytes",
          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
        );
      }
      return {
        buffer: entry.buffer,
        mime: entry.bufferMime,
        fileName: entry.bufferFileName ?? `media-${params.attachmentIndex + 1}`,
        size: entry.buffer.length,
      };
    }
    if (entry.resolvedPath) {
      const size = await this.ensureLocalStat(entry);
      if (entry.resolvedPath) {
        if (size !== undefined && size > params.maxBytes) {
          throw new MediaUnderstandingSkipError(
            "maxBytes",
            `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
          );
        }
        const buffer = await fs.readFile(entry.resolvedPath);
        entry.buffer = buffer;
        entry.bufferMime =
          entry.bufferMime ??
          entry.attachment.mime ??
          (await detectMime({
            buffer,
            filePath: entry.resolvedPath,
          }));
        entry.bufferFileName =
          path.basename(entry.resolvedPath) || `media-${params.attachmentIndex + 1}`;
        return {
          buffer,
          mime: entry.bufferMime,
          fileName: entry.bufferFileName,
          size: buffer.length,
        };
      }
    }
    const url = entry.attachment.url?.trim();
    if (!url) {
      throw new MediaUnderstandingSkipError(
        "empty",
        `Attachment ${params.attachmentIndex + 1} has no path or URL.`,
      );
    }
    try {
      const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
        fetchWithTimeout(resolveRequestUrl(input), init ?? {}, params.timeoutMs, fetch);
      const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes: params.maxBytes });
      entry.buffer = fetched.buffer;
      entry.bufferMime =
        entry.attachment.mime ??
        fetched.contentType ??
        (await detectMime({
          buffer: fetched.buffer,
          filePath: fetched.fileName ?? url,
        }));
      entry.bufferFileName = fetched.fileName ?? `media-${params.attachmentIndex + 1}`;
      return {
        buffer: fetched.buffer,
        mime: entry.bufferMime,
        fileName: entry.bufferFileName,
        size: fetched.buffer.length,
      };
    } catch (err) {
      if (err instanceof MediaFetchError && err.code === "max_bytes") {
        throw new MediaUnderstandingSkipError(
          "maxBytes",
          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
        );
      }
      if (isAbortError(err)) {
        throw new MediaUnderstandingSkipError(
          "timeout",
          `Attachment ${params.attachmentIndex + 1} timed out while fetching.`,
        );
      }
      throw err;
    }
  }
  async getPath(params: {
    attachmentIndex: number;
    maxBytes?: number;
    timeoutMs: number;
  }): Promise<MediaPathResult> {
    const entry = await this.ensureEntry(params.attachmentIndex);
    if (entry.resolvedPath) {
      if (params.maxBytes) {
        const size = await this.ensureLocalStat(entry);
        if (entry.resolvedPath) {
          if (size !== undefined && size > params.maxBytes) {
            throw new MediaUnderstandingSkipError(
              "maxBytes",
              `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
            );
          }
        }
      }
      if (entry.resolvedPath) {
        return { path: entry.resolvedPath };
      }
    }
    if (entry.tempPath) {
      if (params.maxBytes && entry.buffer && entry.buffer.length > params.maxBytes) {
        throw new MediaUnderstandingSkipError(
          "maxBytes",
          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
        );
      }
      return { path: entry.tempPath, cleanup: entry.tempCleanup };
    }
    const maxBytes = params.maxBytes ?? Number.POSITIVE_INFINITY;
    const bufferResult = await this.getBuffer({
      attachmentIndex: params.attachmentIndex,
      maxBytes,
      timeoutMs: params.timeoutMs,
    });
    const extension = path.extname(bufferResult.fileName || "") || "";
    const tmpPath = path.join(
      os.tmpdir(),
      `clawdbot-media-${crypto.randomUUID()}${extension}`,
    );
    await fs.writeFile(tmpPath, bufferResult.buffer);
    entry.tempPath = tmpPath;
    entry.tempCleanup = async () => {
      await fs.unlink(tmpPath).catch(() => {});
    };
    return { path: tmpPath, cleanup: entry.tempCleanup };
  }
  async cleanup(): Promise<void> {
    const cleanups: Array<Promise<void> | void> = [];
    for (const entry of this.entries.values()) {
      if (entry.tempCleanup) {
        cleanups.push(Promise.resolve(entry.tempCleanup()));
        entry.tempCleanup = undefined;
      }
    }
    await Promise.all(cleanups);
  }
  private async ensureEntry(attachmentIndex: number): Promise<AttachmentCacheEntry> {
    const existing = this.entries.get(attachmentIndex);
    if (existing) {
      if (!existing.resolvedPath) {
        existing.resolvedPath = this.resolveLocalPath(existing.attachment);
      }
      return existing;
    }
    const attachment =
      this.attachments.find((item) => item.index === attachmentIndex) ?? { index: attachmentIndex };
    const entry: AttachmentCacheEntry = {
      attachment,
      resolvedPath: this.resolveLocalPath(attachment),
    };
    this.entries.set(attachmentIndex, entry);
    return entry;
  }
  private resolveLocalPath(attachment: MediaAttachment): string | undefined {
    const rawPath = normalizeAttachmentPath(attachment.path);
    if (!rawPath) return undefined;
    return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
  }
  private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {
    if (!entry.resolvedPath) return undefined;
    if (entry.statSize !== undefined) return entry.statSize;
    try {
      const stat = await fs.stat(entry.resolvedPath);
      if (!stat.isFile()) {
        entry.resolvedPath = undefined;
        return undefined;
      }
      entry.statSize = stat.size;
      return stat.size;
    } catch (err) {
      entry.resolvedPath = undefined;
      if (shouldLogVerbose()) {
        logVerbose(`Failed to read attachment ${entry.attachment.index + 1}: ${String(err)}`);
      }
      return undefined;
    }
  }
 }
--- a/src/media-understanding/concurrency.ts
+++ b/src/media-understanding/concurrency.ts
@@ -0,0 +1,29 @@
 import { logVerbose, shouldLogVerbose } from "../globals.js";
 export async function runWithConcurrency<T>(
  tasks: Array<() => Promise<T>>,
  limit: number,
 ): Promise<T[]> {
  if (tasks.length === 0) return [];
  const resolvedLimit = Math.max(1, Math.min(limit, tasks.length));
  const results: T[] = Array.from({ length: tasks.length });
  let next = 0;
  const workers = Array.from({ length: resolvedLimit }, async () => {
    while (true) {
      const index = next;
      next += 1;
      if (index >= tasks.length) return;
      try {
        results[index] = await tasks[index]();
      } catch (err) {
        if (shouldLogVerbose()) {
          logVerbose(`Media understanding task failed: ${String(err)}`);
        }
      }
    }
  });
  await Promise.allSettled(workers);
  return results;
 }
--- a/src/media-understanding/defaults.ts
+++ b/src/media-understanding/defaults.ts
@@ -0,0 +1,35 @@
 import type { MediaUnderstandingCapability } from "./types.js";
 const MB = 1024 * 1024;
 export const DEFAULT_MAX_CHARS = 500;
 export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<
  MediaUnderstandingCapability,
  number | undefined
 > = {
  image: DEFAULT_MAX_CHARS,
  audio: undefined,
  video: DEFAULT_MAX_CHARS,
 };
 export const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number> = {
  image: 10 * MB,
  audio: 20 * MB,
  video: 50 * MB,
 };
 export const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number> = {
  image: 60,
  audio: 60,
  video: 120,
 };
 export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
  image: "Describe the image.",
  audio: "Transcribe the audio.",
  video: "Describe the video.",
 };
 export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
 export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
  groq: "whisper-large-v3-turbo",
  openai: "whisper-1",
 };
 export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
 export const DEFAULT_MEDIA_CONCURRENCY = 2;
--- a/src/media-understanding/errors.ts
+++ b/src/media-understanding/errors.ts
@@ -0,0 +1,17 @@
 export type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty";
 export class MediaUnderstandingSkipError extends Error {
  readonly reason: MediaUnderstandingSkipReason;
  constructor(reason: MediaUnderstandingSkipReason, message: string) {
    super(message);
    this.reason = reason;
    this.name = "MediaUnderstandingSkipError";
  }
 }
 export function isMediaUnderstandingSkipError(
  err: unknown,
 ): err is MediaUnderstandingSkipError {
  return err instanceof MediaUnderstandingSkipError;
 }
--- a/src/media-understanding/format.ts
+++ b/src/media-understanding/format.ts
@@ -12,7 +12,7 @@ export function extractMediaUserText(body?: string): string | undefined {
 }
 function formatSection(
-  title: "Audio" | "Video" | "Image",
+  title: string,
  kind: "Transcript" | "Description",
  text: string,
  userText?: string,
@@ -40,11 +40,21 @@ export function formatMediaUnderstandingBody(params: {
    sections.push(`User text:\n${userText}`);
  }
  const counts = new Map<MediaUnderstandingOutput["kind"], number>();
  for (const output of outputs) {
    counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
  }
  const seen = new Map<MediaUnderstandingOutput["kind"], number>();
  for (const output of outputs) {
    const count = counts.get(output.kind) ?? 1;
    const next = (seen.get(output.kind) ?? 0) + 1;
    seen.set(output.kind, next);
    const suffix = count > 1 ? ` ${next}/${count}` : "";
    if (output.kind === "audio.transcription") {
      sections.push(
        formatSection(
-          "Audio",
+          `Audio${suffix}`,
          "Transcript",
          output.text,
          outputs.length === 1 ? userText : undefined,
@@ -55,7 +65,7 @@ export function formatMediaUnderstandingBody(params: {
    if (output.kind === "image.description") {
      sections.push(
        formatSection(
-          "Image",
+          `Image${suffix}`,
          "Description",
          output.text,
          outputs.length === 1 ? userText : undefined,
@@ -65,7 +75,7 @@ export function formatMediaUnderstandingBody(params: {
    }
    sections.push(
      formatSection(
-        "Video",
+        `Video${suffix}`,
        "Description",
        output.text,
        outputs.length === 1 ? userText : undefined,
@@ -75,3 +85,10 @@ export function formatMediaUnderstandingBody(params: {
  return sections.join("\n\n").trim();
 }
 export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string {
  if (outputs.length === 1) return outputs[0].text;
  return outputs
    .map((output, index) => `Audio ${index + 1}:\n${output.text}`)
    .join("\n\n");
 }
--- a/src/media-understanding/providers/anthropic/index.ts
+++ b/src/media-understanding/providers/anthropic/index.ts
@@ -0,0 +1,7 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { describeImageWithModel } from "../image.js";
 export const anthropicProvider: MediaUnderstandingProvider = {
  id: "anthropic",
  describeImage: describeImageWithModel,
 };
--- a/src/media-understanding/providers/google/index.ts
+++ b/src/media-understanding/providers/google/index.ts
@@ -1,7 +1,9 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { describeImageWithModel } from "../image.js";
 import { describeGeminiVideo } from "./video.js";
 export const googleProvider: MediaUnderstandingProvider = {
  id: "google",
  describeImage: describeImageWithModel,
  describeVideo: describeGeminiVideo,
 };
--- a/src/media-understanding/providers/image.ts
+++ b/src/media-understanding/providers/image.ts
@@ -0,0 +1,66 @@
 import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
 import { complete } from "@mariozechner/pi-ai";
 import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
 import { getApiKeyForModel } from "../../agents/model-auth.js";
 import { ensureClawdbotModelsJson } from "../../agents/models-config.js";
 import { minimaxUnderstandImage } from "../../agents/minimax-vlm.js";
 import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js";
 import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js";
 export async function describeImageWithModel(
  params: ImageDescriptionRequest,
 ): Promise<ImageDescriptionResult> {
  await ensureClawdbotModelsJson(params.cfg, params.agentDir);
  const authStorage = discoverAuthStorage(params.agentDir);
  const modelRegistry = discoverModels(authStorage, params.agentDir);
  const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
  if (!model) {
    throw new Error(`Unknown model: ${params.provider}/${params.model}`);
  }
  if (!model.input?.includes("image")) {
    throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
  }
  const apiKeyInfo = await getApiKeyForModel({
    model,
    cfg: params.cfg,
    agentDir: params.agentDir,
    profileId: params.profile,
    preferredProfile: params.preferredProfile,
  });
  authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
  const base64 = params.buffer.toString("base64");
  if (model.provider === "minimax") {
    const text = await minimaxUnderstandImage({
      apiKey: apiKeyInfo.apiKey,
      prompt: params.prompt ?? "Describe the image.",
      imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`,
      modelBaseUrl: model.baseUrl,
    });
    return { text, model: model.id };
  }
  const context: Context = {
    messages: [
      {
        role: "user",
        content: [
          { type: "text", text: params.prompt ?? "Describe the image." },
          { type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" },
        ],
        timestamp: Date.now(),
      },
    ],
  };
  const message = (await complete(model, context, {
    apiKey: apiKeyInfo.apiKey,
    maxTokens: params.maxTokens ?? 512,
  })) as AssistantMessage;
  const text = coerceImageAssistantText({
    message,
    provider: model.provider,
    model: model.id,
  });
  return { text, model: model.id };
 }
--- a/src/media-understanding/providers/index.ts
+++ b/src/media-understanding/providers/index.ts
@@ -1,10 +1,18 @@
 import { normalizeProviderId } from "../../agents/model-selection.js";
 import type { MediaUnderstandingProvider } from "../types.js";
 import { anthropicProvider } from "./anthropic/index.js";
 import { googleProvider } from "./google/index.js";
 import { groqProvider } from "./groq/index.js";
 import { minimaxProvider } from "./minimax/index.js";
 import { openaiProvider } from "./openai/index.js";
-const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
+const PROVIDERS: MediaUnderstandingProvider[] = [
  groqProvider,
  openaiProvider,
  googleProvider,
  anthropicProvider,
  minimaxProvider,
 ];
 export function normalizeMediaProviderId(id: string): string {
  const normalized = normalizeProviderId(id);
--- a/src/media-understanding/providers/minimax/index.ts
+++ b/src/media-understanding/providers/minimax/index.ts
@@ -0,0 +1,7 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { describeImageWithModel } from "../image.js";
 export const minimaxProvider: MediaUnderstandingProvider = {
  id: "minimax",
  describeImage: describeImageWithModel,
 };
--- a/src/media-understanding/providers/openai/index.ts
+++ b/src/media-understanding/providers/openai/index.ts
@@ -1,7 +1,9 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { describeImageWithModel } from "../image.js";
 import { transcribeOpenAiCompatibleAudio } from "./audio.js";
 export const openaiProvider: MediaUnderstandingProvider = {
  id: "openai",
  describeImage: describeImageWithModel,
  transcribeAudio: transcribeOpenAiCompatibleAudio,
 };
--- a/src/media-understanding/resolve.ts
+++ b/src/media-understanding/resolve.ts
@@ -0,0 +1,154 @@
 import type { ClawdbotConfig } from "../config/config.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import type {
  MediaUnderstandingConfig,
  MediaUnderstandingModelConfig,
  MediaUnderstandingScopeConfig,
 } from "../config/types.tools.js";
 import { logVerbose, shouldLogVerbose } from "../globals.js";
 import {
  DEFAULT_MAX_BYTES,
  DEFAULT_MAX_CHARS_BY_CAPABILITY,
  DEFAULT_MEDIA_CONCURRENCY,
  DEFAULT_PROMPT,
 } from "./defaults.js";
 import { normalizeMediaProviderId } from "./providers/index.js";
 import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
 import type { MediaUnderstandingCapability } from "./types.js";
 export function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
  const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
  return Math.max(1000, Math.floor(value * 1000));
 }
 export function resolvePrompt(
  capability: MediaUnderstandingCapability,
  prompt?: string,
  maxChars?: number,
 ): string {
  const base = prompt?.trim() || DEFAULT_PROMPT[capability];
  if (!maxChars || capability === "audio") return base;
  return `${base} Respond in at most ${maxChars} characters.`;
 }
 export function resolveMaxChars(params: {
  capability: MediaUnderstandingCapability;
  entry: MediaUnderstandingModelConfig;
  cfg: ClawdbotConfig;
  config?: MediaUnderstandingConfig;
 }): number | undefined {
  const { capability, entry, cfg } = params;
  const configured =
    entry.maxChars ?? params.config?.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
  if (typeof configured === "number") return configured;
  return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
 }
 export function resolveMaxBytes(params: {
  capability: MediaUnderstandingCapability;
  entry: MediaUnderstandingModelConfig;
  cfg: ClawdbotConfig;
  config?: MediaUnderstandingConfig;
 }): number {
  const configured =
    params.entry.maxBytes ??
    params.config?.maxBytes ??
    params.cfg.tools?.media?.[params.capability]?.maxBytes;
  if (typeof configured === "number") return configured;
  return DEFAULT_MAX_BYTES[params.capability];
 }
 export function resolveCapabilityConfig(
  cfg: ClawdbotConfig,
  capability: MediaUnderstandingCapability,
 ): MediaUnderstandingConfig | undefined {
  return cfg.tools?.media?.[capability];
 }
 export function resolveScopeDecision(params: {
  scope?: MediaUnderstandingScopeConfig;
  ctx: MsgContext;
 }): "allow" | "deny" {
  return resolveMediaUnderstandingScope({
    scope: params.scope,
    sessionKey: params.ctx.SessionKey,
    channel: params.ctx.Surface ?? params.ctx.Provider,
    chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
  });
 }
 function inferCapabilities(
  entry: MediaUnderstandingModelConfig,
 ): MediaUnderstandingCapability[] | undefined {
  if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") {
    return ["image", "audio", "video"];
  }
  const provider = normalizeMediaProviderId(entry.provider ?? "");
  if (!provider) return undefined;
  if (provider === "openai" || provider === "anthropic" || provider === "minimax") {
    return ["image"];
  }
  if (provider === "google") {
    return ["image", "audio", "video"];
  }
  if (provider === "groq") {
    return ["audio"];
  }
  return undefined;
 }
 export function resolveModelEntries(params: {
  cfg: ClawdbotConfig;
  capability: MediaUnderstandingCapability;
  config?: MediaUnderstandingConfig;
 }): MediaUnderstandingModelConfig[] {
  const { cfg, capability, config } = params;
  const sharedModels = cfg.tools?.media?.models ?? [];
  const entries = [
    ...(config?.models ?? []).map((entry) => ({ entry, source: "capability" as const })),
    ...sharedModels.map((entry) => ({ entry, source: "shared" as const })),
  ];
  if (entries.length === 0) return [];
  return entries
    .filter(({ entry, source }) => {
      const caps =
        entry.capabilities && entry.capabilities.length > 0
          ? entry.capabilities
          : source === "shared"
            ? inferCapabilities(entry)
            : undefined;
      if (!caps || caps.length === 0) {
        if (source === "shared") {
          if (shouldLogVerbose()) {
            logVerbose(
              `Skipping shared media model without capabilities: ${entry.provider ?? entry.command ?? "unknown"}`,
            );
          }
          return false;
        }
        return true;
      }
      return caps.includes(capability);
    })
    .map(({ entry }) => entry);
 }
 export function resolveConcurrency(cfg: ClawdbotConfig): number {
  const configured = cfg.tools?.media?.concurrency;
  if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
    return Math.floor(configured);
  }
  return DEFAULT_MEDIA_CONCURRENCY;
 }
 export function resolveCapabilityEnabled(params: {
  cfg: ClawdbotConfig;
  config?: MediaUnderstandingConfig;
 }): boolean {
  if (params.config?.enabled === false) return false;
  const sharedModels = params.cfg.tools?.media?.models ?? [];
  const hasModels = (params.config?.models?.length ?? 0) > 0 || sharedModels.length > 0;
  if (!hasModels) return false;
  return true;
 }
--- a/src/media-understanding/types.ts
+++ b/src/media-understanding/types.ts
@@ -3,6 +3,8 @@ export type MediaUnderstandingKind =
  | "video.description"
  | "image.description";
 export type MediaUnderstandingCapability = "image" | "audio" | "video";
 export type MediaAttachment = {
  path?: string;
  url?: string;
@@ -55,8 +57,29 @@ export type VideoDescriptionResult = {
  model?: string;
 };
 export type ImageDescriptionRequest = {
  buffer: Buffer;
  fileName: string;
  mime?: string;
  model: string;
  provider: string;
  prompt?: string;
  maxTokens?: number;
  timeoutMs: number;
  profile?: string;
  preferredProfile?: string;
  agentDir: string;
  cfg: import("../config/config.js").ClawdbotConfig;
 };
 export type ImageDescriptionResult = {
  text: string;
  model?: string;
 };
 export type MediaUnderstandingProvider = {
  id: string;
  transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
  describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
  describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;
 };
--- a/src/media-understanding/video.ts
+++ b/src/media-understanding/video.ts
@@ -0,0 +1,10 @@
 import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.js";
 export function estimateBase64Size(bytes: number): number {
  return Math.ceil(bytes / 3) * 4;
 }
 export function resolveVideoMaxBase64Bytes(maxBytes: number): number {
  const expanded = Math.floor(maxBytes * (4 / 3));
  return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
 }
--- a/src/media/fetch.ts
+++ b/src/media/fetch.ts
@@ -8,6 +8,18 @@ type FetchMediaResult = {
  fileName?: string;
 };
 export type MediaFetchErrorCode = "max_bytes" | "http_error" | "fetch_failed";
 export class MediaFetchError extends Error {
  readonly code: MediaFetchErrorCode;
  constructor(code: MediaFetchErrorCode, message: string) {
    super(message);
    this.code = code;
    this.name = "MediaFetchError";
  }
 }
 export type FetchLike = (input: RequestInfo | URL, init?: RequestInit) => Promise<Response>;
 type FetchMediaOptions = {
@@ -62,7 +74,7 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
  try {
    res = await fetcher(url);
  } catch (err) {
-    throw new Error(`Failed to fetch media from ${url}: ${String(err)}`);
+    throw new MediaFetchError("fetch_failed", `Failed to fetch media from ${url}: ${String(err)}`);
  }
  if (!res.ok) {
@@ -75,14 +87,18 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
      const snippet = await readErrorBodySnippet(res);
      if (snippet) detail += `; body: ${snippet}`;
    }
-    throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
+    throw new MediaFetchError(
      "http_error",
      `Failed to fetch media from ${url}${redirected}: ${detail}`,
    );
  }
  const contentLength = res.headers.get("content-length");
  if (maxBytes && contentLength) {
    const length = Number(contentLength);
    if (Number.isFinite(length) && length > maxBytes) {
-      throw new Error(
+      throw new MediaFetchError(
        "max_bytes",
        `Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
      );
    }
@@ -128,7 +144,8 @@ async function readResponseWithLimit(res: Response, maxBytes: number): Promise<B
  if (!body || typeof body.getReader !== "function") {
    const fallback = Buffer.from(await res.arrayBuffer());
    if (fallback.length > maxBytes) {
-      throw new Error(
+      throw new MediaFetchError(
        "max_bytes",
        `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
      );
    }
@@ -148,7 +165,8 @@ async function readResponseWithLimit(res: Response, maxBytes: number): Promise<B
          try {
            await reader.cancel();
          } catch {}
-          throw new Error(
+          throw new MediaFetchError(
            "max_bytes",
            `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
          );
        }