feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions
--- a/src/agents/bash-tools.exec.ts
+++ b/src/agents/bash-tools.exec.ts
@@ -254,7 +254,10 @@ export function createExecTool(
      let yielded = false;
      let yieldTimer: NodeJS.Timeout | null = null;
      let timeoutTimer: NodeJS.Timeout | null = null;
+      let timeoutFinalizeTimer: NodeJS.Timeout | null = null;
      let timedOut = false;
+      const timeoutFinalizeMs = 1000;
+      let rejectFn: ((err: Error) => void) | null = null;

      const settle = (fn: () => void) => {
        if (settled) return;
@@ -262,6 +265,18 @@ export function createExecTool(
        fn();
      };

+      const effectiveTimeout =
+        typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
+      const finalizeTimeout = () => {
+        if (session.exited) return;
+        markExited(session, null, "SIGKILL", "failed");
+        if (settled || !rejectFn) return;
+        const aggregated = session.aggregated.trim();
+        const reason = `Command timed out after ${effectiveTimeout} seconds`;
+        const message = aggregated ? `${aggregated}\n\n${reason}` : reason;
+        settle(() => rejectFn?.(new Error(message)));
+      };
+
      // Tool-call abort should not kill backgrounded sessions; timeouts still must.
      const onAbortSignal = () => {
        if (yielded || session.backgrounded) return;
@@ -272,15 +287,17 @@ export function createExecTool(
      const onTimeout = () => {
        timedOut = true;
        killSession(session);
+        if (!timeoutFinalizeTimer) {
+          timeoutFinalizeTimer = setTimeout(() => {
+            finalizeTimeout();
+          }, timeoutFinalizeMs);
+        }
      };

      if (signal?.aborted) onAbortSignal();
      else if (signal) {
        signal.addEventListener("abort", onAbortSignal, { once: true });
      }
-
-      const effectiveTimeout =
-        typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
      if (effectiveTimeout > 0) {
        timeoutTimer = setTimeout(() => {
          onTimeout();
@@ -321,6 +338,7 @@ export function createExecTool(
      });

      return new Promise<AgentToolResult<ExecToolDetails>>((resolve, reject) => {
+        rejectFn = reject;
        const resolveRunning = () => {
          settle(() =>
            resolve({
@@ -369,6 +387,7 @@ export function createExecTool(
        const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
          if (yieldTimer) clearTimeout(yieldTimer);
          if (timeoutTimer) clearTimeout(timeoutTimer);
+          if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
          const durationMs = Date.now() - startedAt;
          const wasSignal = exitSignal != null;
          const isSuccess = code === 0 && !wasSignal && !signal?.aborted && !timedOut;
@@ -421,6 +440,7 @@ export function createExecTool(
        child.once("error", (err) => {
          if (yieldTimer) clearTimeout(yieldTimer);
          if (timeoutTimer) clearTimeout(timeoutTimer);
+          if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
          markExited(session, null, null, "failed");
          settle(() => reject(err));
        });
--- a/src/auto-reply/media-note.test.ts
+++ b/src/auto-reply/media-note.test.ts
@@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => {
      ].join("\n"),
    );
  });
+
+  it("skips media notes for attachments with understanding output", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
+      MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
+      MediaUnderstanding: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "hello",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
+  });
 });
--- a/src/auto-reply/media-note.ts
+++ b/src/auto-reply/media-note.ts
@@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: {
 }

 export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
+  // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
+  const suppressed = new Set(
+    Array.isArray(ctx.MediaUnderstanding)
+      ? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
+      : [],
+  );
  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
  const paths =
    pathsFromArray && pathsFromArray.length > 0
@@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
      ? ctx.MediaTypes
      : undefined;

-  if (paths.length === 1) {
+  const entries = paths
+    .map((entry, index) => ({
+      path: entry ?? "",
+      type: types?.[index] ?? ctx.MediaType,
+      url: urls?.[index] ?? ctx.MediaUrl,
+      index,
+    }))
+    .filter((entry) => !suppressed.has(entry.index));
+  if (entries.length === 0) return undefined;
+  if (entries.length === 1) {
    return formatMediaAttachedLine({
-      path: paths[0] ?? "",
-      type: types?.[0] ?? ctx.MediaType,
-      url: urls?.[0] ?? ctx.MediaUrl,
+      path: entries[0]?.path ?? "",
+      type: entries[0]?.type,
+      url: entries[0]?.url,
    });
  }

-  const count = paths.length;
+  const count = entries.length;
  const lines: string[] = [`[media attached: ${count} files]`];
-  for (const [idx, mediaPath] of paths.entries()) {
+  for (const [idx, entry] of entries.entries()) {
    lines.push(
      formatMediaAttachedLine({
-        path: mediaPath,
+        path: entry.path,
        index: idx + 1,
        total: count,
-        type: types?.[idx],
-        url: urls?.[idx],
+        type: entry.type,
+        url: entry.url,
      }),
    );
  }
--- a/src/auto-reply/reply/get-reply-directives.ts
+++ b/src/auto-reply/reply/get-reply-directives.ts
@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
  const commandSource =
    sessionCtx.CommandBody ??
    sessionCtx.RawBody ??
+    sessionCtx.Transcript ??
    sessionCtx.BodyStripped ??
    sessionCtx.Body ??
    "";
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
    cap?: number;
    dropPolicy?: InlineDirectives["dropPolicy"];
  };
-  transcribedText?: string;
  typing: TypingController;
  opts?: GetReplyOptions;
  defaultModel: string;
@@ -210,7 +209,6 @@ export async function runPreparedReply(
    model,
    perMessageQueueMode,
    perMessageQueueOptions,
-    transcribedText,
    typing,
    opts,
    defaultModel,
@@ -325,11 +323,7 @@ export async function runPreparedReply(
  sessionEntry = skillResult.sessionEntry ?? sessionEntry;
  currentSystemSent = skillResult.systemSent;
  const skillsSnapshot = skillResult.skillsSnapshot;
-  const prefixedBody = transcribedText
-    ? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
-        .filter(Boolean)
-        .join("\n\n")
-    : [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
+  const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
  const mediaNote = buildInboundMediaNote(ctx);
  const mediaReplyHint = mediaNote
    ? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
@@ -370,11 +364,7 @@ export async function runPreparedReply(
  }
  const sessionIdFinal = sessionId ?? crypto.randomUUID();
  const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
-  const queueBodyBase = transcribedText
-    ? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
-        .filter(Boolean)
-        .join("\n\n")
-    : [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
+  const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
  const queuedBody = mediaNote
    ? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
    : queueBodyBase;
--- a/src/auto-reply/reply/get-reply.ts
+++ b/src/auto-reply/reply/get-reply.ts
@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
 import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
 import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
 import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
-import { logVerbose } from "../../globals.js";
 import { defaultRuntime } from "../../runtime.js";
 import { resolveCommandAuthorization } from "../command-auth.js";
 import type { MsgContext } from "../templating.js";
 import { SILENT_REPLY_TOKEN } from "../tokens.js";
-import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveDefaultModel } from "./directive-handling.js";
 import { resolveReplyDirectives } from "./get-reply-directives.js";
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
  });
  opts?.onTypingController?.(typing);

-  let transcribedText: string | undefined;
-  if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
-    const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
-    if (transcribed?.text) {
-      transcribedText = transcribed.text;
-      ctx.Body = transcribed.text;
-      ctx.Transcript = transcribed.text;
-      logVerbose("Replaced Body with audio transcript for reply flow");
-    }
-  }
+  await applyMediaUnderstanding({
+    ctx,
+    cfg,
+    agentDir,
+  });

  const commandAuthorized = ctx.CommandAuthorized ?? true;
  resolveCommandAuthorization({
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
    model,
    perMessageQueueMode,
    perMessageQueueOptions,
-    transcribedText,
    typing,
    opts,
    defaultModel,
--- a/src/auto-reply/templating.ts
+++ b/src/auto-reply/templating.ts
@@ -1,6 +1,7 @@
 import type { ChannelId } from "../channels/plugins/types.js";
 import type { InternalMessageChannel } from "../utils/message-channel.js";
 import type { CommandArgs } from "./commands-registry.types.js";
+import type { MediaUnderstandingOutput } from "../media-understanding/types.js";

 /** Valid message channels for routing. */
 export type OriginatingChannelType = ChannelId | InternalMessageChannel;
@@ -41,6 +42,9 @@ export type MsgContext = {
  /** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
  MediaRemoteHost?: string;
  Transcript?: string;
+  MediaUnderstanding?: MediaUnderstandingOutput[];
+  Prompt?: string;
+  MaxChars?: number;
  ChatType?: string;
  GroupSubject?: string;
  GroupRoom?: string;
--- a/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts
+++ b/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts
@@ -65,7 +65,7 @@ describe("legacy config detection", () => {
    expect(res.config?.messages?.groupChat?.mentionPatterns).toEqual(["@clawd"]);
    expect(res.config?.routing?.groupChat?.mentionPatterns).toBeUndefined();
  });
-  it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/audio", async () => {
+  it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/media", async () => {
    vi.resetModules();
    const { migrateLegacyConfig } = await import("./config.js");
    const res = migrateLegacyConfig({
@@ -80,7 +80,7 @@ describe("legacy config detection", () => {
    });
    expect(res.changes).toContain("Moved routing.agentToAgent → tools.agentToAgent.");
    expect(res.changes).toContain("Moved routing.queue → messages.queue.");
-    expect(res.changes).toContain("Moved routing.transcribeAudio → tools.audio.transcription.");
+    expect(res.changes).toContain("Moved routing.transcribeAudio → tools.media.audio.models.");
    expect(res.config?.tools?.agentToAgent).toEqual({
      enabled: true,
      allow: ["main"],
@@ -89,9 +89,16 @@ describe("legacy config detection", () => {
      mode: "queue",
      cap: 3,
    });
-    expect(res.config?.tools?.audio?.transcription).toEqual({
-      args: ["--model", "base"],
-      timeoutSeconds: 2,
+    expect(res.config?.tools?.media?.audio).toEqual({
+      enabled: true,
+      models: [
+        {
+          command: "whisper",
+          type: "cli",
+          args: ["--model", "base"],
+          timeoutSeconds: 2,
+        },
+      ],
    });
    expect(res.config?.routing).toBeUndefined();
  });
--- a/src/config/legacy.migrations.part-2.ts
+++ b/src/config/legacy.migrations.part-2.ts
@@ -330,14 +330,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
        const mapped = mapLegacyAudioTranscription(routing.transcribeAudio);
        if (mapped) {
          const tools = ensureRecord(raw, "tools");
-          const toolsAudio = ensureRecord(tools, "audio");
-          if (toolsAudio.transcription === undefined) {
-            toolsAudio.transcription = mapped;
-            changes.push("Moved routing.transcribeAudio → tools.audio.transcription.");
+          const media = ensureRecord(tools, "media");
+          const mediaAudio = ensureRecord(media, "audio");
+          const models = Array.isArray(mediaAudio.models)
+            ? (mediaAudio.models as unknown[])
+            : [];
+          if (models.length === 0) {
+            mediaAudio.enabled = true;
+            mediaAudio.models = [mapped];
+            changes.push("Moved routing.transcribeAudio → tools.media.audio.models.");
          } else {
-            changes.push(
-              "Removed routing.transcribeAudio (tools.audio.transcription already set).",
-            );
+            changes.push("Removed routing.transcribeAudio (tools.media.audio.models already set).");
          }
        } else {
          changes.push("Removed routing.transcribeAudio (unsupported transcription CLI).");
@@ -350,12 +353,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
        const mapped = mapLegacyAudioTranscription(audio.transcription);
        if (mapped) {
          const tools = ensureRecord(raw, "tools");
-          const toolsAudio = ensureRecord(tools, "audio");
-          if (toolsAudio.transcription === undefined) {
-            toolsAudio.transcription = mapped;
-            changes.push("Moved audio.transcription → tools.audio.transcription.");
+          const media = ensureRecord(tools, "media");
+          const mediaAudio = ensureRecord(media, "audio");
+          const models = Array.isArray(mediaAudio.models)
+            ? (mediaAudio.models as unknown[])
+            : [];
+          if (models.length === 0) {
+            mediaAudio.enabled = true;
+            mediaAudio.models = [mapped];
+            changes.push("Moved audio.transcription → tools.media.audio.models.");
          } else {
-            changes.push("Removed audio.transcription (tools.audio.transcription already set).");
+            changes.push("Removed audio.transcription (tools.media.audio.models already set).");
          }
          delete audio.transcription;
          if (Object.keys(audio).length === 0) delete raw.audio;
--- a/src/config/legacy.rules.ts
+++ b/src/config/legacy.rules.ts
@@ -69,7 +69,7 @@ export const LEGACY_CONFIG_RULES: LegacyConfigRule[] = [
  {
    path: ["routing", "transcribeAudio"],
    message:
-      "routing.transcribeAudio was moved; use tools.audio.transcription instead (auto-migrated on load).",
+      "routing.transcribeAudio was moved; use tools.media.audio.models instead (auto-migrated on load).",
  },
  {
    path: ["telegram", "requireMention"],
--- a/src/config/legacy.shared.ts
+++ b/src/config/legacy.shared.ts
@@ -56,7 +56,7 @@ export const mapLegacyAudioTranscription = (value: unknown): Record<string, unkn
  const timeoutSeconds =
    typeof transcriber?.timeoutSeconds === "number" ? transcriber?.timeoutSeconds : undefined;

-  const result: Record<string, unknown> = {};
+  const result: Record<string, unknown> = { command: rawExecutable, type: "cli" };
  if (args.length > 0) result.args = args;
  if (timeoutSeconds !== undefined) result.timeoutSeconds = timeoutSeconds;
  return result;
--- a/src/config/schema.ts
+++ b/src/config/schema.ts
@@ -102,8 +102,28 @@ const FIELD_LABELS: Record<string, string> = {
  "gateway.remote.password": "Remote Gateway Password",
  "gateway.auth.token": "Gateway Token",
  "gateway.auth.password": "Gateway Password",
-  "tools.audio.transcription.args": "Audio Transcription Args",
-  "tools.audio.transcription.timeoutSeconds": "Audio Transcription Timeout (sec)",
+  "tools.media.image.enabled": "Enable Image Understanding",
+  "tools.media.image.maxBytes": "Image Understanding Max Bytes",
+  "tools.media.image.maxChars": "Image Understanding Max Chars",
+  "tools.media.image.prompt": "Image Understanding Prompt",
+  "tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
+  "tools.media.image.models": "Image Understanding Models",
+  "tools.media.image.scope": "Image Understanding Scope",
+  "tools.media.audio.enabled": "Enable Audio Understanding",
+  "tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
+  "tools.media.audio.maxChars": "Audio Understanding Max Chars",
+  "tools.media.audio.prompt": "Audio Understanding Prompt",
+  "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
+  "tools.media.audio.language": "Audio Understanding Language",
+  "tools.media.audio.models": "Audio Understanding Models",
+  "tools.media.audio.scope": "Audio Understanding Scope",
+  "tools.media.video.enabled": "Enable Video Understanding",
+  "tools.media.video.maxBytes": "Video Understanding Max Bytes",
+  "tools.media.video.maxChars": "Video Understanding Max Chars",
+  "tools.media.video.prompt": "Video Understanding Prompt",
+  "tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
+  "tools.media.video.models": "Video Understanding Models",
+  "tools.media.video.scope": "Video Understanding Scope",
  "tools.profile": "Tool Profile",
  "agents.list[].tools.profile": "Agent Tool Profile",
  "tools.byProvider": "Tool Policy by Provider",
--- a/src/config/types.messages.ts
+++ b/src/config/types.messages.ts
@@ -47,7 +47,7 @@ export type BroadcastConfig = {
 };

 export type AudioConfig = {
-  /** @deprecated Use tools.audio.transcription instead. */
+  /** @deprecated Use tools.media.audio.models instead. */
  transcription?: {
    // Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout.
    command: string[];
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -1,4 +1,76 @@
-import type { AgentElevatedAllowFromConfig } from "./types.base.js";
+import type { AgentElevatedAllowFromConfig, SessionSendPolicyAction } from "./types.base.js";
+
+export type MediaUnderstandingScopeMatch = {
+  channel?: string;
+  chatType?: "direct" | "group" | "room";
+  keyPrefix?: string;
+};
+
+export type MediaUnderstandingScopeRule = {
+  action: SessionSendPolicyAction;
+  match?: MediaUnderstandingScopeMatch;
+};
+
+export type MediaUnderstandingScopeConfig = {
+  default?: SessionSendPolicyAction;
+  rules?: MediaUnderstandingScopeRule[];
+};
+
+export type MediaUnderstandingCapability = "image" | "audio" | "video";
+
+export type MediaUnderstandingModelConfig = {
+  /** provider API id (e.g. openai, google). */
+  provider?: string;
+  /** Model id for provider-based understanding. */
+  model?: string;
+  /** Optional capability tags for shared model lists. */
+  capabilities?: MediaUnderstandingCapability[];
+  /** Use a CLI command instead of provider API. */
+  type?: "provider" | "cli";
+  /** CLI binary (required when type=cli). */
+  command?: string;
+  /** CLI args (template-enabled). */
+  args?: string[];
+  /** Optional prompt override for this model entry. */
+  prompt?: string;
+  /** Optional max output characters for this model entry. */
+  maxChars?: number;
+  /** Optional max bytes for this model entry. */
+  maxBytes?: number;
+  /** Optional timeout override (seconds) for this model entry. */
+  timeoutSeconds?: number;
+  /** Optional language hint for audio transcription. */
+  language?: string;
+  /** Auth profile id to use for this provider. */
+  profile?: string;
+  /** Preferred profile id if multiple are available. */
+  preferredProfile?: string;
+};
+
+export type MediaUnderstandingConfig = {
+  /** Enable media understanding when models are configured. */
+  enabled?: boolean;
+  /** Optional scope gating for understanding. */
+  scope?: MediaUnderstandingScopeConfig;
+  /** Default max bytes to send. */
+  maxBytes?: number;
+  /** Default max output characters. */
+  maxChars?: number;
+  /** Default prompt. */
+  prompt?: string;
+  /** Default timeout (seconds). */
+  timeoutSeconds?: number;
+  /** Default language hint (audio). */
+  language?: string;
+  /** Ordered model list (fallbacks in order). */
+  models?: MediaUnderstandingModelConfig[];
+};
+
+export type MediaToolsConfig = {
+  image?: MediaUnderstandingConfig;
+  audio?: MediaUnderstandingConfig;
+  video?: MediaUnderstandingConfig;
+};

 export type ToolProfileId = "minimal" | "coding" | "messaging" | "full";

@@ -127,13 +199,7 @@ export type ToolsConfig = {
      };
    };
  };
-  audio?: {
-    transcription?: {
-      /** CLI args (template-enabled). */
-      args?: string[];
-      timeoutSeconds?: number;
-    };
-  };
+  media?: MediaToolsConfig;
  /** Message tool configuration. */
  message?: {
    /**
--- a/src/config/zod-schema.agent-runtime.ts
+++ b/src/config/zod-schema.agent-runtime.ts
@@ -5,7 +5,7 @@ import {
  GroupChatSchema,
  HumanDelaySchema,
  IdentitySchema,
-  ToolsAudioTranscriptionSchema,
+  ToolsMediaSchema,
 } from "./zod-schema.core.js";

 export const HeartbeatSchema = z
@@ -283,11 +283,7 @@ export const ToolsSchema = z
    deny: z.array(z.string()).optional(),
    byProvider: z.record(z.string(), ToolPolicyWithProfileSchema).optional(),
    web: ToolsWebSchema,
-    audio: z
-      .object({
-        transcription: ToolsAudioTranscriptionSchema,
-      })
-      .optional(),
+    media: ToolsMediaSchema,
    message: z
      .object({
        allowCrossContextSend: z.boolean().optional(),
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -240,10 +240,68 @@ export const ExecutableTokenSchema = z
  .string()
  .refine(isSafeExecutableValue, "expected safe executable name or path");

-export const ToolsAudioTranscriptionSchema = z
+export const MediaUnderstandingScopeSchema = z
  .object({
+    default: z.union([z.literal("allow"), z.literal("deny")]).optional(),
+    rules: z
+      .array(
+        z.object({
+          action: z.union([z.literal("allow"), z.literal("deny")]),
+          match: z
+            .object({
+              channel: z.string().optional(),
+              chatType: z
+                .union([z.literal("direct"), z.literal("group"), z.literal("room")])
+                .optional(),
+              keyPrefix: z.string().optional(),
+            })
+            .optional(),
+        }),
+      )
+      .optional(),
+  })
+  .optional();
+
+export const MediaUnderstandingCapabilitiesSchema = z
+  .array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
+  .optional();
+
+export const MediaUnderstandingModelSchema = z
+  .object({
+    provider: z.string().optional(),
+    model: z.string().optional(),
+    capabilities: MediaUnderstandingCapabilitiesSchema,
+    type: z.union([z.literal("provider"), z.literal("cli")]).optional(),
+    command: z.string().optional(),
    args: z.array(z.string()).optional(),
+    prompt: z.string().optional(),
+    maxChars: z.number().int().positive().optional(),
+    maxBytes: z.number().int().positive().optional(),
    timeoutSeconds: z.number().int().positive().optional(),
+    language: z.string().optional(),
+    profile: z.string().optional(),
+    preferredProfile: z.string().optional(),
+  })
+  .optional();
+
+export const ToolsMediaUnderstandingSchema = z
+  .object({
+    enabled: z.boolean().optional(),
+    scope: MediaUnderstandingScopeSchema,
+    maxBytes: z.number().int().positive().optional(),
+    maxChars: z.number().int().positive().optional(),
+    prompt: z.string().optional(),
+    timeoutSeconds: z.number().int().positive().optional(),
+    language: z.string().optional(),
+    models: z.array(MediaUnderstandingModelSchema).optional(),
+  })
+  .optional();
+
+export const ToolsMediaSchema = z
+  .object({
+    image: ToolsMediaUnderstandingSchema.optional(),
+    audio: ToolsMediaUnderstandingSchema.optional(),
+    video: ToolsMediaUnderstandingSchema.optional(),
  })
  .optional();

--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -0,0 +1,258 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import { resolveApiKeyForProvider } from "../agents/model-auth.js";
+import { fetchRemoteMedia } from "../media/fetch.js";
+
+vi.mock("../agents/model-auth.js", () => ({
+  resolveApiKeyForProvider: vi.fn(async () => ({
+    apiKey: "test-key",
+    source: "test",
+  })),
+}));
+
+vi.mock("../media/fetch.js", () => ({
+  fetchRemoteMedia: vi.fn(),
+}));
+
+vi.mock("../process/exec.js", () => ({
+  runExec: vi.fn(),
+}));
+
+async function loadApply() {
+  return await import("./apply.js");
+}
+
+describe("applyMediaUnderstanding", () => {
+  const mockedResolveApiKey = vi.mocked(resolveApiKeyForProvider);
+  const mockedFetchRemoteMedia = vi.mocked(fetchRemoteMedia);
+
+  beforeEach(() => {
+    mockedResolveApiKey.mockClear();
+    mockedFetchRemoteMedia.mockReset();
+    mockedFetchRemoteMedia.mockResolvedValue({
+      buffer: Buffer.from("audio-bytes"),
+      contentType: "audio/ogg",
+      fileName: "note.ogg",
+    });
+  });
+
+  it("sets Transcript and replaces Body when audio transcription succeeds", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "note.ogg");
+    await fs.writeFile(audioPath, "hello");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/ogg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => ({ text: "transcribed text" }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("transcribed text");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\ntranscribed text");
+    expect(ctx.CommandBody).toBe("transcribed text");
+    expect(ctx.RawBody).toBe("transcribed text");
+  });
+
+  it("handles URL-only attachments for audio transcription", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaUrl: "https://example.com/note.ogg",
+      MediaType: "audio/ogg",
+      ChatType: "dm",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            scope: {
+              default: "deny",
+              rules: [{ action: "allow", match: { chatType: "direct" } }],
+            },
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => ({ text: "remote transcript" }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("remote transcript");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\nremote transcript");
+  });
+
+  it("skips audio transcription when attachment exceeds maxBytes", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "large.wav");
+    await fs.writeFile(audioPath, "0123456789");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/wav",
+    };
+    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 4,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: { groq: { id: "groq", transcribeAudio } },
+    });
+
+    expect(result.appliedAudio).toBe(false);
+    expect(transcribeAudio).not.toHaveBeenCalled();
+    expect(ctx.Body).toBe("<media:audio>");
+  });
+
+  it("falls back to CLI model when provider fails", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "note.ogg");
+    await fs.writeFile(audioPath, "hello");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/ogg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            models: [
+              { provider: "groq" },
+              {
+                type: "cli",
+                command: "whisper",
+                args: ["{{MediaPath}}"],
+              },
+            ],
+          },
+        },
+      },
+    };
+
+    const execModule = await import("../process/exec.js");
+    vi.mocked(execModule.runExec).mockResolvedValue({
+      stdout: "cli transcript\n",
+      stderr: "",
+    });
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => {
+            throw new Error("boom");
+          },
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("cli transcript");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
+  });
+
+  it("uses CLI image understanding and preserves caption for commands", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const imagePath = path.join(dir, "photo.jpg");
+    await fs.writeFile(imagePath, "image-bytes");
+
+    const ctx: MsgContext = {
+      Body: "<media:image> show Dom",
+      MediaPath: imagePath,
+      MediaType: "image/jpeg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          image: {
+            enabled: true,
+            models: [
+              {
+                type: "cli",
+                command: "gemini",
+                args: ["--file", "{{MediaPath}}", "--prompt", "{{Prompt}}"],
+              },
+            ],
+          },
+        },
+      },
+    };
+
+    const execModule = await import("../process/exec.js");
+    vi.mocked(execModule.runExec).mockResolvedValue({
+      stdout: "image description\n",
+      stderr: "",
+    });
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+    });
+
+    expect(result.appliedImage).toBe(true);
+    expect(ctx.Body).toBe("[Image]\nUser text:\nshow Dom\nDescription:\nimage description");
+    expect(ctx.CommandBody).toBe("show Dom");
+    expect(ctx.RawBody).toBe("show Dom");
+  });
+});
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -0,0 +1,804 @@
+import crypto from "node:crypto";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
+import { complete } from "@mariozechner/pi-ai";
+import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import { applyTemplate } from "../auto-reply/templating.js";
+import { getApiKeyForModel, resolveApiKeyForProvider } from "../agents/model-auth.js";
+import { ensureClawdbotModelsJson } from "../agents/models-config.js";
+import { minimaxUnderstandImage } from "../agents/minimax-vlm.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { fetchRemoteMedia } from "../media/fetch.js";
+import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
+import { runExec } from "../process/exec.js";
+import type {
+  MediaUnderstandingConfig,
+  MediaUnderstandingModelConfig,
+  MediaUnderstandingScopeConfig,
+} from "../config/types.tools.js";
+import { extractMediaUserText, formatMediaUnderstandingBody } from "./format.js";
+import {
+  buildMediaUnderstandingRegistry,
+  getMediaUnderstandingProvider,
+  normalizeMediaProviderId,
+} from "./providers/index.js";
+import { fetchWithTimeout } from "./providers/shared.js";
+import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
+import type {
+  MediaAttachment,
+  MediaUnderstandingOutput,
+  MediaUnderstandingProvider,
+} from "./types.js";
+import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
+
+const MB = 1024 * 1024;
+const DEFAULT_MAX_CHARS = 500;
+const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<Capability, number | undefined> = {
+  image: DEFAULT_MAX_CHARS,
+  audio: undefined,
+  video: DEFAULT_MAX_CHARS,
+};
+const DEFAULT_MAX_BYTES: Record<Capability, number> = {
+  image: 10 * MB,
+  audio: 20 * MB,
+  video: 50 * MB,
+};
+const DEFAULT_TIMEOUT_SECONDS: Record<Capability, number> = {
+  image: 60,
+  audio: 60,
+  video: 120,
+};
+const DEFAULT_PROMPT: Record<Capability, string> = {
+  image: "Describe the image.",
+  audio: "Transcribe the audio.",
+  video: "Describe the video.",
+};
+const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
+const DEFAULT_AUDIO_MODELS: Record<string, string> = {
+  groq: "whisper-large-v3-turbo",
+  openai: "whisper-1",
+};
+const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
+
+export type ApplyMediaUnderstandingResult = {
+  outputs: MediaUnderstandingOutput[];
+  appliedImage: boolean;
+  appliedAudio: boolean;
+  appliedVideo: boolean;
+};
+
+type Capability = "image" | "audio" | "video";
+
+type MediaBufferResult = {
+  buffer: Buffer;
+  mime?: string;
+  fileName: string;
+};
+
+type MediaPathResult = {
+  path: string;
+  cleanup?: () => Promise<void> | void;
+};
+
+function normalizeAttachmentPath(raw?: string | null): string | undefined {
+  const value = raw?.trim();
+  if (!value) return undefined;
+  if (value.startsWith("file://")) {
+    try {
+      return fileURLToPath(value);
+    } catch {
+      return undefined;
+    }
+  }
+  return value;
+}
+
+function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
+  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
+  const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
+  const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
+  const resolveMime = (count: number, index: number) => {
+    const typeHint = typesFromArray?.[index];
+    const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
+    if (trimmed) return trimmed;
+    return count === 1 ? ctx.MediaType : undefined;
+  };
+
+  if (pathsFromArray && pathsFromArray.length > 0) {
+    const count = pathsFromArray.length;
+    const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
+    return pathsFromArray
+      .map((value, index) => ({
+        path: value?.trim() || undefined,
+        url: urls?.[index] ?? ctx.MediaUrl,
+        mime: resolveMime(count, index),
+        index,
+      }))
+      .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
+  }
+
+  if (urlsFromArray && urlsFromArray.length > 0) {
+    const count = urlsFromArray.length;
+    return urlsFromArray
+      .map((value, index) => ({
+        path: undefined,
+        url: value?.trim() || undefined,
+        mime: resolveMime(count, index),
+        index,
+      }))
+      .filter((entry) => Boolean(entry.url?.trim()));
+  }
+
+  const pathValue = ctx.MediaPath?.trim();
+  const url = ctx.MediaUrl?.trim();
+  if (!pathValue && !url) return [];
+  return [
+    {
+      path: pathValue || undefined,
+      url: url || undefined,
+      mime: ctx.MediaType,
+      index: 0,
+    },
+  ];
+}
+
+function isVideoAttachment(attachment: MediaAttachment): boolean {
+  if (attachment.mime?.startsWith("video/")) return true;
+  const ext = getFileExtension(attachment.path ?? attachment.url);
+  if (!ext) return false;
+  return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
+}
+
+function isAudioAttachment(attachment: MediaAttachment): boolean {
+  if (attachment.mime?.startsWith("audio/")) return true;
+  return isAudioFileName(attachment.path ?? attachment.url);
+}
+
+function isImageAttachment(attachment: MediaAttachment): boolean {
+  if (attachment.mime?.startsWith("image/")) return true;
+  const ext = getFileExtension(attachment.path ?? attachment.url);
+  if (!ext) return false;
+  return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
+}
+
+function estimateBase64Size(bytes: number): number {
+  return Math.ceil(bytes / 3) * 4;
+}
+
+function resolveVideoMaxBase64Bytes(maxBytes: number): number {
+  const expanded = Math.floor(maxBytes * (4 / 3));
+  return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
+}
+
+function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
+  const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
+  return Math.max(1000, Math.floor(value * 1000));
+}
+
+function resolvePrompt(capability: Capability, prompt?: string, maxChars?: number): string {
+  const base = prompt?.trim() || DEFAULT_PROMPT[capability];
+  if (!maxChars || capability === "audio") return base;
+  return `${base} Respond in at most ${maxChars} characters.`;
+}
+
+function resolveRequestUrl(input: RequestInfo | URL): string {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+}
+
+function normalizeErrorMessage(err: unknown): string {
+  if (!err) return "";
+  if (typeof err === "string") return err;
+  if (err instanceof Error) return err.message;
+  try {
+    return JSON.stringify(err);
+  } catch {
+    return "";
+  }
+}
+
+function resolveMaxChars(params: {
+  capability: Capability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+}): number | undefined {
+  const { capability, entry, cfg } = params;
+  const configured = entry.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
+  if (typeof configured === "number") return configured;
+  return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
+}
+
+function trimOutput(text: string, maxChars?: number): string {
+  const trimmed = text.trim();
+  if (!maxChars || trimmed.length <= maxChars) return trimmed;
+  return trimmed.slice(0, maxChars).trim();
+}
+
+function resolveConfigValue<T>(primary: T | undefined, fallback: T): T {
+  return primary === undefined ? fallback : primary;
+}
+
+function resolveCapabilityConfig(
+  cfg: ClawdbotConfig,
+  capability: Capability,
+): MediaUnderstandingConfig | undefined {
+  return cfg.tools?.media?.[capability];
+}
+
+function resolveScopeDecision(params: {
+  scope?: MediaUnderstandingScopeConfig;
+  ctx: MsgContext;
+}): "allow" | "deny" {
+  return resolveMediaUnderstandingScope({
+    scope: params.scope,
+    sessionKey: params.ctx.SessionKey,
+    channel: params.ctx.Surface ?? params.ctx.Provider,
+    chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
+  });
+}
+
+function resolveModelEntries(
+  cfg: MediaUnderstandingConfig | undefined,
+  capability: Capability,
+): MediaUnderstandingModelConfig[] {
+  const models = cfg?.models ?? [];
+  if (models.length === 0) return [];
+  return models.filter((entry) => {
+    const caps = entry.capabilities;
+    if (!caps || caps.length === 0) return true;
+    return caps.includes(capability);
+  });
+}
+
+function isMaxBytesError(err: unknown): boolean {
+  const message = normalizeErrorMessage(err);
+  if (!message) return false;
+  return message.includes("exceeds maxBytes") || message.includes("payload exceeds maxBytes");
+}
+
+async function loadAttachmentBuffer(params: {
+  attachment: MediaAttachment;
+  maxBytes: number;
+  timeoutMs: number;
+}): Promise<MediaBufferResult | undefined> {
+  const { attachment, maxBytes, timeoutMs } = params;
+  const rawPath = normalizeAttachmentPath(attachment.path);
+  if (rawPath) {
+    const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
+    try {
+      const stat = await fs.stat(resolved);
+      if (!stat.isFile()) return undefined;
+      if (stat.size > maxBytes) {
+        if (shouldLogVerbose()) {
+          logVerbose(
+            `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
+          );
+        }
+        return undefined;
+      }
+      const buffer = await fs.readFile(resolved);
+      const mime =
+        attachment.mime ??
+        (await detectMime({
+          buffer,
+          filePath: resolved,
+        }));
+      const fileName = path.basename(resolved) || `media-${attachment.index + 1}`;
+      return { buffer, mime, fileName };
+    } catch (err) {
+      if (shouldLogVerbose()) {
+        logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
+      }
+    }
+  }
+
+  const url = attachment.url?.trim();
+  if (!url) return undefined;
+
+  try {
+    const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
+      fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
+    const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
+    if (fetched.buffer.length > maxBytes) {
+      if (shouldLogVerbose()) {
+        logVerbose(
+          `Skipping media attachment ${attachment.index + 1}: ${fetched.buffer.length} bytes exceeds ${maxBytes}`,
+        );
+      }
+      return undefined;
+    }
+    const mime =
+      attachment.mime ??
+      fetched.contentType ??
+      (await detectMime({
+        buffer: fetched.buffer,
+        filePath: fetched.fileName ?? url,
+      }));
+    const fileName = fetched.fileName ?? `media-${attachment.index + 1}`;
+    return { buffer: fetched.buffer, mime, fileName };
+  } catch (err) {
+    if (shouldLogVerbose()) {
+      logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
+    }
+  }
+
+  return undefined;
+}
+
+async function resolveAttachmentPath(params: {
+  attachment: MediaAttachment;
+  maxBytes?: number;
+  timeoutMs: number;
+}): Promise<MediaPathResult | undefined> {
+  const { attachment, maxBytes, timeoutMs } = params;
+  const rawPath = normalizeAttachmentPath(attachment.path);
+  if (rawPath) {
+    const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
+    try {
+      const stat = await fs.stat(resolved);
+      if (!stat.isFile()) return undefined;
+      if (maxBytes && stat.size > maxBytes) {
+        if (shouldLogVerbose()) {
+          logVerbose(
+            `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
+          );
+        }
+        return undefined;
+      }
+      return { path: resolved };
+    } catch (err) {
+      if (shouldLogVerbose()) {
+        logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
+      }
+    }
+  }
+
+  const url = attachment.url?.trim();
+  if (!url) return undefined;
+
+  try {
+    const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
+      fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
+    const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
+    const buffer = fetched.buffer;
+    if (maxBytes && buffer.length > maxBytes) {
+      if (shouldLogVerbose()) {
+        logVerbose(
+          `Skipping media attachment ${attachment.index + 1}: ${buffer.length} bytes exceeds ${maxBytes}`,
+        );
+      }
+      return undefined;
+    }
+    const extension = fetched.fileName ? path.extname(fetched.fileName) : "";
+    const tmpPath = path.join(
+      os.tmpdir(),
+      `clawdbot-media-${crypto.randomUUID()}${extension || ""}`,
+    );
+    await fs.writeFile(tmpPath, buffer);
+    return {
+      path: tmpPath,
+      cleanup: async () => {
+        await fs.unlink(tmpPath).catch(() => {});
+      },
+    };
+  } catch (err) {
+    if (shouldLogVerbose()) {
+      logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
+    }
+  }
+
+  return undefined;
+}
+
+async function describeImageWithModel(params: {
+  cfg: ClawdbotConfig;
+  agentDir: string;
+  provider: string;
+  model: string;
+  prompt: string;
+  maxChars?: number;
+  buffer: Buffer;
+  mimeType: string;
+  profile?: string;
+  preferredProfile?: string;
+}): Promise<{ text: string; model: string }> {
+  await ensureClawdbotModelsJson(params.cfg, params.agentDir);
+  const authStorage = discoverAuthStorage(params.agentDir);
+  const modelRegistry = discoverModels(authStorage, params.agentDir);
+  const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
+  if (!model) {
+    throw new Error(`Unknown model: ${params.provider}/${params.model}`);
+  }
+  if (!model.input?.includes("image")) {
+    throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
+  }
+  const apiKeyInfo = await getApiKeyForModel({
+    model,
+    cfg: params.cfg,
+    agentDir: params.agentDir,
+    profileId: params.profile,
+    preferredProfile: params.preferredProfile,
+  });
+  authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
+
+  const base64 = params.buffer.toString("base64");
+  if (model.provider === "minimax") {
+    const text = await minimaxUnderstandImage({
+      apiKey: apiKeyInfo.apiKey,
+      prompt: params.prompt,
+      imageDataUrl: `data:${params.mimeType};base64,${base64}`,
+      modelBaseUrl: model.baseUrl,
+    });
+    return { text, model: model.id };
+  }
+
+  const context: Context = {
+    messages: [
+      {
+        role: "user",
+        content: [
+          { type: "text", text: params.prompt },
+          { type: "image", data: base64, mimeType: params.mimeType },
+        ],
+        timestamp: Date.now(),
+      },
+    ],
+  };
+  const message = (await complete(model, context, {
+    apiKey: apiKeyInfo.apiKey,
+    maxTokens: 512,
+  })) as AssistantMessage;
+  const text = coerceImageAssistantText({
+    message,
+    provider: model.provider,
+    model: model.id,
+  });
+  return { text, model: model.id };
+}
+
+async function runProviderEntry(params: {
+  capability: Capability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachment: MediaAttachment;
+  agentDir?: string;
+  providerRegistry: Map<string, MediaUnderstandingProvider>;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { entry, capability, cfg, attachment } = params;
+  const providerIdRaw = entry.provider?.trim();
+  if (!providerIdRaw) {
+    throw new Error(`Provider entry missing provider for ${capability}`);
+  }
+  const providerId = normalizeMediaProviderId(providerIdRaw);
+  const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
+  const maxChars = resolveMaxChars({ capability, entry, cfg });
+  const timeoutMs = resolveTimeoutMs(
+    entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
+    DEFAULT_TIMEOUT_SECONDS[capability],
+  );
+  const prompt = resolvePrompt(
+    capability,
+    entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
+    maxChars,
+  );
+
+  if (capability === "image") {
+    if (!params.agentDir) {
+      throw new Error("Image understanding requires agentDir");
+    }
+    const modelId = entry.model?.trim();
+    if (!modelId) {
+      throw new Error("Image understanding requires model id");
+    }
+    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
+    if (!media) return null;
+    const mimeType = media.mime ?? "image/jpeg";
+    const result = await describeImageWithModel({
+      cfg,
+      agentDir: params.agentDir,
+      provider: providerId,
+      model: modelId,
+      prompt,
+      maxChars,
+      buffer: media.buffer,
+      mimeType,
+      profile: entry.profile,
+      preferredProfile: entry.preferredProfile,
+    });
+    return {
+      kind: "image.description",
+      attachmentIndex: attachment.index,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model,
+    };
+  }
+
+  const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
+  if (!provider) {
+    throw new Error(`Media provider not available: ${providerId}`);
+  }
+
+  if (capability === "audio") {
+    if (!provider.transcribeAudio) {
+      throw new Error(`Audio transcription provider "${providerId}" not available.`);
+    }
+    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
+    if (!media) return null;
+    const key = await resolveApiKeyForProvider({
+      provider: providerId,
+      cfg,
+      profileId: entry.profile,
+      preferredProfile: entry.preferredProfile,
+      agentDir: params.agentDir,
+    });
+    const providerConfig = cfg.models?.providers?.[providerId];
+    const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
+    const result = await provider.transcribeAudio({
+      buffer: media.buffer,
+      fileName: media.fileName,
+      mime: media.mime,
+      apiKey: key.apiKey,
+      baseUrl: providerConfig?.baseUrl,
+      headers: providerConfig?.headers,
+      model,
+      language: entry.language ?? cfg.tools?.media?.audio?.language,
+      prompt,
+      timeoutMs,
+    });
+    return {
+      kind: "audio.transcription",
+      attachmentIndex: attachment.index,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model ?? model,
+    };
+  }
+
+  if (capability === "video") {
+    if (!provider.describeVideo) {
+      throw new Error(`Video understanding provider "${providerId}" not available.`);
+    }
+    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
+    if (!media) return null;
+    const estimatedBase64Bytes = estimateBase64Size(media.buffer.length);
+    const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
+    if (estimatedBase64Bytes > maxBase64Bytes) {
+      if (shouldLogVerbose()) {
+        logVerbose(
+          `Skipping video attachment ${attachment.index + 1}: base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
+        );
+      }
+      return null;
+    }
+    const key = await resolveApiKeyForProvider({
+      provider: providerId,
+      cfg,
+      profileId: entry.profile,
+      preferredProfile: entry.preferredProfile,
+      agentDir: params.agentDir,
+    });
+    const providerConfig = cfg.models?.providers?.[providerId];
+    const result = await provider.describeVideo({
+      buffer: media.buffer,
+      fileName: media.fileName,
+      mime: media.mime,
+      apiKey: key.apiKey,
+      baseUrl: providerConfig?.baseUrl,
+      headers: providerConfig?.headers,
+      model: entry.model,
+      prompt,
+      timeoutMs,
+    });
+    return {
+      kind: "video.description",
+      attachmentIndex: attachment.index,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model ?? entry.model,
+    };
+  }
+
+  return null;
+}
+
+async function runCliEntry(params: {
+  capability: Capability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachment: MediaAttachment;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { entry, capability, cfg, ctx, attachment } = params;
+  const command = entry.command?.trim();
+  const args = entry.args ?? [];
+  if (!command) {
+    throw new Error(`CLI entry missing command for ${capability}`);
+  }
+  const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
+  const maxChars = resolveMaxChars({ capability, entry, cfg });
+  const timeoutMs = resolveTimeoutMs(
+    entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
+    DEFAULT_TIMEOUT_SECONDS[capability],
+  );
+  const prompt = resolvePrompt(
+    capability,
+    entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
+    maxChars,
+  );
+  const pathResult = await resolveAttachmentPath({
+    attachment,
+    maxBytes,
+    timeoutMs,
+  });
+  if (!pathResult) return null;
+
+  const templCtx: MsgContext = {
+    ...ctx,
+    MediaPath: pathResult.path,
+    Prompt: prompt,
+    MaxChars: maxChars,
+  };
+  const argv = [command, ...args].map((part, index) =>
+    index === 0 ? part : applyTemplate(part, templCtx),
+  );
+  if (shouldLogVerbose()) {
+    logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
+  }
+  try {
+    const { stdout } = await runExec(argv[0], argv.slice(1), {
+      timeoutMs,
+      maxBuffer: CLI_OUTPUT_MAX_BUFFER,
+    });
+    const text = trimOutput(stdout, maxChars);
+    if (!text) return null;
+    return {
+      kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
+      attachmentIndex: attachment.index,
+      text,
+      provider: "cli",
+      model: command,
+    };
+  } finally {
+    if (pathResult.cleanup) {
+      await pathResult.cleanup();
+    }
+  }
+}
+
+async function runCapability(params: {
+  capability: Capability;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachments: MediaAttachment[];
+  agentDir?: string;
+  providerRegistry: Map<string, MediaUnderstandingProvider>;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { capability, cfg, ctx, attachments } = params;
+  const config = resolveCapabilityConfig(cfg, capability);
+  if (!config || config.enabled === false) return null;
+  const entries = resolveModelEntries(config, capability);
+  if (entries.length === 0) return null;
+
+  const scopeDecision = resolveScopeDecision({ scope: config.scope, ctx });
+  if (scopeDecision === "deny") {
+    if (shouldLogVerbose()) {
+      logVerbose(`${capability} understanding disabled by scope policy.`);
+    }
+    return null;
+  }
+
+  const attachment = attachments.find((item) => {
+    if (capability === "image") return isImageAttachment(item);
+    if (capability === "audio") return isAudioAttachment(item);
+    return isVideoAttachment(item);
+  });
+  if (!attachment) return null;
+
+  for (const entry of entries) {
+    try {
+      const entryType = entry.type ?? (entry.command ? "cli" : "provider");
+      const result =
+        entryType === "cli"
+          ? await runCliEntry({ capability, entry, cfg, ctx, attachment })
+          : await runProviderEntry({
+              capability,
+              entry,
+              cfg,
+              ctx,
+              attachment,
+              agentDir: params.agentDir,
+              providerRegistry: params.providerRegistry,
+            });
+      if (result) return result;
+    } catch (err) {
+      if (isMaxBytesError(err)) {
+        if (shouldLogVerbose()) {
+          logVerbose(`Skipping ${capability} model due to size: ${String(err)}`);
+        }
+      } else if (shouldLogVerbose()) {
+        logVerbose(`${capability} understanding failed: ${String(err)}`);
+      }
+    }
+  }
+
+  return null;
+}
+
+export async function applyMediaUnderstanding(params: {
+  ctx: MsgContext;
+  cfg: ClawdbotConfig;
+  agentDir?: string;
+  providers?: Record<string, MediaUnderstandingProvider>;
+}): Promise<ApplyMediaUnderstandingResult> {
+  const { ctx, cfg } = params;
+  const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
+  const originalUserText =
+    commandCandidates
+      .map((value) => extractMediaUserText(value))
+      .find((value) => value && value.trim()) ?? undefined;
+
+  const attachments = normalizeAttachments(ctx);
+  const providerRegistry = buildMediaUnderstandingRegistry(params.providers);
+  const outputs: MediaUnderstandingOutput[] = [];
+
+  const imageOutput = await runCapability({
+    capability: "image",
+    cfg,
+    ctx,
+    attachments,
+    agentDir: params.agentDir,
+    providerRegistry,
+  });
+  if (imageOutput) outputs.push(imageOutput);
+
+  const audioOutput = await runCapability({
+    capability: "audio",
+    cfg,
+    ctx,
+    attachments,
+    agentDir: params.agentDir,
+    providerRegistry,
+  });
+  if (audioOutput) outputs.push(audioOutput);
+
+  const videoOutput = await runCapability({
+    capability: "video",
+    cfg,
+    ctx,
+    attachments,
+    agentDir: params.agentDir,
+    providerRegistry,
+  });
+  if (videoOutput) outputs.push(videoOutput);
+
+  if (outputs.length > 0) {
+    ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
+    const audioResult = outputs.find((output) => output.kind === "audio.transcription");
+    if (audioResult) {
+      ctx.Transcript = audioResult.text;
+      ctx.CommandBody = audioResult.text;
+      ctx.RawBody = audioResult.text;
+    } else if (originalUserText) {
+      ctx.CommandBody = originalUserText;
+      ctx.RawBody = originalUserText;
+    }
+    ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
+  }
+
+  return {
+    outputs,
+    appliedImage: outputs.some((output) => output.kind === "image.description"),
+    appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
+    appliedVideo: outputs.some((output) => output.kind === "video.description"),
+  };
+}
--- a/src/media-understanding/format.test.ts
+++ b/src/media-understanding/format.test.ts
@@ -0,0 +1,91 @@
+import { describe, expect, it } from "vitest";
+import { formatMediaUnderstandingBody } from "./format.js";
+
+describe("formatMediaUnderstandingBody", () => {
+  it("replaces placeholder body with transcript", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "<media:audio>",
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "hello world",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe("[Audio]\nTranscript:\nhello world");
+  });
+
+  it("includes user text when body is meaningful", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "caption here",
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "transcribed",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
+  });
+
+  it("strips leading media placeholders from user text", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "<media:audio> caption here",
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "transcribed",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
+  });
+
+  it("keeps user text once when multiple outputs exist", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "caption here",
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "audio text",
+          provider: "groq",
+        },
+        {
+          kind: "video.description",
+          attachmentIndex: 1,
+          text: "video text",
+          provider: "google",
+        },
+      ],
+    });
+    expect(body).toBe(
+      [
+        "User text:\ncaption here",
+        "[Audio]\nTranscript:\naudio text",
+        "[Video]\nDescription:\nvideo text",
+      ].join("\n\n"),
+    );
+  });
+
+  it("formats image outputs", () => {
+    const body = formatMediaUnderstandingBody({
+      body: "<media:image>",
+      outputs: [
+        {
+          kind: "image.description",
+          attachmentIndex: 0,
+          text: "a cat",
+          provider: "openai",
+        },
+      ],
+    });
+    expect(body).toBe("[Image]\nDescription:\na cat");
+  });
+});
--- a/src/media-understanding/format.ts
+++ b/src/media-understanding/format.ts
@@ -0,0 +1,77 @@
+import type { MediaUnderstandingOutput } from "./types.js";
+
+const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
+const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
+
+export function extractMediaUserText(body?: string): string | undefined {
+  const trimmed = body?.trim() ?? "";
+  if (!trimmed) return undefined;
+  if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return undefined;
+  const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
+  return cleaned || undefined;
+}
+
+function formatSection(
+  title: "Audio" | "Video" | "Image",
+  kind: "Transcript" | "Description",
+  text: string,
+  userText?: string,
+): string {
+  const lines = [`[${title}]`];
+  if (userText) {
+    lines.push(`User text:\n${userText}`);
+  }
+  lines.push(`${kind}:\n${text}`);
+  return lines.join("\n");
+}
+
+export function formatMediaUnderstandingBody(params: {
+  body?: string;
+  outputs: MediaUnderstandingOutput[];
+}): string {
+  const outputs = params.outputs.filter((output) => output.text.trim());
+  if (outputs.length === 0) {
+    return params.body ?? "";
+  }
+
+  const userText = extractMediaUserText(params.body);
+  const sections: string[] = [];
+  if (userText && outputs.length > 1) {
+    sections.push(`User text:\n${userText}`);
+  }
+
+  for (const output of outputs) {
+    if (output.kind === "audio.transcription") {
+      sections.push(
+        formatSection(
+          "Audio",
+          "Transcript",
+          output.text,
+          outputs.length === 1 ? userText : undefined,
+        ),
+      );
+      continue;
+    }
+    if (output.kind === "image.description") {
+      sections.push(
+        formatSection(
+          "Image",
+          "Description",
+          output.text,
+          outputs.length === 1 ? userText : undefined,
+        ),
+      );
+      continue;
+    }
+    sections.push(
+      formatSection(
+        "Video",
+        "Description",
+        output.text,
+        outputs.length === 1 ? userText : undefined,
+      ),
+    );
+  }
+
+  return sections.join("\n\n").trim();
+}
--- a/src/media-understanding/index.ts
+++ b/src/media-understanding/index.ts
@@ -0,0 +1,9 @@
+export { applyMediaUnderstanding } from "./apply.js";
+export { formatMediaUnderstandingBody } from "./format.js";
+export { resolveMediaUnderstandingScope } from "./scope.js";
+export type {
+  MediaAttachment,
+  MediaUnderstandingOutput,
+  MediaUnderstandingProvider,
+  MediaUnderstandingKind,
+} from "./types.js";
--- a/src/media-understanding/providers/google/index.ts
+++ b/src/media-understanding/providers/google/index.ts
@@ -0,0 +1,7 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { describeGeminiVideo } from "./video.js";
+
+export const googleProvider: MediaUnderstandingProvider = {
+  id: "google",
+  describeVideo: describeGeminiVideo,
+};
--- a/src/media-understanding/providers/google/video.test.ts
+++ b/src/media-understanding/providers/google/video.test.ts
@@ -0,0 +1,93 @@
+import { describe, expect, it } from "vitest";
+
+import { describeGeminiVideo } from "./video.js";
+
+const resolveRequestUrl = (input: RequestInfo | URL) => {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+};
+
+describe("describeGeminiVideo", () => {
+  it("respects case-insensitive x-goog-api-key overrides", async () => {
+    let seenKey: string | null = null;
+    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
+      const headers = new Headers(init?.headers);
+      seenKey = headers.get("x-goog-api-key");
+      return new Response(
+        JSON.stringify({
+          candidates: [{ content: { parts: [{ text: "video ok" }] } }],
+        }),
+        { status: 200, headers: { "content-type": "application/json" } },
+      );
+    };
+
+    const result = await describeGeminiVideo({
+      buffer: Buffer.from("video"),
+      fileName: "clip.mp4",
+      apiKey: "test-key",
+      timeoutMs: 1000,
+      headers: { "X-Goog-Api-Key": "override" },
+      fetchFn,
+    });
+
+    expect(seenKey).toBe("override");
+    expect(result.text).toBe("video ok");
+  });
+
+  it("builds the expected request payload", async () => {
+    let seenUrl: string | null = null;
+    let seenInit: RequestInit | undefined;
+    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
+      seenUrl = resolveRequestUrl(input);
+      seenInit = init;
+      return new Response(
+        JSON.stringify({
+          candidates: [
+            {
+              content: {
+                parts: [{ text: "first" }, { text: " second " }, { text: "" }],
+              },
+            },
+          ],
+        }),
+        { status: 200, headers: { "content-type": "application/json" } },
+      );
+    };
+
+    const result = await describeGeminiVideo({
+      buffer: Buffer.from("video-bytes"),
+      fileName: "clip.mp4",
+      apiKey: "test-key",
+      timeoutMs: 1500,
+      baseUrl: "https://example.com/v1beta/",
+      model: "gemini-3-pro",
+      headers: { "X-Other": "1" },
+      fetchFn,
+    });
+
+    expect(result.model).toBe("gemini-3-pro-preview");
+    expect(result.text).toBe("first\nsecond");
+    expect(seenUrl).toBe("https://example.com/v1beta/models/gemini-3-pro-preview:generateContent");
+    expect(seenInit?.method).toBe("POST");
+    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
+
+    const headers = new Headers(seenInit?.headers);
+    expect(headers.get("x-goog-api-key")).toBe("test-key");
+    expect(headers.get("content-type")).toBe("application/json");
+    expect(headers.get("x-other")).toBe("1");
+
+    const bodyText =
+      typeof seenInit?.body === "string"
+        ? seenInit.body
+        : Buffer.isBuffer(seenInit?.body)
+          ? seenInit.body.toString("utf8")
+          : "";
+    const body = JSON.parse(bodyText);
+    expect(body.contents?.[0]?.parts?.[0]?.text).toBe("Describe the video.");
+    expect(body.contents?.[0]?.parts?.[1]?.inline_data?.mime_type).toBe("video/mp4");
+    expect(body.contents?.[0]?.parts?.[1]?.inline_data?.data).toBe(
+      Buffer.from("video-bytes").toString("base64"),
+    );
+  });
+});
--- a/src/media-understanding/providers/google/video.ts
+++ b/src/media-understanding/providers/google/video.ts
@@ -0,0 +1,84 @@
+import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
+import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
+import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
+
+export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
+const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
+const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  if (!trimmed) return DEFAULT_GOOGLE_VIDEO_MODEL;
+  return normalizeGoogleModelId(trimmed);
+}
+
+function resolvePrompt(prompt?: string): string {
+  const trimmed = prompt?.trim();
+  return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
+}
+
+export async function describeGeminiVideo(
+  params: VideoDescriptionRequest,
+): Promise<VideoDescriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
+  const model = resolveModel(params.model);
+  const url = `${baseUrl}/models/${model}:generateContent`;
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("content-type")) {
+    headers.set("content-type", "application/json");
+  }
+  if (!headers.has("x-goog-api-key")) {
+    headers.set("x-goog-api-key", params.apiKey);
+  }
+
+  const body = {
+    contents: [
+      {
+        role: "user",
+        parts: [
+          { text: resolvePrompt(params.prompt) },
+          {
+            inline_data: {
+              mime_type: params.mime ?? "video/mp4",
+              data: params.buffer.toString("base64"),
+            },
+          },
+        ],
+      },
+    ],
+  };
+
+  const res = await fetchWithTimeout(
+    url,
+    {
+      method: "POST",
+      headers,
+      body: JSON.stringify(body),
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  if (!res.ok) {
+    const detail = await readErrorResponse(res);
+    const suffix = detail ? `: ${detail}` : "";
+    throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
+  }
+
+  const payload = (await res.json()) as {
+    candidates?: Array<{
+      content?: { parts?: Array<{ text?: string }> };
+    }>;
+  };
+  const parts = payload.candidates?.[0]?.content?.parts ?? [];
+  const text = parts
+    .map((part) => part?.text?.trim())
+    .filter(Boolean)
+    .join("\n");
+  if (!text) {
+    throw new Error("Video description response missing text");
+  }
+  return { text, model };
+}
--- a/src/media-understanding/providers/groq/index.ts
+++ b/src/media-understanding/providers/groq/index.ts
@@ -0,0 +1,13 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
+
+const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
+
+export const groqProvider: MediaUnderstandingProvider = {
+  id: "groq",
+  transcribeAudio: (req) =>
+    transcribeOpenAiCompatibleAudio({
+      ...req,
+      baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
+    }),
+};
--- a/src/media-understanding/providers/index.ts
+++ b/src/media-understanding/providers/index.ts
@@ -0,0 +1,35 @@
+import { normalizeProviderId } from "../../agents/model-selection.js";
+import type { MediaUnderstandingProvider } from "../types.js";
+import { googleProvider } from "./google/index.js";
+import { groqProvider } from "./groq/index.js";
+import { openaiProvider } from "./openai/index.js";
+
+const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
+
+export function normalizeMediaProviderId(id: string): string {
+  const normalized = normalizeProviderId(id);
+  if (normalized === "gemini") return "google";
+  return normalized;
+}
+
+export function buildMediaUnderstandingRegistry(
+  overrides?: Record<string, MediaUnderstandingProvider>,
+): Map<string, MediaUnderstandingProvider> {
+  const registry = new Map<string, MediaUnderstandingProvider>();
+  for (const provider of PROVIDERS) {
+    registry.set(normalizeMediaProviderId(provider.id), provider);
+  }
+  if (overrides) {
+    for (const [key, provider] of Object.entries(overrides)) {
+      registry.set(normalizeMediaProviderId(key), provider);
+    }
+  }
+  return registry;
+}
+
+export function getMediaUnderstandingProvider(
+  id: string,
+  registry: Map<string, MediaUnderstandingProvider>,
+): MediaUnderstandingProvider | undefined {
+  return registry.get(normalizeMediaProviderId(id));
+}
--- a/src/media-understanding/providers/openai/audio.test.ts
+++ b/src/media-understanding/providers/openai/audio.test.ts
@@ -0,0 +1,86 @@
+import { describe, expect, it } from "vitest";
+
+import { transcribeOpenAiCompatibleAudio } from "./audio.js";
+
+const resolveRequestUrl = (input: RequestInfo | URL) => {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+};
+
+describe("transcribeOpenAiCompatibleAudio", () => {
+  it("respects lowercase authorization header overrides", async () => {
+    let seenAuth: string | null = null;
+    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
+      const headers = new Headers(init?.headers);
+      seenAuth = headers.get("authorization");
+      return new Response(JSON.stringify({ text: "ok" }), {
+        status: 200,
+        headers: { "content-type": "application/json" },
+      });
+    };
+
+    const result = await transcribeOpenAiCompatibleAudio({
+      buffer: Buffer.from("audio"),
+      fileName: "note.mp3",
+      apiKey: "test-key",
+      timeoutMs: 1000,
+      headers: { authorization: "Bearer override" },
+      fetchFn,
+    });
+
+    expect(seenAuth).toBe("Bearer override");
+    expect(result.text).toBe("ok");
+  });
+
+  it("builds the expected request payload", async () => {
+    let seenUrl: string | null = null;
+    let seenInit: RequestInit | undefined;
+    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
+      seenUrl = resolveRequestUrl(input);
+      seenInit = init;
+      return new Response(JSON.stringify({ text: "hello" }), {
+        status: 200,
+        headers: { "content-type": "application/json" },
+      });
+    };
+
+    const result = await transcribeOpenAiCompatibleAudio({
+      buffer: Buffer.from("audio-bytes"),
+      fileName: "voice.wav",
+      apiKey: "test-key",
+      timeoutMs: 1234,
+      baseUrl: "https://api.example.com/v1/",
+      model: " ",
+      language: " en ",
+      prompt: " hello ",
+      mime: "audio/wav",
+      headers: { "X-Custom": "1" },
+      fetchFn,
+    });
+
+    expect(result.model).toBe("whisper-1");
+    expect(result.text).toBe("hello");
+    expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
+    expect(seenInit?.method).toBe("POST");
+    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
+
+    const headers = new Headers(seenInit?.headers);
+    expect(headers.get("authorization")).toBe("Bearer test-key");
+    expect(headers.get("x-custom")).toBe("1");
+
+    const form = seenInit?.body as FormData;
+    expect(form).toBeInstanceOf(FormData);
+    expect(form.get("model")).toBe("whisper-1");
+    expect(form.get("language")).toBe("en");
+    expect(form.get("prompt")).toBe("hello");
+    const file = form.get("file") as Blob | { type?: string; name?: string } | null;
+    expect(file).not.toBeNull();
+    if (file) {
+      expect(file.type).toBe("audio/wav");
+      if ("name" in file && typeof file.name === "string") {
+        expect(file.name).toBe("voice.wav");
+      }
+    }
+  });
+});
--- a/src/media-understanding/providers/openai/audio.ts
+++ b/src/media-understanding/providers/openai/audio.ts
@@ -0,0 +1,61 @@
+import path from "node:path";
+
+import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
+import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
+
+export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
+const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
+}
+
+export async function transcribeOpenAiCompatibleAudio(
+  params: AudioTranscriptionRequest,
+): Promise<AudioTranscriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
+  const url = `${baseUrl}/audio/transcriptions`;
+
+  const model = resolveModel(params.model);
+  const form = new FormData();
+  const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
+  const bytes = new Uint8Array(params.buffer);
+  const blob = new Blob([bytes], {
+    type: params.mime ?? "application/octet-stream",
+  });
+  form.append("file", blob, fileName);
+  form.append("model", model);
+  if (params.language?.trim()) form.append("language", params.language.trim());
+  if (params.prompt?.trim()) form.append("prompt", params.prompt.trim());
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("authorization")) {
+    headers.set("authorization", `Bearer ${params.apiKey}`);
+  }
+
+  const res = await fetchWithTimeout(
+    url,
+    {
+      method: "POST",
+      headers,
+      body: form,
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  if (!res.ok) {
+    const detail = await readErrorResponse(res);
+    const suffix = detail ? `: ${detail}` : "";
+    throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
+  }
+
+  const payload = (await res.json()) as { text?: string };
+  const text = payload.text?.trim();
+  if (!text) {
+    throw new Error("Audio transcription response missing text");
+  }
+  return { text, model };
+}
--- a/src/media-understanding/providers/openai/index.ts
+++ b/src/media-understanding/providers/openai/index.ts
@@ -0,0 +1,7 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { transcribeOpenAiCompatibleAudio } from "./audio.js";
+
+export const openaiProvider: MediaUnderstandingProvider = {
+  id: "openai",
+  transcribeAudio: transcribeOpenAiCompatibleAudio,
+};
--- a/src/media-understanding/providers/shared.ts
+++ b/src/media-understanding/providers/shared.ts
@@ -0,0 +1,33 @@
+const MAX_ERROR_CHARS = 300;
+
+export function normalizeBaseUrl(baseUrl: string | undefined, fallback: string): string {
+  const raw = baseUrl?.trim() || fallback;
+  return raw.replace(/\/+$/, "");
+}
+
+export async function fetchWithTimeout(
+  url: string,
+  init: RequestInit,
+  timeoutMs: number,
+  fetchFn: typeof fetch,
+): Promise<Response> {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
+  try {
+    return await fetchFn(url, { ...init, signal: controller.signal });
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+export async function readErrorResponse(res: Response): Promise<string | undefined> {
+  try {
+    const text = await res.text();
+    const collapsed = text.replace(/\s+/g, " ").trim();
+    if (!collapsed) return undefined;
+    if (collapsed.length <= MAX_ERROR_CHARS) return collapsed;
+    return `${collapsed.slice(0, MAX_ERROR_CHARS)}…`;
+  } catch {
+    return undefined;
+  }
+}
--- a/src/media-understanding/scope.test.ts
+++ b/src/media-understanding/scope.test.ts
@@ -0,0 +1,46 @@
+import { describe, expect, it } from "vitest";
+import { resolveMediaUnderstandingScope } from "./scope.js";
+
+describe("resolveMediaUnderstandingScope", () => {
+  it("defaults to allow when scope is undefined", () => {
+    expect(resolveMediaUnderstandingScope({})).toBe("allow");
+  });
+
+  it("uses first matching rule", () => {
+    const decision = resolveMediaUnderstandingScope({
+      scope: {
+        default: "deny",
+        rules: [
+          { action: "allow", match: { channel: "whatsapp" } },
+          { action: "deny", match: { channel: "whatsapp", chatType: "direct" } },
+        ],
+      },
+      channel: "whatsapp",
+      chatType: "direct",
+      sessionKey: "whatsapp:direct:123",
+    });
+    expect(decision).toBe("allow");
+  });
+
+  it("matches keyPrefix when provided", () => {
+    const decision = resolveMediaUnderstandingScope({
+      scope: {
+        default: "deny",
+        rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
+      },
+      sessionKey: "agent:main:whatsapp:group:123",
+    });
+    expect(decision).toBe("allow");
+  });
+
+  it("matches keyPrefix case-insensitively", () => {
+    const decision = resolveMediaUnderstandingScope({
+      scope: {
+        default: "deny",
+        rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
+      },
+      sessionKey: "AGENT:MAIN:WHATSAPP:GROUP:123",
+    });
+    expect(decision).toBe("allow");
+  });
+});
--- a/src/media-understanding/scope.ts
+++ b/src/media-understanding/scope.ts
@@ -0,0 +1,54 @@
+import type { MediaUnderstandingScopeConfig } from "../config/types.tools.js";
+
+export type MediaUnderstandingScopeDecision = "allow" | "deny";
+
+function normalizeDecision(value?: string | null): MediaUnderstandingScopeDecision | undefined {
+  const normalized = value?.trim().toLowerCase();
+  if (normalized === "allow") return "allow";
+  if (normalized === "deny") return "deny";
+  return undefined;
+}
+
+function normalizeMatch(value?: string | null): string | undefined {
+  const normalized = value?.trim().toLowerCase();
+  return normalized || undefined;
+}
+
+export function normalizeMediaUnderstandingChatType(raw?: string | null): string | undefined {
+  const value = raw?.trim().toLowerCase();
+  if (!value) return undefined;
+  if (value === "dm" || value === "direct_message" || value === "private") return "direct";
+  if (value === "groups") return "group";
+  if (value === "channel") return "room";
+  return value;
+}
+
+export function resolveMediaUnderstandingScope(params: {
+  scope?: MediaUnderstandingScopeConfig;
+  sessionKey?: string;
+  channel?: string;
+  chatType?: string;
+}): MediaUnderstandingScopeDecision {
+  const scope = params.scope;
+  if (!scope) return "allow";
+
+  const channel = normalizeMatch(params.channel);
+  const chatType = normalizeMatch(params.chatType);
+  const sessionKey = normalizeMatch(params.sessionKey) ?? "";
+
+  for (const rule of scope.rules ?? []) {
+    if (!rule) continue;
+    const action = normalizeDecision(rule.action) ?? "allow";
+    const match = rule.match ?? {};
+    const matchChannel = normalizeMatch(match.channel);
+    const matchChatType = normalizeMatch(match.chatType);
+    const matchPrefix = normalizeMatch(match.keyPrefix);
+
+    if (matchChannel && matchChannel !== channel) continue;
+    if (matchChatType && matchChatType !== chatType) continue;
+    if (matchPrefix && !sessionKey.startsWith(matchPrefix)) continue;
+    return action;
+  }
+
+  return normalizeDecision(scope.default) ?? "allow";
+}
--- a/src/media-understanding/types.ts
+++ b/src/media-understanding/types.ts
@@ -0,0 +1,62 @@
+export type MediaUnderstandingKind =
+  | "audio.transcription"
+  | "video.description"
+  | "image.description";
+
+export type MediaAttachment = {
+  path?: string;
+  url?: string;
+  mime?: string;
+  index: number;
+};
+
+export type MediaUnderstandingOutput = {
+  kind: MediaUnderstandingKind;
+  attachmentIndex: number;
+  text: string;
+  provider: string;
+  model?: string;
+};
+
+export type AudioTranscriptionRequest = {
+  buffer: Buffer;
+  fileName: string;
+  mime?: string;
+  apiKey: string;
+  baseUrl?: string;
+  headers?: Record<string, string>;
+  model?: string;
+  language?: string;
+  prompt?: string;
+  timeoutMs: number;
+  fetchFn?: typeof fetch;
+};
+
+export type AudioTranscriptionResult = {
+  text: string;
+  model?: string;
+};
+
+export type VideoDescriptionRequest = {
+  buffer: Buffer;
+  fileName: string;
+  mime?: string;
+  apiKey: string;
+  baseUrl?: string;
+  headers?: Record<string, string>;
+  model?: string;
+  prompt?: string;
+  timeoutMs: number;
+  fetchFn?: typeof fetch;
+};
+
+export type VideoDescriptionResult = {
+  text: string;
+  model?: string;
+};
+
+export type MediaUnderstandingProvider = {
+  id: string;
+  transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
+  describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
+};
--- a/src/media/fetch.test.ts
+++ b/src/media/fetch.test.ts
@@ -0,0 +1,47 @@
+import { describe, expect, it } from "vitest";
+
+import { fetchRemoteMedia } from "./fetch.js";
+
+function makeStream(chunks: Uint8Array[]) {
+  return new ReadableStream<Uint8Array>({
+    start(controller) {
+      for (const chunk of chunks) {
+        controller.enqueue(chunk);
+      }
+      controller.close();
+    },
+  });
+}
+
+describe("fetchRemoteMedia", () => {
+  it("rejects when content-length exceeds maxBytes", async () => {
+    const fetchImpl = async () =>
+      new Response(makeStream([new Uint8Array([1, 2, 3, 4, 5])]), {
+        status: 200,
+        headers: { "content-length": "5" },
+      });
+
+    await expect(
+      fetchRemoteMedia({
+        url: "https://example.com/file.bin",
+        fetchImpl,
+        maxBytes: 4,
+      }),
+    ).rejects.toThrow("exceeds maxBytes");
+  });
+
+  it("rejects when streamed payload exceeds maxBytes", async () => {
+    const fetchImpl = async () =>
+      new Response(makeStream([new Uint8Array([1, 2, 3]), new Uint8Array([4, 5, 6])]), {
+        status: 200,
+      });
+
+    await expect(
+      fetchRemoteMedia({
+        url: "https://example.com/file.bin",
+        fetchImpl,
+        maxBytes: 4,
+      }),
+    ).rejects.toThrow("exceeds maxBytes");
+  });
+});
--- a/src/media/fetch.ts
+++ b/src/media/fetch.ts
@@ -14,6 +14,7 @@ type FetchMediaOptions = {
  url: string;
  fetchImpl?: FetchLike;
  filePathHint?: string;
+  maxBytes?: number;
 };

 function stripQuotes(value: string): string {
@@ -51,7 +52,7 @@ async function readErrorBodySnippet(res: Response, maxChars = 200): Promise<stri
 }

 export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<FetchMediaResult> {
-  const { url, fetchImpl, filePathHint } = options;
+  const { url, fetchImpl, filePathHint, maxBytes } = options;
  const fetcher: FetchLike | undefined = fetchImpl ?? globalThis.fetch;
  if (!fetcher) {
    throw new Error("fetch is not available");
@@ -77,7 +78,19 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
    throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
  }

-  const buffer = Buffer.from(await res.arrayBuffer());
+  const contentLength = res.headers.get("content-length");
+  if (maxBytes && contentLength) {
+    const length = Number(contentLength);
+    if (Number.isFinite(length) && length > maxBytes) {
+      throw new Error(
+        `Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
+      );
+    }
+  }
+
+  const buffer = maxBytes
+    ? await readResponseWithLimit(res, maxBytes)
+    : Buffer.from(await res.arrayBuffer());
  let fileNameFromUrl: string | undefined;
  try {
    const parsed = new URL(url);
@@ -109,3 +122,47 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
    fileName,
  };
 }
+
+async function readResponseWithLimit(res: Response, maxBytes: number): Promise<Buffer> {
+  const body = res.body;
+  if (!body || typeof body.getReader !== "function") {
+    const fallback = Buffer.from(await res.arrayBuffer());
+    if (fallback.length > maxBytes) {
+      throw new Error(
+        `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
+      );
+    }
+    return fallback;
+  }
+
+  const reader = body.getReader();
+  const chunks: Uint8Array[] = [];
+  let total = 0;
+  try {
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      if (value?.length) {
+        total += value.length;
+        if (total > maxBytes) {
+          try {
+            await reader.cancel();
+          } catch {}
+          throw new Error(
+            `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
+          );
+        }
+        chunks.push(value);
+      }
+    }
+  } finally {
+    try {
+      reader.releaseLock();
+    } catch {}
+  }
+
+  return Buffer.concat(
+    chunks.map((chunk) => Buffer.from(chunk)),
+    total,
+  );
+}