diff --git a/src/agents/model-catalog.ts b/src/agents/model-catalog.ts
index 3a3db7a28..5463542cb 100644
--- a/src/agents/model-catalog.ts
+++ b/src/agents/model-catalog.ts
@@ -8,6 +8,7 @@ export type ModelCatalogEntry = {
   provider: string;
   contextWindow?: number;
   reasoning?: boolean;
+  input?: Array<"text" | "image">;
 };
 
 type DiscoveredModel = {
@@ -16,6 +17,7 @@ type DiscoveredModel = {
   provider: string;
   contextWindow?: number;
   reasoning?: boolean;
+  input?: Array<"text" | "image">;
 };
 
 type PiSdkModule = typeof import("@mariozechner/pi-coding-agent");
@@ -80,7 +82,10 @@ export async function loadModelCatalog(params?: {
             ? entry.contextWindow
             : undefined;
         const reasoning = typeof entry?.reasoning === "boolean" ? entry.reasoning : undefined;
-        models.push({ id, name, provider, contextWindow, reasoning });
+        const input = Array.isArray(entry?.input)
+          ? (entry.input as Array<"text" | "image">)
+          : undefined;
+        models.push({ id, name, provider, contextWindow, reasoning, input });
       }
 
       if (models.length === 0) {
@@ -105,3 +110,27 @@ export async function loadModelCatalog(params?: {
 
   return modelCatalogPromise;
 }
+
+/**
+ * Check if a model supports image input based on its catalog entry.
+ */
+export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boolean {
+  return entry?.input?.includes("image") ?? false;
+}
+
+/**
+ * Find a model in the catalog by provider and model ID.
+ */
+export function findModelInCatalog(
+  catalog: ModelCatalogEntry[],
+  provider: string,
+  modelId: string,
+): ModelCatalogEntry | undefined {
+  const normalizedProvider = provider.toLowerCase().trim();
+  const normalizedModelId = modelId.toLowerCase().trim();
+  return catalog.find(
+    (entry) =>
+      entry.provider.toLowerCase() === normalizedProvider &&
+      entry.id.toLowerCase() === normalizedModelId,
+  );
+}
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
index 2e9bccb08..0bff2513e 100644
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -4,6 +4,11 @@ import os from "node:os";
 import path from "node:path";
 
 import type { ClawdbotConfig } from "../config/config.js";
+import {
+  findModelInCatalog,
+  loadModelCatalog,
+  modelSupportsVision,
+} from "../agents/model-catalog.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import { applyTemplate } from "../auto-reply/templating.js";
 import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js";
@@ -986,6 +991,26 @@ export async function runCapability(params: {
     };
   }
 
+  // Skip image understanding when the primary model supports vision natively.
+  // The image will be injected directly into the model context instead.
+  if (capability === "image" && params.activeModel?.provider) {
+    const catalog = await loadModelCatalog({ config: cfg });
+    const entry = findModelInCatalog(
+      catalog,
+      params.activeModel.provider,
+      params.activeModel.model ?? "",
+    );
+    if (modelSupportsVision(entry)) {
+      if (shouldLogVerbose()) {
+        logVerbose("Skipping image understanding: primary model supports vision natively");
+      }
+      return {
+        outputs: [],
+        decision: { capability, outcome: "skipped", attachments: [] },
+      };
+    }
+  }
+
   const attachmentPolicy = config?.attachments;
   const selected = selectAttachments({
     capability,