From c69947dff82b0714706c611e733cf8629e1a14fe Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sun, 18 Jan 2026 14:49:11 +0000
Subject: [PATCH] feat: auto-enable audio understanding when keys exist

---
 CHANGELOG.md                                  |  13 +-
 docs/nodes/media-understanding.md             |  23 ++++
 .../runner.auto-audio.test.ts                 | 114 ++++++++++++++++++
 src/media-understanding/runner.ts             |  37 +++++-
 4 files changed, 177 insertions(+), 10 deletions(-)
 create mode 100644 src/media-understanding/runner.auto-audio.test.ts
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 68beaaba8..b4d391fd3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,18 +10,15 @@ Docs: https://docs.clawd.bot
 - Swabble: use the tagged Commander Swift package release.
 - CLI: add `clawdbot acp client` interactive ACP harness for debugging.
 - Plugins: route command detection/text chunking helpers through the plugin runtime and drop runtime exports from the SDK.
-- Memory: add native Gemini embeddings provider for memory search. (#1151)
+- Memory: add native Gemini embeddings provider for memory search. (#1151) — thanks @gumadeiras.
+- Media: auto-enable audio understanding when provider keys are configured (OpenAI/Groq/Deepgram).
+- Docs: add API usage + costs overview. https://docs.clawd.bot/reference/api-usage-costs
 
 ### Fixes
 - Auth profiles: keep auto-pinned preference while allowing rotation on failover; user pins stay locked. (#1138) — thanks @cheeeee.
 - macOS: avoid touching launchd in Remote over SSH so quitting the app no longer disables the remote gateway. (#1105)
-- Memory: index atomically so failed reindex preserves the previous memory database. (#1151)
-- Memory: avoid sqlite-vec unique constraint failures when reindexing duplicate chunk ids. (#1151)
-
-## 2026.1.18-5
-
-### Changes
-- Dependencies: update core + plugin deps (grammy, vitest, openai, Microsoft agents hosting, etc.).
+- Memory: index atomically so failed reindex preserves the previous memory database. (#1151) — thanks @gumadeiras.
+- Memory: avoid sqlite-vec unique constraint failures when reindexing duplicate chunk ids. (#1151) — thanks @gumadeiras.
 
 ## 2026.1.18-3
 
diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md
index 6d640cd53..e0daa1497 100644
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -104,6 +104,29 @@ Rules:
 - If `<capability>.enabled: true` but no models are configured, Clawdbot tries the
   **active reply model** when its provider supports the capability.
 
+### Auto-enable audio (when keys exist)
+If `tools.media.audio.enabled` is **not** set to `false` and you have any supported
+audio provider keys configured, Clawdbot will **auto-enable audio transcription**
+even when you haven’t listed models explicitly.
+
+Providers checked (in order):
+1) OpenAI
+2) Groq
+3) Deepgram
+
+To disable this behavior, set:
+```json5
+{
+  tools: {
+    media: {
+      audio: {
+        enabled: false
+      }
+    }
+  }
+}
+```
+
 ## Capabilities (optional)
 If you set `capabilities`, the entry only runs for those media types. For shared
 lists, Clawdbot can infer defaults:
diff --git a/src/media-understanding/runner.auto-audio.test.ts b/src/media-understanding/runner.auto-audio.test.ts
new file mode 100644
index 000000000..68d9b665e
--- /dev/null
+++ b/src/media-understanding/runner.auto-audio.test.ts
@@ -0,0 +1,114 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { describe, expect, it } from "vitest";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+describe("runCapability auto audio entries", () => {
+  it("uses provider keys to auto-enable audio transcription", async () => {
+    const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
+    await fs.writeFile(tmpPath, Buffer.from("RIFF"));
+    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+
+    let seenModel: string | undefined;
+    const providerRegistry = buildProviderRegistry({
+      openai: {
+        id: "openai",
+        capabilities: ["audio"],
+        transcribeAudio: async (req) => {
+          seenModel = req.model;
+          return { text: "ok", model: req.model };
+        },
+      },
+    });
+
+    const cfg = {
+      models: {
+        providers: {
+          openai: {
+            apiKey: "test-key",
+            models: [],
+          },
+        },
+      },
+    } as unknown as ClawdbotConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+      });
+      expect(result.outputs[0]?.text).toBe("ok");
+      expect(seenModel).toBe("whisper-1");
+      expect(result.decision.outcome).toBe("success");
+    } finally {
+      await cache.cleanup();
+      await fs.unlink(tmpPath).catch(() => {});
+    }
+  });
+
+  it("skips auto audio when disabled", async () => {
+    const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
+    await fs.writeFile(tmpPath, Buffer.from("RIFF"));
+    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+
+    const providerRegistry = buildProviderRegistry({
+      openai: {
+        id: "openai",
+        capabilities: ["audio"],
+        transcribeAudio: async () => ({ text: "ok", model: "whisper-1" }),
+      },
+    });
+
+    const cfg = {
+      models: {
+        providers: {
+          openai: {
+            apiKey: "test-key",
+            models: [],
+          },
+        },
+      },
+      tools: {
+        media: {
+          audio: {
+            enabled: false,
+          },
+        },
+      },
+    } as unknown as ClawdbotConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+      });
+      expect(result.outputs).toHaveLength(0);
+      expect(result.decision.outcome).toBe("disabled");
+    } finally {
+      await cache.cleanup();
+      await fs.unlink(tmpPath).catch(() => {});
+    }
+  });
+});
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
index 89720b796..f1e648120 100644
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -39,6 +39,8 @@ import {
 import { describeImageWithModel } from "./providers/image.js";
 import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
 
+const AUTO_AUDIO_PROVIDERS = ["openai", "groq", "deepgram"] as const;
+
 export type ActiveMediaModel = {
   provider: string;
   model?: string;
@@ -65,6 +67,29 @@ export function createMediaAttachmentCache(attachments: MediaAttachment[]): Medi
   return new MediaAttachmentCache(attachments);
 }
 
+async function resolveAutoAudioEntries(params: {
+  cfg: ClawdbotConfig;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+}): Promise<MediaUnderstandingModelConfig[]> {
+  const entries: MediaUnderstandingModelConfig[] = [];
+  for (const providerId of AUTO_AUDIO_PROVIDERS) {
+    const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
+    if (!provider?.transcribeAudio) continue;
+    try {
+      await resolveApiKeyForProvider({
+        provider: providerId,
+        cfg: params.cfg,
+        agentDir: params.agentDir,
+      });
+      entries.push({ type: "provider", provider: providerId });
+    } catch {
+      continue;
+    }
+  }
+  return entries;
+}
+
 function trimOutput(text: string, maxChars?: number): string {
   const trimmed = text.trim();
   if (!maxChars || trimmed.length <= maxChars) return trimmed;
@@ -561,7 +586,15 @@ export async function runCapability(params: {
     providerRegistry: params.providerRegistry,
     activeModel: params.activeModel,
   });
-  if (entries.length === 0) {
+  let resolvedEntries = entries;
+  if (resolvedEntries.length === 0 && capability === "audio" && config?.enabled !== false) {
+    resolvedEntries = await resolveAutoAudioEntries({
+      cfg,
+      agentDir: params.agentDir,
+      providerRegistry: params.providerRegistry,
+    });
+  }
+  if (resolvedEntries.length === 0) {
     return {
       outputs: [],
       decision: {
@@ -583,7 +616,7 @@ export async function runCapability(params: {
       agentDir: params.agentDir,
       providerRegistry: params.providerRegistry,
       cache: params.attachments,
-      entries,
+      entries: resolvedEntries,
       config,
     });
     if (output) outputs.push(output);