From c69947dff82b0714706c611e733cf8629e1a14fe Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 18 Jan 2026 14:49:11 +0000 Subject: [PATCH] feat: auto-enable audio understanding when keys exist --- CHANGELOG.md | 13 +- docs/nodes/media-understanding.md | 23 ++++ .../runner.auto-audio.test.ts | 114 ++++++++++++++++++ src/media-understanding/runner.ts | 37 +++++- 4 files changed, 177 insertions(+), 10 deletions(-) create mode 100644 src/media-understanding/runner.auto-audio.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 68beaaba8..b4d391fd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,18 +10,15 @@ Docs: https://docs.clawd.bot - Swabble: use the tagged Commander Swift package release. - CLI: add `clawdbot acp client` interactive ACP harness for debugging. - Plugins: route command detection/text chunking helpers through the plugin runtime and drop runtime exports from the SDK. -- Memory: add native Gemini embeddings provider for memory search. (#1151) +- Memory: add native Gemini embeddings provider for memory search. (#1151) — thanks @gumadeiras. +- Media: auto-enable audio understanding when provider keys are configured (OpenAI/Groq/Deepgram). +- Docs: add API usage + costs overview. https://docs.clawd.bot/reference/api-usage-costs ### Fixes - Auth profiles: keep auto-pinned preference while allowing rotation on failover; user pins stay locked. (#1138) — thanks @cheeeee. - macOS: avoid touching launchd in Remote over SSH so quitting the app no longer disables the remote gateway. (#1105) -- Memory: index atomically so failed reindex preserves the previous memory database. (#1151) -- Memory: avoid sqlite-vec unique constraint failures when reindexing duplicate chunk ids. (#1151) - -## 2026.1.18-5 - -### Changes -- Dependencies: update core + plugin deps (grammy, vitest, openai, Microsoft agents hosting, etc.). +- Memory: index atomically so failed reindex preserves the previous memory database. (#1151) — thanks @gumadeiras. +- Memory: avoid sqlite-vec unique constraint failures when reindexing duplicate chunk ids. (#1151) — thanks @gumadeiras. ## 2026.1.18-3 diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index 6d640cd53..e0daa1497 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -104,6 +104,29 @@ Rules: - If `.enabled: true` but no models are configured, Clawdbot tries the **active reply model** when its provider supports the capability. +### Auto-enable audio (when keys exist) +If `tools.media.audio.enabled` is **not** set to `false` and you have any supported +audio provider keys configured, Clawdbot will **auto-enable audio transcription** +even when you haven’t listed models explicitly. + +Providers checked (in order): +1) OpenAI +2) Groq +3) Deepgram + +To disable this behavior, set: +```json5 +{ + tools: { + media: { + audio: { + enabled: false + } + } + } +} +``` + ## Capabilities (optional) If you set `capabilities`, the entry only runs for those media types. For shared lists, Clawdbot can infer defaults: diff --git a/src/media-understanding/runner.auto-audio.test.ts b/src/media-understanding/runner.auto-audio.test.ts new file mode 100644 index 000000000..68d9b665e --- /dev/null +++ b/src/media-understanding/runner.auto-audio.test.ts @@ -0,0 +1,114 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { describe, expect, it } from "vitest"; + +import type { ClawdbotConfig } from "../config/config.js"; +import type { MsgContext } from "../auto-reply/templating.js"; +import { + buildProviderRegistry, + createMediaAttachmentCache, + normalizeMediaAttachments, + runCapability, +} from "./runner.js"; + +describe("runCapability auto audio entries", () => { + it("uses provider keys to auto-enable audio transcription", async () => { + const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`); + await fs.writeFile(tmpPath, Buffer.from("RIFF")); + const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media); + + let seenModel: string | undefined; + const providerRegistry = buildProviderRegistry({ + openai: { + id: "openai", + capabilities: ["audio"], + transcribeAudio: async (req) => { + seenModel = req.model; + return { text: "ok", model: req.model }; + }, + }, + }); + + const cfg = { + models: { + providers: { + openai: { + apiKey: "test-key", + models: [], + }, + }, + }, + } as unknown as ClawdbotConfig; + + try { + const result = await runCapability({ + capability: "audio", + cfg, + ctx, + attachments: cache, + media, + providerRegistry, + }); + expect(result.outputs[0]?.text).toBe("ok"); + expect(seenModel).toBe("whisper-1"); + expect(result.decision.outcome).toBe("success"); + } finally { + await cache.cleanup(); + await fs.unlink(tmpPath).catch(() => {}); + } + }); + + it("skips auto audio when disabled", async () => { + const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`); + await fs.writeFile(tmpPath, Buffer.from("RIFF")); + const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media); + + const providerRegistry = buildProviderRegistry({ + openai: { + id: "openai", + capabilities: ["audio"], + transcribeAudio: async () => ({ text: "ok", model: "whisper-1" }), + }, + }); + + const cfg = { + models: { + providers: { + openai: { + apiKey: "test-key", + models: [], + }, + }, + }, + tools: { + media: { + audio: { + enabled: false, + }, + }, + }, + } as unknown as ClawdbotConfig; + + try { + const result = await runCapability({ + capability: "audio", + cfg, + ctx, + attachments: cache, + media, + providerRegistry, + }); + expect(result.outputs).toHaveLength(0); + expect(result.decision.outcome).toBe("disabled"); + } finally { + await cache.cleanup(); + await fs.unlink(tmpPath).catch(() => {}); + } + }); +}); diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 89720b796..f1e648120 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -39,6 +39,8 @@ import { import { describeImageWithModel } from "./providers/image.js"; import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js"; +const AUTO_AUDIO_PROVIDERS = ["openai", "groq", "deepgram"] as const; + export type ActiveMediaModel = { provider: string; model?: string; @@ -65,6 +67,29 @@ export function createMediaAttachmentCache(attachments: MediaAttachment[]): Medi return new MediaAttachmentCache(attachments); } +async function resolveAutoAudioEntries(params: { + cfg: ClawdbotConfig; + agentDir?: string; + providerRegistry: ProviderRegistry; +}): Promise { + const entries: MediaUnderstandingModelConfig[] = []; + for (const providerId of AUTO_AUDIO_PROVIDERS) { + const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); + if (!provider?.transcribeAudio) continue; + try { + await resolveApiKeyForProvider({ + provider: providerId, + cfg: params.cfg, + agentDir: params.agentDir, + }); + entries.push({ type: "provider", provider: providerId }); + } catch { + continue; + } + } + return entries; +} + function trimOutput(text: string, maxChars?: number): string { const trimmed = text.trim(); if (!maxChars || trimmed.length <= maxChars) return trimmed; @@ -561,7 +586,15 @@ export async function runCapability(params: { providerRegistry: params.providerRegistry, activeModel: params.activeModel, }); - if (entries.length === 0) { + let resolvedEntries = entries; + if (resolvedEntries.length === 0 && capability === "audio" && config?.enabled !== false) { + resolvedEntries = await resolveAutoAudioEntries({ + cfg, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + }); + } + if (resolvedEntries.length === 0) { return { outputs: [], decision: { @@ -583,7 +616,7 @@ export async function runCapability(params: { agentDir: params.agentDir, providerRegistry: params.providerRegistry, cache: params.attachments, - entries, + entries: resolvedEntries, config, }); if (output) outputs.push(output);