feat: auto-enable audio understanding when keys exist
This commit is contained in:
13
CHANGELOG.md
13
CHANGELOG.md
@@ -10,18 +10,15 @@ Docs: https://docs.clawd.bot
|
|||||||
- Swabble: use the tagged Commander Swift package release.
|
- Swabble: use the tagged Commander Swift package release.
|
||||||
- CLI: add `clawdbot acp client` interactive ACP harness for debugging.
|
- CLI: add `clawdbot acp client` interactive ACP harness for debugging.
|
||||||
- Plugins: route command detection/text chunking helpers through the plugin runtime and drop runtime exports from the SDK.
|
- Plugins: route command detection/text chunking helpers through the plugin runtime and drop runtime exports from the SDK.
|
||||||
- Memory: add native Gemini embeddings provider for memory search. (#1151)
|
- Memory: add native Gemini embeddings provider for memory search. (#1151) — thanks @gumadeiras.
|
||||||
|
- Media: auto-enable audio understanding when provider keys are configured (OpenAI/Groq/Deepgram).
|
||||||
|
- Docs: add API usage + costs overview. https://docs.clawd.bot/reference/api-usage-costs
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
- Auth profiles: keep auto-pinned preference while allowing rotation on failover; user pins stay locked. (#1138) — thanks @cheeeee.
|
- Auth profiles: keep auto-pinned preference while allowing rotation on failover; user pins stay locked. (#1138) — thanks @cheeeee.
|
||||||
- macOS: avoid touching launchd in Remote over SSH so quitting the app no longer disables the remote gateway. (#1105)
|
- macOS: avoid touching launchd in Remote over SSH so quitting the app no longer disables the remote gateway. (#1105)
|
||||||
- Memory: index atomically so failed reindex preserves the previous memory database. (#1151)
|
- Memory: index atomically so failed reindex preserves the previous memory database. (#1151) — thanks @gumadeiras.
|
||||||
- Memory: avoid sqlite-vec unique constraint failures when reindexing duplicate chunk ids. (#1151)
|
- Memory: avoid sqlite-vec unique constraint failures when reindexing duplicate chunk ids. (#1151) — thanks @gumadeiras.
|
||||||
|
|
||||||
## 2026.1.18-5
|
|
||||||
|
|
||||||
### Changes
|
|
||||||
- Dependencies: update core + plugin deps (grammy, vitest, openai, Microsoft agents hosting, etc.).
|
|
||||||
|
|
||||||
## 2026.1.18-3
|
## 2026.1.18-3
|
||||||
|
|
||||||
|
|||||||
@@ -104,6 +104,29 @@ Rules:
|
|||||||
- If `<capability>.enabled: true` but no models are configured, Clawdbot tries the
|
- If `<capability>.enabled: true` but no models are configured, Clawdbot tries the
|
||||||
**active reply model** when its provider supports the capability.
|
**active reply model** when its provider supports the capability.
|
||||||
|
|
||||||
|
### Auto-enable audio (when keys exist)
|
||||||
|
If `tools.media.audio.enabled` is **not** set to `false` and you have any supported
|
||||||
|
audio provider keys configured, Clawdbot will **auto-enable audio transcription**
|
||||||
|
even when you haven’t listed models explicitly.
|
||||||
|
|
||||||
|
Providers checked (in order):
|
||||||
|
1) OpenAI
|
||||||
|
2) Groq
|
||||||
|
3) Deepgram
|
||||||
|
|
||||||
|
To disable this behavior, set:
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Capabilities (optional)
|
## Capabilities (optional)
|
||||||
If you set `capabilities`, the entry only runs for those media types. For shared
|
If you set `capabilities`, the entry only runs for those media types. For shared
|
||||||
lists, Clawdbot can infer defaults:
|
lists, Clawdbot can infer defaults:
|
||||||
|
|||||||
114
src/media-understanding/runner.auto-audio.test.ts
Normal file
114
src/media-understanding/runner.auto-audio.test.ts
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
|
||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import type { ClawdbotConfig } from "../config/config.js";
|
||||||
|
import type { MsgContext } from "../auto-reply/templating.js";
|
||||||
|
import {
|
||||||
|
buildProviderRegistry,
|
||||||
|
createMediaAttachmentCache,
|
||||||
|
normalizeMediaAttachments,
|
||||||
|
runCapability,
|
||||||
|
} from "./runner.js";
|
||||||
|
|
||||||
|
describe("runCapability auto audio entries", () => {
|
||||||
|
it("uses provider keys to auto-enable audio transcription", async () => {
|
||||||
|
const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
|
||||||
|
await fs.writeFile(tmpPath, Buffer.from("RIFF"));
|
||||||
|
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
|
||||||
|
const media = normalizeMediaAttachments(ctx);
|
||||||
|
const cache = createMediaAttachmentCache(media);
|
||||||
|
|
||||||
|
let seenModel: string | undefined;
|
||||||
|
const providerRegistry = buildProviderRegistry({
|
||||||
|
openai: {
|
||||||
|
id: "openai",
|
||||||
|
capabilities: ["audio"],
|
||||||
|
transcribeAudio: async (req) => {
|
||||||
|
seenModel = req.model;
|
||||||
|
return { text: "ok", model: req.model };
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const cfg = {
|
||||||
|
models: {
|
||||||
|
providers: {
|
||||||
|
openai: {
|
||||||
|
apiKey: "test-key",
|
||||||
|
models: [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
} as unknown as ClawdbotConfig;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await runCapability({
|
||||||
|
capability: "audio",
|
||||||
|
cfg,
|
||||||
|
ctx,
|
||||||
|
attachments: cache,
|
||||||
|
media,
|
||||||
|
providerRegistry,
|
||||||
|
});
|
||||||
|
expect(result.outputs[0]?.text).toBe("ok");
|
||||||
|
expect(seenModel).toBe("whisper-1");
|
||||||
|
expect(result.decision.outcome).toBe("success");
|
||||||
|
} finally {
|
||||||
|
await cache.cleanup();
|
||||||
|
await fs.unlink(tmpPath).catch(() => {});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it("skips auto audio when disabled", async () => {
|
||||||
|
const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
|
||||||
|
await fs.writeFile(tmpPath, Buffer.from("RIFF"));
|
||||||
|
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
|
||||||
|
const media = normalizeMediaAttachments(ctx);
|
||||||
|
const cache = createMediaAttachmentCache(media);
|
||||||
|
|
||||||
|
const providerRegistry = buildProviderRegistry({
|
||||||
|
openai: {
|
||||||
|
id: "openai",
|
||||||
|
capabilities: ["audio"],
|
||||||
|
transcribeAudio: async () => ({ text: "ok", model: "whisper-1" }),
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const cfg = {
|
||||||
|
models: {
|
||||||
|
providers: {
|
||||||
|
openai: {
|
||||||
|
apiKey: "test-key",
|
||||||
|
models: [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: false,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
} as unknown as ClawdbotConfig;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await runCapability({
|
||||||
|
capability: "audio",
|
||||||
|
cfg,
|
||||||
|
ctx,
|
||||||
|
attachments: cache,
|
||||||
|
media,
|
||||||
|
providerRegistry,
|
||||||
|
});
|
||||||
|
expect(result.outputs).toHaveLength(0);
|
||||||
|
expect(result.decision.outcome).toBe("disabled");
|
||||||
|
} finally {
|
||||||
|
await cache.cleanup();
|
||||||
|
await fs.unlink(tmpPath).catch(() => {});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -39,6 +39,8 @@ import {
|
|||||||
import { describeImageWithModel } from "./providers/image.js";
|
import { describeImageWithModel } from "./providers/image.js";
|
||||||
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
|
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
|
||||||
|
|
||||||
|
const AUTO_AUDIO_PROVIDERS = ["openai", "groq", "deepgram"] as const;
|
||||||
|
|
||||||
export type ActiveMediaModel = {
|
export type ActiveMediaModel = {
|
||||||
provider: string;
|
provider: string;
|
||||||
model?: string;
|
model?: string;
|
||||||
@@ -65,6 +67,29 @@ export function createMediaAttachmentCache(attachments: MediaAttachment[]): Medi
|
|||||||
return new MediaAttachmentCache(attachments);
|
return new MediaAttachmentCache(attachments);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function resolveAutoAudioEntries(params: {
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
agentDir?: string;
|
||||||
|
providerRegistry: ProviderRegistry;
|
||||||
|
}): Promise<MediaUnderstandingModelConfig[]> {
|
||||||
|
const entries: MediaUnderstandingModelConfig[] = [];
|
||||||
|
for (const providerId of AUTO_AUDIO_PROVIDERS) {
|
||||||
|
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||||
|
if (!provider?.transcribeAudio) continue;
|
||||||
|
try {
|
||||||
|
await resolveApiKeyForProvider({
|
||||||
|
provider: providerId,
|
||||||
|
cfg: params.cfg,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
});
|
||||||
|
entries.push({ type: "provider", provider: providerId });
|
||||||
|
} catch {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return entries;
|
||||||
|
}
|
||||||
|
|
||||||
function trimOutput(text: string, maxChars?: number): string {
|
function trimOutput(text: string, maxChars?: number): string {
|
||||||
const trimmed = text.trim();
|
const trimmed = text.trim();
|
||||||
if (!maxChars || trimmed.length <= maxChars) return trimmed;
|
if (!maxChars || trimmed.length <= maxChars) return trimmed;
|
||||||
@@ -561,7 +586,15 @@ export async function runCapability(params: {
|
|||||||
providerRegistry: params.providerRegistry,
|
providerRegistry: params.providerRegistry,
|
||||||
activeModel: params.activeModel,
|
activeModel: params.activeModel,
|
||||||
});
|
});
|
||||||
if (entries.length === 0) {
|
let resolvedEntries = entries;
|
||||||
|
if (resolvedEntries.length === 0 && capability === "audio" && config?.enabled !== false) {
|
||||||
|
resolvedEntries = await resolveAutoAudioEntries({
|
||||||
|
cfg,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
providerRegistry: params.providerRegistry,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (resolvedEntries.length === 0) {
|
||||||
return {
|
return {
|
||||||
outputs: [],
|
outputs: [],
|
||||||
decision: {
|
decision: {
|
||||||
@@ -583,7 +616,7 @@ export async function runCapability(params: {
|
|||||||
agentDir: params.agentDir,
|
agentDir: params.agentDir,
|
||||||
providerRegistry: params.providerRegistry,
|
providerRegistry: params.providerRegistry,
|
||||||
cache: params.attachments,
|
cache: params.attachments,
|
||||||
entries,
|
entries: resolvedEntries,
|
||||||
config,
|
config,
|
||||||
});
|
});
|
||||||
if (output) outputs.push(output);
|
if (output) outputs.push(output);
|
||||||
|
|||||||
Reference in New Issue
Block a user