feat: auto-enable audio understanding when keys exist
This commit is contained in:
114
src/media-understanding/runner.auto-audio.test.ts
Normal file
114
src/media-understanding/runner.auto-audio.test.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import {
|
||||
buildProviderRegistry,
|
||||
createMediaAttachmentCache,
|
||||
normalizeMediaAttachments,
|
||||
runCapability,
|
||||
} from "./runner.js";
|
||||
|
||||
describe("runCapability auto audio entries", () => {
|
||||
it("uses provider keys to auto-enable audio transcription", async () => {
|
||||
const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
|
||||
await fs.writeFile(tmpPath, Buffer.from("RIFF"));
|
||||
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
|
||||
const media = normalizeMediaAttachments(ctx);
|
||||
const cache = createMediaAttachmentCache(media);
|
||||
|
||||
let seenModel: string | undefined;
|
||||
const providerRegistry = buildProviderRegistry({
|
||||
openai: {
|
||||
id: "openai",
|
||||
capabilities: ["audio"],
|
||||
transcribeAudio: async (req) => {
|
||||
seenModel = req.model;
|
||||
return { text: "ok", model: req.model };
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as ClawdbotConfig;
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: "audio",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
providerRegistry,
|
||||
});
|
||||
expect(result.outputs[0]?.text).toBe("ok");
|
||||
expect(seenModel).toBe("whisper-1");
|
||||
expect(result.decision.outcome).toBe("success");
|
||||
} finally {
|
||||
await cache.cleanup();
|
||||
await fs.unlink(tmpPath).catch(() => {});
|
||||
}
|
||||
});
|
||||
|
||||
it("skips auto audio when disabled", async () => {
|
||||
const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
|
||||
await fs.writeFile(tmpPath, Buffer.from("RIFF"));
|
||||
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
|
||||
const media = normalizeMediaAttachments(ctx);
|
||||
const cache = createMediaAttachmentCache(media);
|
||||
|
||||
const providerRegistry = buildProviderRegistry({
|
||||
openai: {
|
||||
id: "openai",
|
||||
capabilities: ["audio"],
|
||||
transcribeAudio: async () => ({ text: "ok", model: "whisper-1" }),
|
||||
},
|
||||
});
|
||||
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as ClawdbotConfig;
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: "audio",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
providerRegistry,
|
||||
});
|
||||
expect(result.outputs).toHaveLength(0);
|
||||
expect(result.decision.outcome).toBe("disabled");
|
||||
} finally {
|
||||
await cache.cleanup();
|
||||
await fs.unlink(tmpPath).catch(() => {});
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -39,6 +39,8 @@ import {
|
||||
import { describeImageWithModel } from "./providers/image.js";
|
||||
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
|
||||
|
||||
const AUTO_AUDIO_PROVIDERS = ["openai", "groq", "deepgram"] as const;
|
||||
|
||||
export type ActiveMediaModel = {
|
||||
provider: string;
|
||||
model?: string;
|
||||
@@ -65,6 +67,29 @@ export function createMediaAttachmentCache(attachments: MediaAttachment[]): Medi
|
||||
return new MediaAttachmentCache(attachments);
|
||||
}
|
||||
|
||||
async function resolveAutoAudioEntries(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
}): Promise<MediaUnderstandingModelConfig[]> {
|
||||
const entries: MediaUnderstandingModelConfig[] = [];
|
||||
for (const providerId of AUTO_AUDIO_PROVIDERS) {
|
||||
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||
if (!provider?.transcribeAudio) continue;
|
||||
try {
|
||||
await resolveApiKeyForProvider({
|
||||
provider: providerId,
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
entries.push({ type: "provider", provider: providerId });
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
function trimOutput(text: string, maxChars?: number): string {
|
||||
const trimmed = text.trim();
|
||||
if (!maxChars || trimmed.length <= maxChars) return trimmed;
|
||||
@@ -561,7 +586,15 @@ export async function runCapability(params: {
|
||||
providerRegistry: params.providerRegistry,
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
if (entries.length === 0) {
|
||||
let resolvedEntries = entries;
|
||||
if (resolvedEntries.length === 0 && capability === "audio" && config?.enabled !== false) {
|
||||
resolvedEntries = await resolveAutoAudioEntries({
|
||||
cfg,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
});
|
||||
}
|
||||
if (resolvedEntries.length === 0) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
@@ -583,7 +616,7 @@ export async function runCapability(params: {
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
cache: params.attachments,
|
||||
entries,
|
||||
entries: resolvedEntries,
|
||||
config,
|
||||
});
|
||||
if (output) outputs.push(output);
|
||||
|
||||
Reference in New Issue
Block a user