feat: auto-enable audio understanding when keys exist

2026-01-18 14:49:11 +00:00
parent 173bce34b0
commit c69947dff8
4 changed files with 177 additions and 10 deletions
--- a/src/media-understanding/runner.auto-audio.test.ts
+++ b/src/media-understanding/runner.auto-audio.test.ts
@@ -0,0 +1,114 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { describe, expect, it } from "vitest";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+describe("runCapability auto audio entries", () => {
+  it("uses provider keys to auto-enable audio transcription", async () => {
+    const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
+    await fs.writeFile(tmpPath, Buffer.from("RIFF"));
+    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+
+    let seenModel: string | undefined;
+    const providerRegistry = buildProviderRegistry({
+      openai: {
+        id: "openai",
+        capabilities: ["audio"],
+        transcribeAudio: async (req) => {
+          seenModel = req.model;
+          return { text: "ok", model: req.model };
+        },
+      },
+    });
+
+    const cfg = {
+      models: {
+        providers: {
+          openai: {
+            apiKey: "test-key",
+            models: [],
+          },
+        },
+      },
+    } as unknown as ClawdbotConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+      });
+      expect(result.outputs[0]?.text).toBe("ok");
+      expect(seenModel).toBe("whisper-1");
+      expect(result.decision.outcome).toBe("success");
+    } finally {
+      await cache.cleanup();
+      await fs.unlink(tmpPath).catch(() => {});
+    }
+  });
+
+  it("skips auto audio when disabled", async () => {
+    const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
+    await fs.writeFile(tmpPath, Buffer.from("RIFF"));
+    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+
+    const providerRegistry = buildProviderRegistry({
+      openai: {
+        id: "openai",
+        capabilities: ["audio"],
+        transcribeAudio: async () => ({ text: "ok", model: "whisper-1" }),
+      },
+    });
+
+    const cfg = {
+      models: {
+        providers: {
+          openai: {
+            apiKey: "test-key",
+            models: [],
+          },
+        },
+      },
+      tools: {
+        media: {
+          audio: {
+            enabled: false,
+          },
+        },
+      },
+    } as unknown as ClawdbotConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+      });
+      expect(result.outputs).toHaveLength(0);
+      expect(result.decision.outcome).toBe("disabled");
+    } finally {
+      await cache.cleanup();
+      await fs.unlink(tmpPath).catch(() => {});
+    }
+  });
+});