refactor: tune media understanding

2026-01-17 06:44:12 +00:00
parent 3dc4a96330
commit 5a1ff5b9e7
6 changed files with 285 additions and 36 deletions
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -85,6 +85,50 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.BodyForCommands).toBe("transcribed text");
  });

+  it("keeps caption for command parsing when audio has user text", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "note.ogg");
+    await fs.writeFile(audioPath, "hello");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio> /capture status",
+      MediaPath: audioPath,
+      MediaType: "audio/ogg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => ({ text: "transcribed text" }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("transcribed text");
+    expect(ctx.Body).toBe(
+      "[Audio]\nUser text:\n/capture status\nTranscript:\ntranscribed text",
+    );
+    expect(ctx.CommandBody).toBe("/capture status");
+    expect(ctx.RawBody).toBe("/capture status");
+    expect(ctx.BodyForCommands).toBe("/capture status");
+  });
+
  it("handles URL-only attachments for audio transcription", async () => {
    const { applyMediaUnderstanding } = await loadApply();
    const ctx: MsgContext = {
@@ -301,6 +345,43 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.Body).toBe("[Image]\nDescription:\nshared description");
  });

+  it("uses active model when enabled and models are missing", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "fallback.ogg");
+    await fs.writeFile(audioPath, "hello");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/ogg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      activeModel: { provider: "groq", model: "whisper-large-v3" },
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => ({ text: "fallback transcript" }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("fallback transcript");
+  });
+
  it("handles multiple audio attachments when attachment mode is all", async () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));