refactor: tune media understanding

This commit is contained in:
Peter Steinberger
2026-01-17 06:44:12 +00:00
parent 3dc4a96330
commit 5a1ff5b9e7
6 changed files with 285 additions and 36 deletions

View File

@@ -85,6 +85,50 @@ describe("applyMediaUnderstanding", () => {
expect(ctx.BodyForCommands).toBe("transcribed text");
});
it("keeps caption for command parsing when audio has user text", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const audioPath = path.join(dir, "note.ogg");
await fs.writeFile(audioPath, "hello");
const ctx: MsgContext = {
Body: "<media:audio> /capture status",
MediaPath: audioPath,
MediaType: "audio/ogg",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio: async () => ({ text: "transcribed text" }),
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("transcribed text");
expect(ctx.Body).toBe(
"[Audio]\nUser text:\n/capture status\nTranscript:\ntranscribed text",
);
expect(ctx.CommandBody).toBe("/capture status");
expect(ctx.RawBody).toBe("/capture status");
expect(ctx.BodyForCommands).toBe("/capture status");
});
it("handles URL-only attachments for audio transcription", async () => {
const { applyMediaUnderstanding } = await loadApply();
const ctx: MsgContext = {
@@ -301,6 +345,43 @@ describe("applyMediaUnderstanding", () => {
expect(ctx.Body).toBe("[Image]\nDescription:\nshared description");
});
it("uses active model when enabled and models are missing", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const audioPath = path.join(dir, "fallback.ogg");
await fs.writeFile(audioPath, "hello");
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: audioPath,
MediaType: "audio/ogg",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
activeModel: { provider: "groq", model: "whisper-large-v3" },
providers: {
groq: {
id: "groq",
transcribeAudio: async () => ({ text: "fallback transcript" }),
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("fallback transcript");
});
it("handles multiple audio attachments when attachment mode is all", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));