refactor: normalize media attachment selection

This commit is contained in:
Peter Steinberger
2026-01-17 07:38:06 +00:00
parent 68c7d577a4
commit 6d969fe58e
2 changed files with 80 additions and 11 deletions

View File

@@ -424,4 +424,64 @@ describe("applyMediaUnderstanding", () => {
["[Audio 1/2]\nTranscript:\nnote-a.ogg", "[Audio 2/2]\nTranscript:\nnote-b.ogg"].join("\n\n"),
);
});
it("orders mixed media outputs as image, audio, video", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const imagePath = path.join(dir, "photo.jpg");
const audioPath = path.join(dir, "note.ogg");
const videoPath = path.join(dir, "clip.mp4");
await fs.writeFile(imagePath, "image-bytes");
await fs.writeFile(audioPath, "audio-bytes");
await fs.writeFile(videoPath, "video-bytes");
const ctx: MsgContext = {
Body: "<media:mixed>",
MediaPaths: [imagePath, audioPath, videoPath],
MediaTypes: ["image/jpeg", "audio/ogg", "video/mp4"],
};
const cfg: ClawdbotConfig = {
tools: {
media: {
image: { enabled: true, models: [{ provider: "openai", model: "gpt-5.2" }] },
audio: { enabled: true, models: [{ provider: "groq" }] },
video: { enabled: true, models: [{ provider: "google", model: "gemini-3" }] },
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
agentDir: dir,
providers: {
openai: {
id: "openai",
describeImage: async () => ({ text: "image ok" }),
},
groq: {
id: "groq",
transcribeAudio: async () => ({ text: "audio ok" }),
},
google: {
id: "google",
describeVideo: async () => ({ text: "video ok" }),
},
},
});
expect(result.appliedImage).toBe(true);
expect(result.appliedAudio).toBe(true);
expect(result.appliedVideo).toBe(true);
expect(ctx.Body).toBe(
[
"[Image]\nDescription:\nimage ok",
"[Audio]\nTranscript:\naudio ok",
"[Video]\nDescription:\nvideo ok",
].join("\n\n"),
);
expect(ctx.Transcript).toBe("audio ok");
expect(ctx.CommandBody).toBe("audio ok");
expect(ctx.BodyForCommands).toBe("audio ok");
});
});

View File

@@ -7,7 +7,7 @@ import { fileURLToPath } from "node:url";
import type { MsgContext } from "../auto-reply/templating.js";
import type { MediaUnderstandingAttachmentsConfig } from "../config/types.tools.js";
import { fetchRemoteMedia, MediaFetchError } from "../media/fetch.js";
import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
import { detectMime, getFileExtension, isAudioFileName, kindFromMime } from "../media/mime.js";
import { logVerbose, shouldLogVerbose } from "../globals.js";
import { fetchWithTimeout } from "./providers/shared.js";
import type { MediaAttachment, MediaUnderstandingCapability } from "./types.js";
@@ -100,23 +100,32 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
];
}
export function isVideoAttachment(attachment: MediaAttachment): boolean {
if (attachment.mime?.startsWith("video/")) return true;
export function resolveAttachmentKind(
attachment: MediaAttachment,
): "image" | "audio" | "video" | "unknown" {
const kind = kindFromMime(attachment.mime);
if (kind !== "unknown") return kind;
const ext = getFileExtension(attachment.path ?? attachment.url);
if (!ext) return false;
return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
if (!ext) return "unknown";
if ([".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext)) return "video";
if (isAudioFileName(attachment.path ?? attachment.url)) return "audio";
if ([".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext)) {
return "image";
}
return "unknown";
}
export function isVideoAttachment(attachment: MediaAttachment): boolean {
return resolveAttachmentKind(attachment) === "video";
}
export function isAudioAttachment(attachment: MediaAttachment): boolean {
if (attachment.mime?.startsWith("audio/")) return true;
return isAudioFileName(attachment.path ?? attachment.url);
return resolveAttachmentKind(attachment) === "audio";
}
export function isImageAttachment(attachment: MediaAttachment): boolean {
if (attachment.mime?.startsWith("image/")) return true;
const ext = getFileExtension(attachment.path ?? attachment.url);
if (!ext) return false;
return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
return resolveAttachmentKind(attachment) === "image";
}
function isAbortError(err: unknown): boolean {