diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index d2ce7141e..9967f9209 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -424,4 +424,64 @@ describe("applyMediaUnderstanding", () => { ["[Audio 1/2]\nTranscript:\nnote-a.ogg", "[Audio 2/2]\nTranscript:\nnote-b.ogg"].join("\n\n"), ); }); + + it("orders mixed media outputs as image, audio, video", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); + const imagePath = path.join(dir, "photo.jpg"); + const audioPath = path.join(dir, "note.ogg"); + const videoPath = path.join(dir, "clip.mp4"); + await fs.writeFile(imagePath, "image-bytes"); + await fs.writeFile(audioPath, "audio-bytes"); + await fs.writeFile(videoPath, "video-bytes"); + + const ctx: MsgContext = { + Body: "", + MediaPaths: [imagePath, audioPath, videoPath], + MediaTypes: ["image/jpeg", "audio/ogg", "video/mp4"], + }; + const cfg: ClawdbotConfig = { + tools: { + media: { + image: { enabled: true, models: [{ provider: "openai", model: "gpt-5.2" }] }, + audio: { enabled: true, models: [{ provider: "groq" }] }, + video: { enabled: true, models: [{ provider: "google", model: "gemini-3" }] }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + agentDir: dir, + providers: { + openai: { + id: "openai", + describeImage: async () => ({ text: "image ok" }), + }, + groq: { + id: "groq", + transcribeAudio: async () => ({ text: "audio ok" }), + }, + google: { + id: "google", + describeVideo: async () => ({ text: "video ok" }), + }, + }, + }); + + expect(result.appliedImage).toBe(true); + expect(result.appliedAudio).toBe(true); + expect(result.appliedVideo).toBe(true); + expect(ctx.Body).toBe( + [ + "[Image]\nDescription:\nimage ok", + "[Audio]\nTranscript:\naudio ok", + "[Video]\nDescription:\nvideo ok", + ].join("\n\n"), + ); + expect(ctx.Transcript).toBe("audio ok"); + expect(ctx.CommandBody).toBe("audio ok"); + expect(ctx.BodyForCommands).toBe("audio ok"); + }); }); diff --git a/src/media-understanding/attachments.ts b/src/media-understanding/attachments.ts index fcc761b70..8250464a7 100644 --- a/src/media-understanding/attachments.ts +++ b/src/media-understanding/attachments.ts @@ -7,7 +7,7 @@ import { fileURLToPath } from "node:url"; import type { MsgContext } from "../auto-reply/templating.js"; import type { MediaUnderstandingAttachmentsConfig } from "../config/types.tools.js"; import { fetchRemoteMedia, MediaFetchError } from "../media/fetch.js"; -import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js"; +import { detectMime, getFileExtension, isAudioFileName, kindFromMime } from "../media/mime.js"; import { logVerbose, shouldLogVerbose } from "../globals.js"; import { fetchWithTimeout } from "./providers/shared.js"; import type { MediaAttachment, MediaUnderstandingCapability } from "./types.js"; @@ -100,23 +100,32 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] { ]; } -export function isVideoAttachment(attachment: MediaAttachment): boolean { - if (attachment.mime?.startsWith("video/")) return true; +export function resolveAttachmentKind( + attachment: MediaAttachment, +): "image" | "audio" | "video" | "unknown" { + const kind = kindFromMime(attachment.mime); + if (kind !== "unknown") return kind; + const ext = getFileExtension(attachment.path ?? attachment.url); - if (!ext) return false; - return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext); + if (!ext) return "unknown"; + if ([".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext)) return "video"; + if (isAudioFileName(attachment.path ?? attachment.url)) return "audio"; + if ([".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext)) { + return "image"; + } + return "unknown"; +} + +export function isVideoAttachment(attachment: MediaAttachment): boolean { + return resolveAttachmentKind(attachment) === "video"; } export function isAudioAttachment(attachment: MediaAttachment): boolean { - if (attachment.mime?.startsWith("audio/")) return true; - return isAudioFileName(attachment.path ?? attachment.url); + return resolveAttachmentKind(attachment) === "audio"; } export function isImageAttachment(attachment: MediaAttachment): boolean { - if (attachment.mime?.startsWith("image/")) return true; - const ext = getFileExtension(attachment.path ?? attachment.url); - if (!ext) return false; - return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext); + return resolveAttachmentKind(attachment) === "image"; } function isAbortError(err: unknown): boolean {