From e0425ad3e18515dc0929c6c244f87820aeb4cb1e Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 25 Nov 2025 23:21:35 +0100 Subject: [PATCH] feat: support audio/video/doc media caps and transcript context --- src/auto-reply/claude.ts | 2 +- src/index.core.test.ts | 28 +++++------ src/media/constants.ts | 31 ++++++++++++ src/provider-web.test.ts | 12 ++++- src/provider-web.ts | 104 ++++++++++++++++++++++++++++++++++----- 5 files changed, 147 insertions(+), 30 deletions(-) create mode 100644 src/media/constants.ts diff --git a/src/auto-reply/claude.ts b/src/auto-reply/claude.ts index a7baf97ae..6cbb754c5 100644 --- a/src/auto-reply/claude.ts +++ b/src/auto-reply/claude.ts @@ -4,7 +4,7 @@ import { z } from "zod"; // Preferred binary name for Claude CLI invocations. export const CLAUDE_BIN = "claude"; export const CLAUDE_IDENTITY_PREFIX = - "You are Clawd (Claude) running on the user's Mac via warelay. Your scratchpad is /Users/steipete/clawd; this is your folder and you can add what you like in markdown files and/or images. You don't need to be concise, but WhatsApp replies must stay under ~1500 characters."; + "You are Clawd (Claude) running on the user's Mac via warelay. Your scratchpad is /Users/steipete/clawd; this is your folder and you can add what you like in markdown files and/or images. You don't need to be concise, but WhatsApp replies must stay under ~1500 characters. Media you can send: images ≤6MB, audio/video ≤16MB, documents ≤100MB. The prompt may include a media path and an optional Transcript: section—use them when present."; function extractClaudeText(payload: unknown): string | undefined { // Best-effort walker to find the primary text field in Claude JSON outputs. diff --git a/src/index.core.test.ts b/src/index.core.test.ts index 102a84ba6..2d6a68772 100644 --- a/src/index.core.test.ts +++ b/src/index.core.test.ts @@ -143,11 +143,11 @@ describe("config and templating", () => { killed: false, }); - const result = await index.getReplyFromConfig( - { - Body: "", - From: "+1", - To: "+2", + const result = await index.getReplyFromConfig( + { + Body: "", + From: "+1", + To: "+2", MediaPath: "/tmp/voice.ogg", MediaType: "audio/ogg", }, @@ -156,15 +156,15 @@ describe("config and templating", () => { commandRunner, ); - expect(runExec).toHaveBeenCalled(); - expect(commandRunner).toHaveBeenCalled(); - const argv = commandRunner.mock.calls[0][0]; - const prompt = argv[argv.length - 1] as string; - expect(prompt).toContain("/tmp/voice.ogg"); - expect(prompt).toContain("Transcript:"); - expect(prompt).toContain("voice transcript"); - expect(result?.text).toBeUndefined(); - }); + expect(runExec).toHaveBeenCalled(); + expect(commandRunner).toHaveBeenCalled(); + const argv = commandRunner.mock.calls[0][0]; + const prompt = argv[argv.length - 1] as string; + expect(prompt).toContain("/tmp/voice.ogg"); + expect(prompt).toContain("Transcript:"); + expect(prompt).toContain("voice transcript"); + expect(result?.text).toBe("ok"); +}); it("getReplyFromConfig skips transcription when not configured", async () => { const cfg = { diff --git a/src/media/constants.ts b/src/media/constants.ts new file mode 100644 index 000000000..27aff351f --- /dev/null +++ b/src/media/constants.ts @@ -0,0 +1,31 @@ +export const MAX_IMAGE_BYTES = 6 * 1024 * 1024; // 6MB +export const MAX_AUDIO_BYTES = 16 * 1024 * 1024; // 16MB +export const MAX_VIDEO_BYTES = 16 * 1024 * 1024; // 16MB +export const MAX_DOCUMENT_BYTES = 100 * 1024 * 1024; // 100MB + +export type MediaKind = "image" | "audio" | "video" | "document" | "unknown"; + +export function mediaKindFromMime(mime?: string | null): MediaKind { + if (!mime) return "unknown"; + if (mime.startsWith("image/")) return "image"; + if (mime.startsWith("audio/")) return "audio"; + if (mime.startsWith("video/")) return "video"; + if (mime === "application/pdf") return "document"; + if (mime.startsWith("application/")) return "document"; + return "unknown"; +} + +export function maxBytesForKind(kind: MediaKind): number { + switch (kind) { + case "image": + return MAX_IMAGE_BYTES; + case "audio": + return MAX_AUDIO_BYTES; + case "video": + return MAX_VIDEO_BYTES; + case "document": + return MAX_DOCUMENT_BYTES; + default: + return MAX_DOCUMENT_BYTES; + } +} diff --git a/src/provider-web.test.ts b/src/provider-web.test.ts index 24163cb73..c06b309b0 100644 --- a/src/provider-web.test.ts +++ b/src/provider-web.test.ts @@ -719,8 +719,16 @@ describe("provider-web", () => { sendMedia, }); - expect(sendMedia).not.toHaveBeenCalled(); - expect(reply).toHaveBeenCalledWith("hi"); + expect(sendMedia).toHaveBeenCalledTimes(1); + const payload = sendMedia.mock.calls[0][0] as { + document?: Buffer; + caption?: string; + fileName?: string; + }; + expect(payload.document).toBeInstanceOf(Buffer); + expect(payload.fileName).toBe("file.pdf"); + expect(payload.caption).toBe("hi"); + expect(reply).not.toHaveBeenCalled(); fetchMock.mockRestore(); }); diff --git a/src/provider-web.ts b/src/provider-web.ts index 5e5efd0e9..e00e68e14 100644 --- a/src/provider-web.ts +++ b/src/provider-web.ts @@ -21,6 +21,7 @@ import { loadConfig } from "./config/config.js"; import { danger, info, isVerbose, logVerbose, success } from "./globals.js"; import { logInfo } from "./logger.js"; import { getChildLogger } from "./logging.js"; +import { maxBytesForKind, mediaKindFromMime } from "./media/constants.js"; import { saveMediaBuffer } from "./media/store.js"; import { defaultRuntime, type RuntimeEnv } from "./runtime.js"; import type { Provider } from "./utils.js"; @@ -485,12 +486,39 @@ export async function monitorWebProvider( logVerbose( `Web auto-reply media size: ${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB`, ); + logVerbose( + `Web auto-reply media source: ${replyResult.mediaUrl} (kind ${media.kind})`, + ); + } + if (media.kind === "image") { + await msg.sendMedia({ + image: media.buffer, + caption: replyResult.text || undefined, + mimetype: media.contentType, + }); + } else if (media.kind === "audio") { + await msg.sendMedia({ + audio: media.buffer, + ptt: true, + mimetype: media.contentType, + caption: replyResult.text || undefined, + } as AnyMessageContent); + } else if (media.kind === "video") { + await msg.sendMedia({ + video: media.buffer, + caption: replyResult.text || undefined, + mimetype: media.contentType, + }); + } else { + const fileName = + replyResult.mediaUrl.split("/").pop() ?? "file"; + await msg.sendMedia({ + document: media.buffer, + fileName, + caption: replyResult.text || undefined, + mimetype: media.contentType, + } as AnyMessageContent); } - await msg.sendMedia({ - image: media.buffer, - caption: replyResult.text || undefined, - mimetype: media.contentType, - }); logInfo( `✅ Sent web media reply to ${msg.from} (${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB)`, runtime, @@ -502,6 +530,7 @@ export async function monitorWebProvider( text: replyResult.text ?? null, mediaUrl: replyResult.mediaUrl, mediaSizeBytes: media.buffer.length, + mediaKind: media.kind, durationMs: Date.now() - replyStarted, }, "auto-reply sent (media)", @@ -727,22 +756,21 @@ async function downloadInboundMedia( async function loadWebMedia( mediaUrl: string, - maxBytes: number = DEFAULT_WEB_MEDIA_BYTES, -): Promise<{ buffer: Buffer; contentType?: string }> { - // Hard cap to avoid Anthropic/WhatsApp 5MB image limit that triggers API 400s. + maxBytes?: number, +): Promise<{ buffer: Buffer; contentType?: string; kind: MediaKind }> { if (mediaUrl.startsWith("file://")) { mediaUrl = mediaUrl.replace("file://", ""); } - const optimizeAndClamp = async (buffer: Buffer) => { + const optimizeAndClampImage = async (buffer: Buffer, cap: number) => { const originalSize = buffer.length; - const optimized = await optimizeImageToJpeg(buffer, maxBytes); + const optimized = await optimizeImageToJpeg(buffer, cap); if (optimized.optimizedSize < originalSize && isVerbose()) { logVerbose( `Optimized media from ${(originalSize / (1024 * 1024)).toFixed(2)}MB to ${(optimized.optimizedSize / (1024 * 1024)).toFixed(2)}MB (side≤${optimized.resizeSide}px, q=${optimized.quality})`, ); } - if (optimized.buffer.length > maxBytes) { + if (optimized.buffer.length > cap) { throw new Error( `Media could not be reduced below ${(maxBytes / (1024 * 1024)).toFixed(0)}MB (got ${( optimized.buffer.length / (1024 * 1024) @@ -752,6 +780,7 @@ async function loadWebMedia( return { buffer: optimized.buffer, contentType: "image/jpeg", + kind: "image" as const, }; }; @@ -761,11 +790,60 @@ async function loadWebMedia( throw new Error(`Failed to fetch media: HTTP ${res.status}`); } const array = Buffer.from(await res.arrayBuffer()); - return optimizeAndClamp(array); + const contentType = res.headers.get("content-type"); + const kind = mediaKindFromMime(contentType); + const cap = Math.min( + maxBytes ?? maxBytesForKind(kind), + maxBytesForKind(kind), + ); + if (kind === "image") { + return optimizeAndClampImage(array, cap); + } + if (array.length > cap) { + throw new Error( + `Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${( + array.length / (1024 * 1024) + ).toFixed(2)}MB)`, + ); + } + return { buffer: array, contentType: contentType ?? undefined, kind }; } // Local path const data = await fs.readFile(mediaUrl); - return optimizeAndClamp(data); + const ext = path.extname(mediaUrl); + const mime = + (ext && + ( + { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".webp": "image/webp", + ".gif": "image/gif", + ".ogg": "audio/ogg", + ".opus": "audio/ogg", + ".mp3": "audio/mpeg", + ".mp4": "video/mp4", + ".pdf": "application/pdf", + } as Record + )[ext.toLowerCase()]) ?? + undefined; + const kind = mediaKindFromMime(mime); + const cap = Math.min( + maxBytes ?? maxBytesForKind(kind), + maxBytesForKind(kind), + ); + if (kind === "image") { + return optimizeAndClampImage(data, cap); + } + if (data.length > cap) { + throw new Error( + `Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${( + data.length / (1024 * 1024) + ).toFixed(2)}MB)`, + ); + } + return { buffer: data, contentType: mime, kind }; } function getStatusCode(err: unknown) {