feat: support audio/video/doc media caps and transcript context

This commit is contained in:
Peter Steinberger
2025-11-25 23:21:35 +01:00
parent 5dced02a20
commit e0425ad3e1
5 changed files with 147 additions and 30 deletions

View File

@@ -4,7 +4,7 @@ import { z } from "zod";
// Preferred binary name for Claude CLI invocations.
export const CLAUDE_BIN = "claude";
export const CLAUDE_IDENTITY_PREFIX =
"You are Clawd (Claude) running on the user's Mac via warelay. Your scratchpad is /Users/steipete/clawd; this is your folder and you can add what you like in markdown files and/or images. You don't need to be concise, but WhatsApp replies must stay under ~1500 characters.";
"You are Clawd (Claude) running on the user's Mac via warelay. Your scratchpad is /Users/steipete/clawd; this is your folder and you can add what you like in markdown files and/or images. You don't need to be concise, but WhatsApp replies must stay under ~1500 characters. Media you can send: images ≤6MB, audio/video ≤16MB, documents ≤100MB. The prompt may include a media path and an optional Transcript: section—use them when present.";
function extractClaudeText(payload: unknown): string | undefined {
// Best-effort walker to find the primary text field in Claude JSON outputs.

View File

@@ -143,11 +143,11 @@ describe("config and templating", () => {
killed: false,
});
const result = await index.getReplyFromConfig(
{
Body: "<media:audio>",
From: "+1",
To: "+2",
const result = await index.getReplyFromConfig(
{
Body: "<media:audio>",
From: "+1",
To: "+2",
MediaPath: "/tmp/voice.ogg",
MediaType: "audio/ogg",
},
@@ -156,15 +156,15 @@ describe("config and templating", () => {
commandRunner,
);
expect(runExec).toHaveBeenCalled();
expect(commandRunner).toHaveBeenCalled();
const argv = commandRunner.mock.calls[0][0];
const prompt = argv[argv.length - 1] as string;
expect(prompt).toContain("/tmp/voice.ogg");
expect(prompt).toContain("Transcript:");
expect(prompt).toContain("voice transcript");
expect(result?.text).toBeUndefined();
});
expect(runExec).toHaveBeenCalled();
expect(commandRunner).toHaveBeenCalled();
const argv = commandRunner.mock.calls[0][0];
const prompt = argv[argv.length - 1] as string;
expect(prompt).toContain("/tmp/voice.ogg");
expect(prompt).toContain("Transcript:");
expect(prompt).toContain("voice transcript");
expect(result?.text).toBe("ok");
});
it("getReplyFromConfig skips transcription when not configured", async () => {
const cfg = {

31
src/media/constants.ts Normal file
View File

@@ -0,0 +1,31 @@
export const MAX_IMAGE_BYTES = 6 * 1024 * 1024; // 6MB
export const MAX_AUDIO_BYTES = 16 * 1024 * 1024; // 16MB
export const MAX_VIDEO_BYTES = 16 * 1024 * 1024; // 16MB
export const MAX_DOCUMENT_BYTES = 100 * 1024 * 1024; // 100MB
export type MediaKind = "image" | "audio" | "video" | "document" | "unknown";
export function mediaKindFromMime(mime?: string | null): MediaKind {
if (!mime) return "unknown";
if (mime.startsWith("image/")) return "image";
if (mime.startsWith("audio/")) return "audio";
if (mime.startsWith("video/")) return "video";
if (mime === "application/pdf") return "document";
if (mime.startsWith("application/")) return "document";
return "unknown";
}
export function maxBytesForKind(kind: MediaKind): number {
switch (kind) {
case "image":
return MAX_IMAGE_BYTES;
case "audio":
return MAX_AUDIO_BYTES;
case "video":
return MAX_VIDEO_BYTES;
case "document":
return MAX_DOCUMENT_BYTES;
default:
return MAX_DOCUMENT_BYTES;
}
}

View File

@@ -719,8 +719,16 @@ describe("provider-web", () => {
sendMedia,
});
expect(sendMedia).not.toHaveBeenCalled();
expect(reply).toHaveBeenCalledWith("hi");
expect(sendMedia).toHaveBeenCalledTimes(1);
const payload = sendMedia.mock.calls[0][0] as {
document?: Buffer;
caption?: string;
fileName?: string;
};
expect(payload.document).toBeInstanceOf(Buffer);
expect(payload.fileName).toBe("file.pdf");
expect(payload.caption).toBe("hi");
expect(reply).not.toHaveBeenCalled();
fetchMock.mockRestore();
});

View File

@@ -21,6 +21,7 @@ import { loadConfig } from "./config/config.js";
import { danger, info, isVerbose, logVerbose, success } from "./globals.js";
import { logInfo } from "./logger.js";
import { getChildLogger } from "./logging.js";
import { maxBytesForKind, mediaKindFromMime } from "./media/constants.js";
import { saveMediaBuffer } from "./media/store.js";
import { defaultRuntime, type RuntimeEnv } from "./runtime.js";
import type { Provider } from "./utils.js";
@@ -485,12 +486,39 @@ export async function monitorWebProvider(
logVerbose(
`Web auto-reply media size: ${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB`,
);
logVerbose(
`Web auto-reply media source: ${replyResult.mediaUrl} (kind ${media.kind})`,
);
}
if (media.kind === "image") {
await msg.sendMedia({
image: media.buffer,
caption: replyResult.text || undefined,
mimetype: media.contentType,
});
} else if (media.kind === "audio") {
await msg.sendMedia({
audio: media.buffer,
ptt: true,
mimetype: media.contentType,
caption: replyResult.text || undefined,
} as AnyMessageContent);
} else if (media.kind === "video") {
await msg.sendMedia({
video: media.buffer,
caption: replyResult.text || undefined,
mimetype: media.contentType,
});
} else {
const fileName =
replyResult.mediaUrl.split("/").pop() ?? "file";
await msg.sendMedia({
document: media.buffer,
fileName,
caption: replyResult.text || undefined,
mimetype: media.contentType,
} as AnyMessageContent);
}
await msg.sendMedia({
image: media.buffer,
caption: replyResult.text || undefined,
mimetype: media.contentType,
});
logInfo(
`✅ Sent web media reply to ${msg.from} (${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB)`,
runtime,
@@ -502,6 +530,7 @@ export async function monitorWebProvider(
text: replyResult.text ?? null,
mediaUrl: replyResult.mediaUrl,
mediaSizeBytes: media.buffer.length,
mediaKind: media.kind,
durationMs: Date.now() - replyStarted,
},
"auto-reply sent (media)",
@@ -727,22 +756,21 @@ async function downloadInboundMedia(
async function loadWebMedia(
mediaUrl: string,
maxBytes: number = DEFAULT_WEB_MEDIA_BYTES,
): Promise<{ buffer: Buffer; contentType?: string }> {
// Hard cap to avoid Anthropic/WhatsApp 5MB image limit that triggers API 400s.
maxBytes?: number,
): Promise<{ buffer: Buffer; contentType?: string; kind: MediaKind }> {
if (mediaUrl.startsWith("file://")) {
mediaUrl = mediaUrl.replace("file://", "");
}
const optimizeAndClamp = async (buffer: Buffer) => {
const optimizeAndClampImage = async (buffer: Buffer, cap: number) => {
const originalSize = buffer.length;
const optimized = await optimizeImageToJpeg(buffer, maxBytes);
const optimized = await optimizeImageToJpeg(buffer, cap);
if (optimized.optimizedSize < originalSize && isVerbose()) {
logVerbose(
`Optimized media from ${(originalSize / (1024 * 1024)).toFixed(2)}MB to ${(optimized.optimizedSize / (1024 * 1024)).toFixed(2)}MB (side≤${optimized.resizeSide}px, q=${optimized.quality})`,
);
}
if (optimized.buffer.length > maxBytes) {
if (optimized.buffer.length > cap) {
throw new Error(
`Media could not be reduced below ${(maxBytes / (1024 * 1024)).toFixed(0)}MB (got ${(
optimized.buffer.length / (1024 * 1024)
@@ -752,6 +780,7 @@ async function loadWebMedia(
return {
buffer: optimized.buffer,
contentType: "image/jpeg",
kind: "image" as const,
};
};
@@ -761,11 +790,60 @@ async function loadWebMedia(
throw new Error(`Failed to fetch media: HTTP ${res.status}`);
}
const array = Buffer.from(await res.arrayBuffer());
return optimizeAndClamp(array);
const contentType = res.headers.get("content-type");
const kind = mediaKindFromMime(contentType);
const cap = Math.min(
maxBytes ?? maxBytesForKind(kind),
maxBytesForKind(kind),
);
if (kind === "image") {
return optimizeAndClampImage(array, cap);
}
if (array.length > cap) {
throw new Error(
`Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
array.length / (1024 * 1024)
).toFixed(2)}MB)`,
);
}
return { buffer: array, contentType: contentType ?? undefined, kind };
}
// Local path
const data = await fs.readFile(mediaUrl);
return optimizeAndClamp(data);
const ext = path.extname(mediaUrl);
const mime =
(ext &&
(
{
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".webp": "image/webp",
".gif": "image/gif",
".ogg": "audio/ogg",
".opus": "audio/ogg",
".mp3": "audio/mpeg",
".mp4": "video/mp4",
".pdf": "application/pdf",
} as Record<string, string | undefined>
)[ext.toLowerCase()]) ??
undefined;
const kind = mediaKindFromMime(mime);
const cap = Math.min(
maxBytes ?? maxBytesForKind(kind),
maxBytesForKind(kind),
);
if (kind === "image") {
return optimizeAndClampImage(data, cap);
}
if (data.length > cap) {
throw new Error(
`Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
data.length / (1024 * 1024)
).toFixed(2)}MB)`,
);
}
return { buffer: data, contentType: mime, kind };
}
function getStatusCode(err: unknown) {