feat: support audio/video/doc media caps and transcript context
This commit is contained in:
@@ -4,7 +4,7 @@ import { z } from "zod";
|
||||
// Preferred binary name for Claude CLI invocations.
|
||||
export const CLAUDE_BIN = "claude";
|
||||
export const CLAUDE_IDENTITY_PREFIX =
|
||||
"You are Clawd (Claude) running on the user's Mac via warelay. Your scratchpad is /Users/steipete/clawd; this is your folder and you can add what you like in markdown files and/or images. You don't need to be concise, but WhatsApp replies must stay under ~1500 characters.";
|
||||
"You are Clawd (Claude) running on the user's Mac via warelay. Your scratchpad is /Users/steipete/clawd; this is your folder and you can add what you like in markdown files and/or images. You don't need to be concise, but WhatsApp replies must stay under ~1500 characters. Media you can send: images ≤6MB, audio/video ≤16MB, documents ≤100MB. The prompt may include a media path and an optional Transcript: section—use them when present.";
|
||||
|
||||
function extractClaudeText(payload: unknown): string | undefined {
|
||||
// Best-effort walker to find the primary text field in Claude JSON outputs.
|
||||
|
||||
@@ -143,11 +143,11 @@ describe("config and templating", () => {
|
||||
killed: false,
|
||||
});
|
||||
|
||||
const result = await index.getReplyFromConfig(
|
||||
{
|
||||
Body: "<media:audio>",
|
||||
From: "+1",
|
||||
To: "+2",
|
||||
const result = await index.getReplyFromConfig(
|
||||
{
|
||||
Body: "<media:audio>",
|
||||
From: "+1",
|
||||
To: "+2",
|
||||
MediaPath: "/tmp/voice.ogg",
|
||||
MediaType: "audio/ogg",
|
||||
},
|
||||
@@ -156,15 +156,15 @@ describe("config and templating", () => {
|
||||
commandRunner,
|
||||
);
|
||||
|
||||
expect(runExec).toHaveBeenCalled();
|
||||
expect(commandRunner).toHaveBeenCalled();
|
||||
const argv = commandRunner.mock.calls[0][0];
|
||||
const prompt = argv[argv.length - 1] as string;
|
||||
expect(prompt).toContain("/tmp/voice.ogg");
|
||||
expect(prompt).toContain("Transcript:");
|
||||
expect(prompt).toContain("voice transcript");
|
||||
expect(result?.text).toBeUndefined();
|
||||
});
|
||||
expect(runExec).toHaveBeenCalled();
|
||||
expect(commandRunner).toHaveBeenCalled();
|
||||
const argv = commandRunner.mock.calls[0][0];
|
||||
const prompt = argv[argv.length - 1] as string;
|
||||
expect(prompt).toContain("/tmp/voice.ogg");
|
||||
expect(prompt).toContain("Transcript:");
|
||||
expect(prompt).toContain("voice transcript");
|
||||
expect(result?.text).toBe("ok");
|
||||
});
|
||||
|
||||
it("getReplyFromConfig skips transcription when not configured", async () => {
|
||||
const cfg = {
|
||||
|
||||
31
src/media/constants.ts
Normal file
31
src/media/constants.ts
Normal file
@@ -0,0 +1,31 @@
|
||||
export const MAX_IMAGE_BYTES = 6 * 1024 * 1024; // 6MB
|
||||
export const MAX_AUDIO_BYTES = 16 * 1024 * 1024; // 16MB
|
||||
export const MAX_VIDEO_BYTES = 16 * 1024 * 1024; // 16MB
|
||||
export const MAX_DOCUMENT_BYTES = 100 * 1024 * 1024; // 100MB
|
||||
|
||||
export type MediaKind = "image" | "audio" | "video" | "document" | "unknown";
|
||||
|
||||
export function mediaKindFromMime(mime?: string | null): MediaKind {
|
||||
if (!mime) return "unknown";
|
||||
if (mime.startsWith("image/")) return "image";
|
||||
if (mime.startsWith("audio/")) return "audio";
|
||||
if (mime.startsWith("video/")) return "video";
|
||||
if (mime === "application/pdf") return "document";
|
||||
if (mime.startsWith("application/")) return "document";
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
export function maxBytesForKind(kind: MediaKind): number {
|
||||
switch (kind) {
|
||||
case "image":
|
||||
return MAX_IMAGE_BYTES;
|
||||
case "audio":
|
||||
return MAX_AUDIO_BYTES;
|
||||
case "video":
|
||||
return MAX_VIDEO_BYTES;
|
||||
case "document":
|
||||
return MAX_DOCUMENT_BYTES;
|
||||
default:
|
||||
return MAX_DOCUMENT_BYTES;
|
||||
}
|
||||
}
|
||||
@@ -719,8 +719,16 @@ describe("provider-web", () => {
|
||||
sendMedia,
|
||||
});
|
||||
|
||||
expect(sendMedia).not.toHaveBeenCalled();
|
||||
expect(reply).toHaveBeenCalledWith("hi");
|
||||
expect(sendMedia).toHaveBeenCalledTimes(1);
|
||||
const payload = sendMedia.mock.calls[0][0] as {
|
||||
document?: Buffer;
|
||||
caption?: string;
|
||||
fileName?: string;
|
||||
};
|
||||
expect(payload.document).toBeInstanceOf(Buffer);
|
||||
expect(payload.fileName).toBe("file.pdf");
|
||||
expect(payload.caption).toBe("hi");
|
||||
expect(reply).not.toHaveBeenCalled();
|
||||
|
||||
fetchMock.mockRestore();
|
||||
});
|
||||
|
||||
@@ -21,6 +21,7 @@ import { loadConfig } from "./config/config.js";
|
||||
import { danger, info, isVerbose, logVerbose, success } from "./globals.js";
|
||||
import { logInfo } from "./logger.js";
|
||||
import { getChildLogger } from "./logging.js";
|
||||
import { maxBytesForKind, mediaKindFromMime } from "./media/constants.js";
|
||||
import { saveMediaBuffer } from "./media/store.js";
|
||||
import { defaultRuntime, type RuntimeEnv } from "./runtime.js";
|
||||
import type { Provider } from "./utils.js";
|
||||
@@ -485,12 +486,39 @@ export async function monitorWebProvider(
|
||||
logVerbose(
|
||||
`Web auto-reply media size: ${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB`,
|
||||
);
|
||||
logVerbose(
|
||||
`Web auto-reply media source: ${replyResult.mediaUrl} (kind ${media.kind})`,
|
||||
);
|
||||
}
|
||||
if (media.kind === "image") {
|
||||
await msg.sendMedia({
|
||||
image: media.buffer,
|
||||
caption: replyResult.text || undefined,
|
||||
mimetype: media.contentType,
|
||||
});
|
||||
} else if (media.kind === "audio") {
|
||||
await msg.sendMedia({
|
||||
audio: media.buffer,
|
||||
ptt: true,
|
||||
mimetype: media.contentType,
|
||||
caption: replyResult.text || undefined,
|
||||
} as AnyMessageContent);
|
||||
} else if (media.kind === "video") {
|
||||
await msg.sendMedia({
|
||||
video: media.buffer,
|
||||
caption: replyResult.text || undefined,
|
||||
mimetype: media.contentType,
|
||||
});
|
||||
} else {
|
||||
const fileName =
|
||||
replyResult.mediaUrl.split("/").pop() ?? "file";
|
||||
await msg.sendMedia({
|
||||
document: media.buffer,
|
||||
fileName,
|
||||
caption: replyResult.text || undefined,
|
||||
mimetype: media.contentType,
|
||||
} as AnyMessageContent);
|
||||
}
|
||||
await msg.sendMedia({
|
||||
image: media.buffer,
|
||||
caption: replyResult.text || undefined,
|
||||
mimetype: media.contentType,
|
||||
});
|
||||
logInfo(
|
||||
`✅ Sent web media reply to ${msg.from} (${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB)`,
|
||||
runtime,
|
||||
@@ -502,6 +530,7 @@ export async function monitorWebProvider(
|
||||
text: replyResult.text ?? null,
|
||||
mediaUrl: replyResult.mediaUrl,
|
||||
mediaSizeBytes: media.buffer.length,
|
||||
mediaKind: media.kind,
|
||||
durationMs: Date.now() - replyStarted,
|
||||
},
|
||||
"auto-reply sent (media)",
|
||||
@@ -727,22 +756,21 @@ async function downloadInboundMedia(
|
||||
|
||||
async function loadWebMedia(
|
||||
mediaUrl: string,
|
||||
maxBytes: number = DEFAULT_WEB_MEDIA_BYTES,
|
||||
): Promise<{ buffer: Buffer; contentType?: string }> {
|
||||
// Hard cap to avoid Anthropic/WhatsApp 5MB image limit that triggers API 400s.
|
||||
maxBytes?: number,
|
||||
): Promise<{ buffer: Buffer; contentType?: string; kind: MediaKind }> {
|
||||
if (mediaUrl.startsWith("file://")) {
|
||||
mediaUrl = mediaUrl.replace("file://", "");
|
||||
}
|
||||
|
||||
const optimizeAndClamp = async (buffer: Buffer) => {
|
||||
const optimizeAndClampImage = async (buffer: Buffer, cap: number) => {
|
||||
const originalSize = buffer.length;
|
||||
const optimized = await optimizeImageToJpeg(buffer, maxBytes);
|
||||
const optimized = await optimizeImageToJpeg(buffer, cap);
|
||||
if (optimized.optimizedSize < originalSize && isVerbose()) {
|
||||
logVerbose(
|
||||
`Optimized media from ${(originalSize / (1024 * 1024)).toFixed(2)}MB to ${(optimized.optimizedSize / (1024 * 1024)).toFixed(2)}MB (side≤${optimized.resizeSide}px, q=${optimized.quality})`,
|
||||
);
|
||||
}
|
||||
if (optimized.buffer.length > maxBytes) {
|
||||
if (optimized.buffer.length > cap) {
|
||||
throw new Error(
|
||||
`Media could not be reduced below ${(maxBytes / (1024 * 1024)).toFixed(0)}MB (got ${(
|
||||
optimized.buffer.length / (1024 * 1024)
|
||||
@@ -752,6 +780,7 @@ async function loadWebMedia(
|
||||
return {
|
||||
buffer: optimized.buffer,
|
||||
contentType: "image/jpeg",
|
||||
kind: "image" as const,
|
||||
};
|
||||
};
|
||||
|
||||
@@ -761,11 +790,60 @@ async function loadWebMedia(
|
||||
throw new Error(`Failed to fetch media: HTTP ${res.status}`);
|
||||
}
|
||||
const array = Buffer.from(await res.arrayBuffer());
|
||||
return optimizeAndClamp(array);
|
||||
const contentType = res.headers.get("content-type");
|
||||
const kind = mediaKindFromMime(contentType);
|
||||
const cap = Math.min(
|
||||
maxBytes ?? maxBytesForKind(kind),
|
||||
maxBytesForKind(kind),
|
||||
);
|
||||
if (kind === "image") {
|
||||
return optimizeAndClampImage(array, cap);
|
||||
}
|
||||
if (array.length > cap) {
|
||||
throw new Error(
|
||||
`Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
|
||||
array.length / (1024 * 1024)
|
||||
).toFixed(2)}MB)`,
|
||||
);
|
||||
}
|
||||
return { buffer: array, contentType: contentType ?? undefined, kind };
|
||||
}
|
||||
// Local path
|
||||
const data = await fs.readFile(mediaUrl);
|
||||
return optimizeAndClamp(data);
|
||||
const ext = path.extname(mediaUrl);
|
||||
const mime =
|
||||
(ext &&
|
||||
(
|
||||
{
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".png": "image/png",
|
||||
".webp": "image/webp",
|
||||
".gif": "image/gif",
|
||||
".ogg": "audio/ogg",
|
||||
".opus": "audio/ogg",
|
||||
".mp3": "audio/mpeg",
|
||||
".mp4": "video/mp4",
|
||||
".pdf": "application/pdf",
|
||||
} as Record<string, string | undefined>
|
||||
)[ext.toLowerCase()]) ??
|
||||
undefined;
|
||||
const kind = mediaKindFromMime(mime);
|
||||
const cap = Math.min(
|
||||
maxBytes ?? maxBytesForKind(kind),
|
||||
maxBytesForKind(kind),
|
||||
);
|
||||
if (kind === "image") {
|
||||
return optimizeAndClampImage(data, cap);
|
||||
}
|
||||
if (data.length > cap) {
|
||||
throw new Error(
|
||||
`Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
|
||||
data.length / (1024 * 1024)
|
||||
).toFixed(2)}MB)`,
|
||||
);
|
||||
}
|
||||
return { buffer: data, contentType: mime, kind };
|
||||
}
|
||||
|
||||
function getStatusCode(err: unknown) {
|
||||
|
||||
Reference in New Issue
Block a user