feat: transcribe audio and surface transcript to prompts
This commit is contained in:
@@ -110,12 +110,15 @@ export async function getReplyFromConfig(
|
||||
started = true;
|
||||
await opts?.onReplyStart?.();
|
||||
};
|
||||
let transcribedText: string | undefined;
|
||||
|
||||
// Optional audio transcription before templating/session handling.
|
||||
if (cfg.inbound?.transcribeAudio && isAudio(ctx.MediaType)) {
|
||||
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
|
||||
if (transcribed?.text) {
|
||||
transcribedText = transcribed.text;
|
||||
ctx.Body = transcribed.text;
|
||||
ctx.Transcript = transcribed.text;
|
||||
logVerbose("Replaced Body with audio transcript for reply flow");
|
||||
}
|
||||
}
|
||||
@@ -193,9 +196,15 @@ export async function getReplyFromConfig(
|
||||
const bodyPrefix = reply?.bodyPrefix
|
||||
? applyTemplate(reply.bodyPrefix, sessionCtx)
|
||||
: "";
|
||||
const prefixedBody = bodyPrefix
|
||||
const prefixedBodyBase = bodyPrefix
|
||||
? `${bodyPrefix}${sessionCtx.BodyStripped ?? sessionCtx.Body ?? ""}`
|
||||
: (sessionCtx.BodyStripped ?? sessionCtx.Body);
|
||||
const prefixedBody =
|
||||
transcribedText && reply?.mode === "command"
|
||||
? [prefixedBodyBase, `Transcript:\n${transcribedText}`]
|
||||
.filter(Boolean)
|
||||
.join("\n\n")
|
||||
: prefixedBodyBase;
|
||||
const mediaNote = ctx.MediaPath?.length
|
||||
? `[media attached: ${ctx.MediaPath}${ctx.MediaType ? ` (${ctx.MediaType})` : ""}${ctx.MediaUrl ? ` | ${ctx.MediaUrl}` : ""}]`
|
||||
: undefined;
|
||||
@@ -543,7 +552,7 @@ export async function autoReplyIfConfigured(
|
||||
}
|
||||
|
||||
function isAudio(mediaType?: string | null) {
|
||||
return Boolean(mediaType && mediaType.startsWith("audio"));
|
||||
return Boolean(mediaType?.startsWith("audio"));
|
||||
}
|
||||
|
||||
async function transcribeInboundAudio(
|
||||
@@ -596,9 +605,7 @@ async function transcribeInboundAudio(
|
||||
return undefined;
|
||||
} finally {
|
||||
if (tmpPath) {
|
||||
void fs
|
||||
.unlink(tmpPath)
|
||||
.catch(() => {});
|
||||
void fs.unlink(tmpPath).catch(() => {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ export type MsgContext = {
|
||||
MediaPath?: string;
|
||||
MediaUrl?: string;
|
||||
MediaType?: string;
|
||||
Transcript?: string;
|
||||
};
|
||||
|
||||
export type TemplateContext = MsgContext & {
|
||||
|
||||
@@ -5,8 +5,8 @@ import path from "node:path";
|
||||
import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.js";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { createMockTwilio } from "../test/mocks/twilio.js";
|
||||
import { withWhatsAppPrefix } from "./utils.js";
|
||||
import * as exec from "./process/exec.js";
|
||||
import { withWhatsAppPrefix } from "./utils.js";
|
||||
|
||||
// Twilio mock factory shared across tests
|
||||
vi.mock("twilio", () => {
|
||||
@@ -125,8 +125,8 @@ describe("config and templating", () => {
|
||||
command: ["echo", "voice transcript"],
|
||||
},
|
||||
reply: {
|
||||
mode: "text" as const,
|
||||
text: "{{Body}}",
|
||||
mode: "command" as const,
|
||||
command: ["echo", "{{Body}}"],
|
||||
},
|
||||
},
|
||||
};
|
||||
@@ -135,6 +135,13 @@ describe("config and templating", () => {
|
||||
stdout: "voice transcript\n",
|
||||
stderr: "",
|
||||
});
|
||||
const commandRunner = vi.fn().mockResolvedValue({
|
||||
stdout: "ok",
|
||||
stderr: "",
|
||||
code: 0,
|
||||
signal: null,
|
||||
killed: false,
|
||||
});
|
||||
|
||||
const result = await index.getReplyFromConfig(
|
||||
{
|
||||
@@ -146,11 +153,17 @@ describe("config and templating", () => {
|
||||
},
|
||||
undefined,
|
||||
cfg,
|
||||
commandRunner,
|
||||
);
|
||||
|
||||
expect(runExec).toHaveBeenCalled();
|
||||
expect(result?.text).toContain("voice transcript");
|
||||
expect(result?.text).toContain("/tmp/voice.ogg");
|
||||
expect(commandRunner).toHaveBeenCalled();
|
||||
const argv = commandRunner.mock.calls[0][0];
|
||||
const prompt = argv[argv.length - 1] as string;
|
||||
expect(prompt).toContain("/tmp/voice.ogg");
|
||||
expect(prompt).toContain("Transcript:");
|
||||
expect(prompt).toContain("voice transcript");
|
||||
expect(result?.text).toBeUndefined();
|
||||
});
|
||||
|
||||
it("getReplyFromConfig skips transcription when not configured", async () => {
|
||||
|
||||
@@ -505,104 +505,105 @@ describe("provider-web", () => {
|
||||
"compresses common formats to jpeg under the cap",
|
||||
{ timeout: 15_000 },
|
||||
async () => {
|
||||
const formats = [
|
||||
{
|
||||
name: "png",
|
||||
mime: "image/png",
|
||||
make: (buf: Buffer, opts: { width: number; height: number }) =>
|
||||
sharp(buf, {
|
||||
raw: { width: opts.width, height: opts.height, channels: 3 },
|
||||
})
|
||||
.png({ compressionLevel: 0 })
|
||||
.toBuffer(),
|
||||
},
|
||||
{
|
||||
name: "jpeg",
|
||||
mime: "image/jpeg",
|
||||
make: (buf: Buffer, opts: { width: number; height: number }) =>
|
||||
sharp(buf, {
|
||||
raw: { width: opts.width, height: opts.height, channels: 3 },
|
||||
})
|
||||
.jpeg({ quality: 100, chromaSubsampling: "4:4:4" })
|
||||
.toBuffer(),
|
||||
},
|
||||
{
|
||||
name: "webp",
|
||||
mime: "image/webp",
|
||||
make: (buf: Buffer, opts: { width: number; height: number }) =>
|
||||
sharp(buf, {
|
||||
raw: { width: opts.width, height: opts.height, channels: 3 },
|
||||
})
|
||||
.webp({ quality: 100 })
|
||||
.toBuffer(),
|
||||
},
|
||||
] as const;
|
||||
const formats = [
|
||||
{
|
||||
name: "png",
|
||||
mime: "image/png",
|
||||
make: (buf: Buffer, opts: { width: number; height: number }) =>
|
||||
sharp(buf, {
|
||||
raw: { width: opts.width, height: opts.height, channels: 3 },
|
||||
})
|
||||
.png({ compressionLevel: 0 })
|
||||
.toBuffer(),
|
||||
},
|
||||
{
|
||||
name: "jpeg",
|
||||
mime: "image/jpeg",
|
||||
make: (buf: Buffer, opts: { width: number; height: number }) =>
|
||||
sharp(buf, {
|
||||
raw: { width: opts.width, height: opts.height, channels: 3 },
|
||||
})
|
||||
.jpeg({ quality: 100, chromaSubsampling: "4:4:4" })
|
||||
.toBuffer(),
|
||||
},
|
||||
{
|
||||
name: "webp",
|
||||
mime: "image/webp",
|
||||
make: (buf: Buffer, opts: { width: number; height: number }) =>
|
||||
sharp(buf, {
|
||||
raw: { width: opts.width, height: opts.height, channels: 3 },
|
||||
})
|
||||
.webp({ quality: 100 })
|
||||
.toBuffer(),
|
||||
},
|
||||
] as const;
|
||||
|
||||
for (const fmt of formats) {
|
||||
// Force a small cap to ensure compression is exercised for every format.
|
||||
loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });
|
||||
const sendMedia = vi.fn();
|
||||
const reply = vi.fn().mockResolvedValue(undefined);
|
||||
const sendComposing = vi.fn();
|
||||
const resolver = vi.fn().mockResolvedValue({
|
||||
text: "hi",
|
||||
mediaUrl: `https://example.com/big.${fmt.name}`,
|
||||
});
|
||||
for (const fmt of formats) {
|
||||
// Force a small cap to ensure compression is exercised for every format.
|
||||
loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });
|
||||
const sendMedia = vi.fn();
|
||||
const reply = vi.fn().mockResolvedValue(undefined);
|
||||
const sendComposing = vi.fn();
|
||||
const resolver = vi.fn().mockResolvedValue({
|
||||
text: "hi",
|
||||
mediaUrl: `https://example.com/big.${fmt.name}`,
|
||||
});
|
||||
|
||||
let capturedOnMessage:
|
||||
| ((
|
||||
let capturedOnMessage:
|
||||
| ((
|
||||
msg: import("./provider-web.js").WebInboundMessage,
|
||||
) => Promise<void>)
|
||||
| undefined;
|
||||
const listenerFactory = async (opts: {
|
||||
onMessage: (
|
||||
msg: import("./provider-web.js").WebInboundMessage,
|
||||
) => Promise<void>)
|
||||
| undefined;
|
||||
const listenerFactory = async (opts: {
|
||||
onMessage: (
|
||||
msg: import("./provider-web.js").WebInboundMessage,
|
||||
) => Promise<void>;
|
||||
}) => {
|
||||
capturedOnMessage = opts.onMessage;
|
||||
return { close: vi.fn() };
|
||||
};
|
||||
) => Promise<void>;
|
||||
}) => {
|
||||
capturedOnMessage = opts.onMessage;
|
||||
return { close: vi.fn() };
|
||||
};
|
||||
|
||||
const width = 2000;
|
||||
const height = 2000;
|
||||
const raw = crypto.randomBytes(width * height * 3);
|
||||
const big = await fmt.make(raw, { width, height });
|
||||
expect(big.length).toBeGreaterThan(1 * 1024 * 1024);
|
||||
const width = 2000;
|
||||
const height = 2000;
|
||||
const raw = crypto.randomBytes(width * height * 3);
|
||||
const big = await fmt.make(raw, { width, height });
|
||||
expect(big.length).toBeGreaterThan(1 * 1024 * 1024);
|
||||
|
||||
const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
|
||||
ok: true,
|
||||
body: true,
|
||||
arrayBuffer: async () =>
|
||||
big.buffer.slice(big.byteOffset, big.byteOffset + big.byteLength),
|
||||
headers: { get: () => fmt.mime },
|
||||
status: 200,
|
||||
} as Response);
|
||||
const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
|
||||
ok: true,
|
||||
body: true,
|
||||
arrayBuffer: async () =>
|
||||
big.buffer.slice(big.byteOffset, big.byteOffset + big.byteLength),
|
||||
headers: { get: () => fmt.mime },
|
||||
status: 200,
|
||||
} as Response);
|
||||
|
||||
await monitorWebProvider(false, listenerFactory, false, resolver);
|
||||
expect(capturedOnMessage).toBeDefined();
|
||||
await monitorWebProvider(false, listenerFactory, false, resolver);
|
||||
expect(capturedOnMessage).toBeDefined();
|
||||
|
||||
await capturedOnMessage?.({
|
||||
body: "hello",
|
||||
from: "+1",
|
||||
to: "+2",
|
||||
id: `msg-${fmt.name}`,
|
||||
sendComposing,
|
||||
reply,
|
||||
sendMedia,
|
||||
});
|
||||
await capturedOnMessage?.({
|
||||
body: "hello",
|
||||
from: "+1",
|
||||
to: "+2",
|
||||
id: `msg-${fmt.name}`,
|
||||
sendComposing,
|
||||
reply,
|
||||
sendMedia,
|
||||
});
|
||||
|
||||
expect(sendMedia).toHaveBeenCalledTimes(1);
|
||||
const payload = sendMedia.mock.calls[0][0] as {
|
||||
image: Buffer;
|
||||
mimetype?: string;
|
||||
};
|
||||
expect(payload.image.length).toBeLessThanOrEqual(1 * 1024 * 1024);
|
||||
expect(payload.mimetype).toBe("image/jpeg");
|
||||
expect(reply).not.toHaveBeenCalled();
|
||||
expect(sendMedia).toHaveBeenCalledTimes(1);
|
||||
const payload = sendMedia.mock.calls[0][0] as {
|
||||
image: Buffer;
|
||||
mimetype?: string;
|
||||
};
|
||||
expect(payload.image.length).toBeLessThanOrEqual(1 * 1024 * 1024);
|
||||
expect(payload.mimetype).toBe("image/jpeg");
|
||||
expect(reply).not.toHaveBeenCalled();
|
||||
|
||||
fetchMock.mockRestore();
|
||||
}
|
||||
});
|
||||
fetchMock.mockRestore();
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
it("honors mediaMaxMb from config", async () => {
|
||||
loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });
|
||||
|
||||
Reference in New Issue
Block a user