feat: transcribe audio and surface transcript to prompts

This commit is contained in:
Peter Steinberger
2025-11-25 23:13:22 +01:00
parent 7d0ae151e8
commit e642f128ae
6 changed files with 169 additions and 100 deletions

47
docs/audio.md Normal file
View File

@@ -0,0 +1,47 @@
# Audio / Voice Notes — 2025-11-25
## What works
- **Optional transcription**: If `inbound.transcribeAudio.command` is set in `~/.warelay/warelay.json`, warelay will:
1) Download inbound audio (Web or Twilio) to a temp path if only a URL is present.
2) Run the configured CLI (templated with `{{MediaPath}}`), expecting transcript on stdout.
3) Replace `Body` with the transcript, set `{{Transcript}}`, and prepend the original media path plus a `Transcript:` section in the command prompt so models see both.
4) Continue through the normal auto-reply pipeline (templating, sessions, Claude/command).
- **Verbose logging**: In `--verbose`, we log when transcription runs and when the transcript replaces the body.
## Config example (OpenAI Whisper CLI)
Requires `OPENAI_API_KEY` in env and `openai` CLI installed:
```json5
{
inbound: {
transcribeAudio: {
command: [
"openai",
"api",
"audio.transcriptions.create",
"-m",
"whisper-1",
"-f",
"{{MediaPath}}",
"--response-format",
"text"
],
timeoutSeconds: 45
},
reply: {
mode: "command",
command: ["claude", "{{Body}}"]
}
}
}
```
## Notes & limits
- We dont ship a transcriber; you opt in with any CLI that prints text to stdout (Whisper cloud, whisper.cpp, vosk, Deepgram, etc.).
- Size guard: inbound audio must be ≤5MB (same as other media).
- If transcription fails, we fall back to the original body/media note; replies still go through.
- Transcript is available to templates as `{{Transcript}}`; models get both the media path and a `Transcript:` block in the prompt when using command mode.
## Gotchas
- Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
- Keep timeouts reasonable (`timeoutSeconds`, default 45s) to avoid blocking the reply queue.
- Twilio paths are hosted URLs; Web paths are local. The temp download uses HTTPS for Twilio and a temp file for Web-only media.

View File

@@ -57,7 +57,7 @@ This document defines how `warelay` should handle sending and replying with imag
- `{{MediaUrl}}` original URL (Twilio) or pseudo-URL (web).
- `{{MediaPath}}` local temp path written before running the command.
- Size guard: only download if ≤5MB; else skip and log.
- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs.
- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs. The command prompt includes the original media path plus a `Transcript:` section so the model sees both.
## Errors & Messaging
- Local path with twilio + Funnel disabled → error: “Twilio media needs a public URL; start `warelay webhook --ingress tailscale` or pass an https:// URL.”

View File

@@ -110,12 +110,15 @@ export async function getReplyFromConfig(
started = true;
await opts?.onReplyStart?.();
};
let transcribedText: string | undefined;
// Optional audio transcription before templating/session handling.
if (cfg.inbound?.transcribeAudio && isAudio(ctx.MediaType)) {
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
if (transcribed?.text) {
transcribedText = transcribed.text;
ctx.Body = transcribed.text;
ctx.Transcript = transcribed.text;
logVerbose("Replaced Body with audio transcript for reply flow");
}
}
@@ -193,9 +196,15 @@ export async function getReplyFromConfig(
const bodyPrefix = reply?.bodyPrefix
? applyTemplate(reply.bodyPrefix, sessionCtx)
: "";
const prefixedBody = bodyPrefix
const prefixedBodyBase = bodyPrefix
? `${bodyPrefix}${sessionCtx.BodyStripped ?? sessionCtx.Body ?? ""}`
: (sessionCtx.BodyStripped ?? sessionCtx.Body);
const prefixedBody =
transcribedText && reply?.mode === "command"
? [prefixedBodyBase, `Transcript:\n${transcribedText}`]
.filter(Boolean)
.join("\n\n")
: prefixedBodyBase;
const mediaNote = ctx.MediaPath?.length
? `[media attached: ${ctx.MediaPath}${ctx.MediaType ? ` (${ctx.MediaType})` : ""}${ctx.MediaUrl ? ` | ${ctx.MediaUrl}` : ""}]`
: undefined;
@@ -543,7 +552,7 @@ export async function autoReplyIfConfigured(
}
function isAudio(mediaType?: string | null) {
return Boolean(mediaType && mediaType.startsWith("audio"));
return Boolean(mediaType?.startsWith("audio"));
}
async function transcribeInboundAudio(
@@ -596,9 +605,7 @@ async function transcribeInboundAudio(
return undefined;
} finally {
if (tmpPath) {
void fs
.unlink(tmpPath)
.catch(() => {});
void fs.unlink(tmpPath).catch(() => {});
}
}
}

View File

@@ -6,6 +6,7 @@ export type MsgContext = {
MediaPath?: string;
MediaUrl?: string;
MediaType?: string;
Transcript?: string;
};
export type TemplateContext = MsgContext & {

View File

@@ -5,8 +5,8 @@ import path from "node:path";
import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.js";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { createMockTwilio } from "../test/mocks/twilio.js";
import { withWhatsAppPrefix } from "./utils.js";
import * as exec from "./process/exec.js";
import { withWhatsAppPrefix } from "./utils.js";
// Twilio mock factory shared across tests
vi.mock("twilio", () => {
@@ -125,8 +125,8 @@ describe("config and templating", () => {
command: ["echo", "voice transcript"],
},
reply: {
mode: "text" as const,
text: "{{Body}}",
mode: "command" as const,
command: ["echo", "{{Body}}"],
},
},
};
@@ -135,6 +135,13 @@ describe("config and templating", () => {
stdout: "voice transcript\n",
stderr: "",
});
const commandRunner = vi.fn().mockResolvedValue({
stdout: "ok",
stderr: "",
code: 0,
signal: null,
killed: false,
});
const result = await index.getReplyFromConfig(
{
@@ -146,11 +153,17 @@ describe("config and templating", () => {
},
undefined,
cfg,
commandRunner,
);
expect(runExec).toHaveBeenCalled();
expect(result?.text).toContain("voice transcript");
expect(result?.text).toContain("/tmp/voice.ogg");
expect(commandRunner).toHaveBeenCalled();
const argv = commandRunner.mock.calls[0][0];
const prompt = argv[argv.length - 1] as string;
expect(prompt).toContain("/tmp/voice.ogg");
expect(prompt).toContain("Transcript:");
expect(prompt).toContain("voice transcript");
expect(result?.text).toBeUndefined();
});
it("getReplyFromConfig skips transcription when not configured", async () => {

View File

@@ -505,104 +505,105 @@ describe("provider-web", () => {
"compresses common formats to jpeg under the cap",
{ timeout: 15_000 },
async () => {
const formats = [
{
name: "png",
mime: "image/png",
make: (buf: Buffer, opts: { width: number; height: number }) =>
sharp(buf, {
raw: { width: opts.width, height: opts.height, channels: 3 },
})
.png({ compressionLevel: 0 })
.toBuffer(),
},
{
name: "jpeg",
mime: "image/jpeg",
make: (buf: Buffer, opts: { width: number; height: number }) =>
sharp(buf, {
raw: { width: opts.width, height: opts.height, channels: 3 },
})
.jpeg({ quality: 100, chromaSubsampling: "4:4:4" })
.toBuffer(),
},
{
name: "webp",
mime: "image/webp",
make: (buf: Buffer, opts: { width: number; height: number }) =>
sharp(buf, {
raw: { width: opts.width, height: opts.height, channels: 3 },
})
.webp({ quality: 100 })
.toBuffer(),
},
] as const;
const formats = [
{
name: "png",
mime: "image/png",
make: (buf: Buffer, opts: { width: number; height: number }) =>
sharp(buf, {
raw: { width: opts.width, height: opts.height, channels: 3 },
})
.png({ compressionLevel: 0 })
.toBuffer(),
},
{
name: "jpeg",
mime: "image/jpeg",
make: (buf: Buffer, opts: { width: number; height: number }) =>
sharp(buf, {
raw: { width: opts.width, height: opts.height, channels: 3 },
})
.jpeg({ quality: 100, chromaSubsampling: "4:4:4" })
.toBuffer(),
},
{
name: "webp",
mime: "image/webp",
make: (buf: Buffer, opts: { width: number; height: number }) =>
sharp(buf, {
raw: { width: opts.width, height: opts.height, channels: 3 },
})
.webp({ quality: 100 })
.toBuffer(),
},
] as const;
for (const fmt of formats) {
// Force a small cap to ensure compression is exercised for every format.
loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });
const sendMedia = vi.fn();
const reply = vi.fn().mockResolvedValue(undefined);
const sendComposing = vi.fn();
const resolver = vi.fn().mockResolvedValue({
text: "hi",
mediaUrl: `https://example.com/big.${fmt.name}`,
});
for (const fmt of formats) {
// Force a small cap to ensure compression is exercised for every format.
loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });
const sendMedia = vi.fn();
const reply = vi.fn().mockResolvedValue(undefined);
const sendComposing = vi.fn();
const resolver = vi.fn().mockResolvedValue({
text: "hi",
mediaUrl: `https://example.com/big.${fmt.name}`,
});
let capturedOnMessage:
| ((
let capturedOnMessage:
| ((
msg: import("./provider-web.js").WebInboundMessage,
) => Promise<void>)
| undefined;
const listenerFactory = async (opts: {
onMessage: (
msg: import("./provider-web.js").WebInboundMessage,
) => Promise<void>)
| undefined;
const listenerFactory = async (opts: {
onMessage: (
msg: import("./provider-web.js").WebInboundMessage,
) => Promise<void>;
}) => {
capturedOnMessage = opts.onMessage;
return { close: vi.fn() };
};
) => Promise<void>;
}) => {
capturedOnMessage = opts.onMessage;
return { close: vi.fn() };
};
const width = 2000;
const height = 2000;
const raw = crypto.randomBytes(width * height * 3);
const big = await fmt.make(raw, { width, height });
expect(big.length).toBeGreaterThan(1 * 1024 * 1024);
const width = 2000;
const height = 2000;
const raw = crypto.randomBytes(width * height * 3);
const big = await fmt.make(raw, { width, height });
expect(big.length).toBeGreaterThan(1 * 1024 * 1024);
const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
ok: true,
body: true,
arrayBuffer: async () =>
big.buffer.slice(big.byteOffset, big.byteOffset + big.byteLength),
headers: { get: () => fmt.mime },
status: 200,
} as Response);
const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
ok: true,
body: true,
arrayBuffer: async () =>
big.buffer.slice(big.byteOffset, big.byteOffset + big.byteLength),
headers: { get: () => fmt.mime },
status: 200,
} as Response);
await monitorWebProvider(false, listenerFactory, false, resolver);
expect(capturedOnMessage).toBeDefined();
await monitorWebProvider(false, listenerFactory, false, resolver);
expect(capturedOnMessage).toBeDefined();
await capturedOnMessage?.({
body: "hello",
from: "+1",
to: "+2",
id: `msg-${fmt.name}`,
sendComposing,
reply,
sendMedia,
});
await capturedOnMessage?.({
body: "hello",
from: "+1",
to: "+2",
id: `msg-${fmt.name}`,
sendComposing,
reply,
sendMedia,
});
expect(sendMedia).toHaveBeenCalledTimes(1);
const payload = sendMedia.mock.calls[0][0] as {
image: Buffer;
mimetype?: string;
};
expect(payload.image.length).toBeLessThanOrEqual(1 * 1024 * 1024);
expect(payload.mimetype).toBe("image/jpeg");
expect(reply).not.toHaveBeenCalled();
expect(sendMedia).toHaveBeenCalledTimes(1);
const payload = sendMedia.mock.calls[0][0] as {
image: Buffer;
mimetype?: string;
};
expect(payload.image.length).toBeLessThanOrEqual(1 * 1024 * 1024);
expect(payload.mimetype).toBe("image/jpeg");
expect(reply).not.toHaveBeenCalled();
fetchMock.mockRestore();
}
});
fetchMock.mockRestore();
}
},
);
it("honors mediaMaxMb from config", async () => {
loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });