feat: transcribe audio and surface transcript to prompts

2025-11-25 23:13:22 +01:00
parent 7d0ae151e8
commit e642f128ae
6 changed files with 169 additions and 100 deletions
--- a/docs/audio.md
+++ b/docs/audio.md
@@ -0,0 +1,47 @@
+# Audio / Voice Notes — 2025-11-25
+
+## What works
+- **Optional transcription**: If `inbound.transcribeAudio.command` is set in `~/.warelay/warelay.json`, warelay will:
+  1) Download inbound audio (Web or Twilio) to a temp path if only a URL is present.
+  2) Run the configured CLI (templated with `{{MediaPath}}`), expecting transcript on stdout.
+  3) Replace `Body` with the transcript, set `{{Transcript}}`, and prepend the original media path plus a `Transcript:` section in the command prompt so models see both.
+  4) Continue through the normal auto-reply pipeline (templating, sessions, Claude/command).
+- **Verbose logging**: In `--verbose`, we log when transcription runs and when the transcript replaces the body.
+
+## Config example (OpenAI Whisper CLI)
+Requires `OPENAI_API_KEY` in env and `openai` CLI installed:
+```json5
+{
+  inbound: {
+    transcribeAudio: {
+      command: [
+        "openai",
+        "api",
+        "audio.transcriptions.create",
+        "-m",
+        "whisper-1",
+        "-f",
+        "{{MediaPath}}",
+        "--response-format",
+        "text"
+      ],
+      timeoutSeconds: 45
+    },
+    reply: {
+      mode: "command",
+      command: ["claude", "{{Body}}"]
+    }
+  }
+}
+```
+
+## Notes & limits
+- We don’t ship a transcriber; you opt in with any CLI that prints text to stdout (Whisper cloud, whisper.cpp, vosk, Deepgram, etc.).
+- Size guard: inbound audio must be ≤5 MB (same as other media).
+- If transcription fails, we fall back to the original body/media note; replies still go through.
+- Transcript is available to templates as `{{Transcript}}`; models get both the media path and a `Transcript:` block in the prompt when using command mode.
+
+## Gotchas
+- Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
+- Keep timeouts reasonable (`timeoutSeconds`, default 45s) to avoid blocking the reply queue.
+- Twilio paths are hosted URLs; Web paths are local. The temp download uses HTTPS for Twilio and a temp file for Web-only media.
--- a/docs/images.md
+++ b/docs/images.md
@@ -57,7 +57,7 @@ This document defines how `warelay` should handle sending and replying with imag
  - `{{MediaUrl}}` original URL (Twilio) or pseudo-URL (web).
  - `{{MediaPath}}` local temp path written before running the command.
 - Size guard: only download if ≤5 MB; else skip and log.
- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs.
+- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs. The command prompt includes the original media path plus a `Transcript:` section so the model sees both.

 ## Errors & Messaging
 - Local path with twilio + Funnel disabled → error: “Twilio media needs a public URL; start `warelay webhook --ingress tailscale` or pass an https:// URL.”
--- a/src/auto-reply/reply.ts
+++ b/src/auto-reply/reply.ts
@@ -110,12 +110,15 @@ export async function getReplyFromConfig(
 		started = true;
 		await opts?.onReplyStart?.();
 	};
+	let transcribedText: string | undefined;

 	// Optional audio transcription before templating/session handling.
 	if (cfg.inbound?.transcribeAudio && isAudio(ctx.MediaType)) {
 		const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
 		if (transcribed?.text) {
+			transcribedText = transcribed.text;
 			ctx.Body = transcribed.text;
+			ctx.Transcript = transcribed.text;
 			logVerbose("Replaced Body with audio transcript for reply flow");
 		}
 	}
@@ -193,9 +196,15 @@ export async function getReplyFromConfig(
 	const bodyPrefix = reply?.bodyPrefix
 		? applyTemplate(reply.bodyPrefix, sessionCtx)
 		: "";
-	const prefixedBody = bodyPrefix
+	const prefixedBodyBase = bodyPrefix
 		? `${bodyPrefix}${sessionCtx.BodyStripped ?? sessionCtx.Body ?? ""}`
 		: (sessionCtx.BodyStripped ?? sessionCtx.Body);
+	const prefixedBody =
+		transcribedText && reply?.mode === "command"
+			? [prefixedBodyBase, `Transcript:\n${transcribedText}`]
+					.filter(Boolean)
+					.join("\n\n")
+			: prefixedBodyBase;
 	const mediaNote = ctx.MediaPath?.length
 		? `[media attached: ${ctx.MediaPath}${ctx.MediaType ? ` (${ctx.MediaType})` : ""}${ctx.MediaUrl ? ` | ${ctx.MediaUrl}` : ""}]`
 		: undefined;
@@ -543,7 +552,7 @@ export async function autoReplyIfConfigured(
 }

 function isAudio(mediaType?: string | null) {
-	return Boolean(mediaType && mediaType.startsWith("audio"));
+	return Boolean(mediaType?.startsWith("audio"));
 }

 async function transcribeInboundAudio(
@@ -596,9 +605,7 @@ async function transcribeInboundAudio(
 		return undefined;
 	} finally {
 		if (tmpPath) {
-			void fs
-				.unlink(tmpPath)
-				.catch(() => {});
+			void fs.unlink(tmpPath).catch(() => {});
 		}
 	}
 }
--- a/src/auto-reply/templating.ts
+++ b/src/auto-reply/templating.ts
@@ -6,6 +6,7 @@ export type MsgContext = {
 	MediaPath?: string;
 	MediaUrl?: string;
 	MediaType?: string;
+	Transcript?: string;
 };

 export type TemplateContext = MsgContext & {
--- a/src/index.core.test.ts
+++ b/src/index.core.test.ts
@@ -5,8 +5,8 @@ import path from "node:path";
 import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.js";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import { createMockTwilio } from "../test/mocks/twilio.js";
-import { withWhatsAppPrefix } from "./utils.js";
 import * as exec from "./process/exec.js";
+import { withWhatsAppPrefix } from "./utils.js";

 // Twilio mock factory shared across tests
 vi.mock("twilio", () => {
@@ -125,8 +125,8 @@ describe("config and templating", () => {
 					command: ["echo", "voice transcript"],
 				},
 				reply: {
-					mode: "text" as const,
-					text: "{{Body}}",
+					mode: "command" as const,
+					command: ["echo", "{{Body}}"],
 				},
 			},
 		};
@@ -135,6 +135,13 @@ describe("config and templating", () => {
 			stdout: "voice transcript\n",
 			stderr: "",
 		});
+		const commandRunner = vi.fn().mockResolvedValue({
+			stdout: "ok",
+			stderr: "",
+			code: 0,
+			signal: null,
+			killed: false,
+		});

 		const result = await index.getReplyFromConfig(
 			{
@@ -146,11 +153,17 @@ describe("config and templating", () => {
 			},
 			undefined,
 			cfg,
+			commandRunner,
 		);

 		expect(runExec).toHaveBeenCalled();
-		expect(result?.text).toContain("voice transcript");
-		expect(result?.text).toContain("/tmp/voice.ogg");
+		expect(commandRunner).toHaveBeenCalled();
+		const argv = commandRunner.mock.calls[0][0];
+		const prompt = argv[argv.length - 1] as string;
+		expect(prompt).toContain("/tmp/voice.ogg");
+		expect(prompt).toContain("Transcript:");
+		expect(prompt).toContain("voice transcript");
+		expect(result?.text).toBeUndefined();
 	});

 	it("getReplyFromConfig skips transcription when not configured", async () => {
--- a/src/provider-web.test.ts
+++ b/src/provider-web.test.ts
@@ -505,104 +505,105 @@ describe("provider-web", () => {
 		"compresses common formats to jpeg under the cap",
 		{ timeout: 15_000 },
 		async () => {
-		const formats = [
-			{
-				name: "png",
-				mime: "image/png",
-				make: (buf: Buffer, opts: { width: number; height: number }) =>
-					sharp(buf, {
-						raw: { width: opts.width, height: opts.height, channels: 3 },
-					})
-						.png({ compressionLevel: 0 })
-						.toBuffer(),
-			},
-			{
-				name: "jpeg",
-				mime: "image/jpeg",
-				make: (buf: Buffer, opts: { width: number; height: number }) =>
-					sharp(buf, {
-						raw: { width: opts.width, height: opts.height, channels: 3 },
-					})
-						.jpeg({ quality: 100, chromaSubsampling: "4:4:4" })
-						.toBuffer(),
-			},
-			{
-				name: "webp",
-				mime: "image/webp",
-				make: (buf: Buffer, opts: { width: number; height: number }) =>
-					sharp(buf, {
-						raw: { width: opts.width, height: opts.height, channels: 3 },
-					})
-						.webp({ quality: 100 })
-						.toBuffer(),
-			},
-		] as const;
+			const formats = [
+				{
+					name: "png",
+					mime: "image/png",
+					make: (buf: Buffer, opts: { width: number; height: number }) =>
+						sharp(buf, {
+							raw: { width: opts.width, height: opts.height, channels: 3 },
+						})
+							.png({ compressionLevel: 0 })
+							.toBuffer(),
+				},
+				{
+					name: "jpeg",
+					mime: "image/jpeg",
+					make: (buf: Buffer, opts: { width: number; height: number }) =>
+						sharp(buf, {
+							raw: { width: opts.width, height: opts.height, channels: 3 },
+						})
+							.jpeg({ quality: 100, chromaSubsampling: "4:4:4" })
+							.toBuffer(),
+				},
+				{
+					name: "webp",
+					mime: "image/webp",
+					make: (buf: Buffer, opts: { width: number; height: number }) =>
+						sharp(buf, {
+							raw: { width: opts.width, height: opts.height, channels: 3 },
+						})
+							.webp({ quality: 100 })
+							.toBuffer(),
+				},
+			] as const;

-		for (const fmt of formats) {
-			// Force a small cap to ensure compression is exercised for every format.
-			loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });
-			const sendMedia = vi.fn();
-			const reply = vi.fn().mockResolvedValue(undefined);
-			const sendComposing = vi.fn();
-			const resolver = vi.fn().mockResolvedValue({
-				text: "hi",
-				mediaUrl: `https://example.com/big.${fmt.name}`,
-			});
+			for (const fmt of formats) {
+				// Force a small cap to ensure compression is exercised for every format.
+				loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });
+				const sendMedia = vi.fn();
+				const reply = vi.fn().mockResolvedValue(undefined);
+				const sendComposing = vi.fn();
+				const resolver = vi.fn().mockResolvedValue({
+					text: "hi",
+					mediaUrl: `https://example.com/big.${fmt.name}`,
+				});

-			let capturedOnMessage:
-				| ((
+				let capturedOnMessage:
+					| ((
+							msg: import("./provider-web.js").WebInboundMessage,
+					  ) => Promise<void>)
+					| undefined;
+				const listenerFactory = async (opts: {
+					onMessage: (
 						msg: import("./provider-web.js").WebInboundMessage,
-				  ) => Promise<void>)
-				| undefined;
-			const listenerFactory = async (opts: {
-				onMessage: (
-					msg: import("./provider-web.js").WebInboundMessage,
-				) => Promise<void>;
-			}) => {
-				capturedOnMessage = opts.onMessage;
-				return { close: vi.fn() };
-			};
+					) => Promise<void>;
+				}) => {
+					capturedOnMessage = opts.onMessage;
+					return { close: vi.fn() };
+				};

-			const width = 2000;
-			const height = 2000;
-			const raw = crypto.randomBytes(width * height * 3);
-			const big = await fmt.make(raw, { width, height });
-			expect(big.length).toBeGreaterThan(1 * 1024 * 1024);
+				const width = 2000;
+				const height = 2000;
+				const raw = crypto.randomBytes(width * height * 3);
+				const big = await fmt.make(raw, { width, height });
+				expect(big.length).toBeGreaterThan(1 * 1024 * 1024);

-			const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
-				ok: true,
-				body: true,
-				arrayBuffer: async () =>
-					big.buffer.slice(big.byteOffset, big.byteOffset + big.byteLength),
-				headers: { get: () => fmt.mime },
-				status: 200,
-			} as Response);
+				const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
+					ok: true,
+					body: true,
+					arrayBuffer: async () =>
+						big.buffer.slice(big.byteOffset, big.byteOffset + big.byteLength),
+					headers: { get: () => fmt.mime },
+					status: 200,
+				} as Response);

-			await monitorWebProvider(false, listenerFactory, false, resolver);
-			expect(capturedOnMessage).toBeDefined();
+				await monitorWebProvider(false, listenerFactory, false, resolver);
+				expect(capturedOnMessage).toBeDefined();

-			await capturedOnMessage?.({
-				body: "hello",
-				from: "+1",
-				to: "+2",
-				id: `msg-${fmt.name}`,
-				sendComposing,
-				reply,
-				sendMedia,
-			});
+				await capturedOnMessage?.({
+					body: "hello",
+					from: "+1",
+					to: "+2",
+					id: `msg-${fmt.name}`,
+					sendComposing,
+					reply,
+					sendMedia,
+				});

-			expect(sendMedia).toHaveBeenCalledTimes(1);
-			const payload = sendMedia.mock.calls[0][0] as {
-				image: Buffer;
-				mimetype?: string;
-			};
-			expect(payload.image.length).toBeLessThanOrEqual(1 * 1024 * 1024);
-			expect(payload.mimetype).toBe("image/jpeg");
-			expect(reply).not.toHaveBeenCalled();
+				expect(sendMedia).toHaveBeenCalledTimes(1);
+				const payload = sendMedia.mock.calls[0][0] as {
+					image: Buffer;
+					mimetype?: string;
+				};
+				expect(payload.image.length).toBeLessThanOrEqual(1 * 1024 * 1024);
+				expect(payload.mimetype).toBe("image/jpeg");
+				expect(reply).not.toHaveBeenCalled();

-			fetchMock.mockRestore();
-		}
-	});
+				fetchMock.mockRestore();
+			}
+		},
+	);

 	it("honors mediaMaxMb from config", async () => {
 		loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });