feat: optional audio transcription via CLI

2025-11-25 23:06:54 +01:00
parent f945e284e1
commit 7d0ae151e8
7 changed files with 215 additions and 14 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@

 ### Pending
 - Web auto-replies now resize/recompress media and honor `inbound.reply.mediaMaxMb` in `~/.warelay/warelay.json` (default 5 MB) to avoid provider/API limits.
+- Optional voice-note transcription: set `inbound.transcribeAudio.command` (e.g., OpenAI Whisper CLI) to turn inbound audio into text before templating/Claude; verbose logs surface when transcription runs.

 ## 1.0.4 — 2025-11-25

--- a/README.md
+++ b/README.md
@@ -51,6 +51,32 @@ Install from npm (global): `npm install -g warelay` (Node 22+). Then choose **on
 - Web: `warelay send --provider web --media ./pic.jpg --message "Hi"` (local path or URL; no hosting needed).
 - Auto-replies can attach `mediaUrl` in `~/.warelay/warelay.json` (used alongside `text` when present). Web auto-replies now auto-resize/recompress images and cap size by config: set `inbound.reply.mediaMaxMb` (default 5) to control the post-compression limit; images are resized (max side 2048px) and JPEG-compressed to fit.

+### Voice notes (optional transcription)
+- If you set `inbound.transcribeAudio.command`, warelay will run that CLI when inbound audio arrives (e.g., WhatsApp voice notes) and replace the Body with the transcript before templating/Claude.
+- Example using OpenAI Whisper CLI (requires `OPENAI_API_KEY`):
+  ```json5
+  {
+    inbound: {
+      transcribeAudio: {
+        command: [
+          "openai",
+          "api",
+          "audio.transcriptions.create",
+          "-m",
+          "whisper-1",
+          "-f",
+          "{{MediaPath}}",
+          "--response-format",
+          "text"
+        ],
+        timeoutSeconds: 45
+      },
+      reply: { mode: "command", command: ["claude", "{{Body}}"] }
+    }
+  }
+  ```
+- Works for Web and Twilio providers; verbose mode logs when transcription runs. If transcription fails, the original Body is used.
+
 ## Providers
 - **Twilio (default):** needs `.env` creds + WhatsApp-enabled number; supports delivery tracking, polling, webhooks, and auto-reply typing indicators.
 - **Web (`--provider web`):** uses your personal WhatsApp via Baileys; supports send/receive + auto-reply, but no delivery-status wait; cache lives in `~/.warelay/credentials/` (rerun `login` if logged out).
--- a/docs/claude-config.md
+++ b/docs/claude-config.md
@@ -59,6 +59,7 @@ Notes on this configuration:
  - Host local paths for Twilio using the media server/Tailscale Funnel.
  - Send buffers directly for the Web provider.
 - Inbound media is downloaded (≤5 MB) and exposed to your templates as `{{MediaPath}}`, `{{MediaUrl}}`, and `{{MediaType}}`. You can mention this in your prompt if you want Claude to reason about the attachment. Outbound media from Claude (via `MEDIA:`) is resized/recompressed on the Web provider path; control the cap with `inbound.reply.mediaMaxMb` (default 5).
+- Voice notes: set `inbound.transcribeAudio.command` to run a CLI that emits the transcript to stdout (e.g., OpenAI Whisper: `openai api audio.transcriptions.create -m whisper-1 -f {{MediaPath}} --response-format text`). If it succeeds, warelay replaces `Body` with the transcript before invoking Claude.

 ## Testing the setup
 1. Start a relay (auto-selects Web when logged in, otherwise Twilio polling):
--- a/docs/images.md
+++ b/docs/images.md
@@ -57,6 +57,7 @@ This document defines how `warelay` should handle sending and replying with imag
  - `{{MediaUrl}}` original URL (Twilio) or pseudo-URL (web).
  - `{{MediaPath}}` local temp path written before running the command.
 - Size guard: only download if ≤5 MB; else skip and log.
+- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs.

 ## Errors & Messaging
 - Local path with twilio + Funnel disabled → error: “Twilio media needs a public URL; start `warelay webhook --ingress tailscale` or pass an https:// URL.”
--- a/src/auto-reply/reply.ts
+++ b/src/auto-reply/reply.ts
@@ -1,4 +1,6 @@
 import crypto from "node:crypto";
+import fs from "node:fs/promises";
+import os from "node:os";
 import path from "node:path";

 import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.js";
@@ -16,7 +18,7 @@ import { logError } from "../logger.js";
 import { ensureMediaHosted } from "../media/host.js";
 import { splitMediaFromOutput } from "../media/parse.js";
 import { enqueueCommand } from "../process/command-queue.js";
-import { runCommandWithTimeout } from "../process/exec.js";
+import { runCommandWithTimeout, runExec } from "../process/exec.js";
 import { defaultRuntime, type RuntimeEnv } from "../runtime.js";
 import type { TwilioRequester } from "../twilio/types.js";
 import { sendTypingIndicator } from "../twilio/typing.js";
@@ -109,6 +111,15 @@ export async function getReplyFromConfig(
 		await opts?.onReplyStart?.();
 	};

+	// Optional audio transcription before templating/session handling.
+	if (cfg.inbound?.transcribeAudio && isAudio(ctx.MediaType)) {
+		const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
+		if (transcribed?.text) {
+			ctx.Body = transcribed.text;
+			logVerbose("Replaced Body with audio transcript for reply flow");
+		}
+	}
+
 	// Optional session handling (conversation reuse + /new resets)
 	const sessionCfg = reply?.session;
 	const resetTriggers = sessionCfg?.resetTriggers?.length
@@ -166,6 +177,18 @@ export async function getReplyFromConfig(
 		IsNewSession: isNewSession ? "true" : "false",
 	};

+	// Optional allowlist by origin number (E.164 without whatsapp: prefix)
+	const allowFrom = cfg.inbound?.allowFrom;
+	if (Array.isArray(allowFrom) && allowFrom.length > 0) {
+		const from = (ctx.From ?? "").replace(/^whatsapp:/, "");
+		if (!allowFrom.includes(from)) {
+			logVerbose(
+				`Skipping auto-reply: sender ${from || "<unknown>"} not in allowFrom list`,
+			);
+			return undefined;
+		}
+	}
+
 	// Optional prefix injected before Body for templating/command prompts.
 	const bodyPrefix = reply?.bodyPrefix
 		? applyTemplate(reply.bodyPrefix, sessionCtx)
@@ -192,18 +215,6 @@ export async function getReplyFromConfig(
 		Body: commandBody,
 		BodyStripped: commandBody,
 	};
-
-	// Optional allowlist by origin number (E.164 without whatsapp: prefix)
-	const allowFrom = cfg.inbound?.allowFrom;
-	if (Array.isArray(allowFrom) && allowFrom.length > 0) {
-		const from = (ctx.From ?? "").replace(/^whatsapp:/, "");
-		if (!allowFrom.includes(from)) {
-			logVerbose(
-				`Skipping auto-reply: sender ${from || "<unknown>"} not in allowFrom list`,
-			);
-			return undefined;
-		}
-	}
 	if (!reply) {
 		logVerbose("No inbound.reply configured; skipping auto-reply");
 		return undefined;
@@ -431,13 +442,38 @@ export async function autoReplyIfConfigured(
 		To: message.to ?? undefined,
 		MessageSid: message.sid,
 	};
+	const cfg = configOverride ?? loadConfig();
+	// Attach media hints for transcription/templates if present on Twilio payloads.
+	const mediaUrl = (message as { mediaUrl?: string }).mediaUrl;
+	if (mediaUrl) ctx.MediaUrl = mediaUrl;
+
+	// Optional audio transcription before building reply.
+	if (cfg.inbound?.transcribeAudio && message.media?.length) {
+		const media = message.media[0];
+		const contentType = (media as { contentType?: string }).contentType;
+		if (contentType?.startsWith("audio")) {
+			const transcribed = await transcribeInboundAudio(
+				cfg,
+				{
+					mediaUrl: mediaUrl ?? undefined,
+					contentType,
+				},
+				runtime,
+			);
+			if (transcribed?.text) {
+				ctx.Body = transcribed.text;
+				ctx.MediaType = contentType;
+				logVerbose("Replaced Body with audio transcript for reply flow");
+			}
+		}
+	}

 	const replyResult = await getReplyFromConfig(
 		ctx,
 		{
 			onReplyStart: () => sendTypingIndicator(client, runtime, message.sid),
 		},
-		configOverride,
+		cfg,
 	);
 	if (!replyResult || (!replyResult.text && !replyResult.mediaUrl)) return;

@@ -505,3 +541,64 @@ export async function autoReplyIfConfigured(
 		}
 	}
 }
+
+function isAudio(mediaType?: string | null) {
+	return Boolean(mediaType && mediaType.startsWith("audio"));
+}
+
+async function transcribeInboundAudio(
+	cfg: WarelayConfig,
+	ctx: MsgContext,
+	runtime: RuntimeEnv,
+): Promise<{ text: string } | undefined> {
+	const transcriber = cfg.inbound?.transcribeAudio;
+	if (!transcriber?.command?.length) return undefined;
+
+	const timeoutMs = Math.max((transcriber.timeoutSeconds ?? 45) * 1000, 1_000);
+	let tmpPath: string | undefined;
+	let mediaPath = ctx.MediaPath;
+	try {
+		if (!mediaPath && ctx.MediaUrl) {
+			const res = await fetch(ctx.MediaUrl);
+			if (!res.ok) throw new Error(`HTTP ${res.status}`);
+			const arrayBuf = await res.arrayBuffer();
+			const buffer = Buffer.from(arrayBuf);
+			tmpPath = path.join(
+				os.tmpdir(),
+				`warelay-audio-${crypto.randomUUID()}.ogg`,
+			);
+			await fs.writeFile(tmpPath, buffer);
+			mediaPath = tmpPath;
+			if (isVerbose()) {
+				logVerbose(
+					`Downloaded audio for transcription (${(buffer.length / (1024 * 1024)).toFixed(2)}MB) -> ${tmpPath}`,
+				);
+			}
+		}
+		if (!mediaPath) return undefined;
+
+		const templCtx: MsgContext = { ...ctx, MediaPath: mediaPath };
+		const argv = transcriber.command.map((part) =>
+			applyTemplate(part, templCtx),
+		);
+		if (isVerbose()) {
+			logVerbose(`Transcribing audio via command: ${argv.join(" ")}`);
+		}
+		const { stdout } = await runExec(argv[0], argv.slice(1), {
+			timeoutMs,
+			maxBuffer: 5 * 1024 * 1024,
+		});
+		const text = stdout.trim();
+		if (!text) return undefined;
+		return { text };
+	} catch (err) {
+		runtime.error?.(`Audio transcription failed: ${String(err)}`);
+		return undefined;
+	} finally {
+		if (tmpPath) {
+			void fs
+				.unlink(tmpPath)
+				.catch(() => {});
+		}
+	}
+}
--- a/src/config/config.ts
+++ b/src/config/config.ts
@@ -28,6 +28,11 @@ export type WarelayConfig = {
 	logging?: LoggingConfig;
 	inbound?: {
 		allowFrom?: string[]; // E.164 numbers allowed to trigger auto-reply (without whatsapp:)
+		transcribeAudio?: {
+			// Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout.
+			command: string[];
+			timeoutSeconds?: number;
+		};
 		reply?: {
 			mode: ReplyMode;
 			text?: string; // for mode=text, can contain {{Body}}
@@ -107,6 +112,12 @@ const WarelaySchema = z.object({
 	inbound: z
 		.object({
 			allowFrom: z.array(z.string()).optional(),
+			transcribeAudio: z
+				.object({
+					command: z.array(z.string()),
+					timeoutSeconds: z.number().int().positive().optional(),
+				})
+				.optional(),
 			reply: ReplySchema.optional(),
 		})
 		.optional(),
--- a/src/index.core.test.ts
+++ b/src/index.core.test.ts
@@ -6,6 +6,7 @@ import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import { createMockTwilio } from "../test/mocks/twilio.js";
 import { withWhatsAppPrefix } from "./utils.js";
+import * as exec from "./process/exec.js";

 // Twilio mock factory shared across tests
 vi.mock("twilio", () => {
@@ -117,6 +118,69 @@ describe("config and templating", () => {
 		expect(result?.text).toContain("http://example.com/a.jpg");
 	});

+	it("getReplyFromConfig runs audio transcription command when configured", async () => {
+		const cfg = {
+			inbound: {
+				transcribeAudio: {
+					command: ["echo", "voice transcript"],
+				},
+				reply: {
+					mode: "text" as const,
+					text: "{{Body}}",
+				},
+			},
+		};
+
+		const runExec = vi.spyOn(exec, "runExec").mockResolvedValue({
+			stdout: "voice transcript\n",
+			stderr: "",
+		});
+
+		const result = await index.getReplyFromConfig(
+			{
+				Body: "<media:audio>",
+				From: "+1",
+				To: "+2",
+				MediaPath: "/tmp/voice.ogg",
+				MediaType: "audio/ogg",
+			},
+			undefined,
+			cfg,
+		);
+
+		expect(runExec).toHaveBeenCalled();
+		expect(result?.text).toContain("voice transcript");
+		expect(result?.text).toContain("/tmp/voice.ogg");
+	});
+
+	it("getReplyFromConfig skips transcription when not configured", async () => {
+		const cfg = {
+			inbound: {
+				reply: {
+					mode: "text" as const,
+					text: "{{Body}}",
+				},
+			},
+		};
+
+		const runExec = vi.spyOn(exec, "runExec");
+		const result = await index.getReplyFromConfig(
+			{
+				Body: "<media:audio>",
+				From: "+1",
+				To: "+2",
+				MediaPath: "/tmp/voice.ogg",
+				MediaType: "audio/ogg",
+			},
+			undefined,
+			cfg,
+		);
+
+		expect(runExec).not.toHaveBeenCalled();
+		expect(result?.text).toContain("/tmp/voice.ogg");
+		expect(result?.text).toContain("<media:audio>");
+	});
+
 	it("getReplyFromConfig extracts media URL from command stdout", async () => {
 		const runSpy = vi.spyOn(index, "runCommandWithTimeout").mockResolvedValue({
 			stdout: "hello\nMEDIA: https://example.com/img.jpg\n",