From 7d0ae151e85df880f2b42e16d2dd329f43d836fb Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 25 Nov 2025 23:06:54 +0100 Subject: [PATCH] feat: optional audio transcription via CLI --- CHANGELOG.md | 1 + README.md | 26 +++++++++ docs/claude-config.md | 1 + docs/images.md | 1 + src/auto-reply/reply.ts | 125 +++++++++++++++++++++++++++++++++++----- src/config/config.ts | 11 ++++ src/index.core.test.ts | 64 ++++++++++++++++++++ 7 files changed, 215 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 907637cb9..ac58f1921 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Pending - Web auto-replies now resize/recompress media and honor `inbound.reply.mediaMaxMb` in `~/.warelay/warelay.json` (default 5 MB) to avoid provider/API limits. +- Optional voice-note transcription: set `inbound.transcribeAudio.command` (e.g., OpenAI Whisper CLI) to turn inbound audio into text before templating/Claude; verbose logs surface when transcription runs. ## 1.0.4 — 2025-11-25 diff --git a/README.md b/README.md index 82dc28aa4..7bf8e9d65 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,32 @@ Install from npm (global): `npm install -g warelay` (Node 22+). Then choose **on - Web: `warelay send --provider web --media ./pic.jpg --message "Hi"` (local path or URL; no hosting needed). - Auto-replies can attach `mediaUrl` in `~/.warelay/warelay.json` (used alongside `text` when present). Web auto-replies now auto-resize/recompress images and cap size by config: set `inbound.reply.mediaMaxMb` (default 5) to control the post-compression limit; images are resized (max side 2048px) and JPEG-compressed to fit. +### Voice notes (optional transcription) +- If you set `inbound.transcribeAudio.command`, warelay will run that CLI when inbound audio arrives (e.g., WhatsApp voice notes) and replace the Body with the transcript before templating/Claude. +- Example using OpenAI Whisper CLI (requires `OPENAI_API_KEY`): + ```json5 + { + inbound: { + transcribeAudio: { + command: [ + "openai", + "api", + "audio.transcriptions.create", + "-m", + "whisper-1", + "-f", + "{{MediaPath}}", + "--response-format", + "text" + ], + timeoutSeconds: 45 + }, + reply: { mode: "command", command: ["claude", "{{Body}}"] } + } + } + ``` +- Works for Web and Twilio providers; verbose mode logs when transcription runs. If transcription fails, the original Body is used. + ## Providers - **Twilio (default):** needs `.env` creds + WhatsApp-enabled number; supports delivery tracking, polling, webhooks, and auto-reply typing indicators. - **Web (`--provider web`):** uses your personal WhatsApp via Baileys; supports send/receive + auto-reply, but no delivery-status wait; cache lives in `~/.warelay/credentials/` (rerun `login` if logged out). diff --git a/docs/claude-config.md b/docs/claude-config.md index cc2346cd7..52069a5e7 100644 --- a/docs/claude-config.md +++ b/docs/claude-config.md @@ -59,6 +59,7 @@ Notes on this configuration: - Host local paths for Twilio using the media server/Tailscale Funnel. - Send buffers directly for the Web provider. - Inbound media is downloaded (≤5 MB) and exposed to your templates as `{{MediaPath}}`, `{{MediaUrl}}`, and `{{MediaType}}`. You can mention this in your prompt if you want Claude to reason about the attachment. Outbound media from Claude (via `MEDIA:`) is resized/recompressed on the Web provider path; control the cap with `inbound.reply.mediaMaxMb` (default 5). +- Voice notes: set `inbound.transcribeAudio.command` to run a CLI that emits the transcript to stdout (e.g., OpenAI Whisper: `openai api audio.transcriptions.create -m whisper-1 -f {{MediaPath}} --response-format text`). If it succeeds, warelay replaces `Body` with the transcript before invoking Claude. ## Testing the setup 1. Start a relay (auto-selects Web when logged in, otherwise Twilio polling): diff --git a/docs/images.md b/docs/images.md index 5c1ecf398..315e12db3 100644 --- a/docs/images.md +++ b/docs/images.md @@ -57,6 +57,7 @@ This document defines how `warelay` should handle sending and replying with imag - `{{MediaUrl}}` original URL (Twilio) or pseudo-URL (web). - `{{MediaPath}}` local temp path written before running the command. - Size guard: only download if ≤5 MB; else skip and log. +- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs. ## Errors & Messaging - Local path with twilio + Funnel disabled → error: “Twilio media needs a public URL; start `warelay webhook --ingress tailscale` or pass an https:// URL.” diff --git a/src/auto-reply/reply.ts b/src/auto-reply/reply.ts index dcd36e282..77da9a838 100644 --- a/src/auto-reply/reply.ts +++ b/src/auto-reply/reply.ts @@ -1,4 +1,6 @@ import crypto from "node:crypto"; +import fs from "node:fs/promises"; +import os from "node:os"; import path from "node:path"; import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.js"; @@ -16,7 +18,7 @@ import { logError } from "../logger.js"; import { ensureMediaHosted } from "../media/host.js"; import { splitMediaFromOutput } from "../media/parse.js"; import { enqueueCommand } from "../process/command-queue.js"; -import { runCommandWithTimeout } from "../process/exec.js"; +import { runCommandWithTimeout, runExec } from "../process/exec.js"; import { defaultRuntime, type RuntimeEnv } from "../runtime.js"; import type { TwilioRequester } from "../twilio/types.js"; import { sendTypingIndicator } from "../twilio/typing.js"; @@ -109,6 +111,15 @@ export async function getReplyFromConfig( await opts?.onReplyStart?.(); }; + // Optional audio transcription before templating/session handling. + if (cfg.inbound?.transcribeAudio && isAudio(ctx.MediaType)) { + const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime); + if (transcribed?.text) { + ctx.Body = transcribed.text; + logVerbose("Replaced Body with audio transcript for reply flow"); + } + } + // Optional session handling (conversation reuse + /new resets) const sessionCfg = reply?.session; const resetTriggers = sessionCfg?.resetTriggers?.length @@ -166,6 +177,18 @@ export async function getReplyFromConfig( IsNewSession: isNewSession ? "true" : "false", }; + // Optional allowlist by origin number (E.164 without whatsapp: prefix) + const allowFrom = cfg.inbound?.allowFrom; + if (Array.isArray(allowFrom) && allowFrom.length > 0) { + const from = (ctx.From ?? "").replace(/^whatsapp:/, ""); + if (!allowFrom.includes(from)) { + logVerbose( + `Skipping auto-reply: sender ${from || ""} not in allowFrom list`, + ); + return undefined; + } + } + // Optional prefix injected before Body for templating/command prompts. const bodyPrefix = reply?.bodyPrefix ? applyTemplate(reply.bodyPrefix, sessionCtx) @@ -192,18 +215,6 @@ export async function getReplyFromConfig( Body: commandBody, BodyStripped: commandBody, }; - - // Optional allowlist by origin number (E.164 without whatsapp: prefix) - const allowFrom = cfg.inbound?.allowFrom; - if (Array.isArray(allowFrom) && allowFrom.length > 0) { - const from = (ctx.From ?? "").replace(/^whatsapp:/, ""); - if (!allowFrom.includes(from)) { - logVerbose( - `Skipping auto-reply: sender ${from || ""} not in allowFrom list`, - ); - return undefined; - } - } if (!reply) { logVerbose("No inbound.reply configured; skipping auto-reply"); return undefined; @@ -431,13 +442,38 @@ export async function autoReplyIfConfigured( To: message.to ?? undefined, MessageSid: message.sid, }; + const cfg = configOverride ?? loadConfig(); + // Attach media hints for transcription/templates if present on Twilio payloads. + const mediaUrl = (message as { mediaUrl?: string }).mediaUrl; + if (mediaUrl) ctx.MediaUrl = mediaUrl; + + // Optional audio transcription before building reply. + if (cfg.inbound?.transcribeAudio && message.media?.length) { + const media = message.media[0]; + const contentType = (media as { contentType?: string }).contentType; + if (contentType?.startsWith("audio")) { + const transcribed = await transcribeInboundAudio( + cfg, + { + mediaUrl: mediaUrl ?? undefined, + contentType, + }, + runtime, + ); + if (transcribed?.text) { + ctx.Body = transcribed.text; + ctx.MediaType = contentType; + logVerbose("Replaced Body with audio transcript for reply flow"); + } + } + } const replyResult = await getReplyFromConfig( ctx, { onReplyStart: () => sendTypingIndicator(client, runtime, message.sid), }, - configOverride, + cfg, ); if (!replyResult || (!replyResult.text && !replyResult.mediaUrl)) return; @@ -505,3 +541,64 @@ export async function autoReplyIfConfigured( } } } + +function isAudio(mediaType?: string | null) { + return Boolean(mediaType && mediaType.startsWith("audio")); +} + +async function transcribeInboundAudio( + cfg: WarelayConfig, + ctx: MsgContext, + runtime: RuntimeEnv, +): Promise<{ text: string } | undefined> { + const transcriber = cfg.inbound?.transcribeAudio; + if (!transcriber?.command?.length) return undefined; + + const timeoutMs = Math.max((transcriber.timeoutSeconds ?? 45) * 1000, 1_000); + let tmpPath: string | undefined; + let mediaPath = ctx.MediaPath; + try { + if (!mediaPath && ctx.MediaUrl) { + const res = await fetch(ctx.MediaUrl); + if (!res.ok) throw new Error(`HTTP ${res.status}`); + const arrayBuf = await res.arrayBuffer(); + const buffer = Buffer.from(arrayBuf); + tmpPath = path.join( + os.tmpdir(), + `warelay-audio-${crypto.randomUUID()}.ogg`, + ); + await fs.writeFile(tmpPath, buffer); + mediaPath = tmpPath; + if (isVerbose()) { + logVerbose( + `Downloaded audio for transcription (${(buffer.length / (1024 * 1024)).toFixed(2)}MB) -> ${tmpPath}`, + ); + } + } + if (!mediaPath) return undefined; + + const templCtx: MsgContext = { ...ctx, MediaPath: mediaPath }; + const argv = transcriber.command.map((part) => + applyTemplate(part, templCtx), + ); + if (isVerbose()) { + logVerbose(`Transcribing audio via command: ${argv.join(" ")}`); + } + const { stdout } = await runExec(argv[0], argv.slice(1), { + timeoutMs, + maxBuffer: 5 * 1024 * 1024, + }); + const text = stdout.trim(); + if (!text) return undefined; + return { text }; + } catch (err) { + runtime.error?.(`Audio transcription failed: ${String(err)}`); + return undefined; + } finally { + if (tmpPath) { + void fs + .unlink(tmpPath) + .catch(() => {}); + } + } +} diff --git a/src/config/config.ts b/src/config/config.ts index e6b36adfa..8e19d63f4 100644 --- a/src/config/config.ts +++ b/src/config/config.ts @@ -28,6 +28,11 @@ export type WarelayConfig = { logging?: LoggingConfig; inbound?: { allowFrom?: string[]; // E.164 numbers allowed to trigger auto-reply (without whatsapp:) + transcribeAudio?: { + // Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout. + command: string[]; + timeoutSeconds?: number; + }; reply?: { mode: ReplyMode; text?: string; // for mode=text, can contain {{Body}} @@ -107,6 +112,12 @@ const WarelaySchema = z.object({ inbound: z .object({ allowFrom: z.array(z.string()).optional(), + transcribeAudio: z + .object({ + command: z.array(z.string()), + timeoutSeconds: z.number().int().positive().optional(), + }) + .optional(), reply: ReplySchema.optional(), }) .optional(), diff --git a/src/index.core.test.ts b/src/index.core.test.ts index 1e134cf5d..c570e0dcc 100644 --- a/src/index.core.test.ts +++ b/src/index.core.test.ts @@ -6,6 +6,7 @@ import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message. import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { createMockTwilio } from "../test/mocks/twilio.js"; import { withWhatsAppPrefix } from "./utils.js"; +import * as exec from "./process/exec.js"; // Twilio mock factory shared across tests vi.mock("twilio", () => { @@ -117,6 +118,69 @@ describe("config and templating", () => { expect(result?.text).toContain("http://example.com/a.jpg"); }); + it("getReplyFromConfig runs audio transcription command when configured", async () => { + const cfg = { + inbound: { + transcribeAudio: { + command: ["echo", "voice transcript"], + }, + reply: { + mode: "text" as const, + text: "{{Body}}", + }, + }, + }; + + const runExec = vi.spyOn(exec, "runExec").mockResolvedValue({ + stdout: "voice transcript\n", + stderr: "", + }); + + const result = await index.getReplyFromConfig( + { + Body: "", + From: "+1", + To: "+2", + MediaPath: "/tmp/voice.ogg", + MediaType: "audio/ogg", + }, + undefined, + cfg, + ); + + expect(runExec).toHaveBeenCalled(); + expect(result?.text).toContain("voice transcript"); + expect(result?.text).toContain("/tmp/voice.ogg"); + }); + + it("getReplyFromConfig skips transcription when not configured", async () => { + const cfg = { + inbound: { + reply: { + mode: "text" as const, + text: "{{Body}}", + }, + }, + }; + + const runExec = vi.spyOn(exec, "runExec"); + const result = await index.getReplyFromConfig( + { + Body: "", + From: "+1", + To: "+2", + MediaPath: "/tmp/voice.ogg", + MediaType: "audio/ogg", + }, + undefined, + cfg, + ); + + expect(runExec).not.toHaveBeenCalled(); + expect(result?.text).toContain("/tmp/voice.ogg"); + expect(result?.text).toContain(""); + }); + it("getReplyFromConfig extracts media URL from command stdout", async () => { const runSpy = vi.spyOn(index, "runCommandWithTimeout").mockResolvedValue({ stdout: "hello\nMEDIA: https://example.com/img.jpg\n",