diff --git a/docs/audio.md b/docs/audio.md new file mode 100644 index 000000000..c92e93cf7 --- /dev/null +++ b/docs/audio.md @@ -0,0 +1,47 @@ +# Audio / Voice Notes — 2025-11-25 + +## What works +- **Optional transcription**: If `inbound.transcribeAudio.command` is set in `~/.warelay/warelay.json`, warelay will: + 1) Download inbound audio (Web or Twilio) to a temp path if only a URL is present. + 2) Run the configured CLI (templated with `{{MediaPath}}`), expecting transcript on stdout. + 3) Replace `Body` with the transcript, set `{{Transcript}}`, and prepend the original media path plus a `Transcript:` section in the command prompt so models see both. + 4) Continue through the normal auto-reply pipeline (templating, sessions, Claude/command). +- **Verbose logging**: In `--verbose`, we log when transcription runs and when the transcript replaces the body. + +## Config example (OpenAI Whisper CLI) +Requires `OPENAI_API_KEY` in env and `openai` CLI installed: +```json5 +{ + inbound: { + transcribeAudio: { + command: [ + "openai", + "api", + "audio.transcriptions.create", + "-m", + "whisper-1", + "-f", + "{{MediaPath}}", + "--response-format", + "text" + ], + timeoutSeconds: 45 + }, + reply: { + mode: "command", + command: ["claude", "{{Body}}"] + } + } +} +``` + +## Notes & limits +- We don’t ship a transcriber; you opt in with any CLI that prints text to stdout (Whisper cloud, whisper.cpp, vosk, Deepgram, etc.). +- Size guard: inbound audio must be ≤5 MB (same as other media). +- If transcription fails, we fall back to the original body/media note; replies still go through. +- Transcript is available to templates as `{{Transcript}}`; models get both the media path and a `Transcript:` block in the prompt when using command mode. + +## Gotchas +- Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`. +- Keep timeouts reasonable (`timeoutSeconds`, default 45s) to avoid blocking the reply queue. +- Twilio paths are hosted URLs; Web paths are local. The temp download uses HTTPS for Twilio and a temp file for Web-only media. diff --git a/docs/images.md b/docs/images.md index 315e12db3..6a622cb30 100644 --- a/docs/images.md +++ b/docs/images.md @@ -57,7 +57,7 @@ This document defines how `warelay` should handle sending and replying with imag - `{{MediaUrl}}` original URL (Twilio) or pseudo-URL (web). - `{{MediaPath}}` local temp path written before running the command. - Size guard: only download if ≤5 MB; else skip and log. -- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs. +- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs. The command prompt includes the original media path plus a `Transcript:` section so the model sees both. ## Errors & Messaging - Local path with twilio + Funnel disabled → error: “Twilio media needs a public URL; start `warelay webhook --ingress tailscale` or pass an https:// URL.” diff --git a/src/auto-reply/reply.ts b/src/auto-reply/reply.ts index 77da9a838..264fc8b6c 100644 --- a/src/auto-reply/reply.ts +++ b/src/auto-reply/reply.ts @@ -110,12 +110,15 @@ export async function getReplyFromConfig( started = true; await opts?.onReplyStart?.(); }; + let transcribedText: string | undefined; // Optional audio transcription before templating/session handling. if (cfg.inbound?.transcribeAudio && isAudio(ctx.MediaType)) { const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime); if (transcribed?.text) { + transcribedText = transcribed.text; ctx.Body = transcribed.text; + ctx.Transcript = transcribed.text; logVerbose("Replaced Body with audio transcript for reply flow"); } } @@ -193,9 +196,15 @@ export async function getReplyFromConfig( const bodyPrefix = reply?.bodyPrefix ? applyTemplate(reply.bodyPrefix, sessionCtx) : ""; - const prefixedBody = bodyPrefix + const prefixedBodyBase = bodyPrefix ? `${bodyPrefix}${sessionCtx.BodyStripped ?? sessionCtx.Body ?? ""}` : (sessionCtx.BodyStripped ?? sessionCtx.Body); + const prefixedBody = + transcribedText && reply?.mode === "command" + ? [prefixedBodyBase, `Transcript:\n${transcribedText}`] + .filter(Boolean) + .join("\n\n") + : prefixedBodyBase; const mediaNote = ctx.MediaPath?.length ? `[media attached: ${ctx.MediaPath}${ctx.MediaType ? ` (${ctx.MediaType})` : ""}${ctx.MediaUrl ? ` | ${ctx.MediaUrl}` : ""}]` : undefined; @@ -543,7 +552,7 @@ export async function autoReplyIfConfigured( } function isAudio(mediaType?: string | null) { - return Boolean(mediaType && mediaType.startsWith("audio")); + return Boolean(mediaType?.startsWith("audio")); } async function transcribeInboundAudio( @@ -596,9 +605,7 @@ async function transcribeInboundAudio( return undefined; } finally { if (tmpPath) { - void fs - .unlink(tmpPath) - .catch(() => {}); + void fs.unlink(tmpPath).catch(() => {}); } } } diff --git a/src/auto-reply/templating.ts b/src/auto-reply/templating.ts index b2f5b6f42..3aa244ea8 100644 --- a/src/auto-reply/templating.ts +++ b/src/auto-reply/templating.ts @@ -6,6 +6,7 @@ export type MsgContext = { MediaPath?: string; MediaUrl?: string; MediaType?: string; + Transcript?: string; }; export type TemplateContext = MsgContext & { diff --git a/src/index.core.test.ts b/src/index.core.test.ts index c570e0dcc..102a84ba6 100644 --- a/src/index.core.test.ts +++ b/src/index.core.test.ts @@ -5,8 +5,8 @@ import path from "node:path"; import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.js"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { createMockTwilio } from "../test/mocks/twilio.js"; -import { withWhatsAppPrefix } from "./utils.js"; import * as exec from "./process/exec.js"; +import { withWhatsAppPrefix } from "./utils.js"; // Twilio mock factory shared across tests vi.mock("twilio", () => { @@ -125,8 +125,8 @@ describe("config and templating", () => { command: ["echo", "voice transcript"], }, reply: { - mode: "text" as const, - text: "{{Body}}", + mode: "command" as const, + command: ["echo", "{{Body}}"], }, }, }; @@ -135,6 +135,13 @@ describe("config and templating", () => { stdout: "voice transcript\n", stderr: "", }); + const commandRunner = vi.fn().mockResolvedValue({ + stdout: "ok", + stderr: "", + code: 0, + signal: null, + killed: false, + }); const result = await index.getReplyFromConfig( { @@ -146,11 +153,17 @@ describe("config and templating", () => { }, undefined, cfg, + commandRunner, ); expect(runExec).toHaveBeenCalled(); - expect(result?.text).toContain("voice transcript"); - expect(result?.text).toContain("/tmp/voice.ogg"); + expect(commandRunner).toHaveBeenCalled(); + const argv = commandRunner.mock.calls[0][0]; + const prompt = argv[argv.length - 1] as string; + expect(prompt).toContain("/tmp/voice.ogg"); + expect(prompt).toContain("Transcript:"); + expect(prompt).toContain("voice transcript"); + expect(result?.text).toBeUndefined(); }); it("getReplyFromConfig skips transcription when not configured", async () => { diff --git a/src/provider-web.test.ts b/src/provider-web.test.ts index 626312642..24163cb73 100644 --- a/src/provider-web.test.ts +++ b/src/provider-web.test.ts @@ -505,104 +505,105 @@ describe("provider-web", () => { "compresses common formats to jpeg under the cap", { timeout: 15_000 }, async () => { - const formats = [ - { - name: "png", - mime: "image/png", - make: (buf: Buffer, opts: { width: number; height: number }) => - sharp(buf, { - raw: { width: opts.width, height: opts.height, channels: 3 }, - }) - .png({ compressionLevel: 0 }) - .toBuffer(), - }, - { - name: "jpeg", - mime: "image/jpeg", - make: (buf: Buffer, opts: { width: number; height: number }) => - sharp(buf, { - raw: { width: opts.width, height: opts.height, channels: 3 }, - }) - .jpeg({ quality: 100, chromaSubsampling: "4:4:4" }) - .toBuffer(), - }, - { - name: "webp", - mime: "image/webp", - make: (buf: Buffer, opts: { width: number; height: number }) => - sharp(buf, { - raw: { width: opts.width, height: opts.height, channels: 3 }, - }) - .webp({ quality: 100 }) - .toBuffer(), - }, - ] as const; + const formats = [ + { + name: "png", + mime: "image/png", + make: (buf: Buffer, opts: { width: number; height: number }) => + sharp(buf, { + raw: { width: opts.width, height: opts.height, channels: 3 }, + }) + .png({ compressionLevel: 0 }) + .toBuffer(), + }, + { + name: "jpeg", + mime: "image/jpeg", + make: (buf: Buffer, opts: { width: number; height: number }) => + sharp(buf, { + raw: { width: opts.width, height: opts.height, channels: 3 }, + }) + .jpeg({ quality: 100, chromaSubsampling: "4:4:4" }) + .toBuffer(), + }, + { + name: "webp", + mime: "image/webp", + make: (buf: Buffer, opts: { width: number; height: number }) => + sharp(buf, { + raw: { width: opts.width, height: opts.height, channels: 3 }, + }) + .webp({ quality: 100 }) + .toBuffer(), + }, + ] as const; - for (const fmt of formats) { - // Force a small cap to ensure compression is exercised for every format. - loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } }); - const sendMedia = vi.fn(); - const reply = vi.fn().mockResolvedValue(undefined); - const sendComposing = vi.fn(); - const resolver = vi.fn().mockResolvedValue({ - text: "hi", - mediaUrl: `https://example.com/big.${fmt.name}`, - }); + for (const fmt of formats) { + // Force a small cap to ensure compression is exercised for every format. + loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } }); + const sendMedia = vi.fn(); + const reply = vi.fn().mockResolvedValue(undefined); + const sendComposing = vi.fn(); + const resolver = vi.fn().mockResolvedValue({ + text: "hi", + mediaUrl: `https://example.com/big.${fmt.name}`, + }); - let capturedOnMessage: - | (( + let capturedOnMessage: + | (( + msg: import("./provider-web.js").WebInboundMessage, + ) => Promise) + | undefined; + const listenerFactory = async (opts: { + onMessage: ( msg: import("./provider-web.js").WebInboundMessage, - ) => Promise) - | undefined; - const listenerFactory = async (opts: { - onMessage: ( - msg: import("./provider-web.js").WebInboundMessage, - ) => Promise; - }) => { - capturedOnMessage = opts.onMessage; - return { close: vi.fn() }; - }; + ) => Promise; + }) => { + capturedOnMessage = opts.onMessage; + return { close: vi.fn() }; + }; - const width = 2000; - const height = 2000; - const raw = crypto.randomBytes(width * height * 3); - const big = await fmt.make(raw, { width, height }); - expect(big.length).toBeGreaterThan(1 * 1024 * 1024); + const width = 2000; + const height = 2000; + const raw = crypto.randomBytes(width * height * 3); + const big = await fmt.make(raw, { width, height }); + expect(big.length).toBeGreaterThan(1 * 1024 * 1024); - const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({ - ok: true, - body: true, - arrayBuffer: async () => - big.buffer.slice(big.byteOffset, big.byteOffset + big.byteLength), - headers: { get: () => fmt.mime }, - status: 200, - } as Response); + const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({ + ok: true, + body: true, + arrayBuffer: async () => + big.buffer.slice(big.byteOffset, big.byteOffset + big.byteLength), + headers: { get: () => fmt.mime }, + status: 200, + } as Response); - await monitorWebProvider(false, listenerFactory, false, resolver); - expect(capturedOnMessage).toBeDefined(); + await monitorWebProvider(false, listenerFactory, false, resolver); + expect(capturedOnMessage).toBeDefined(); - await capturedOnMessage?.({ - body: "hello", - from: "+1", - to: "+2", - id: `msg-${fmt.name}`, - sendComposing, - reply, - sendMedia, - }); + await capturedOnMessage?.({ + body: "hello", + from: "+1", + to: "+2", + id: `msg-${fmt.name}`, + sendComposing, + reply, + sendMedia, + }); - expect(sendMedia).toHaveBeenCalledTimes(1); - const payload = sendMedia.mock.calls[0][0] as { - image: Buffer; - mimetype?: string; - }; - expect(payload.image.length).toBeLessThanOrEqual(1 * 1024 * 1024); - expect(payload.mimetype).toBe("image/jpeg"); - expect(reply).not.toHaveBeenCalled(); + expect(sendMedia).toHaveBeenCalledTimes(1); + const payload = sendMedia.mock.calls[0][0] as { + image: Buffer; + mimetype?: string; + }; + expect(payload.image.length).toBeLessThanOrEqual(1 * 1024 * 1024); + expect(payload.mimetype).toBe("image/jpeg"); + expect(reply).not.toHaveBeenCalled(); - fetchMock.mockRestore(); - } - }); + fetchMock.mockRestore(); + } + }, + ); it("honors mediaMaxMb from config", async () => { loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });