feat: optional audio transcription via CLI
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
|
||||
### Pending
|
||||
- Web auto-replies now resize/recompress media and honor `inbound.reply.mediaMaxMb` in `~/.warelay/warelay.json` (default 5 MB) to avoid provider/API limits.
|
||||
- Optional voice-note transcription: set `inbound.transcribeAudio.command` (e.g., OpenAI Whisper CLI) to turn inbound audio into text before templating/Claude; verbose logs surface when transcription runs.
|
||||
|
||||
## 1.0.4 — 2025-11-25
|
||||
|
||||
|
||||
26
README.md
26
README.md
@@ -51,6 +51,32 @@ Install from npm (global): `npm install -g warelay` (Node 22+). Then choose **on
|
||||
- Web: `warelay send --provider web --media ./pic.jpg --message "Hi"` (local path or URL; no hosting needed).
|
||||
- Auto-replies can attach `mediaUrl` in `~/.warelay/warelay.json` (used alongside `text` when present). Web auto-replies now auto-resize/recompress images and cap size by config: set `inbound.reply.mediaMaxMb` (default 5) to control the post-compression limit; images are resized (max side 2048px) and JPEG-compressed to fit.
|
||||
|
||||
### Voice notes (optional transcription)
|
||||
- If you set `inbound.transcribeAudio.command`, warelay will run that CLI when inbound audio arrives (e.g., WhatsApp voice notes) and replace the Body with the transcript before templating/Claude.
|
||||
- Example using OpenAI Whisper CLI (requires `OPENAI_API_KEY`):
|
||||
```json5
|
||||
{
|
||||
inbound: {
|
||||
transcribeAudio: {
|
||||
command: [
|
||||
"openai",
|
||||
"api",
|
||||
"audio.transcriptions.create",
|
||||
"-m",
|
||||
"whisper-1",
|
||||
"-f",
|
||||
"{{MediaPath}}",
|
||||
"--response-format",
|
||||
"text"
|
||||
],
|
||||
timeoutSeconds: 45
|
||||
},
|
||||
reply: { mode: "command", command: ["claude", "{{Body}}"] }
|
||||
}
|
||||
}
|
||||
```
|
||||
- Works for Web and Twilio providers; verbose mode logs when transcription runs. If transcription fails, the original Body is used.
|
||||
|
||||
## Providers
|
||||
- **Twilio (default):** needs `.env` creds + WhatsApp-enabled number; supports delivery tracking, polling, webhooks, and auto-reply typing indicators.
|
||||
- **Web (`--provider web`):** uses your personal WhatsApp via Baileys; supports send/receive + auto-reply, but no delivery-status wait; cache lives in `~/.warelay/credentials/` (rerun `login` if logged out).
|
||||
|
||||
@@ -59,6 +59,7 @@ Notes on this configuration:
|
||||
- Host local paths for Twilio using the media server/Tailscale Funnel.
|
||||
- Send buffers directly for the Web provider.
|
||||
- Inbound media is downloaded (≤5 MB) and exposed to your templates as `{{MediaPath}}`, `{{MediaUrl}}`, and `{{MediaType}}`. You can mention this in your prompt if you want Claude to reason about the attachment. Outbound media from Claude (via `MEDIA:`) is resized/recompressed on the Web provider path; control the cap with `inbound.reply.mediaMaxMb` (default 5).
|
||||
- Voice notes: set `inbound.transcribeAudio.command` to run a CLI that emits the transcript to stdout (e.g., OpenAI Whisper: `openai api audio.transcriptions.create -m whisper-1 -f {{MediaPath}} --response-format text`). If it succeeds, warelay replaces `Body` with the transcript before invoking Claude.
|
||||
|
||||
## Testing the setup
|
||||
1. Start a relay (auto-selects Web when logged in, otherwise Twilio polling):
|
||||
|
||||
@@ -57,6 +57,7 @@ This document defines how `warelay` should handle sending and replying with imag
|
||||
- `{{MediaUrl}}` original URL (Twilio) or pseudo-URL (web).
|
||||
- `{{MediaPath}}` local temp path written before running the command.
|
||||
- Size guard: only download if ≤5 MB; else skip and log.
|
||||
- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs.
|
||||
|
||||
## Errors & Messaging
|
||||
- Local path with twilio + Funnel disabled → error: “Twilio media needs a public URL; start `warelay webhook --ingress tailscale` or pass an https:// URL.”
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import crypto from "node:crypto";
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.js";
|
||||
@@ -16,7 +18,7 @@ import { logError } from "../logger.js";
|
||||
import { ensureMediaHosted } from "../media/host.js";
|
||||
import { splitMediaFromOutput } from "../media/parse.js";
|
||||
import { enqueueCommand } from "../process/command-queue.js";
|
||||
import { runCommandWithTimeout } from "../process/exec.js";
|
||||
import { runCommandWithTimeout, runExec } from "../process/exec.js";
|
||||
import { defaultRuntime, type RuntimeEnv } from "../runtime.js";
|
||||
import type { TwilioRequester } from "../twilio/types.js";
|
||||
import { sendTypingIndicator } from "../twilio/typing.js";
|
||||
@@ -109,6 +111,15 @@ export async function getReplyFromConfig(
|
||||
await opts?.onReplyStart?.();
|
||||
};
|
||||
|
||||
// Optional audio transcription before templating/session handling.
|
||||
if (cfg.inbound?.transcribeAudio && isAudio(ctx.MediaType)) {
|
||||
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
|
||||
if (transcribed?.text) {
|
||||
ctx.Body = transcribed.text;
|
||||
logVerbose("Replaced Body with audio transcript for reply flow");
|
||||
}
|
||||
}
|
||||
|
||||
// Optional session handling (conversation reuse + /new resets)
|
||||
const sessionCfg = reply?.session;
|
||||
const resetTriggers = sessionCfg?.resetTriggers?.length
|
||||
@@ -166,6 +177,18 @@ export async function getReplyFromConfig(
|
||||
IsNewSession: isNewSession ? "true" : "false",
|
||||
};
|
||||
|
||||
// Optional allowlist by origin number (E.164 without whatsapp: prefix)
|
||||
const allowFrom = cfg.inbound?.allowFrom;
|
||||
if (Array.isArray(allowFrom) && allowFrom.length > 0) {
|
||||
const from = (ctx.From ?? "").replace(/^whatsapp:/, "");
|
||||
if (!allowFrom.includes(from)) {
|
||||
logVerbose(
|
||||
`Skipping auto-reply: sender ${from || "<unknown>"} not in allowFrom list`,
|
||||
);
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// Optional prefix injected before Body for templating/command prompts.
|
||||
const bodyPrefix = reply?.bodyPrefix
|
||||
? applyTemplate(reply.bodyPrefix, sessionCtx)
|
||||
@@ -192,18 +215,6 @@ export async function getReplyFromConfig(
|
||||
Body: commandBody,
|
||||
BodyStripped: commandBody,
|
||||
};
|
||||
|
||||
// Optional allowlist by origin number (E.164 without whatsapp: prefix)
|
||||
const allowFrom = cfg.inbound?.allowFrom;
|
||||
if (Array.isArray(allowFrom) && allowFrom.length > 0) {
|
||||
const from = (ctx.From ?? "").replace(/^whatsapp:/, "");
|
||||
if (!allowFrom.includes(from)) {
|
||||
logVerbose(
|
||||
`Skipping auto-reply: sender ${from || "<unknown>"} not in allowFrom list`,
|
||||
);
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
if (!reply) {
|
||||
logVerbose("No inbound.reply configured; skipping auto-reply");
|
||||
return undefined;
|
||||
@@ -431,13 +442,38 @@ export async function autoReplyIfConfigured(
|
||||
To: message.to ?? undefined,
|
||||
MessageSid: message.sid,
|
||||
};
|
||||
const cfg = configOverride ?? loadConfig();
|
||||
// Attach media hints for transcription/templates if present on Twilio payloads.
|
||||
const mediaUrl = (message as { mediaUrl?: string }).mediaUrl;
|
||||
if (mediaUrl) ctx.MediaUrl = mediaUrl;
|
||||
|
||||
// Optional audio transcription before building reply.
|
||||
if (cfg.inbound?.transcribeAudio && message.media?.length) {
|
||||
const media = message.media[0];
|
||||
const contentType = (media as { contentType?: string }).contentType;
|
||||
if (contentType?.startsWith("audio")) {
|
||||
const transcribed = await transcribeInboundAudio(
|
||||
cfg,
|
||||
{
|
||||
mediaUrl: mediaUrl ?? undefined,
|
||||
contentType,
|
||||
},
|
||||
runtime,
|
||||
);
|
||||
if (transcribed?.text) {
|
||||
ctx.Body = transcribed.text;
|
||||
ctx.MediaType = contentType;
|
||||
logVerbose("Replaced Body with audio transcript for reply flow");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const replyResult = await getReplyFromConfig(
|
||||
ctx,
|
||||
{
|
||||
onReplyStart: () => sendTypingIndicator(client, runtime, message.sid),
|
||||
},
|
||||
configOverride,
|
||||
cfg,
|
||||
);
|
||||
if (!replyResult || (!replyResult.text && !replyResult.mediaUrl)) return;
|
||||
|
||||
@@ -505,3 +541,64 @@ export async function autoReplyIfConfigured(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function isAudio(mediaType?: string | null) {
|
||||
return Boolean(mediaType && mediaType.startsWith("audio"));
|
||||
}
|
||||
|
||||
async function transcribeInboundAudio(
|
||||
cfg: WarelayConfig,
|
||||
ctx: MsgContext,
|
||||
runtime: RuntimeEnv,
|
||||
): Promise<{ text: string } | undefined> {
|
||||
const transcriber = cfg.inbound?.transcribeAudio;
|
||||
if (!transcriber?.command?.length) return undefined;
|
||||
|
||||
const timeoutMs = Math.max((transcriber.timeoutSeconds ?? 45) * 1000, 1_000);
|
||||
let tmpPath: string | undefined;
|
||||
let mediaPath = ctx.MediaPath;
|
||||
try {
|
||||
if (!mediaPath && ctx.MediaUrl) {
|
||||
const res = await fetch(ctx.MediaUrl);
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
const arrayBuf = await res.arrayBuffer();
|
||||
const buffer = Buffer.from(arrayBuf);
|
||||
tmpPath = path.join(
|
||||
os.tmpdir(),
|
||||
`warelay-audio-${crypto.randomUUID()}.ogg`,
|
||||
);
|
||||
await fs.writeFile(tmpPath, buffer);
|
||||
mediaPath = tmpPath;
|
||||
if (isVerbose()) {
|
||||
logVerbose(
|
||||
`Downloaded audio for transcription (${(buffer.length / (1024 * 1024)).toFixed(2)}MB) -> ${tmpPath}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
if (!mediaPath) return undefined;
|
||||
|
||||
const templCtx: MsgContext = { ...ctx, MediaPath: mediaPath };
|
||||
const argv = transcriber.command.map((part) =>
|
||||
applyTemplate(part, templCtx),
|
||||
);
|
||||
if (isVerbose()) {
|
||||
logVerbose(`Transcribing audio via command: ${argv.join(" ")}`);
|
||||
}
|
||||
const { stdout } = await runExec(argv[0], argv.slice(1), {
|
||||
timeoutMs,
|
||||
maxBuffer: 5 * 1024 * 1024,
|
||||
});
|
||||
const text = stdout.trim();
|
||||
if (!text) return undefined;
|
||||
return { text };
|
||||
} catch (err) {
|
||||
runtime.error?.(`Audio transcription failed: ${String(err)}`);
|
||||
return undefined;
|
||||
} finally {
|
||||
if (tmpPath) {
|
||||
void fs
|
||||
.unlink(tmpPath)
|
||||
.catch(() => {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,6 +28,11 @@ export type WarelayConfig = {
|
||||
logging?: LoggingConfig;
|
||||
inbound?: {
|
||||
allowFrom?: string[]; // E.164 numbers allowed to trigger auto-reply (without whatsapp:)
|
||||
transcribeAudio?: {
|
||||
// Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout.
|
||||
command: string[];
|
||||
timeoutSeconds?: number;
|
||||
};
|
||||
reply?: {
|
||||
mode: ReplyMode;
|
||||
text?: string; // for mode=text, can contain {{Body}}
|
||||
@@ -107,6 +112,12 @@ const WarelaySchema = z.object({
|
||||
inbound: z
|
||||
.object({
|
||||
allowFrom: z.array(z.string()).optional(),
|
||||
transcribeAudio: z
|
||||
.object({
|
||||
command: z.array(z.string()),
|
||||
timeoutSeconds: z.number().int().positive().optional(),
|
||||
})
|
||||
.optional(),
|
||||
reply: ReplySchema.optional(),
|
||||
})
|
||||
.optional(),
|
||||
|
||||
@@ -6,6 +6,7 @@ import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { createMockTwilio } from "../test/mocks/twilio.js";
|
||||
import { withWhatsAppPrefix } from "./utils.js";
|
||||
import * as exec from "./process/exec.js";
|
||||
|
||||
// Twilio mock factory shared across tests
|
||||
vi.mock("twilio", () => {
|
||||
@@ -117,6 +118,69 @@ describe("config and templating", () => {
|
||||
expect(result?.text).toContain("http://example.com/a.jpg");
|
||||
});
|
||||
|
||||
it("getReplyFromConfig runs audio transcription command when configured", async () => {
|
||||
const cfg = {
|
||||
inbound: {
|
||||
transcribeAudio: {
|
||||
command: ["echo", "voice transcript"],
|
||||
},
|
||||
reply: {
|
||||
mode: "text" as const,
|
||||
text: "{{Body}}",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const runExec = vi.spyOn(exec, "runExec").mockResolvedValue({
|
||||
stdout: "voice transcript\n",
|
||||
stderr: "",
|
||||
});
|
||||
|
||||
const result = await index.getReplyFromConfig(
|
||||
{
|
||||
Body: "<media:audio>",
|
||||
From: "+1",
|
||||
To: "+2",
|
||||
MediaPath: "/tmp/voice.ogg",
|
||||
MediaType: "audio/ogg",
|
||||
},
|
||||
undefined,
|
||||
cfg,
|
||||
);
|
||||
|
||||
expect(runExec).toHaveBeenCalled();
|
||||
expect(result?.text).toContain("voice transcript");
|
||||
expect(result?.text).toContain("/tmp/voice.ogg");
|
||||
});
|
||||
|
||||
it("getReplyFromConfig skips transcription when not configured", async () => {
|
||||
const cfg = {
|
||||
inbound: {
|
||||
reply: {
|
||||
mode: "text" as const,
|
||||
text: "{{Body}}",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const runExec = vi.spyOn(exec, "runExec");
|
||||
const result = await index.getReplyFromConfig(
|
||||
{
|
||||
Body: "<media:audio>",
|
||||
From: "+1",
|
||||
To: "+2",
|
||||
MediaPath: "/tmp/voice.ogg",
|
||||
MediaType: "audio/ogg",
|
||||
},
|
||||
undefined,
|
||||
cfg,
|
||||
);
|
||||
|
||||
expect(runExec).not.toHaveBeenCalled();
|
||||
expect(result?.text).toContain("/tmp/voice.ogg");
|
||||
expect(result?.text).toContain("<media:audio>");
|
||||
});
|
||||
|
||||
it("getReplyFromConfig extracts media URL from command stdout", async () => {
|
||||
const runSpy = vi.spyOn(index, "runCommandWithTimeout").mockResolvedValue({
|
||||
stdout: "hello\nMEDIA: https://example.com/img.jpg\n",
|
||||
|
||||
Reference in New Issue
Block a user