feat: transcribe audio and surface transcript to prompts
This commit is contained in:
47
docs/audio.md
Normal file
47
docs/audio.md
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# Audio / Voice Notes — 2025-11-25
|
||||||
|
|
||||||
|
## What works
|
||||||
|
- **Optional transcription**: If `inbound.transcribeAudio.command` is set in `~/.warelay/warelay.json`, warelay will:
|
||||||
|
1) Download inbound audio (Web or Twilio) to a temp path if only a URL is present.
|
||||||
|
2) Run the configured CLI (templated with `{{MediaPath}}`), expecting transcript on stdout.
|
||||||
|
3) Replace `Body` with the transcript, set `{{Transcript}}`, and prepend the original media path plus a `Transcript:` section in the command prompt so models see both.
|
||||||
|
4) Continue through the normal auto-reply pipeline (templating, sessions, Claude/command).
|
||||||
|
- **Verbose logging**: In `--verbose`, we log when transcription runs and when the transcript replaces the body.
|
||||||
|
|
||||||
|
## Config example (OpenAI Whisper CLI)
|
||||||
|
Requires `OPENAI_API_KEY` in env and `openai` CLI installed:
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
inbound: {
|
||||||
|
transcribeAudio: {
|
||||||
|
command: [
|
||||||
|
"openai",
|
||||||
|
"api",
|
||||||
|
"audio.transcriptions.create",
|
||||||
|
"-m",
|
||||||
|
"whisper-1",
|
||||||
|
"-f",
|
||||||
|
"{{MediaPath}}",
|
||||||
|
"--response-format",
|
||||||
|
"text"
|
||||||
|
],
|
||||||
|
timeoutSeconds: 45
|
||||||
|
},
|
||||||
|
reply: {
|
||||||
|
mode: "command",
|
||||||
|
command: ["claude", "{{Body}}"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes & limits
|
||||||
|
- We don’t ship a transcriber; you opt in with any CLI that prints text to stdout (Whisper cloud, whisper.cpp, vosk, Deepgram, etc.).
|
||||||
|
- Size guard: inbound audio must be ≤5 MB (same as other media).
|
||||||
|
- If transcription fails, we fall back to the original body/media note; replies still go through.
|
||||||
|
- Transcript is available to templates as `{{Transcript}}`; models get both the media path and a `Transcript:` block in the prompt when using command mode.
|
||||||
|
|
||||||
|
## Gotchas
|
||||||
|
- Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
|
||||||
|
- Keep timeouts reasonable (`timeoutSeconds`, default 45s) to avoid blocking the reply queue.
|
||||||
|
- Twilio paths are hosted URLs; Web paths are local. The temp download uses HTTPS for Twilio and a temp file for Web-only media.
|
||||||
@@ -57,7 +57,7 @@ This document defines how `warelay` should handle sending and replying with imag
|
|||||||
- `{{MediaUrl}}` original URL (Twilio) or pseudo-URL (web).
|
- `{{MediaUrl}}` original URL (Twilio) or pseudo-URL (web).
|
||||||
- `{{MediaPath}}` local temp path written before running the command.
|
- `{{MediaPath}}` local temp path written before running the command.
|
||||||
- Size guard: only download if ≤5 MB; else skip and log.
|
- Size guard: only download if ≤5 MB; else skip and log.
|
||||||
- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs.
|
- Audio/voice notes: if you set `inbound.transcribeAudio.command`, warelay will run that CLI (templated with `{{MediaPath}}`) and replace `Body` with the transcript before continuing the reply flow; verbose logs indicate when transcription runs. The command prompt includes the original media path plus a `Transcript:` section so the model sees both.
|
||||||
|
|
||||||
## Errors & Messaging
|
## Errors & Messaging
|
||||||
- Local path with twilio + Funnel disabled → error: “Twilio media needs a public URL; start `warelay webhook --ingress tailscale` or pass an https:// URL.”
|
- Local path with twilio + Funnel disabled → error: “Twilio media needs a public URL; start `warelay webhook --ingress tailscale` or pass an https:// URL.”
|
||||||
|
|||||||
@@ -110,12 +110,15 @@ export async function getReplyFromConfig(
|
|||||||
started = true;
|
started = true;
|
||||||
await opts?.onReplyStart?.();
|
await opts?.onReplyStart?.();
|
||||||
};
|
};
|
||||||
|
let transcribedText: string | undefined;
|
||||||
|
|
||||||
// Optional audio transcription before templating/session handling.
|
// Optional audio transcription before templating/session handling.
|
||||||
if (cfg.inbound?.transcribeAudio && isAudio(ctx.MediaType)) {
|
if (cfg.inbound?.transcribeAudio && isAudio(ctx.MediaType)) {
|
||||||
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
|
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
|
||||||
if (transcribed?.text) {
|
if (transcribed?.text) {
|
||||||
|
transcribedText = transcribed.text;
|
||||||
ctx.Body = transcribed.text;
|
ctx.Body = transcribed.text;
|
||||||
|
ctx.Transcript = transcribed.text;
|
||||||
logVerbose("Replaced Body with audio transcript for reply flow");
|
logVerbose("Replaced Body with audio transcript for reply flow");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -193,9 +196,15 @@ export async function getReplyFromConfig(
|
|||||||
const bodyPrefix = reply?.bodyPrefix
|
const bodyPrefix = reply?.bodyPrefix
|
||||||
? applyTemplate(reply.bodyPrefix, sessionCtx)
|
? applyTemplate(reply.bodyPrefix, sessionCtx)
|
||||||
: "";
|
: "";
|
||||||
const prefixedBody = bodyPrefix
|
const prefixedBodyBase = bodyPrefix
|
||||||
? `${bodyPrefix}${sessionCtx.BodyStripped ?? sessionCtx.Body ?? ""}`
|
? `${bodyPrefix}${sessionCtx.BodyStripped ?? sessionCtx.Body ?? ""}`
|
||||||
: (sessionCtx.BodyStripped ?? sessionCtx.Body);
|
: (sessionCtx.BodyStripped ?? sessionCtx.Body);
|
||||||
|
const prefixedBody =
|
||||||
|
transcribedText && reply?.mode === "command"
|
||||||
|
? [prefixedBodyBase, `Transcript:\n${transcribedText}`]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join("\n\n")
|
||||||
|
: prefixedBodyBase;
|
||||||
const mediaNote = ctx.MediaPath?.length
|
const mediaNote = ctx.MediaPath?.length
|
||||||
? `[media attached: ${ctx.MediaPath}${ctx.MediaType ? ` (${ctx.MediaType})` : ""}${ctx.MediaUrl ? ` | ${ctx.MediaUrl}` : ""}]`
|
? `[media attached: ${ctx.MediaPath}${ctx.MediaType ? ` (${ctx.MediaType})` : ""}${ctx.MediaUrl ? ` | ${ctx.MediaUrl}` : ""}]`
|
||||||
: undefined;
|
: undefined;
|
||||||
@@ -543,7 +552,7 @@ export async function autoReplyIfConfigured(
|
|||||||
}
|
}
|
||||||
|
|
||||||
function isAudio(mediaType?: string | null) {
|
function isAudio(mediaType?: string | null) {
|
||||||
return Boolean(mediaType && mediaType.startsWith("audio"));
|
return Boolean(mediaType?.startsWith("audio"));
|
||||||
}
|
}
|
||||||
|
|
||||||
async function transcribeInboundAudio(
|
async function transcribeInboundAudio(
|
||||||
@@ -596,9 +605,7 @@ async function transcribeInboundAudio(
|
|||||||
return undefined;
|
return undefined;
|
||||||
} finally {
|
} finally {
|
||||||
if (tmpPath) {
|
if (tmpPath) {
|
||||||
void fs
|
void fs.unlink(tmpPath).catch(() => {});
|
||||||
.unlink(tmpPath)
|
|
||||||
.catch(() => {});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ export type MsgContext = {
|
|||||||
MediaPath?: string;
|
MediaPath?: string;
|
||||||
MediaUrl?: string;
|
MediaUrl?: string;
|
||||||
MediaType?: string;
|
MediaType?: string;
|
||||||
|
Transcript?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type TemplateContext = MsgContext & {
|
export type TemplateContext = MsgContext & {
|
||||||
|
|||||||
@@ -5,8 +5,8 @@ import path from "node:path";
|
|||||||
import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.js";
|
import type { MessageInstance } from "twilio/lib/rest/api/v2010/account/message.js";
|
||||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||||
import { createMockTwilio } from "../test/mocks/twilio.js";
|
import { createMockTwilio } from "../test/mocks/twilio.js";
|
||||||
import { withWhatsAppPrefix } from "./utils.js";
|
|
||||||
import * as exec from "./process/exec.js";
|
import * as exec from "./process/exec.js";
|
||||||
|
import { withWhatsAppPrefix } from "./utils.js";
|
||||||
|
|
||||||
// Twilio mock factory shared across tests
|
// Twilio mock factory shared across tests
|
||||||
vi.mock("twilio", () => {
|
vi.mock("twilio", () => {
|
||||||
@@ -125,8 +125,8 @@ describe("config and templating", () => {
|
|||||||
command: ["echo", "voice transcript"],
|
command: ["echo", "voice transcript"],
|
||||||
},
|
},
|
||||||
reply: {
|
reply: {
|
||||||
mode: "text" as const,
|
mode: "command" as const,
|
||||||
text: "{{Body}}",
|
command: ["echo", "{{Body}}"],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -135,6 +135,13 @@ describe("config and templating", () => {
|
|||||||
stdout: "voice transcript\n",
|
stdout: "voice transcript\n",
|
||||||
stderr: "",
|
stderr: "",
|
||||||
});
|
});
|
||||||
|
const commandRunner = vi.fn().mockResolvedValue({
|
||||||
|
stdout: "ok",
|
||||||
|
stderr: "",
|
||||||
|
code: 0,
|
||||||
|
signal: null,
|
||||||
|
killed: false,
|
||||||
|
});
|
||||||
|
|
||||||
const result = await index.getReplyFromConfig(
|
const result = await index.getReplyFromConfig(
|
||||||
{
|
{
|
||||||
@@ -146,11 +153,17 @@ describe("config and templating", () => {
|
|||||||
},
|
},
|
||||||
undefined,
|
undefined,
|
||||||
cfg,
|
cfg,
|
||||||
|
commandRunner,
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(runExec).toHaveBeenCalled();
|
expect(runExec).toHaveBeenCalled();
|
||||||
expect(result?.text).toContain("voice transcript");
|
expect(commandRunner).toHaveBeenCalled();
|
||||||
expect(result?.text).toContain("/tmp/voice.ogg");
|
const argv = commandRunner.mock.calls[0][0];
|
||||||
|
const prompt = argv[argv.length - 1] as string;
|
||||||
|
expect(prompt).toContain("/tmp/voice.ogg");
|
||||||
|
expect(prompt).toContain("Transcript:");
|
||||||
|
expect(prompt).toContain("voice transcript");
|
||||||
|
expect(result?.text).toBeUndefined();
|
||||||
});
|
});
|
||||||
|
|
||||||
it("getReplyFromConfig skips transcription when not configured", async () => {
|
it("getReplyFromConfig skips transcription when not configured", async () => {
|
||||||
|
|||||||
@@ -602,7 +602,8 @@ describe("provider-web", () => {
|
|||||||
|
|
||||||
fetchMock.mockRestore();
|
fetchMock.mockRestore();
|
||||||
}
|
}
|
||||||
});
|
},
|
||||||
|
);
|
||||||
|
|
||||||
it("honors mediaMaxMb from config", async () => {
|
it("honors mediaMaxMb from config", async () => {
|
||||||
loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });
|
loadConfigMock = () => ({ inbound: { reply: { mediaMaxMb: 1 } } });
|
||||||
|
|||||||
Reference in New Issue
Block a user