feat: add Deepgram audio transcription
Co-authored-by: Safzan Pirani <safzanpirani@users.noreply.github.com>
This commit is contained in:
@@ -62,8 +62,24 @@ read_when:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Provider-only (Deepgram)
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
models: [{ provider: "deepgram", model: "nova-3" }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Notes & limits
|
## Notes & limits
|
||||||
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
||||||
|
- Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
|
||||||
|
- Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
|
||||||
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
||||||
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
||||||
- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
|
- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
|
||||||
|
|||||||
@@ -108,6 +108,7 @@ lists, Clawdbot can infer defaults:
|
|||||||
- `openai`, `anthropic`, `minimax`: **image**
|
- `openai`, `anthropic`, `minimax`: **image**
|
||||||
- `google` (Gemini API): **image + audio + video**
|
- `google` (Gemini API): **image + audio + video**
|
||||||
- `groq`: **audio**
|
- `groq`: **audio**
|
||||||
|
- `deepgram`: **audio**
|
||||||
|
|
||||||
For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
|
For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
|
||||||
If you omit `capabilities`, the entry is eligible for the list it appears in.
|
If you omit `capabilities`, the entry is eligible for the list it appears in.
|
||||||
@@ -116,7 +117,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
|
|||||||
| Capability | Provider integration | Notes |
|
| Capability | Provider integration | Notes |
|
||||||
|------------|----------------------|-------|
|
|------------|----------------------|-------|
|
||||||
| Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. |
|
| Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. |
|
||||||
| Audio | OpenAI, Groq | Provider transcription (Whisper). |
|
| Audio | OpenAI, Groq, Deepgram | Provider transcription (Whisper/Deepgram). |
|
||||||
| Video | Google (Gemini API) | Provider video understanding. |
|
| Video | Google (Gemini API) | Provider video understanding. |
|
||||||
|
|
||||||
## Recommended providers
|
## Recommended providers
|
||||||
@@ -125,8 +126,9 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
|
|||||||
- Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`.
|
- Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`.
|
||||||
|
|
||||||
**Audio**
|
**Audio**
|
||||||
- `openai/whisper-1` or `groq/whisper-large-v3-turbo`.
|
- `openai/whisper-1`, `groq/whisper-large-v3-turbo`, or `deepgram/nova-3`.
|
||||||
- CLI fallback: `whisper` binary.
|
- CLI fallback: `whisper` binary.
|
||||||
|
- Deepgram setup: [Deepgram (audio transcription)](/providers/deepgram).
|
||||||
|
|
||||||
**Video**
|
**Video**
|
||||||
- `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
|
- `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
|
||||||
|
|||||||
64
docs/providers/deepgram.md
Normal file
64
docs/providers/deepgram.md
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
---
|
||||||
|
summary: "Deepgram transcription for inbound voice notes"
|
||||||
|
read_when:
|
||||||
|
- You want Deepgram speech-to-text for audio attachments
|
||||||
|
- You need a quick Deepgram config example
|
||||||
|
---
|
||||||
|
# Deepgram (Audio Transcription)
|
||||||
|
|
||||||
|
Deepgram is a speech-to-text API. In Clawdbot it is used for **inbound audio/voice note
|
||||||
|
transcription** via `tools.media.audio`.
|
||||||
|
|
||||||
|
When enabled, Clawdbot uploads the audio file to Deepgram and injects the transcript
|
||||||
|
into the reply pipeline (`{{Transcript}}` + `[Audio]` block). This is **not streaming**;
|
||||||
|
it uses the pre-recorded transcription endpoint.
|
||||||
|
|
||||||
|
Website: https://deepgram.com
|
||||||
|
Docs: https://developers.deepgram.com
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
1) Set your API key:
|
||||||
|
```
|
||||||
|
DEEPGRAM_API_KEY=dg_...
|
||||||
|
```
|
||||||
|
|
||||||
|
2) Enable the provider:
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
models: [{ provider: "deepgram", model: "nova-3" }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
- `model`: Deepgram model id (default: `nova-3`)
|
||||||
|
- `language`: language hint (optional)
|
||||||
|
|
||||||
|
Example with language:
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
models: [
|
||||||
|
{ provider: "deepgram", model: "nova-3", language: "en" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Authentication follows the standard provider auth order; `DEEPGRAM_API_KEY` is the simplest path.
|
||||||
|
- Output follows the same audio rules as other providers (size caps, timeouts, transcript injection).
|
||||||
@@ -34,5 +34,9 @@ Looking for chat channel docs (WhatsApp/Telegram/Discord/Slack/etc.)? See [Chann
|
|||||||
- [GLM models](/providers/glm)
|
- [GLM models](/providers/glm)
|
||||||
- [MiniMax](/providers/minimax)
|
- [MiniMax](/providers/minimax)
|
||||||
|
|
||||||
|
## Transcription providers
|
||||||
|
|
||||||
|
- [Deepgram (audio transcription)](/providers/deepgram)
|
||||||
|
|
||||||
For the full provider catalog (xAI, Groq, Mistral, etc.) and advanced configuration,
|
For the full provider catalog (xAI, Groq, Mistral, etc.) and advanced configuration,
|
||||||
see [Model providers](/concepts/model-providers).
|
see [Model providers](/concepts/model-providers).
|
||||||
|
|||||||
@@ -290,6 +290,11 @@ Live tests discover credentials the same way the CLI does. Practical implication
|
|||||||
|
|
||||||
If you want to rely on env keys (e.g. exported in your `~/.profile`), run local tests after `source ~/.profile`, or use the Docker runners below (they can mount `~/.profile` into the container).
|
If you want to rely on env keys (e.g. exported in your `~/.profile`), run local tests after `source ~/.profile`, or use the Docker runners below (they can mount `~/.profile` into the container).
|
||||||
|
|
||||||
|
## Deepgram live (audio transcription)
|
||||||
|
|
||||||
|
- Test: `src/media-understanding/providers/deepgram/audio.live.test.ts`
|
||||||
|
- Enable: `DEEPGRAM_API_KEY=... DEEPGRAM_LIVE_TEST=1 pnpm test:live src/media-understanding/providers/deepgram/audio.live.test.ts`
|
||||||
|
|
||||||
## Docker runners (optional “works in Linux” checks)
|
## Docker runners (optional “works in Linux” checks)
|
||||||
|
|
||||||
These run `pnpm test:live` inside the repo Docker image, mounting your local config dir and workspace (and sourcing `~/.profile` if mounted):
|
These run `pnpm test:live` inside the repo Docker image, mounting your local config dir and workspace (and sourcing `~/.profile` if mounted):
|
||||||
|
|||||||
@@ -151,6 +151,7 @@ export function resolveEnvApiKey(provider: string): EnvApiKeyResult | null {
|
|||||||
openai: "OPENAI_API_KEY",
|
openai: "OPENAI_API_KEY",
|
||||||
google: "GEMINI_API_KEY",
|
google: "GEMINI_API_KEY",
|
||||||
groq: "GROQ_API_KEY",
|
groq: "GROQ_API_KEY",
|
||||||
|
deepgram: "DEEPGRAM_API_KEY",
|
||||||
cerebras: "CEREBRAS_API_KEY",
|
cerebras: "CEREBRAS_API_KEY",
|
||||||
xai: "XAI_API_KEY",
|
xai: "XAI_API_KEY",
|
||||||
openrouter: "OPENROUTER_API_KEY",
|
openrouter: "OPENROUTER_API_KEY",
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
|
|||||||
export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
|
export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
|
||||||
groq: "whisper-large-v3-turbo",
|
groq: "whisper-large-v3-turbo",
|
||||||
openai: "whisper-1",
|
openai: "whisper-1",
|
||||||
|
deepgram: "nova-3",
|
||||||
};
|
};
|
||||||
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
||||||
export const DEFAULT_MEDIA_CONCURRENCY = 2;
|
export const DEFAULT_MEDIA_CONCURRENCY = 2;
|
||||||
|
|||||||
@@ -0,0 +1,51 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import { transcribeDeepgramAudio } from "./audio.js";
|
||||||
|
|
||||||
|
const DEEPGRAM_KEY = process.env.DEEPGRAM_API_KEY ?? "";
|
||||||
|
const DEEPGRAM_MODEL = process.env.DEEPGRAM_MODEL?.trim() || "nova-3";
|
||||||
|
const DEEPGRAM_BASE_URL = process.env.DEEPGRAM_BASE_URL?.trim();
|
||||||
|
const SAMPLE_URL =
|
||||||
|
process.env.DEEPGRAM_SAMPLE_URL?.trim() ||
|
||||||
|
"https://static.deepgram.com/examples/Bueller-Life-moves-pretty-fast.wav";
|
||||||
|
const LIVE =
|
||||||
|
process.env.DEEPGRAM_LIVE_TEST === "1" ||
|
||||||
|
process.env.LIVE === "1" ||
|
||||||
|
process.env.CLAWDBOT_LIVE_TEST === "1";
|
||||||
|
|
||||||
|
const describeLive = LIVE && DEEPGRAM_KEY ? describe : describe.skip;
|
||||||
|
|
||||||
|
async function fetchSampleBuffer(url: string, timeoutMs: number): Promise<Buffer> {
|
||||||
|
const controller = new AbortController();
|
||||||
|
const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
|
||||||
|
try {
|
||||||
|
const res = await fetch(url, { signal: controller.signal });
|
||||||
|
if (!res.ok) {
|
||||||
|
throw new Error(`Sample download failed (HTTP ${res.status})`);
|
||||||
|
}
|
||||||
|
const data = await res.arrayBuffer();
|
||||||
|
return Buffer.from(data);
|
||||||
|
} finally {
|
||||||
|
clearTimeout(timer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
describeLive("deepgram live", () => {
|
||||||
|
it(
|
||||||
|
"transcribes sample audio",
|
||||||
|
async () => {
|
||||||
|
const buffer = await fetchSampleBuffer(SAMPLE_URL, 15000);
|
||||||
|
const result = await transcribeDeepgramAudio({
|
||||||
|
buffer,
|
||||||
|
fileName: "sample.wav",
|
||||||
|
mime: "audio/wav",
|
||||||
|
apiKey: DEEPGRAM_KEY,
|
||||||
|
model: DEEPGRAM_MODEL,
|
||||||
|
baseUrl: DEEPGRAM_BASE_URL,
|
||||||
|
timeoutMs: 20000,
|
||||||
|
});
|
||||||
|
expect(result.text.trim().length).toBeGreaterThan(0);
|
||||||
|
},
|
||||||
|
30000,
|
||||||
|
);
|
||||||
|
});
|
||||||
83
src/media-understanding/providers/deepgram/audio.test.ts
Normal file
83
src/media-understanding/providers/deepgram/audio.test.ts
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import { transcribeDeepgramAudio } from "./audio.js";
|
||||||
|
|
||||||
|
const resolveRequestUrl = (input: RequestInfo | URL) => {
|
||||||
|
if (typeof input === "string") return input;
|
||||||
|
if (input instanceof URL) return input.toString();
|
||||||
|
return input.url;
|
||||||
|
};
|
||||||
|
|
||||||
|
describe("transcribeDeepgramAudio", () => {
|
||||||
|
it("respects lowercase authorization header overrides", async () => {
|
||||||
|
let seenAuth: string | null = null;
|
||||||
|
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
|
||||||
|
const headers = new Headers(init?.headers);
|
||||||
|
seenAuth = headers.get("authorization");
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
results: { channels: [{ alternatives: [{ transcript: "ok" }] }] },
|
||||||
|
}),
|
||||||
|
{
|
||||||
|
status: 200,
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
},
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await transcribeDeepgramAudio({
|
||||||
|
buffer: Buffer.from("audio"),
|
||||||
|
fileName: "note.mp3",
|
||||||
|
apiKey: "test-key",
|
||||||
|
timeoutMs: 1000,
|
||||||
|
headers: { authorization: "Token override" },
|
||||||
|
fetchFn,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(seenAuth).toBe("Token override");
|
||||||
|
expect(result.text).toBe("ok");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("builds the expected request payload", async () => {
|
||||||
|
let seenUrl: string | null = null;
|
||||||
|
let seenInit: RequestInit | undefined;
|
||||||
|
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||||
|
seenUrl = resolveRequestUrl(input);
|
||||||
|
seenInit = init;
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
results: { channels: [{ alternatives: [{ transcript: "hello" }] }] },
|
||||||
|
}),
|
||||||
|
{
|
||||||
|
status: 200,
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
},
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await transcribeDeepgramAudio({
|
||||||
|
buffer: Buffer.from("audio-bytes"),
|
||||||
|
fileName: "voice.wav",
|
||||||
|
apiKey: "test-key",
|
||||||
|
timeoutMs: 1234,
|
||||||
|
baseUrl: "https://api.example.com/v1/",
|
||||||
|
model: " ",
|
||||||
|
language: " en ",
|
||||||
|
mime: "audio/wav",
|
||||||
|
headers: { "X-Custom": "1" },
|
||||||
|
fetchFn,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.model).toBe("nova-3");
|
||||||
|
expect(result.text).toBe("hello");
|
||||||
|
expect(seenUrl).toBe("https://api.example.com/v1/listen?model=nova-3&language=en");
|
||||||
|
expect(seenInit?.method).toBe("POST");
|
||||||
|
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
|
||||||
|
|
||||||
|
const headers = new Headers(seenInit?.headers);
|
||||||
|
expect(headers.get("authorization")).toBe("Token test-key");
|
||||||
|
expect(headers.get("x-custom")).toBe("1");
|
||||||
|
expect(headers.get("content-type")).toBe("audio/wav");
|
||||||
|
expect(Buffer.isBuffer(seenInit?.body)).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
64
src/media-understanding/providers/deepgram/audio.ts
Normal file
64
src/media-understanding/providers/deepgram/audio.ts
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
|
||||||
|
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||||
|
|
||||||
|
export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1";
|
||||||
|
export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3";
|
||||||
|
|
||||||
|
function resolveModel(model?: string): string {
|
||||||
|
const trimmed = model?.trim();
|
||||||
|
return trimmed || DEFAULT_DEEPGRAM_AUDIO_MODEL;
|
||||||
|
}
|
||||||
|
|
||||||
|
type DeepgramTranscriptResponse = {
|
||||||
|
results?: {
|
||||||
|
channels?: Array<{
|
||||||
|
alternatives?: Array<{
|
||||||
|
transcript?: string;
|
||||||
|
}>;
|
||||||
|
}>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
export async function transcribeDeepgramAudio(
|
||||||
|
params: AudioTranscriptionRequest,
|
||||||
|
): Promise<AudioTranscriptionResult> {
|
||||||
|
const fetchFn = params.fetchFn ?? fetch;
|
||||||
|
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_DEEPGRAM_AUDIO_BASE_URL);
|
||||||
|
const model = resolveModel(params.model);
|
||||||
|
|
||||||
|
const url = new URL(`${baseUrl}/listen`);
|
||||||
|
url.searchParams.set("model", model);
|
||||||
|
if (params.language?.trim()) url.searchParams.set("language", params.language.trim());
|
||||||
|
|
||||||
|
const headers = new Headers(params.headers);
|
||||||
|
if (!headers.has("authorization")) {
|
||||||
|
headers.set("authorization", `Token ${params.apiKey}`);
|
||||||
|
}
|
||||||
|
if (!headers.has("content-type")) {
|
||||||
|
headers.set("content-type", params.mime ?? "application/octet-stream");
|
||||||
|
}
|
||||||
|
|
||||||
|
const res = await fetchWithTimeout(
|
||||||
|
url.toString(),
|
||||||
|
{
|
||||||
|
method: "POST",
|
||||||
|
headers,
|
||||||
|
body: params.buffer,
|
||||||
|
},
|
||||||
|
params.timeoutMs,
|
||||||
|
fetchFn,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
const detail = await readErrorResponse(res);
|
||||||
|
const suffix = detail ? `: ${detail}` : "";
|
||||||
|
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const payload = (await res.json()) as DeepgramTranscriptResponse;
|
||||||
|
const transcript = payload.results?.channels?.[0]?.alternatives?.[0]?.transcript?.trim();
|
||||||
|
if (!transcript) {
|
||||||
|
throw new Error("Audio transcription response missing transcript");
|
||||||
|
}
|
||||||
|
return { text: transcript, model };
|
||||||
|
}
|
||||||
8
src/media-understanding/providers/deepgram/index.ts
Normal file
8
src/media-understanding/providers/deepgram/index.ts
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||||
|
import { transcribeDeepgramAudio } from "./audio.js";
|
||||||
|
|
||||||
|
export const deepgramProvider: MediaUnderstandingProvider = {
|
||||||
|
id: "deepgram",
|
||||||
|
capabilities: ["audio"],
|
||||||
|
transcribeAudio: transcribeDeepgramAudio,
|
||||||
|
};
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import { normalizeProviderId } from "../../agents/model-selection.js";
|
import { normalizeProviderId } from "../../agents/model-selection.js";
|
||||||
import type { MediaUnderstandingProvider } from "../types.js";
|
import type { MediaUnderstandingProvider } from "../types.js";
|
||||||
import { anthropicProvider } from "./anthropic/index.js";
|
import { anthropicProvider } from "./anthropic/index.js";
|
||||||
|
import { deepgramProvider } from "./deepgram/index.js";
|
||||||
import { googleProvider } from "./google/index.js";
|
import { googleProvider } from "./google/index.js";
|
||||||
import { groqProvider } from "./groq/index.js";
|
import { groqProvider } from "./groq/index.js";
|
||||||
import { minimaxProvider } from "./minimax/index.js";
|
import { minimaxProvider } from "./minimax/index.js";
|
||||||
@@ -12,6 +13,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [
|
|||||||
googleProvider,
|
googleProvider,
|
||||||
anthropicProvider,
|
anthropicProvider,
|
||||||
minimaxProvider,
|
minimaxProvider,
|
||||||
|
deepgramProvider,
|
||||||
];
|
];
|
||||||
|
|
||||||
export function normalizeMediaProviderId(id: string): string {
|
export function normalizeMediaProviderId(id: string): string {
|
||||||
|
|||||||
Reference in New Issue
Block a user