diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index ba68b35e9..ce51c9fab 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -62,8 +62,24 @@ read_when: } ``` +### Provider-only (Deepgram) +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + models: [{ provider: "deepgram", model: "nova-3" }] + } + } + } +} +``` + ## Notes & limits - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`). +- Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used. +- Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram). - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried. - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output. - Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`). diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index 38b2931e3..57b532dc0 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -108,6 +108,7 @@ lists, Clawdbot can infer defaults: - `openai`, `anthropic`, `minimax`: **image** - `google` (Gemini API): **image + audio + video** - `groq`: **audio** +- `deepgram`: **audio** For CLI entries, **set `capabilities` explicitly** to avoid surprising matches. If you omit `capabilities`, the entry is eligible for the list it appears in. @@ -116,7 +117,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in. | Capability | Provider integration | Notes | |------------|----------------------|-------| | Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. | -| Audio | OpenAI, Groq | Provider transcription (Whisper). | +| Audio | OpenAI, Groq, Deepgram | Provider transcription (Whisper/Deepgram). | | Video | Google (Gemini API) | Provider video understanding. | ## Recommended providers @@ -125,8 +126,9 @@ If you omit `capabilities`, the entry is eligible for the list it appears in. - Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`. **Audio** -- `openai/whisper-1` or `groq/whisper-large-v3-turbo`. +- `openai/whisper-1`, `groq/whisper-large-v3-turbo`, or `deepgram/nova-3`. - CLI fallback: `whisper` binary. +- Deepgram setup: [Deepgram (audio transcription)](/providers/deepgram). **Video** - `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer). diff --git a/docs/providers/deepgram.md b/docs/providers/deepgram.md new file mode 100644 index 000000000..3e01bcb13 --- /dev/null +++ b/docs/providers/deepgram.md @@ -0,0 +1,64 @@ +--- +summary: "Deepgram transcription for inbound voice notes" +read_when: + - You want Deepgram speech-to-text for audio attachments + - You need a quick Deepgram config example +--- +# Deepgram (Audio Transcription) + +Deepgram is a speech-to-text API. In Clawdbot it is used for **inbound audio/voice note +transcription** via `tools.media.audio`. + +When enabled, Clawdbot uploads the audio file to Deepgram and injects the transcript +into the reply pipeline (`{{Transcript}}` + `[Audio]` block). This is **not streaming**; +it uses the pre-recorded transcription endpoint. + +Website: https://deepgram.com +Docs: https://developers.deepgram.com + +## Quick start + +1) Set your API key: +``` +DEEPGRAM_API_KEY=dg_... +``` + +2) Enable the provider: +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + models: [{ provider: "deepgram", model: "nova-3" }] + } + } + } +} +``` + +## Options + +- `model`: Deepgram model id (default: `nova-3`) +- `language`: language hint (optional) + +Example with language: +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + models: [ + { provider: "deepgram", model: "nova-3", language: "en" } + ] + } + } + } +} +``` + +## Notes + +- Authentication follows the standard provider auth order; `DEEPGRAM_API_KEY` is the simplest path. +- Output follows the same audio rules as other providers (size caps, timeouts, transcript injection). diff --git a/docs/providers/index.md b/docs/providers/index.md index d89d5c4a5..3bda0ee4c 100644 --- a/docs/providers/index.md +++ b/docs/providers/index.md @@ -34,5 +34,9 @@ Looking for chat channel docs (WhatsApp/Telegram/Discord/Slack/etc.)? See [Chann - [GLM models](/providers/glm) - [MiniMax](/providers/minimax) +## Transcription providers + +- [Deepgram (audio transcription)](/providers/deepgram) + For the full provider catalog (xAI, Groq, Mistral, etc.) and advanced configuration, see [Model providers](/concepts/model-providers). diff --git a/docs/testing.md b/docs/testing.md index 26d11da2b..26bd4d132 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -290,6 +290,11 @@ Live tests discover credentials the same way the CLI does. Practical implication If you want to rely on env keys (e.g. exported in your `~/.profile`), run local tests after `source ~/.profile`, or use the Docker runners below (they can mount `~/.profile` into the container). +## Deepgram live (audio transcription) + +- Test: `src/media-understanding/providers/deepgram/audio.live.test.ts` +- Enable: `DEEPGRAM_API_KEY=... DEEPGRAM_LIVE_TEST=1 pnpm test:live src/media-understanding/providers/deepgram/audio.live.test.ts` + ## Docker runners (optional “works in Linux” checks) These run `pnpm test:live` inside the repo Docker image, mounting your local config dir and workspace (and sourcing `~/.profile` if mounted): diff --git a/src/agents/model-auth.ts b/src/agents/model-auth.ts index c93d1a5c8..32257de9a 100644 --- a/src/agents/model-auth.ts +++ b/src/agents/model-auth.ts @@ -151,6 +151,7 @@ export function resolveEnvApiKey(provider: string): EnvApiKeyResult | null { openai: "OPENAI_API_KEY", google: "GEMINI_API_KEY", groq: "GROQ_API_KEY", + deepgram: "DEEPGRAM_API_KEY", cerebras: "CEREBRAS_API_KEY", xai: "XAI_API_KEY", openrouter: "OPENROUTER_API_KEY", diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index 92ce8835c..570f738e7 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -30,6 +30,7 @@ export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; export const DEFAULT_AUDIO_MODELS: Record = { groq: "whisper-large-v3-turbo", openai: "whisper-1", + deepgram: "nova-3", }; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; export const DEFAULT_MEDIA_CONCURRENCY = 2; diff --git a/src/media-understanding/providers/deepgram/audio.live.test.ts b/src/media-understanding/providers/deepgram/audio.live.test.ts new file mode 100644 index 000000000..cf2223732 --- /dev/null +++ b/src/media-understanding/providers/deepgram/audio.live.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from "vitest"; + +import { transcribeDeepgramAudio } from "./audio.js"; + +const DEEPGRAM_KEY = process.env.DEEPGRAM_API_KEY ?? ""; +const DEEPGRAM_MODEL = process.env.DEEPGRAM_MODEL?.trim() || "nova-3"; +const DEEPGRAM_BASE_URL = process.env.DEEPGRAM_BASE_URL?.trim(); +const SAMPLE_URL = + process.env.DEEPGRAM_SAMPLE_URL?.trim() || + "https://static.deepgram.com/examples/Bueller-Life-moves-pretty-fast.wav"; +const LIVE = + process.env.DEEPGRAM_LIVE_TEST === "1" || + process.env.LIVE === "1" || + process.env.CLAWDBOT_LIVE_TEST === "1"; + +const describeLive = LIVE && DEEPGRAM_KEY ? describe : describe.skip; + +async function fetchSampleBuffer(url: string, timeoutMs: number): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs)); + try { + const res = await fetch(url, { signal: controller.signal }); + if (!res.ok) { + throw new Error(`Sample download failed (HTTP ${res.status})`); + } + const data = await res.arrayBuffer(); + return Buffer.from(data); + } finally { + clearTimeout(timer); + } +} + +describeLive("deepgram live", () => { + it( + "transcribes sample audio", + async () => { + const buffer = await fetchSampleBuffer(SAMPLE_URL, 15000); + const result = await transcribeDeepgramAudio({ + buffer, + fileName: "sample.wav", + mime: "audio/wav", + apiKey: DEEPGRAM_KEY, + model: DEEPGRAM_MODEL, + baseUrl: DEEPGRAM_BASE_URL, + timeoutMs: 20000, + }); + expect(result.text.trim().length).toBeGreaterThan(0); + }, + 30000, + ); +}); diff --git a/src/media-understanding/providers/deepgram/audio.test.ts b/src/media-understanding/providers/deepgram/audio.test.ts new file mode 100644 index 000000000..a4b50dae5 --- /dev/null +++ b/src/media-understanding/providers/deepgram/audio.test.ts @@ -0,0 +1,83 @@ +import { describe, expect, it } from "vitest"; + +import { transcribeDeepgramAudio } from "./audio.js"; + +const resolveRequestUrl = (input: RequestInfo | URL) => { + if (typeof input === "string") return input; + if (input instanceof URL) return input.toString(); + return input.url; +}; + +describe("transcribeDeepgramAudio", () => { + it("respects lowercase authorization header overrides", async () => { + let seenAuth: string | null = null; + const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => { + const headers = new Headers(init?.headers); + seenAuth = headers.get("authorization"); + return new Response( + JSON.stringify({ + results: { channels: [{ alternatives: [{ transcript: "ok" }] }] }, + }), + { + status: 200, + headers: { "content-type": "application/json" }, + }, + ); + }; + + const result = await transcribeDeepgramAudio({ + buffer: Buffer.from("audio"), + fileName: "note.mp3", + apiKey: "test-key", + timeoutMs: 1000, + headers: { authorization: "Token override" }, + fetchFn, + }); + + expect(seenAuth).toBe("Token override"); + expect(result.text).toBe("ok"); + }); + + it("builds the expected request payload", async () => { + let seenUrl: string | null = null; + let seenInit: RequestInit | undefined; + const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => { + seenUrl = resolveRequestUrl(input); + seenInit = init; + return new Response( + JSON.stringify({ + results: { channels: [{ alternatives: [{ transcript: "hello" }] }] }, + }), + { + status: 200, + headers: { "content-type": "application/json" }, + }, + ); + }; + + const result = await transcribeDeepgramAudio({ + buffer: Buffer.from("audio-bytes"), + fileName: "voice.wav", + apiKey: "test-key", + timeoutMs: 1234, + baseUrl: "https://api.example.com/v1/", + model: " ", + language: " en ", + mime: "audio/wav", + headers: { "X-Custom": "1" }, + fetchFn, + }); + + expect(result.model).toBe("nova-3"); + expect(result.text).toBe("hello"); + expect(seenUrl).toBe("https://api.example.com/v1/listen?model=nova-3&language=en"); + expect(seenInit?.method).toBe("POST"); + expect(seenInit?.signal).toBeInstanceOf(AbortSignal); + + const headers = new Headers(seenInit?.headers); + expect(headers.get("authorization")).toBe("Token test-key"); + expect(headers.get("x-custom")).toBe("1"); + expect(headers.get("content-type")).toBe("audio/wav"); + expect(Buffer.isBuffer(seenInit?.body)).toBe(true); + }); +}); diff --git a/src/media-understanding/providers/deepgram/audio.ts b/src/media-understanding/providers/deepgram/audio.ts new file mode 100644 index 000000000..0e9ce1245 --- /dev/null +++ b/src/media-understanding/providers/deepgram/audio.ts @@ -0,0 +1,64 @@ +import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js"; +import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js"; + +export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1"; +export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3"; + +function resolveModel(model?: string): string { + const trimmed = model?.trim(); + return trimmed || DEFAULT_DEEPGRAM_AUDIO_MODEL; +} + +type DeepgramTranscriptResponse = { + results?: { + channels?: Array<{ + alternatives?: Array<{ + transcript?: string; + }>; + }>; + }; +}; + +export async function transcribeDeepgramAudio( + params: AudioTranscriptionRequest, +): Promise { + const fetchFn = params.fetchFn ?? fetch; + const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_DEEPGRAM_AUDIO_BASE_URL); + const model = resolveModel(params.model); + + const url = new URL(`${baseUrl}/listen`); + url.searchParams.set("model", model); + if (params.language?.trim()) url.searchParams.set("language", params.language.trim()); + + const headers = new Headers(params.headers); + if (!headers.has("authorization")) { + headers.set("authorization", `Token ${params.apiKey}`); + } + if (!headers.has("content-type")) { + headers.set("content-type", params.mime ?? "application/octet-stream"); + } + + const res = await fetchWithTimeout( + url.toString(), + { + method: "POST", + headers, + body: params.buffer, + }, + params.timeoutMs, + fetchFn, + ); + + if (!res.ok) { + const detail = await readErrorResponse(res); + const suffix = detail ? `: ${detail}` : ""; + throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`); + } + + const payload = (await res.json()) as DeepgramTranscriptResponse; + const transcript = payload.results?.channels?.[0]?.alternatives?.[0]?.transcript?.trim(); + if (!transcript) { + throw new Error("Audio transcription response missing transcript"); + } + return { text: transcript, model }; +} diff --git a/src/media-understanding/providers/deepgram/index.ts b/src/media-understanding/providers/deepgram/index.ts new file mode 100644 index 000000000..662561a95 --- /dev/null +++ b/src/media-understanding/providers/deepgram/index.ts @@ -0,0 +1,8 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { transcribeDeepgramAudio } from "./audio.js"; + +export const deepgramProvider: MediaUnderstandingProvider = { + id: "deepgram", + capabilities: ["audio"], + transcribeAudio: transcribeDeepgramAudio, +}; diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index 9c560c8e7..a20ba92fb 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -1,6 +1,7 @@ import { normalizeProviderId } from "../../agents/model-selection.js"; import type { MediaUnderstandingProvider } from "../types.js"; import { anthropicProvider } from "./anthropic/index.js"; +import { deepgramProvider } from "./deepgram/index.js"; import { googleProvider } from "./google/index.js"; import { groqProvider } from "./groq/index.js"; import { minimaxProvider } from "./minimax/index.js"; @@ -12,6 +13,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [ googleProvider, anthropicProvider, minimaxProvider, + deepgramProvider, ]; export function normalizeMediaProviderId(id: string): string {