feat: add Deepgram audio transcription

Co-authored-by: Safzan Pirani <safzanpirani@users.noreply.github.com>
2026-01-17 08:46:40 +00:00
parent 869ef0c5ba
commit e637bbdfb5
12 changed files with 303 additions and 2 deletions
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -62,8 +62,24 @@ read_when:
 }
 ```
 ### Provider-only (Deepgram)
 ```json5
 {
  tools: {
    media: {
      audio: {
        enabled: true,
        models: [{ provider: "deepgram", model: "nova-3" }]
      }
    }
  }
 }
 ```
 ## Notes & limits
 - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
 - Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
 - Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
 - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
 - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
 - Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -108,6 +108,7 @@ lists, Clawdbot can infer defaults:
 - `openai`, `anthropic`, `minimax`: **image**
 - `google` (Gemini API): **image + audio + video**
 - `groq`: **audio**
 - `deepgram`: **audio**
 For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
 If you omit `capabilities`, the entry is eligible for the list it appears in.
@@ -116,7 +117,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
 | Capability | Provider integration | Notes |
 |------------|----------------------|-------|
 | Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. |
-| Audio | OpenAI, Groq | Provider transcription (Whisper). |
+| Audio | OpenAI, Groq, Deepgram | Provider transcription (Whisper/Deepgram). |
 | Video | Google (Gemini API) | Provider video understanding. |
 ## Recommended providers
@@ -125,8 +126,9 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
 - Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`.
 **Audio**
- `openai/whisper-1` or `groq/whisper-large-v3-turbo`.
+- `openai/whisper-1`, `groq/whisper-large-v3-turbo`, or `deepgram/nova-3`.
 - CLI fallback: `whisper` binary.
 - Deepgram setup: [Deepgram (audio transcription)](/providers/deepgram).
 **Video**
 - `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
--- a/docs/providers/deepgram.md
+++ b/docs/providers/deepgram.md
@@ -0,0 +1,64 @@
 ---
 summary: "Deepgram transcription for inbound voice notes"
 read_when:
  - You want Deepgram speech-to-text for audio attachments
  - You need a quick Deepgram config example
 ---
 # Deepgram (Audio Transcription)
 Deepgram is a speech-to-text API. In Clawdbot it is used for **inbound audio/voice note
 transcription** via `tools.media.audio`.
 When enabled, Clawdbot uploads the audio file to Deepgram and injects the transcript
 into the reply pipeline (`{{Transcript}}` + `[Audio]` block). This is **not streaming**;
 it uses the pre-recorded transcription endpoint.
 Website: https://deepgram.com  
 Docs: https://developers.deepgram.com
 ## Quick start
 1) Set your API key:
 ```
 DEEPGRAM_API_KEY=dg_...
 ```
 2) Enable the provider:
 ```json5
 {
  tools: {
    media: {
      audio: {
        enabled: true,
        models: [{ provider: "deepgram", model: "nova-3" }]
      }
    }
  }
 }
 ```
 ## Options
 - `model`: Deepgram model id (default: `nova-3`)
 - `language`: language hint (optional)
 Example with language:
 ```json5
 {
  tools: {
    media: {
      audio: {
        enabled: true,
        models: [
          { provider: "deepgram", model: "nova-3", language: "en" }
        ]
      }
    }
  }
 }
 ```
 ## Notes
 - Authentication follows the standard provider auth order; `DEEPGRAM_API_KEY` is the simplest path.
 - Output follows the same audio rules as other providers (size caps, timeouts, transcript injection).
--- a/docs/providers/index.md
+++ b/docs/providers/index.md
@@ -34,5 +34,9 @@ Looking for chat channel docs (WhatsApp/Telegram/Discord/Slack/etc.)? See [Chann
 - [GLM models](/providers/glm)
 - [MiniMax](/providers/minimax)
 ## Transcription providers
 - [Deepgram (audio transcription)](/providers/deepgram)
 For the full provider catalog (xAI, Groq, Mistral, etc.) and advanced configuration,
 see [Model providers](/concepts/model-providers).
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -290,6 +290,11 @@ Live tests discover credentials the same way the CLI does. Practical implication
 If you want to rely on env keys (e.g. exported in your `~/.profile`), run local tests after `source ~/.profile`, or use the Docker runners below (they can mount `~/.profile` into the container).
 ## Deepgram live (audio transcription)
 - Test: `src/media-understanding/providers/deepgram/audio.live.test.ts`
 - Enable: `DEEPGRAM_API_KEY=... DEEPGRAM_LIVE_TEST=1 pnpm test:live src/media-understanding/providers/deepgram/audio.live.test.ts`
 ## Docker runners (optional “works in Linux” checks)
 These run `pnpm test:live` inside the repo Docker image, mounting your local config dir and workspace (and sourcing `~/.profile` if mounted):
--- a/src/agents/model-auth.ts
+++ b/src/agents/model-auth.ts
@@ -151,6 +151,7 @@ export function resolveEnvApiKey(provider: string): EnvApiKeyResult | null {
    openai: "OPENAI_API_KEY",
    google: "GEMINI_API_KEY",
    groq: "GROQ_API_KEY",
    deepgram: "DEEPGRAM_API_KEY",
    cerebras: "CEREBRAS_API_KEY",
    xai: "XAI_API_KEY",
    openrouter: "OPENROUTER_API_KEY",
--- a/src/media-understanding/defaults.ts
+++ b/src/media-understanding/defaults.ts
@@ -30,6 +30,7 @@ export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
 export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
  groq: "whisper-large-v3-turbo",
  openai: "whisper-1",
  deepgram: "nova-3",
 };
 export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
 export const DEFAULT_MEDIA_CONCURRENCY = 2;
--- a/src/media-understanding/providers/deepgram/audio.live.test.ts
+++ b/src/media-understanding/providers/deepgram/audio.live.test.ts
@@ -0,0 +1,51 @@
 import { describe, expect, it } from "vitest";
 import { transcribeDeepgramAudio } from "./audio.js";
 const DEEPGRAM_KEY = process.env.DEEPGRAM_API_KEY ?? "";
 const DEEPGRAM_MODEL = process.env.DEEPGRAM_MODEL?.trim() || "nova-3";
 const DEEPGRAM_BASE_URL = process.env.DEEPGRAM_BASE_URL?.trim();
 const SAMPLE_URL =
  process.env.DEEPGRAM_SAMPLE_URL?.trim() ||
  "https://static.deepgram.com/examples/Bueller-Life-moves-pretty-fast.wav";
 const LIVE =
  process.env.DEEPGRAM_LIVE_TEST === "1" ||
  process.env.LIVE === "1" ||
  process.env.CLAWDBOT_LIVE_TEST === "1";
 const describeLive = LIVE && DEEPGRAM_KEY ? describe : describe.skip;
 async function fetchSampleBuffer(url: string, timeoutMs: number): Promise<Buffer> {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
  try {
    const res = await fetch(url, { signal: controller.signal });
    if (!res.ok) {
      throw new Error(`Sample download failed (HTTP ${res.status})`);
    }
    const data = await res.arrayBuffer();
    return Buffer.from(data);
  } finally {
    clearTimeout(timer);
  }
 }
 describeLive("deepgram live", () => {
  it(
    "transcribes sample audio",
    async () => {
      const buffer = await fetchSampleBuffer(SAMPLE_URL, 15000);
      const result = await transcribeDeepgramAudio({
        buffer,
        fileName: "sample.wav",
        mime: "audio/wav",
        apiKey: DEEPGRAM_KEY,
        model: DEEPGRAM_MODEL,
        baseUrl: DEEPGRAM_BASE_URL,
        timeoutMs: 20000,
      });
      expect(result.text.trim().length).toBeGreaterThan(0);
    },
    30000,
  );
 });
--- a/src/media-understanding/providers/deepgram/audio.test.ts
+++ b/src/media-understanding/providers/deepgram/audio.test.ts
@@ -0,0 +1,83 @@
 import { describe, expect, it } from "vitest";
 import { transcribeDeepgramAudio } from "./audio.js";
 const resolveRequestUrl = (input: RequestInfo | URL) => {
  if (typeof input === "string") return input;
  if (input instanceof URL) return input.toString();
  return input.url;
 };
 describe("transcribeDeepgramAudio", () => {
  it("respects lowercase authorization header overrides", async () => {
    let seenAuth: string | null = null;
    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
      const headers = new Headers(init?.headers);
      seenAuth = headers.get("authorization");
      return new Response(
        JSON.stringify({
          results: { channels: [{ alternatives: [{ transcript: "ok" }] }] },
        }),
        {
          status: 200,
          headers: { "content-type": "application/json" },
        },
      );
    };
    const result = await transcribeDeepgramAudio({
      buffer: Buffer.from("audio"),
      fileName: "note.mp3",
      apiKey: "test-key",
      timeoutMs: 1000,
      headers: { authorization: "Token override" },
      fetchFn,
    });
    expect(seenAuth).toBe("Token override");
    expect(result.text).toBe("ok");
  });
  it("builds the expected request payload", async () => {
    let seenUrl: string | null = null;
    let seenInit: RequestInit | undefined;
    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
      seenUrl = resolveRequestUrl(input);
      seenInit = init;
      return new Response(
        JSON.stringify({
          results: { channels: [{ alternatives: [{ transcript: "hello" }] }] },
        }),
        {
          status: 200,
          headers: { "content-type": "application/json" },
        },
      );
    };
    const result = await transcribeDeepgramAudio({
      buffer: Buffer.from("audio-bytes"),
      fileName: "voice.wav",
      apiKey: "test-key",
      timeoutMs: 1234,
      baseUrl: "https://api.example.com/v1/",
      model: " ",
      language: " en ",
      mime: "audio/wav",
      headers: { "X-Custom": "1" },
      fetchFn,
    });
    expect(result.model).toBe("nova-3");
    expect(result.text).toBe("hello");
    expect(seenUrl).toBe("https://api.example.com/v1/listen?model=nova-3&language=en");
    expect(seenInit?.method).toBe("POST");
    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
    const headers = new Headers(seenInit?.headers);
    expect(headers.get("authorization")).toBe("Token test-key");
    expect(headers.get("x-custom")).toBe("1");
    expect(headers.get("content-type")).toBe("audio/wav");
    expect(Buffer.isBuffer(seenInit?.body)).toBe(true);
  });
 });
--- a/src/media-understanding/providers/deepgram/audio.ts
+++ b/src/media-understanding/providers/deepgram/audio.ts
@@ -0,0 +1,64 @@
 import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
 import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
 export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1";
 export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3";
 function resolveModel(model?: string): string {
  const trimmed = model?.trim();
  return trimmed || DEFAULT_DEEPGRAM_AUDIO_MODEL;
 }
 type DeepgramTranscriptResponse = {
  results?: {
    channels?: Array<{
      alternatives?: Array<{
        transcript?: string;
      }>;
    }>;
  };
 };
 export async function transcribeDeepgramAudio(
  params: AudioTranscriptionRequest,
 ): Promise<AudioTranscriptionResult> {
  const fetchFn = params.fetchFn ?? fetch;
  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_DEEPGRAM_AUDIO_BASE_URL);
  const model = resolveModel(params.model);
  const url = new URL(`${baseUrl}/listen`);
  url.searchParams.set("model", model);
  if (params.language?.trim()) url.searchParams.set("language", params.language.trim());
  const headers = new Headers(params.headers);
  if (!headers.has("authorization")) {
    headers.set("authorization", `Token ${params.apiKey}`);
  }
  if (!headers.has("content-type")) {
    headers.set("content-type", params.mime ?? "application/octet-stream");
  }
  const res = await fetchWithTimeout(
    url.toString(),
    {
      method: "POST",
      headers,
      body: params.buffer,
    },
    params.timeoutMs,
    fetchFn,
  );
  if (!res.ok) {
    const detail = await readErrorResponse(res);
    const suffix = detail ? `: ${detail}` : "";
    throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
  }
  const payload = (await res.json()) as DeepgramTranscriptResponse;
  const transcript = payload.results?.channels?.[0]?.alternatives?.[0]?.transcript?.trim();
  if (!transcript) {
    throw new Error("Audio transcription response missing transcript");
  }
  return { text: transcript, model };
 }
--- a/src/media-understanding/providers/deepgram/index.ts
+++ b/src/media-understanding/providers/deepgram/index.ts
@@ -0,0 +1,8 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { transcribeDeepgramAudio } from "./audio.js";
 export const deepgramProvider: MediaUnderstandingProvider = {
  id: "deepgram",
  capabilities: ["audio"],
  transcribeAudio: transcribeDeepgramAudio,
 };
--- a/src/media-understanding/providers/index.ts
+++ b/src/media-understanding/providers/index.ts
@@ -1,6 +1,7 @@
 import { normalizeProviderId } from "../../agents/model-selection.js";
 import type { MediaUnderstandingProvider } from "../types.js";
 import { anthropicProvider } from "./anthropic/index.js";
 import { deepgramProvider } from "./deepgram/index.js";
 import { googleProvider } from "./google/index.js";
 import { groqProvider } from "./groq/index.js";
 import { minimaxProvider } from "./minimax/index.js";
@@ -12,6 +13,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [
  googleProvider,
  anthropicProvider,
  minimaxProvider,
  deepgramProvider,
 ];
 export function normalizeMediaProviderId(id: string): string {