feat: add Deepgram audio transcription

Co-authored-by: Safzan Pirani <safzanpirani@users.noreply.github.com>
2026-01-17 08:46:40 +00:00
parent 869ef0c5ba
commit e637bbdfb5
12 changed files with 303 additions and 2 deletions
--- a/src/agents/model-auth.ts
+++ b/src/agents/model-auth.ts
@@ -151,6 +151,7 @@ export function resolveEnvApiKey(provider: string): EnvApiKeyResult | null {
    openai: "OPENAI_API_KEY",
    google: "GEMINI_API_KEY",
    groq: "GROQ_API_KEY",
+    deepgram: "DEEPGRAM_API_KEY",
    cerebras: "CEREBRAS_API_KEY",
    xai: "XAI_API_KEY",
    openrouter: "OPENROUTER_API_KEY",
--- a/src/media-understanding/defaults.ts
+++ b/src/media-understanding/defaults.ts
@@ -30,6 +30,7 @@ export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
 export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
  groq: "whisper-large-v3-turbo",
  openai: "whisper-1",
+  deepgram: "nova-3",
 };
 export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
 export const DEFAULT_MEDIA_CONCURRENCY = 2;
--- a/src/media-understanding/providers/deepgram/audio.live.test.ts
+++ b/src/media-understanding/providers/deepgram/audio.live.test.ts
@@ -0,0 +1,51 @@
+import { describe, expect, it } from "vitest";
+
+import { transcribeDeepgramAudio } from "./audio.js";
+
+const DEEPGRAM_KEY = process.env.DEEPGRAM_API_KEY ?? "";
+const DEEPGRAM_MODEL = process.env.DEEPGRAM_MODEL?.trim() || "nova-3";
+const DEEPGRAM_BASE_URL = process.env.DEEPGRAM_BASE_URL?.trim();
+const SAMPLE_URL =
+  process.env.DEEPGRAM_SAMPLE_URL?.trim() ||
+  "https://static.deepgram.com/examples/Bueller-Life-moves-pretty-fast.wav";
+const LIVE =
+  process.env.DEEPGRAM_LIVE_TEST === "1" ||
+  process.env.LIVE === "1" ||
+  process.env.CLAWDBOT_LIVE_TEST === "1";
+
+const describeLive = LIVE && DEEPGRAM_KEY ? describe : describe.skip;
+
+async function fetchSampleBuffer(url: string, timeoutMs: number): Promise<Buffer> {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
+  try {
+    const res = await fetch(url, { signal: controller.signal });
+    if (!res.ok) {
+      throw new Error(`Sample download failed (HTTP ${res.status})`);
+    }
+    const data = await res.arrayBuffer();
+    return Buffer.from(data);
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+describeLive("deepgram live", () => {
+  it(
+    "transcribes sample audio",
+    async () => {
+      const buffer = await fetchSampleBuffer(SAMPLE_URL, 15000);
+      const result = await transcribeDeepgramAudio({
+        buffer,
+        fileName: "sample.wav",
+        mime: "audio/wav",
+        apiKey: DEEPGRAM_KEY,
+        model: DEEPGRAM_MODEL,
+        baseUrl: DEEPGRAM_BASE_URL,
+        timeoutMs: 20000,
+      });
+      expect(result.text.trim().length).toBeGreaterThan(0);
+    },
+    30000,
+  );
+});
--- a/src/media-understanding/providers/deepgram/audio.test.ts
+++ b/src/media-understanding/providers/deepgram/audio.test.ts
@@ -0,0 +1,83 @@
+import { describe, expect, it } from "vitest";
+
+import { transcribeDeepgramAudio } from "./audio.js";
+
+const resolveRequestUrl = (input: RequestInfo | URL) => {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+};
+
+describe("transcribeDeepgramAudio", () => {
+  it("respects lowercase authorization header overrides", async () => {
+    let seenAuth: string | null = null;
+    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
+      const headers = new Headers(init?.headers);
+      seenAuth = headers.get("authorization");
+      return new Response(
+        JSON.stringify({
+          results: { channels: [{ alternatives: [{ transcript: "ok" }] }] },
+        }),
+        {
+          status: 200,
+          headers: { "content-type": "application/json" },
+        },
+      );
+    };
+
+    const result = await transcribeDeepgramAudio({
+      buffer: Buffer.from("audio"),
+      fileName: "note.mp3",
+      apiKey: "test-key",
+      timeoutMs: 1000,
+      headers: { authorization: "Token override" },
+      fetchFn,
+    });
+
+    expect(seenAuth).toBe("Token override");
+    expect(result.text).toBe("ok");
+  });
+
+  it("builds the expected request payload", async () => {
+    let seenUrl: string | null = null;
+    let seenInit: RequestInit | undefined;
+    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
+      seenUrl = resolveRequestUrl(input);
+      seenInit = init;
+      return new Response(
+        JSON.stringify({
+          results: { channels: [{ alternatives: [{ transcript: "hello" }] }] },
+        }),
+        {
+          status: 200,
+          headers: { "content-type": "application/json" },
+        },
+      );
+    };
+
+    const result = await transcribeDeepgramAudio({
+      buffer: Buffer.from("audio-bytes"),
+      fileName: "voice.wav",
+      apiKey: "test-key",
+      timeoutMs: 1234,
+      baseUrl: "https://api.example.com/v1/",
+      model: " ",
+      language: " en ",
+      mime: "audio/wav",
+      headers: { "X-Custom": "1" },
+      fetchFn,
+    });
+
+    expect(result.model).toBe("nova-3");
+    expect(result.text).toBe("hello");
+    expect(seenUrl).toBe("https://api.example.com/v1/listen?model=nova-3&language=en");
+    expect(seenInit?.method).toBe("POST");
+    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
+
+    const headers = new Headers(seenInit?.headers);
+    expect(headers.get("authorization")).toBe("Token test-key");
+    expect(headers.get("x-custom")).toBe("1");
+    expect(headers.get("content-type")).toBe("audio/wav");
+    expect(Buffer.isBuffer(seenInit?.body)).toBe(true);
+  });
+});
--- a/src/media-understanding/providers/deepgram/audio.ts
+++ b/src/media-understanding/providers/deepgram/audio.ts
@@ -0,0 +1,64 @@
+import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
+import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
+
+export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1";
+export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3";
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  return trimmed || DEFAULT_DEEPGRAM_AUDIO_MODEL;
+}
+
+type DeepgramTranscriptResponse = {
+  results?: {
+    channels?: Array<{
+      alternatives?: Array<{
+        transcript?: string;
+      }>;
+    }>;
+  };
+};
+
+export async function transcribeDeepgramAudio(
+  params: AudioTranscriptionRequest,
+): Promise<AudioTranscriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_DEEPGRAM_AUDIO_BASE_URL);
+  const model = resolveModel(params.model);
+
+  const url = new URL(`${baseUrl}/listen`);
+  url.searchParams.set("model", model);
+  if (params.language?.trim()) url.searchParams.set("language", params.language.trim());
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("authorization")) {
+    headers.set("authorization", `Token ${params.apiKey}`);
+  }
+  if (!headers.has("content-type")) {
+    headers.set("content-type", params.mime ?? "application/octet-stream");
+  }
+
+  const res = await fetchWithTimeout(
+    url.toString(),
+    {
+      method: "POST",
+      headers,
+      body: params.buffer,
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  if (!res.ok) {
+    const detail = await readErrorResponse(res);
+    const suffix = detail ? `: ${detail}` : "";
+    throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
+  }
+
+  const payload = (await res.json()) as DeepgramTranscriptResponse;
+  const transcript = payload.results?.channels?.[0]?.alternatives?.[0]?.transcript?.trim();
+  if (!transcript) {
+    throw new Error("Audio transcription response missing transcript");
+  }
+  return { text: transcript, model };
+}
--- a/src/media-understanding/providers/deepgram/index.ts
+++ b/src/media-understanding/providers/deepgram/index.ts
@@ -0,0 +1,8 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { transcribeDeepgramAudio } from "./audio.js";
+
+export const deepgramProvider: MediaUnderstandingProvider = {
+  id: "deepgram",
+  capabilities: ["audio"],
+  transcribeAudio: transcribeDeepgramAudio,
+};
--- a/src/media-understanding/providers/index.ts
+++ b/src/media-understanding/providers/index.ts
@@ -1,6 +1,7 @@
 import { normalizeProviderId } from "../../agents/model-selection.js";
 import type { MediaUnderstandingProvider } from "../types.js";
 import { anthropicProvider } from "./anthropic/index.js";
+import { deepgramProvider } from "./deepgram/index.js";
 import { googleProvider } from "./google/index.js";
 import { groqProvider } from "./groq/index.js";
 import { minimaxProvider } from "./minimax/index.js";
@@ -12,6 +13,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [
  googleProvider,
  anthropicProvider,
  minimaxProvider,
+  deepgramProvider,
 ];

 export function normalizeMediaProviderId(id: string): string {