feat: add Deepgram audio transcription

Co-authored-by: Safzan Pirani <safzanpirani@users.noreply.github.com>
This commit is contained in:
Peter Steinberger
2026-01-17 08:46:40 +00:00
parent 869ef0c5ba
commit e637bbdfb5
12 changed files with 303 additions and 2 deletions

View File

@@ -151,6 +151,7 @@ export function resolveEnvApiKey(provider: string): EnvApiKeyResult | null {
openai: "OPENAI_API_KEY",
google: "GEMINI_API_KEY",
groq: "GROQ_API_KEY",
deepgram: "DEEPGRAM_API_KEY",
cerebras: "CEREBRAS_API_KEY",
xai: "XAI_API_KEY",
openrouter: "OPENROUTER_API_KEY",

View File

@@ -30,6 +30,7 @@ export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
groq: "whisper-large-v3-turbo",
openai: "whisper-1",
deepgram: "nova-3",
};
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
export const DEFAULT_MEDIA_CONCURRENCY = 2;

View File

@@ -0,0 +1,51 @@
import { describe, expect, it } from "vitest";
import { transcribeDeepgramAudio } from "./audio.js";
const DEEPGRAM_KEY = process.env.DEEPGRAM_API_KEY ?? "";
const DEEPGRAM_MODEL = process.env.DEEPGRAM_MODEL?.trim() || "nova-3";
const DEEPGRAM_BASE_URL = process.env.DEEPGRAM_BASE_URL?.trim();
const SAMPLE_URL =
process.env.DEEPGRAM_SAMPLE_URL?.trim() ||
"https://static.deepgram.com/examples/Bueller-Life-moves-pretty-fast.wav";
const LIVE =
process.env.DEEPGRAM_LIVE_TEST === "1" ||
process.env.LIVE === "1" ||
process.env.CLAWDBOT_LIVE_TEST === "1";
const describeLive = LIVE && DEEPGRAM_KEY ? describe : describe.skip;
async function fetchSampleBuffer(url: string, timeoutMs: number): Promise<Buffer> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
try {
const res = await fetch(url, { signal: controller.signal });
if (!res.ok) {
throw new Error(`Sample download failed (HTTP ${res.status})`);
}
const data = await res.arrayBuffer();
return Buffer.from(data);
} finally {
clearTimeout(timer);
}
}
describeLive("deepgram live", () => {
it(
"transcribes sample audio",
async () => {
const buffer = await fetchSampleBuffer(SAMPLE_URL, 15000);
const result = await transcribeDeepgramAudio({
buffer,
fileName: "sample.wav",
mime: "audio/wav",
apiKey: DEEPGRAM_KEY,
model: DEEPGRAM_MODEL,
baseUrl: DEEPGRAM_BASE_URL,
timeoutMs: 20000,
});
expect(result.text.trim().length).toBeGreaterThan(0);
},
30000,
);
});

View File

@@ -0,0 +1,83 @@
import { describe, expect, it } from "vitest";
import { transcribeDeepgramAudio } from "./audio.js";
const resolveRequestUrl = (input: RequestInfo | URL) => {
if (typeof input === "string") return input;
if (input instanceof URL) return input.toString();
return input.url;
};
describe("transcribeDeepgramAudio", () => {
it("respects lowercase authorization header overrides", async () => {
let seenAuth: string | null = null;
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
const headers = new Headers(init?.headers);
seenAuth = headers.get("authorization");
return new Response(
JSON.stringify({
results: { channels: [{ alternatives: [{ transcript: "ok" }] }] },
}),
{
status: 200,
headers: { "content-type": "application/json" },
},
);
};
const result = await transcribeDeepgramAudio({
buffer: Buffer.from("audio"),
fileName: "note.mp3",
apiKey: "test-key",
timeoutMs: 1000,
headers: { authorization: "Token override" },
fetchFn,
});
expect(seenAuth).toBe("Token override");
expect(result.text).toBe("ok");
});
it("builds the expected request payload", async () => {
let seenUrl: string | null = null;
let seenInit: RequestInit | undefined;
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
seenUrl = resolveRequestUrl(input);
seenInit = init;
return new Response(
JSON.stringify({
results: { channels: [{ alternatives: [{ transcript: "hello" }] }] },
}),
{
status: 200,
headers: { "content-type": "application/json" },
},
);
};
const result = await transcribeDeepgramAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.wav",
apiKey: "test-key",
timeoutMs: 1234,
baseUrl: "https://api.example.com/v1/",
model: " ",
language: " en ",
mime: "audio/wav",
headers: { "X-Custom": "1" },
fetchFn,
});
expect(result.model).toBe("nova-3");
expect(result.text).toBe("hello");
expect(seenUrl).toBe("https://api.example.com/v1/listen?model=nova-3&language=en");
expect(seenInit?.method).toBe("POST");
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
const headers = new Headers(seenInit?.headers);
expect(headers.get("authorization")).toBe("Token test-key");
expect(headers.get("x-custom")).toBe("1");
expect(headers.get("content-type")).toBe("audio/wav");
expect(Buffer.isBuffer(seenInit?.body)).toBe(true);
});
});

View File

@@ -0,0 +1,64 @@
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1";
export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3";
function resolveModel(model?: string): string {
const trimmed = model?.trim();
return trimmed || DEFAULT_DEEPGRAM_AUDIO_MODEL;
}
type DeepgramTranscriptResponse = {
results?: {
channels?: Array<{
alternatives?: Array<{
transcript?: string;
}>;
}>;
};
};
export async function transcribeDeepgramAudio(
params: AudioTranscriptionRequest,
): Promise<AudioTranscriptionResult> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_DEEPGRAM_AUDIO_BASE_URL);
const model = resolveModel(params.model);
const url = new URL(`${baseUrl}/listen`);
url.searchParams.set("model", model);
if (params.language?.trim()) url.searchParams.set("language", params.language.trim());
const headers = new Headers(params.headers);
if (!headers.has("authorization")) {
headers.set("authorization", `Token ${params.apiKey}`);
}
if (!headers.has("content-type")) {
headers.set("content-type", params.mime ?? "application/octet-stream");
}
const res = await fetchWithTimeout(
url.toString(),
{
method: "POST",
headers,
body: params.buffer,
},
params.timeoutMs,
fetchFn,
);
if (!res.ok) {
const detail = await readErrorResponse(res);
const suffix = detail ? `: ${detail}` : "";
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
}
const payload = (await res.json()) as DeepgramTranscriptResponse;
const transcript = payload.results?.channels?.[0]?.alternatives?.[0]?.transcript?.trim();
if (!transcript) {
throw new Error("Audio transcription response missing transcript");
}
return { text: transcript, model };
}

View File

@@ -0,0 +1,8 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { transcribeDeepgramAudio } from "./audio.js";
export const deepgramProvider: MediaUnderstandingProvider = {
id: "deepgram",
capabilities: ["audio"],
transcribeAudio: transcribeDeepgramAudio,
};

View File

@@ -1,6 +1,7 @@
import { normalizeProviderId } from "../../agents/model-selection.js";
import type { MediaUnderstandingProvider } from "../types.js";
import { anthropicProvider } from "./anthropic/index.js";
import { deepgramProvider } from "./deepgram/index.js";
import { googleProvider } from "./google/index.js";
import { groqProvider } from "./groq/index.js";
import { minimaxProvider } from "./minimax/index.js";
@@ -12,6 +13,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [
googleProvider,
anthropicProvider,
minimaxProvider,
deepgramProvider,
];
export function normalizeMediaProviderId(id: string): string {