From ae6792522de58c6de07a8f7276aa91057a04eddc Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 17 Jan 2026 08:50:28 +0000 Subject: [PATCH] feat: add deepgram audio options --- docs/nodes/audio.md | 1 + docs/nodes/media-understanding.md | 2 + docs/providers/deepgram.md | 23 +++++++++++ src/config/types.tools.ts | 20 ++++++++++ src/config/zod-schema.core.ts | 14 +++++++ .../providers/deepgram/audio.test.ts | 8 +++- .../providers/deepgram/audio.ts | 6 +++ src/media-understanding/runner.ts | 38 ++++++++++++++++++- src/media-understanding/types.ts | 1 + 9 files changed, 110 insertions(+), 3 deletions(-) diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index ce51c9fab..96a0ff8fe 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -80,6 +80,7 @@ read_when: - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`). - Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used. - Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram). +- Audio providers can override `baseUrl`/`headers` via `tools.media.audio`. - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried. - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output. - Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`). diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index 57b532dc0..9c0e08a66 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -32,6 +32,8 @@ If understanding fails or is disabled, **the reply flow continues** with the ori - `tools.media.models`: shared model list (use `capabilities` to gate). - `tools.media.image` / `tools.media.audio` / `tools.media.video`: - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`) + - provider overrides (`baseUrl`, `headers`) + - Deepgram audio options (`deepgram` in `tools.media.audio`) - optional **per‑capability `models` list** (preferred before shared models) - `attachments` policy (`mode`, `maxAttachments`, `prefer`) - `scope` (optional gating by channel/chatType/session key) diff --git a/docs/providers/deepgram.md b/docs/providers/deepgram.md index 3e01bcb13..d34a880dc 100644 --- a/docs/providers/deepgram.md +++ b/docs/providers/deepgram.md @@ -41,6 +41,9 @@ DEEPGRAM_API_KEY=dg_... - `model`: Deepgram model id (default: `nova-3`) - `language`: language hint (optional) +- `tools.media.audio.deepgram.detectLanguage`: enable language detection (optional) +- `tools.media.audio.deepgram.punctuate`: enable punctuation (optional) +- `tools.media.audio.deepgram.smartFormat`: enable smart formatting (optional) Example with language: ```json5 @@ -58,7 +61,27 @@ Example with language: } ``` +Example with Deepgram options: +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + deepgram: { + detectLanguage: true, + punctuate: true, + smartFormat: true + }, + models: [{ provider: "deepgram", model: "nova-3" }] + } + } + } +} +``` + ## Notes - Authentication follows the standard provider auth order; `DEEPGRAM_API_KEY` is the simplest path. +- Override endpoints or headers with `tools.media.audio.baseUrl` and `tools.media.audio.headers` when using a proxy. - Output follows the same audio rules as other providers (size caps, timeouts, transcript injection). diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 2b19bab2d..22983b270 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -51,6 +51,16 @@ export type MediaUnderstandingModelConfig = { timeoutSeconds?: number; /** Optional language hint for audio transcription. */ language?: string; + /** Optional Deepgram transcription options (audio only). */ + deepgram?: { + detectLanguage?: boolean; + punctuate?: boolean; + smartFormat?: boolean; + }; + /** Optional base URL override for provider requests. */ + baseUrl?: string; + /** Optional headers merged into provider requests. */ + headers?: Record; /** Auth profile id to use for this provider. */ profile?: string; /** Preferred profile id if multiple are available. */ @@ -72,6 +82,16 @@ export type MediaUnderstandingConfig = { timeoutSeconds?: number; /** Default language hint (audio). */ language?: string; + /** Optional Deepgram transcription options (audio only). */ + deepgram?: { + detectLanguage?: boolean; + punctuate?: boolean; + smartFormat?: boolean; + }; + /** Optional base URL override for provider requests. */ + baseUrl?: string; + /** Optional headers merged into provider requests. */ + headers?: Record; /** Attachment selection policy. */ attachments?: MediaUnderstandingAttachmentsConfig; /** Ordered model list (fallbacks in order). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 1922670be..47239d210 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -276,6 +276,14 @@ export const MediaUnderstandingAttachmentsSchema = z }) .optional(); +const DeepgramAudioSchema = z + .object({ + detectLanguage: z.boolean().optional(), + punctuate: z.boolean().optional(), + smartFormat: z.boolean().optional(), + }) + .optional(); + export const MediaUnderstandingModelSchema = z .object({ provider: z.string().optional(), @@ -289,6 +297,9 @@ export const MediaUnderstandingModelSchema = z maxBytes: z.number().int().positive().optional(), timeoutSeconds: z.number().int().positive().optional(), language: z.string().optional(), + deepgram: DeepgramAudioSchema, + baseUrl: z.string().optional(), + headers: z.record(z.string(), z.string()).optional(), profile: z.string().optional(), preferredProfile: z.string().optional(), }) @@ -303,6 +314,9 @@ export const ToolsMediaUnderstandingSchema = z prompt: z.string().optional(), timeoutSeconds: z.number().int().positive().optional(), language: z.string().optional(), + deepgram: DeepgramAudioSchema, + baseUrl: z.string().optional(), + headers: z.record(z.string(), z.string()).optional(), attachments: MediaUnderstandingAttachmentsSchema, models: z.array(MediaUnderstandingModelSchema).optional(), }) diff --git a/src/media-understanding/providers/deepgram/audio.test.ts b/src/media-understanding/providers/deepgram/audio.test.ts index a4b50dae5..737649cbe 100644 --- a/src/media-understanding/providers/deepgram/audio.test.ts +++ b/src/media-understanding/providers/deepgram/audio.test.ts @@ -65,12 +65,18 @@ describe("transcribeDeepgramAudio", () => { language: " en ", mime: "audio/wav", headers: { "X-Custom": "1" }, + query: { + punctuate: false, + smart_format: true, + }, fetchFn, }); expect(result.model).toBe("nova-3"); expect(result.text).toBe("hello"); - expect(seenUrl).toBe("https://api.example.com/v1/listen?model=nova-3&language=en"); + expect(seenUrl).toBe( + "https://api.example.com/v1/listen?model=nova-3&language=en&punctuate=false&smart_format=true", + ); expect(seenInit?.method).toBe("POST"); expect(seenInit?.signal).toBeInstanceOf(AbortSignal); diff --git a/src/media-understanding/providers/deepgram/audio.ts b/src/media-understanding/providers/deepgram/audio.ts index 0e9ce1245..a35d49e7e 100644 --- a/src/media-understanding/providers/deepgram/audio.ts +++ b/src/media-understanding/providers/deepgram/audio.ts @@ -29,6 +29,12 @@ export async function transcribeDeepgramAudio( const url = new URL(`${baseUrl}/listen`); url.searchParams.set("model", model); if (params.language?.trim()) url.searchParams.set("language", params.language.trim()); + if (params.query) { + for (const [key, value] of Object.entries(params.query)) { + if (value === undefined) continue; + url.searchParams.set(key, String(value)); + } + } const headers = new Headers(params.headers); if (!headers.has("authorization")) { diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 10a976e59..2ccd0aa58 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -71,6 +71,25 @@ function trimOutput(text: string, maxChars?: number): string { return trimmed.slice(0, maxChars).trim(); } +function buildDeepgramQuery(options?: { + detectLanguage?: boolean; + punctuate?: boolean; + smartFormat?: boolean; +}): Record | undefined { + if (!options) return undefined; + const query: Record = {}; + if (typeof options.detectLanguage === "boolean") { + query.detect_language = options.detectLanguage; + } + if (typeof options.punctuate === "boolean") { + query.punctuate = options.punctuate; + } + if (typeof options.smartFormat === "boolean") { + query.smart_format = options.smartFormat; + } + return Object.keys(query).length > 0 ? query : undefined; +} + function buildModelDecision(params: { entry: MediaUnderstandingModelConfig; entryType: "provider" | "cli"; @@ -220,17 +239,32 @@ async function runProviderEntry(params: { agentDir: params.agentDir, }); const providerConfig = cfg.models?.providers?.[providerId]; + const baseUrl = entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl; + const mergedHeaders = { + ...(providerConfig?.headers ?? {}), + ...(params.config?.headers ?? {}), + ...(entry.headers ?? {}), + }; + const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined; + const deepgramQuery = + providerId === "deepgram" + ? buildDeepgramQuery({ + ...params.config?.deepgram, + ...entry.deepgram, + }) + : undefined; const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model; const result = await provider.transcribeAudio({ buffer: media.buffer, fileName: media.fileName, mime: media.mime, apiKey: key.apiKey, - baseUrl: providerConfig?.baseUrl, - headers: providerConfig?.headers, + baseUrl, + headers, model, language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language, prompt, + query: deepgramQuery, timeoutMs, }); return { diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts index c0aa11c40..81173c405 100644 --- a/src/media-understanding/types.ts +++ b/src/media-understanding/types.ts @@ -57,6 +57,7 @@ export type AudioTranscriptionRequest = { model?: string; language?: string; prompt?: string; + query?: Record; timeoutMs: number; fetchFn?: typeof fetch; };