From d66bc65ca6b9782b9f0ef6da5de0b2abbb9604c5 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 17 Jan 2026 09:12:19 +0000 Subject: [PATCH] refactor: unify media provider options --- docs/nodes/audio.md | 2 +- docs/nodes/media-understanding.md | 4 +- docs/providers/deepgram.md | 16 +-- src/config/types.tools.ts | 8 +- src/config/zod-schema.core.ts | 7 ++ .../runner.deepgram.test.ts | 112 ++++++++++++++++++ src/media-understanding/runner.ts | 84 ++++++++++--- 7 files changed, 204 insertions(+), 29 deletions(-) create mode 100644 src/media-understanding/runner.deepgram.test.ts diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index 96a0ff8fe..4d011b3fd 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -80,7 +80,7 @@ read_when: - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`). - Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used. - Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram). -- Audio providers can override `baseUrl`/`headers` via `tools.media.audio`. +- Audio providers can override `baseUrl`, `headers`, and `providerOptions` via `tools.media.audio`. - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried. - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output. - Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`). diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index 9c0e08a66..6d640cd53 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -32,8 +32,8 @@ If understanding fails or is disabled, **the reply flow continues** with the ori - `tools.media.models`: shared model list (use `capabilities` to gate). - `tools.media.image` / `tools.media.audio` / `tools.media.video`: - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`) - - provider overrides (`baseUrl`, `headers`) - - Deepgram audio options (`deepgram` in `tools.media.audio`) + - provider overrides (`baseUrl`, `headers`, `providerOptions`) + - Deepgram audio options via `tools.media.audio.providerOptions.deepgram` - optional **per‑capability `models` list** (preferred before shared models) - `attachments` policy (`mode`, `maxAttachments`, `prefer`) - `scope` (optional gating by channel/chatType/session key) diff --git a/docs/providers/deepgram.md b/docs/providers/deepgram.md index d34a880dc..133bc132c 100644 --- a/docs/providers/deepgram.md +++ b/docs/providers/deepgram.md @@ -41,9 +41,9 @@ DEEPGRAM_API_KEY=dg_... - `model`: Deepgram model id (default: `nova-3`) - `language`: language hint (optional) -- `tools.media.audio.deepgram.detectLanguage`: enable language detection (optional) -- `tools.media.audio.deepgram.punctuate`: enable punctuation (optional) -- `tools.media.audio.deepgram.smartFormat`: enable smart formatting (optional) +- `tools.media.audio.providerOptions.deepgram.detect_language`: enable language detection (optional) +- `tools.media.audio.providerOptions.deepgram.punctuate`: enable punctuation (optional) +- `tools.media.audio.providerOptions.deepgram.smart_format`: enable smart formatting (optional) Example with language: ```json5 @@ -68,10 +68,12 @@ Example with Deepgram options: media: { audio: { enabled: true, - deepgram: { - detectLanguage: true, - punctuate: true, - smartFormat: true + providerOptions: { + deepgram: { + detect_language: true, + punctuate: true, + smart_format: true + } }, models: [{ provider: "deepgram", model: "nova-3" }] } diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 22983b270..bbcf17ff1 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -51,7 +51,9 @@ export type MediaUnderstandingModelConfig = { timeoutSeconds?: number; /** Optional language hint for audio transcription. */ language?: string; - /** Optional Deepgram transcription options (audio only). */ + /** Optional provider-specific query params (merged into requests). */ + providerOptions?: Record>; + /** @deprecated Use providerOptions.deepgram instead. */ deepgram?: { detectLanguage?: boolean; punctuate?: boolean; @@ -82,7 +84,9 @@ export type MediaUnderstandingConfig = { timeoutSeconds?: number; /** Default language hint (audio). */ language?: string; - /** Optional Deepgram transcription options (audio only). */ + /** Optional provider-specific query params (merged into requests). */ + providerOptions?: Record>; + /** @deprecated Use providerOptions.deepgram instead. */ deepgram?: { detectLanguage?: boolean; punctuate?: boolean; diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 47239d210..0e7bf0cbc 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -284,6 +284,11 @@ const DeepgramAudioSchema = z }) .optional(); +const ProviderOptionValueSchema = z.union([z.string(), z.number(), z.boolean()]); +const ProviderOptionsSchema = z + .record(z.string(), z.record(z.string(), ProviderOptionValueSchema)) + .optional(); + export const MediaUnderstandingModelSchema = z .object({ provider: z.string().optional(), @@ -297,6 +302,7 @@ export const MediaUnderstandingModelSchema = z maxBytes: z.number().int().positive().optional(), timeoutSeconds: z.number().int().positive().optional(), language: z.string().optional(), + providerOptions: ProviderOptionsSchema, deepgram: DeepgramAudioSchema, baseUrl: z.string().optional(), headers: z.record(z.string(), z.string()).optional(), @@ -314,6 +320,7 @@ export const ToolsMediaUnderstandingSchema = z prompt: z.string().optional(), timeoutSeconds: z.number().int().positive().optional(), language: z.string().optional(), + providerOptions: ProviderOptionsSchema, deepgram: DeepgramAudioSchema, baseUrl: z.string().optional(), headers: z.record(z.string(), z.string()).optional(), diff --git a/src/media-understanding/runner.deepgram.test.ts b/src/media-understanding/runner.deepgram.test.ts new file mode 100644 index 000000000..2a6cc5de7 --- /dev/null +++ b/src/media-understanding/runner.deepgram.test.ts @@ -0,0 +1,112 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { describe, expect, it } from "vitest"; + +import type { ClawdbotConfig } from "../config/config.js"; +import type { MsgContext } from "../auto-reply/templating.js"; +import { + buildProviderRegistry, + createMediaAttachmentCache, + normalizeMediaAttachments, + runCapability, +} from "./runner.js"; + +describe("runCapability deepgram provider options", () => { + it("merges provider options, headers, and baseUrl overrides", async () => { + const tmpPath = path.join(os.tmpdir(), `clawdbot-deepgram-${Date.now()}.wav`); + await fs.writeFile(tmpPath, Buffer.from("RIFF")); + const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media); + + let seenQuery: Record | undefined; + let seenBaseUrl: string | undefined; + let seenHeaders: Record | undefined; + + const providerRegistry = buildProviderRegistry({ + deepgram: { + id: "deepgram", + capabilities: ["audio"], + transcribeAudio: async (req) => { + seenQuery = req.query; + seenBaseUrl = req.baseUrl; + seenHeaders = req.headers; + return { text: "ok", model: req.model }; + }, + }, + }); + + const cfg = { + models: { + providers: { + deepgram: { + baseUrl: "https://provider.example", + apiKey: "test-key", + headers: { "X-Provider": "1" }, + models: [], + }, + }, + }, + tools: { + media: { + audio: { + enabled: true, + baseUrl: "https://config.example", + headers: { "X-Config": "2" }, + providerOptions: { + deepgram: { + detect_language: true, + punctuate: true, + }, + }, + deepgram: { smartFormat: true }, + models: [ + { + provider: "deepgram", + model: "nova-3", + baseUrl: "https://entry.example", + headers: { "X-Entry": "3" }, + providerOptions: { + deepgram: { + detectLanguage: false, + punctuate: false, + smart_format: true, + }, + }, + }, + ], + }, + }, + }, + } as unknown as ClawdbotConfig; + + try { + const result = await runCapability({ + capability: "audio", + cfg, + ctx, + attachments: cache, + media, + providerRegistry, + }); + expect(result.outputs[0]?.text).toBe("ok"); + expect(seenBaseUrl).toBe("https://entry.example"); + expect(seenHeaders).toMatchObject({ + "X-Provider": "1", + "X-Config": "2", + "X-Entry": "3", + }); + expect(seenQuery).toMatchObject({ + detect_language: false, + punctuate: false, + smart_format: true, + }); + expect((seenQuery as Record)["detectLanguage"]).toBeUndefined(); + } finally { + await cache.cleanup(); + await fs.unlink(tmpPath).catch(() => {}); + } + }); +}); diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 2ccd0aa58..4e1b192af 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -71,21 +71,73 @@ function trimOutput(text: string, maxChars?: number): string { return trimmed.slice(0, maxChars).trim(); } -function buildDeepgramQuery(options?: { +type ProviderQuery = Record; + +function normalizeProviderQuery( + options?: Record, +): ProviderQuery | undefined { + if (!options) return undefined; + const query: ProviderQuery = {}; + for (const [key, value] of Object.entries(options)) { + if (value === undefined) continue; + query[key] = value; + } + return Object.keys(query).length > 0 ? query : undefined; +} + +function buildDeepgramCompatQuery(options?: { detectLanguage?: boolean; punctuate?: boolean; smartFormat?: boolean; -}): Record | undefined { +}): ProviderQuery | undefined { if (!options) return undefined; - const query: Record = {}; - if (typeof options.detectLanguage === "boolean") { - query.detect_language = options.detectLanguage; + const query: ProviderQuery = {}; + if (typeof options.detectLanguage === "boolean") query.detect_language = options.detectLanguage; + if (typeof options.punctuate === "boolean") query.punctuate = options.punctuate; + if (typeof options.smartFormat === "boolean") query.smart_format = options.smartFormat; + return Object.keys(query).length > 0 ? query : undefined; +} + +function mergeProviderQuery( + base: ProviderQuery | undefined, + incoming: ProviderQuery | undefined, +): ProviderQuery | undefined { + if (!base && !incoming) return undefined; + return { ...(base ?? {}), ...(incoming ?? {}) }; +} + +function normalizeDeepgramQueryKeys(query: ProviderQuery): ProviderQuery { + const normalized = { ...query }; + if ("detectLanguage" in normalized) { + normalized.detect_language = normalized.detectLanguage as boolean; + delete normalized.detectLanguage; } - if (typeof options.punctuate === "boolean") { - query.punctuate = options.punctuate; + if ("smartFormat" in normalized) { + normalized.smart_format = normalized.smartFormat as boolean; + delete normalized.smartFormat; } - if (typeof options.smartFormat === "boolean") { - query.smart_format = options.smartFormat; + return normalized; +} + +function resolveProviderQuery(params: { + providerId: string; + config?: MediaUnderstandingConfig; + entry: MediaUnderstandingModelConfig; +}): ProviderQuery | undefined { + const { providerId, config, entry } = params; + const mergedOptions = normalizeProviderQuery({ + ...(config?.providerOptions?.[providerId] ?? {}), + ...(entry.providerOptions?.[providerId] ?? {}), + }); + if (providerId !== "deepgram") { + return mergedOptions; + } + let query = normalizeDeepgramQueryKeys(mergedOptions ?? {}); + const compat = buildDeepgramCompatQuery({ ...config?.deepgram, ...entry.deepgram }); + for (const [key, value] of Object.entries(compat ?? {})) { + if (query[key] === undefined) { + query[key] = value; + } } return Object.keys(query).length > 0 ? query : undefined; } @@ -246,13 +298,11 @@ async function runProviderEntry(params: { ...(entry.headers ?? {}), }; const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined; - const deepgramQuery = - providerId === "deepgram" - ? buildDeepgramQuery({ - ...params.config?.deepgram, - ...entry.deepgram, - }) - : undefined; + const providerQuery = resolveProviderQuery({ + providerId, + config: params.config, + entry, + }); const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model; const result = await provider.transcribeAudio({ buffer: media.buffer, @@ -264,7 +314,7 @@ async function runProviderEntry(params: { model, language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language, prompt, - query: deepgramQuery, + query: providerQuery, timeoutMs, }); return {