From d66bc65ca6b9782b9f0ef6da5de0b2abbb9604c5 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sat, 17 Jan 2026 09:12:19 +0000
Subject: [PATCH] refactor: unify media provider options

---
 docs/nodes/audio.md                           |   2 +-
 docs/nodes/media-understanding.md             |   4 +-
 docs/providers/deepgram.md                    |  16 +--
 src/config/types.tools.ts                     |   8 +-
 src/config/zod-schema.core.ts                 |   7 ++
 .../runner.deepgram.test.ts                   | 112 ++++++++++++++++++
 src/media-understanding/runner.ts             |  84 ++++++++++---
 7 files changed, 204 insertions(+), 29 deletions(-)
 create mode 100644 src/media-understanding/runner.deepgram.test.ts

diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md
index 96a0ff8fe..4d011b3fd 100644
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -80,7 +80,7 @@ read_when:
 - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
 - Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
 - Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
-- Audio providers can override `baseUrl`/`headers` via `tools.media.audio`.
+- Audio providers can override `baseUrl`, `headers`, and `providerOptions` via `tools.media.audio`.
 - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
 - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
 - Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md
index 9c0e08a66..6d640cd53 100644
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -32,8 +32,8 @@ If understanding fails or is disabled, **the reply flow continues** with the ori
 - `tools.media.models`: shared model list (use `capabilities` to gate).
 - `tools.media.image` / `tools.media.audio` / `tools.media.video`:
   - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
-  - provider overrides (`baseUrl`, `headers`)
-  - Deepgram audio options (`deepgram` in `tools.media.audio`)
+  - provider overrides (`baseUrl`, `headers`, `providerOptions`)
+  - Deepgram audio options via `tools.media.audio.providerOptions.deepgram`
   - optional **per‑capability `models` list** (preferred before shared models)
   - `attachments` policy (`mode`, `maxAttachments`, `prefer`)
   - `scope` (optional gating by channel/chatType/session key)
diff --git a/docs/providers/deepgram.md b/docs/providers/deepgram.md
index d34a880dc..133bc132c 100644
--- a/docs/providers/deepgram.md
+++ b/docs/providers/deepgram.md
@@ -41,9 +41,9 @@ DEEPGRAM_API_KEY=dg_...
 
 - `model`: Deepgram model id (default: `nova-3`)
 - `language`: language hint (optional)
-- `tools.media.audio.deepgram.detectLanguage`: enable language detection (optional)
-- `tools.media.audio.deepgram.punctuate`: enable punctuation (optional)
-- `tools.media.audio.deepgram.smartFormat`: enable smart formatting (optional)
+- `tools.media.audio.providerOptions.deepgram.detect_language`: enable language detection (optional)
+- `tools.media.audio.providerOptions.deepgram.punctuate`: enable punctuation (optional)
+- `tools.media.audio.providerOptions.deepgram.smart_format`: enable smart formatting (optional)
 
 Example with language:
 ```json5
@@ -68,10 +68,12 @@ Example with Deepgram options:
     media: {
       audio: {
         enabled: true,
-        deepgram: {
-          detectLanguage: true,
-          punctuate: true,
-          smartFormat: true
+        providerOptions: {
+          deepgram: {
+            detect_language: true,
+            punctuate: true,
+            smart_format: true
+          }
         },
         models: [{ provider: "deepgram", model: "nova-3" }]
       }
diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts
index 22983b270..bbcf17ff1 100644
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -51,7 +51,9 @@ export type MediaUnderstandingModelConfig = {
   timeoutSeconds?: number;
   /** Optional language hint for audio transcription. */
   language?: string;
-  /** Optional Deepgram transcription options (audio only). */
+  /** Optional provider-specific query params (merged into requests). */
+  providerOptions?: Record<string, Record<string, string | number | boolean>>;
+  /** @deprecated Use providerOptions.deepgram instead. */
   deepgram?: {
     detectLanguage?: boolean;
     punctuate?: boolean;
@@ -82,7 +84,9 @@ export type MediaUnderstandingConfig = {
   timeoutSeconds?: number;
   /** Default language hint (audio). */
   language?: string;
-  /** Optional Deepgram transcription options (audio only). */
+  /** Optional provider-specific query params (merged into requests). */
+  providerOptions?: Record<string, Record<string, string | number | boolean>>;
+  /** @deprecated Use providerOptions.deepgram instead. */
   deepgram?: {
     detectLanguage?: boolean;
     punctuate?: boolean;
diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts
index 47239d210..0e7bf0cbc 100644
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -284,6 +284,11 @@ const DeepgramAudioSchema = z
   })
   .optional();
 
+const ProviderOptionValueSchema = z.union([z.string(), z.number(), z.boolean()]);
+const ProviderOptionsSchema = z
+  .record(z.string(), z.record(z.string(), ProviderOptionValueSchema))
+  .optional();
+
 export const MediaUnderstandingModelSchema = z
   .object({
     provider: z.string().optional(),
@@ -297,6 +302,7 @@ export const MediaUnderstandingModelSchema = z
     maxBytes: z.number().int().positive().optional(),
     timeoutSeconds: z.number().int().positive().optional(),
     language: z.string().optional(),
+    providerOptions: ProviderOptionsSchema,
     deepgram: DeepgramAudioSchema,
     baseUrl: z.string().optional(),
     headers: z.record(z.string(), z.string()).optional(),
@@ -314,6 +320,7 @@ export const ToolsMediaUnderstandingSchema = z
     prompt: z.string().optional(),
     timeoutSeconds: z.number().int().positive().optional(),
     language: z.string().optional(),
+    providerOptions: ProviderOptionsSchema,
     deepgram: DeepgramAudioSchema,
     baseUrl: z.string().optional(),
     headers: z.record(z.string(), z.string()).optional(),
diff --git a/src/media-understanding/runner.deepgram.test.ts b/src/media-understanding/runner.deepgram.test.ts
new file mode 100644
index 000000000..2a6cc5de7
--- /dev/null
+++ b/src/media-understanding/runner.deepgram.test.ts
@@ -0,0 +1,112 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { describe, expect, it } from "vitest";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+describe("runCapability deepgram provider options", () => {
+  it("merges provider options, headers, and baseUrl overrides", async () => {
+    const tmpPath = path.join(os.tmpdir(), `clawdbot-deepgram-${Date.now()}.wav`);
+    await fs.writeFile(tmpPath, Buffer.from("RIFF"));
+    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+
+    let seenQuery: Record<string, string | number | boolean> | undefined;
+    let seenBaseUrl: string | undefined;
+    let seenHeaders: Record<string, string> | undefined;
+
+    const providerRegistry = buildProviderRegistry({
+      deepgram: {
+        id: "deepgram",
+        capabilities: ["audio"],
+        transcribeAudio: async (req) => {
+          seenQuery = req.query;
+          seenBaseUrl = req.baseUrl;
+          seenHeaders = req.headers;
+          return { text: "ok", model: req.model };
+        },
+      },
+    });
+
+    const cfg = {
+      models: {
+        providers: {
+          deepgram: {
+            baseUrl: "https://provider.example",
+            apiKey: "test-key",
+            headers: { "X-Provider": "1" },
+            models: [],
+          },
+        },
+      },
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            baseUrl: "https://config.example",
+            headers: { "X-Config": "2" },
+            providerOptions: {
+              deepgram: {
+                detect_language: true,
+                punctuate: true,
+              },
+            },
+            deepgram: { smartFormat: true },
+            models: [
+              {
+                provider: "deepgram",
+                model: "nova-3",
+                baseUrl: "https://entry.example",
+                headers: { "X-Entry": "3" },
+                providerOptions: {
+                  deepgram: {
+                    detectLanguage: false,
+                    punctuate: false,
+                    smart_format: true,
+                  },
+                },
+              },
+            ],
+          },
+        },
+      },
+    } as unknown as ClawdbotConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+      });
+      expect(result.outputs[0]?.text).toBe("ok");
+      expect(seenBaseUrl).toBe("https://entry.example");
+      expect(seenHeaders).toMatchObject({
+        "X-Provider": "1",
+        "X-Config": "2",
+        "X-Entry": "3",
+      });
+      expect(seenQuery).toMatchObject({
+        detect_language: false,
+        punctuate: false,
+        smart_format: true,
+      });
+      expect((seenQuery as Record<string, unknown>)["detectLanguage"]).toBeUndefined();
+    } finally {
+      await cache.cleanup();
+      await fs.unlink(tmpPath).catch(() => {});
+    }
+  });
+});
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
index 2ccd0aa58..4e1b192af 100644
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -71,21 +71,73 @@ function trimOutput(text: string, maxChars?: number): string {
   return trimmed.slice(0, maxChars).trim();
 }
 
-function buildDeepgramQuery(options?: {
+type ProviderQuery = Record<string, string | number | boolean>;
+
+function normalizeProviderQuery(
+  options?: Record<string, string | number | boolean>,
+): ProviderQuery | undefined {
+  if (!options) return undefined;
+  const query: ProviderQuery = {};
+  for (const [key, value] of Object.entries(options)) {
+    if (value === undefined) continue;
+    query[key] = value;
+  }
+  return Object.keys(query).length > 0 ? query : undefined;
+}
+
+function buildDeepgramCompatQuery(options?: {
   detectLanguage?: boolean;
   punctuate?: boolean;
   smartFormat?: boolean;
-}): Record<string, string | number | boolean> | undefined {
+}): ProviderQuery | undefined {
   if (!options) return undefined;
-  const query: Record<string, string | number | boolean> = {};
-  if (typeof options.detectLanguage === "boolean") {
-    query.detect_language = options.detectLanguage;
+  const query: ProviderQuery = {};
+  if (typeof options.detectLanguage === "boolean") query.detect_language = options.detectLanguage;
+  if (typeof options.punctuate === "boolean") query.punctuate = options.punctuate;
+  if (typeof options.smartFormat === "boolean") query.smart_format = options.smartFormat;
+  return Object.keys(query).length > 0 ? query : undefined;
+}
+
+function mergeProviderQuery(
+  base: ProviderQuery | undefined,
+  incoming: ProviderQuery | undefined,
+): ProviderQuery | undefined {
+  if (!base && !incoming) return undefined;
+  return { ...(base ?? {}), ...(incoming ?? {}) };
+}
+
+function normalizeDeepgramQueryKeys(query: ProviderQuery): ProviderQuery {
+  const normalized = { ...query };
+  if ("detectLanguage" in normalized) {
+    normalized.detect_language = normalized.detectLanguage as boolean;
+    delete normalized.detectLanguage;
   }
-  if (typeof options.punctuate === "boolean") {
-    query.punctuate = options.punctuate;
+  if ("smartFormat" in normalized) {
+    normalized.smart_format = normalized.smartFormat as boolean;
+    delete normalized.smartFormat;
   }
-  if (typeof options.smartFormat === "boolean") {
-    query.smart_format = options.smartFormat;
+  return normalized;
+}
+
+function resolveProviderQuery(params: {
+  providerId: string;
+  config?: MediaUnderstandingConfig;
+  entry: MediaUnderstandingModelConfig;
+}): ProviderQuery | undefined {
+  const { providerId, config, entry } = params;
+  const mergedOptions = normalizeProviderQuery({
+    ...(config?.providerOptions?.[providerId] ?? {}),
+    ...(entry.providerOptions?.[providerId] ?? {}),
+  });
+  if (providerId !== "deepgram") {
+    return mergedOptions;
+  }
+  let query = normalizeDeepgramQueryKeys(mergedOptions ?? {});
+  const compat = buildDeepgramCompatQuery({ ...config?.deepgram, ...entry.deepgram });
+  for (const [key, value] of Object.entries(compat ?? {})) {
+    if (query[key] === undefined) {
+      query[key] = value;
+    }
   }
   return Object.keys(query).length > 0 ? query : undefined;
 }
@@ -246,13 +298,11 @@ async function runProviderEntry(params: {
       ...(entry.headers ?? {}),
     };
     const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
-    const deepgramQuery =
-      providerId === "deepgram"
-        ? buildDeepgramQuery({
-            ...params.config?.deepgram,
-            ...entry.deepgram,
-          })
-        : undefined;
+    const providerQuery = resolveProviderQuery({
+      providerId,
+      config: params.config,
+      entry,
+    });
     const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
     const result = await provider.transcribeAudio({
       buffer: media.buffer,
@@ -264,7 +314,7 @@ async function runProviderEntry(params: {
       model,
       language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
       prompt,
-      query: deepgramQuery,
+      query: providerQuery,
       timeoutMs,
     });
     return {