refactor: unify media provider options

2026-01-17 09:12:19 +00:00
parent 89f85ddeab
commit d66bc65ca6
7 changed files with 204 additions and 29 deletions
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -80,7 +80,7 @@ read_when:
 - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
 - Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
 - Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
- Audio providers can override `baseUrl`/`headers` via `tools.media.audio`.
+- Audio providers can override `baseUrl`, `headers`, and `providerOptions` via `tools.media.audio`.
 - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
 - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
 - Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -32,8 +32,8 @@ If understanding fails or is disabled, **the reply flow continues** with the ori
 - `tools.media.models`: shared model list (use `capabilities` to gate).
 - `tools.media.image` / `tools.media.audio` / `tools.media.video`:
  - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
-  - provider overrides (`baseUrl`, `headers`)
-  - Deepgram audio options (`deepgram` in `tools.media.audio`)
+  - provider overrides (`baseUrl`, `headers`, `providerOptions`)
+  - Deepgram audio options via `tools.media.audio.providerOptions.deepgram`
  - optional **per‑capability `models` list** (preferred before shared models)
  - `attachments` policy (`mode`, `maxAttachments`, `prefer`)
  - `scope` (optional gating by channel/chatType/session key)
--- a/docs/providers/deepgram.md
+++ b/docs/providers/deepgram.md
@@ -41,9 +41,9 @@ DEEPGRAM_API_KEY=dg_...

 - `model`: Deepgram model id (default: `nova-3`)
 - `language`: language hint (optional)
- `tools.media.audio.deepgram.detectLanguage`: enable language detection (optional)
- `tools.media.audio.deepgram.punctuate`: enable punctuation (optional)
- `tools.media.audio.deepgram.smartFormat`: enable smart formatting (optional)
+- `tools.media.audio.providerOptions.deepgram.detect_language`: enable language detection (optional)
+- `tools.media.audio.providerOptions.deepgram.punctuate`: enable punctuation (optional)
+- `tools.media.audio.providerOptions.deepgram.smart_format`: enable smart formatting (optional)

 Example with language:
 ```json5
@@ -68,10 +68,12 @@ Example with Deepgram options:
    media: {
      audio: {
        enabled: true,
-        deepgram: {
-          detectLanguage: true,
-          punctuate: true,
-          smartFormat: true
+        providerOptions: {
+          deepgram: {
+            detect_language: true,
+            punctuate: true,
+            smart_format: true
+          }
        },
        models: [{ provider: "deepgram", model: "nova-3" }]
      }
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -51,7 +51,9 @@ export type MediaUnderstandingModelConfig = {
  timeoutSeconds?: number;
  /** Optional language hint for audio transcription. */
  language?: string;
-  /** Optional Deepgram transcription options (audio only). */
+  /** Optional provider-specific query params (merged into requests). */
+  providerOptions?: Record<string, Record<string, string | number | boolean>>;
+  /** @deprecated Use providerOptions.deepgram instead. */
  deepgram?: {
    detectLanguage?: boolean;
    punctuate?: boolean;
@@ -82,7 +84,9 @@ export type MediaUnderstandingConfig = {
  timeoutSeconds?: number;
  /** Default language hint (audio). */
  language?: string;
-  /** Optional Deepgram transcription options (audio only). */
+  /** Optional provider-specific query params (merged into requests). */
+  providerOptions?: Record<string, Record<string, string | number | boolean>>;
+  /** @deprecated Use providerOptions.deepgram instead. */
  deepgram?: {
    detectLanguage?: boolean;
    punctuate?: boolean;
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -284,6 +284,11 @@ const DeepgramAudioSchema = z
  })
  .optional();

+const ProviderOptionValueSchema = z.union([z.string(), z.number(), z.boolean()]);
+const ProviderOptionsSchema = z
+  .record(z.string(), z.record(z.string(), ProviderOptionValueSchema))
+  .optional();
+
 export const MediaUnderstandingModelSchema = z
  .object({
    provider: z.string().optional(),
@@ -297,6 +302,7 @@ export const MediaUnderstandingModelSchema = z
    maxBytes: z.number().int().positive().optional(),
    timeoutSeconds: z.number().int().positive().optional(),
    language: z.string().optional(),
+    providerOptions: ProviderOptionsSchema,
    deepgram: DeepgramAudioSchema,
    baseUrl: z.string().optional(),
    headers: z.record(z.string(), z.string()).optional(),
@@ -314,6 +320,7 @@ export const ToolsMediaUnderstandingSchema = z
    prompt: z.string().optional(),
    timeoutSeconds: z.number().int().positive().optional(),
    language: z.string().optional(),
+    providerOptions: ProviderOptionsSchema,
    deepgram: DeepgramAudioSchema,
    baseUrl: z.string().optional(),
    headers: z.record(z.string(), z.string()).optional(),
--- a/src/media-understanding/runner.deepgram.test.ts
+++ b/src/media-understanding/runner.deepgram.test.ts
@@ -0,0 +1,112 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { describe, expect, it } from "vitest";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+describe("runCapability deepgram provider options", () => {
+  it("merges provider options, headers, and baseUrl overrides", async () => {
+    const tmpPath = path.join(os.tmpdir(), `clawdbot-deepgram-${Date.now()}.wav`);
+    await fs.writeFile(tmpPath, Buffer.from("RIFF"));
+    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+
+    let seenQuery: Record<string, string | number | boolean> | undefined;
+    let seenBaseUrl: string | undefined;
+    let seenHeaders: Record<string, string> | undefined;
+
+    const providerRegistry = buildProviderRegistry({
+      deepgram: {
+        id: "deepgram",
+        capabilities: ["audio"],
+        transcribeAudio: async (req) => {
+          seenQuery = req.query;
+          seenBaseUrl = req.baseUrl;
+          seenHeaders = req.headers;
+          return { text: "ok", model: req.model };
+        },
+      },
+    });
+
+    const cfg = {
+      models: {
+        providers: {
+          deepgram: {
+            baseUrl: "https://provider.example",
+            apiKey: "test-key",
+            headers: { "X-Provider": "1" },
+            models: [],
+          },
+        },
+      },
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            baseUrl: "https://config.example",
+            headers: { "X-Config": "2" },
+            providerOptions: {
+              deepgram: {
+                detect_language: true,
+                punctuate: true,
+              },
+            },
+            deepgram: { smartFormat: true },
+            models: [
+              {
+                provider: "deepgram",
+                model: "nova-3",
+                baseUrl: "https://entry.example",
+                headers: { "X-Entry": "3" },
+                providerOptions: {
+                  deepgram: {
+                    detectLanguage: false,
+                    punctuate: false,
+                    smart_format: true,
+                  },
+                },
+              },
+            ],
+          },
+        },
+      },
+    } as unknown as ClawdbotConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+      });
+      expect(result.outputs[0]?.text).toBe("ok");
+      expect(seenBaseUrl).toBe("https://entry.example");
+      expect(seenHeaders).toMatchObject({
+        "X-Provider": "1",
+        "X-Config": "2",
+        "X-Entry": "3",
+      });
+      expect(seenQuery).toMatchObject({
+        detect_language: false,
+        punctuate: false,
+        smart_format: true,
+      });
+      expect((seenQuery as Record<string, unknown>)["detectLanguage"]).toBeUndefined();
+    } finally {
+      await cache.cleanup();
+      await fs.unlink(tmpPath).catch(() => {});
+    }
+  });
+});
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -71,21 +71,73 @@ function trimOutput(text: string, maxChars?: number): string {
  return trimmed.slice(0, maxChars).trim();
 }

-function buildDeepgramQuery(options?: {
+type ProviderQuery = Record<string, string | number | boolean>;
+
+function normalizeProviderQuery(
+  options?: Record<string, string | number | boolean>,
+): ProviderQuery | undefined {
+  if (!options) return undefined;
+  const query: ProviderQuery = {};
+  for (const [key, value] of Object.entries(options)) {
+    if (value === undefined) continue;
+    query[key] = value;
+  }
+  return Object.keys(query).length > 0 ? query : undefined;
+}
+
+function buildDeepgramCompatQuery(options?: {
  detectLanguage?: boolean;
  punctuate?: boolean;
  smartFormat?: boolean;
-}): Record<string, string | number | boolean> | undefined {
+}): ProviderQuery | undefined {
  if (!options) return undefined;
-  const query: Record<string, string | number | boolean> = {};
-  if (typeof options.detectLanguage === "boolean") {
-    query.detect_language = options.detectLanguage;
+  const query: ProviderQuery = {};
+  if (typeof options.detectLanguage === "boolean") query.detect_language = options.detectLanguage;
+  if (typeof options.punctuate === "boolean") query.punctuate = options.punctuate;
+  if (typeof options.smartFormat === "boolean") query.smart_format = options.smartFormat;
+  return Object.keys(query).length > 0 ? query : undefined;
+}
+
+function mergeProviderQuery(
+  base: ProviderQuery | undefined,
+  incoming: ProviderQuery | undefined,
+): ProviderQuery | undefined {
+  if (!base && !incoming) return undefined;
+  return { ...(base ?? {}), ...(incoming ?? {}) };
+}
+
+function normalizeDeepgramQueryKeys(query: ProviderQuery): ProviderQuery {
+  const normalized = { ...query };
+  if ("detectLanguage" in normalized) {
+    normalized.detect_language = normalized.detectLanguage as boolean;
+    delete normalized.detectLanguage;
  }
-  if (typeof options.punctuate === "boolean") {
-    query.punctuate = options.punctuate;
+  if ("smartFormat" in normalized) {
+    normalized.smart_format = normalized.smartFormat as boolean;
+    delete normalized.smartFormat;
  }
-  if (typeof options.smartFormat === "boolean") {
-    query.smart_format = options.smartFormat;
+  return normalized;
+}
+
+function resolveProviderQuery(params: {
+  providerId: string;
+  config?: MediaUnderstandingConfig;
+  entry: MediaUnderstandingModelConfig;
+}): ProviderQuery | undefined {
+  const { providerId, config, entry } = params;
+  const mergedOptions = normalizeProviderQuery({
+    ...(config?.providerOptions?.[providerId] ?? {}),
+    ...(entry.providerOptions?.[providerId] ?? {}),
+  });
+  if (providerId !== "deepgram") {
+    return mergedOptions;
+  }
+  let query = normalizeDeepgramQueryKeys(mergedOptions ?? {});
+  const compat = buildDeepgramCompatQuery({ ...config?.deepgram, ...entry.deepgram });
+  for (const [key, value] of Object.entries(compat ?? {})) {
+    if (query[key] === undefined) {
+      query[key] = value;
+    }
  }
  return Object.keys(query).length > 0 ? query : undefined;
 }
@@ -246,13 +298,11 @@ async function runProviderEntry(params: {
      ...(entry.headers ?? {}),
    };
    const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
-    const deepgramQuery =
-      providerId === "deepgram"
-        ? buildDeepgramQuery({
-            ...params.config?.deepgram,
-            ...entry.deepgram,
-          })
-        : undefined;
+    const providerQuery = resolveProviderQuery({
+      providerId,
+      config: params.config,
+      entry,
+    });
    const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
    const result = await provider.transcribeAudio({
      buffer: media.buffer,
@@ -264,7 +314,7 @@ async function runProviderEntry(params: {
      model,
      language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
      prompt,
-      query: deepgramQuery,
+      query: providerQuery,
      timeoutMs,
    });
    return {