refactor: unify media provider options

This commit is contained in:
Peter Steinberger
2026-01-17 09:12:19 +00:00
parent 89f85ddeab
commit d66bc65ca6
7 changed files with 204 additions and 29 deletions

View File

@@ -80,7 +80,7 @@ read_when:
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
- Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
- Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
- Audio providers can override `baseUrl`/`headers` via `tools.media.audio`.
- Audio providers can override `baseUrl`, `headers`, and `providerOptions` via `tools.media.audio`.
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).

View File

@@ -32,8 +32,8 @@ If understanding fails or is disabled, **the reply flow continues** with the ori
- `tools.media.models`: shared model list (use `capabilities` to gate).
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
- provider overrides (`baseUrl`, `headers`)
- Deepgram audio options (`deepgram` in `tools.media.audio`)
- provider overrides (`baseUrl`, `headers`, `providerOptions`)
- Deepgram audio options via `tools.media.audio.providerOptions.deepgram`
- optional **percapability `models` list** (preferred before shared models)
- `attachments` policy (`mode`, `maxAttachments`, `prefer`)
- `scope` (optional gating by channel/chatType/session key)

View File

@@ -41,9 +41,9 @@ DEEPGRAM_API_KEY=dg_...
- `model`: Deepgram model id (default: `nova-3`)
- `language`: language hint (optional)
- `tools.media.audio.deepgram.detectLanguage`: enable language detection (optional)
- `tools.media.audio.deepgram.punctuate`: enable punctuation (optional)
- `tools.media.audio.deepgram.smartFormat`: enable smart formatting (optional)
- `tools.media.audio.providerOptions.deepgram.detect_language`: enable language detection (optional)
- `tools.media.audio.providerOptions.deepgram.punctuate`: enable punctuation (optional)
- `tools.media.audio.providerOptions.deepgram.smart_format`: enable smart formatting (optional)
Example with language:
```json5
@@ -68,10 +68,12 @@ Example with Deepgram options:
media: {
audio: {
enabled: true,
deepgram: {
detectLanguage: true,
punctuate: true,
smartFormat: true
providerOptions: {
deepgram: {
detect_language: true,
punctuate: true,
smart_format: true
}
},
models: [{ provider: "deepgram", model: "nova-3" }]
}

View File

@@ -51,7 +51,9 @@ export type MediaUnderstandingModelConfig = {
timeoutSeconds?: number;
/** Optional language hint for audio transcription. */
language?: string;
/** Optional Deepgram transcription options (audio only). */
/** Optional provider-specific query params (merged into requests). */
providerOptions?: Record<string, Record<string, string | number | boolean>>;
/** @deprecated Use providerOptions.deepgram instead. */
deepgram?: {
detectLanguage?: boolean;
punctuate?: boolean;
@@ -82,7 +84,9 @@ export type MediaUnderstandingConfig = {
timeoutSeconds?: number;
/** Default language hint (audio). */
language?: string;
/** Optional Deepgram transcription options (audio only). */
/** Optional provider-specific query params (merged into requests). */
providerOptions?: Record<string, Record<string, string | number | boolean>>;
/** @deprecated Use providerOptions.deepgram instead. */
deepgram?: {
detectLanguage?: boolean;
punctuate?: boolean;

View File

@@ -284,6 +284,11 @@ const DeepgramAudioSchema = z
})
.optional();
const ProviderOptionValueSchema = z.union([z.string(), z.number(), z.boolean()]);
const ProviderOptionsSchema = z
.record(z.string(), z.record(z.string(), ProviderOptionValueSchema))
.optional();
export const MediaUnderstandingModelSchema = z
.object({
provider: z.string().optional(),
@@ -297,6 +302,7 @@ export const MediaUnderstandingModelSchema = z
maxBytes: z.number().int().positive().optional(),
timeoutSeconds: z.number().int().positive().optional(),
language: z.string().optional(),
providerOptions: ProviderOptionsSchema,
deepgram: DeepgramAudioSchema,
baseUrl: z.string().optional(),
headers: z.record(z.string(), z.string()).optional(),
@@ -314,6 +320,7 @@ export const ToolsMediaUnderstandingSchema = z
prompt: z.string().optional(),
timeoutSeconds: z.number().int().positive().optional(),
language: z.string().optional(),
providerOptions: ProviderOptionsSchema,
deepgram: DeepgramAudioSchema,
baseUrl: z.string().optional(),
headers: z.record(z.string(), z.string()).optional(),

View File

@@ -0,0 +1,112 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import type { ClawdbotConfig } from "../config/config.js";
import type { MsgContext } from "../auto-reply/templating.js";
import {
buildProviderRegistry,
createMediaAttachmentCache,
normalizeMediaAttachments,
runCapability,
} from "./runner.js";
describe("runCapability deepgram provider options", () => {
it("merges provider options, headers, and baseUrl overrides", async () => {
const tmpPath = path.join(os.tmpdir(), `clawdbot-deepgram-${Date.now()}.wav`);
await fs.writeFile(tmpPath, Buffer.from("RIFF"));
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
const media = normalizeMediaAttachments(ctx);
const cache = createMediaAttachmentCache(media);
let seenQuery: Record<string, string | number | boolean> | undefined;
let seenBaseUrl: string | undefined;
let seenHeaders: Record<string, string> | undefined;
const providerRegistry = buildProviderRegistry({
deepgram: {
id: "deepgram",
capabilities: ["audio"],
transcribeAudio: async (req) => {
seenQuery = req.query;
seenBaseUrl = req.baseUrl;
seenHeaders = req.headers;
return { text: "ok", model: req.model };
},
},
});
const cfg = {
models: {
providers: {
deepgram: {
baseUrl: "https://provider.example",
apiKey: "test-key",
headers: { "X-Provider": "1" },
models: [],
},
},
},
tools: {
media: {
audio: {
enabled: true,
baseUrl: "https://config.example",
headers: { "X-Config": "2" },
providerOptions: {
deepgram: {
detect_language: true,
punctuate: true,
},
},
deepgram: { smartFormat: true },
models: [
{
provider: "deepgram",
model: "nova-3",
baseUrl: "https://entry.example",
headers: { "X-Entry": "3" },
providerOptions: {
deepgram: {
detectLanguage: false,
punctuate: false,
smart_format: true,
},
},
},
],
},
},
},
} as unknown as ClawdbotConfig;
try {
const result = await runCapability({
capability: "audio",
cfg,
ctx,
attachments: cache,
media,
providerRegistry,
});
expect(result.outputs[0]?.text).toBe("ok");
expect(seenBaseUrl).toBe("https://entry.example");
expect(seenHeaders).toMatchObject({
"X-Provider": "1",
"X-Config": "2",
"X-Entry": "3",
});
expect(seenQuery).toMatchObject({
detect_language: false,
punctuate: false,
smart_format: true,
});
expect((seenQuery as Record<string, unknown>)["detectLanguage"]).toBeUndefined();
} finally {
await cache.cleanup();
await fs.unlink(tmpPath).catch(() => {});
}
});
});

View File

@@ -71,21 +71,73 @@ function trimOutput(text: string, maxChars?: number): string {
return trimmed.slice(0, maxChars).trim();
}
function buildDeepgramQuery(options?: {
type ProviderQuery = Record<string, string | number | boolean>;
function normalizeProviderQuery(
options?: Record<string, string | number | boolean>,
): ProviderQuery | undefined {
if (!options) return undefined;
const query: ProviderQuery = {};
for (const [key, value] of Object.entries(options)) {
if (value === undefined) continue;
query[key] = value;
}
return Object.keys(query).length > 0 ? query : undefined;
}
function buildDeepgramCompatQuery(options?: {
detectLanguage?: boolean;
punctuate?: boolean;
smartFormat?: boolean;
}): Record<string, string | number | boolean> | undefined {
}): ProviderQuery | undefined {
if (!options) return undefined;
const query: Record<string, string | number | boolean> = {};
if (typeof options.detectLanguage === "boolean") {
query.detect_language = options.detectLanguage;
const query: ProviderQuery = {};
if (typeof options.detectLanguage === "boolean") query.detect_language = options.detectLanguage;
if (typeof options.punctuate === "boolean") query.punctuate = options.punctuate;
if (typeof options.smartFormat === "boolean") query.smart_format = options.smartFormat;
return Object.keys(query).length > 0 ? query : undefined;
}
function mergeProviderQuery(
base: ProviderQuery | undefined,
incoming: ProviderQuery | undefined,
): ProviderQuery | undefined {
if (!base && !incoming) return undefined;
return { ...(base ?? {}), ...(incoming ?? {}) };
}
function normalizeDeepgramQueryKeys(query: ProviderQuery): ProviderQuery {
const normalized = { ...query };
if ("detectLanguage" in normalized) {
normalized.detect_language = normalized.detectLanguage as boolean;
delete normalized.detectLanguage;
}
if (typeof options.punctuate === "boolean") {
query.punctuate = options.punctuate;
if ("smartFormat" in normalized) {
normalized.smart_format = normalized.smartFormat as boolean;
delete normalized.smartFormat;
}
if (typeof options.smartFormat === "boolean") {
query.smart_format = options.smartFormat;
return normalized;
}
function resolveProviderQuery(params: {
providerId: string;
config?: MediaUnderstandingConfig;
entry: MediaUnderstandingModelConfig;
}): ProviderQuery | undefined {
const { providerId, config, entry } = params;
const mergedOptions = normalizeProviderQuery({
...(config?.providerOptions?.[providerId] ?? {}),
...(entry.providerOptions?.[providerId] ?? {}),
});
if (providerId !== "deepgram") {
return mergedOptions;
}
let query = normalizeDeepgramQueryKeys(mergedOptions ?? {});
const compat = buildDeepgramCompatQuery({ ...config?.deepgram, ...entry.deepgram });
for (const [key, value] of Object.entries(compat ?? {})) {
if (query[key] === undefined) {
query[key] = value;
}
}
return Object.keys(query).length > 0 ? query : undefined;
}
@@ -246,13 +298,11 @@ async function runProviderEntry(params: {
...(entry.headers ?? {}),
};
const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
const deepgramQuery =
providerId === "deepgram"
? buildDeepgramQuery({
...params.config?.deepgram,
...entry.deepgram,
})
: undefined;
const providerQuery = resolveProviderQuery({
providerId,
config: params.config,
entry,
});
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
const result = await provider.transcribeAudio({
buffer: media.buffer,
@@ -264,7 +314,7 @@ async function runProviderEntry(params: {
model,
language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
prompt,
query: deepgramQuery,
query: providerQuery,
timeoutMs,
});
return {