refactor: unify media provider options
This commit is contained in:
@@ -80,7 +80,7 @@ read_when:
|
||||
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
||||
- Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
|
||||
- Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
|
||||
- Audio providers can override `baseUrl`/`headers` via `tools.media.audio`.
|
||||
- Audio providers can override `baseUrl`, `headers`, and `providerOptions` via `tools.media.audio`.
|
||||
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
||||
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
||||
- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
|
||||
|
||||
@@ -32,8 +32,8 @@ If understanding fails or is disabled, **the reply flow continues** with the ori
|
||||
- `tools.media.models`: shared model list (use `capabilities` to gate).
|
||||
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
|
||||
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
|
||||
- provider overrides (`baseUrl`, `headers`)
|
||||
- Deepgram audio options (`deepgram` in `tools.media.audio`)
|
||||
- provider overrides (`baseUrl`, `headers`, `providerOptions`)
|
||||
- Deepgram audio options via `tools.media.audio.providerOptions.deepgram`
|
||||
- optional **per‑capability `models` list** (preferred before shared models)
|
||||
- `attachments` policy (`mode`, `maxAttachments`, `prefer`)
|
||||
- `scope` (optional gating by channel/chatType/session key)
|
||||
|
||||
@@ -41,9 +41,9 @@ DEEPGRAM_API_KEY=dg_...
|
||||
|
||||
- `model`: Deepgram model id (default: `nova-3`)
|
||||
- `language`: language hint (optional)
|
||||
- `tools.media.audio.deepgram.detectLanguage`: enable language detection (optional)
|
||||
- `tools.media.audio.deepgram.punctuate`: enable punctuation (optional)
|
||||
- `tools.media.audio.deepgram.smartFormat`: enable smart formatting (optional)
|
||||
- `tools.media.audio.providerOptions.deepgram.detect_language`: enable language detection (optional)
|
||||
- `tools.media.audio.providerOptions.deepgram.punctuate`: enable punctuation (optional)
|
||||
- `tools.media.audio.providerOptions.deepgram.smart_format`: enable smart formatting (optional)
|
||||
|
||||
Example with language:
|
||||
```json5
|
||||
@@ -68,10 +68,12 @@ Example with Deepgram options:
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
deepgram: {
|
||||
detectLanguage: true,
|
||||
punctuate: true,
|
||||
smartFormat: true
|
||||
providerOptions: {
|
||||
deepgram: {
|
||||
detect_language: true,
|
||||
punctuate: true,
|
||||
smart_format: true
|
||||
}
|
||||
},
|
||||
models: [{ provider: "deepgram", model: "nova-3" }]
|
||||
}
|
||||
|
||||
@@ -51,7 +51,9 @@ export type MediaUnderstandingModelConfig = {
|
||||
timeoutSeconds?: number;
|
||||
/** Optional language hint for audio transcription. */
|
||||
language?: string;
|
||||
/** Optional Deepgram transcription options (audio only). */
|
||||
/** Optional provider-specific query params (merged into requests). */
|
||||
providerOptions?: Record<string, Record<string, string | number | boolean>>;
|
||||
/** @deprecated Use providerOptions.deepgram instead. */
|
||||
deepgram?: {
|
||||
detectLanguage?: boolean;
|
||||
punctuate?: boolean;
|
||||
@@ -82,7 +84,9 @@ export type MediaUnderstandingConfig = {
|
||||
timeoutSeconds?: number;
|
||||
/** Default language hint (audio). */
|
||||
language?: string;
|
||||
/** Optional Deepgram transcription options (audio only). */
|
||||
/** Optional provider-specific query params (merged into requests). */
|
||||
providerOptions?: Record<string, Record<string, string | number | boolean>>;
|
||||
/** @deprecated Use providerOptions.deepgram instead. */
|
||||
deepgram?: {
|
||||
detectLanguage?: boolean;
|
||||
punctuate?: boolean;
|
||||
|
||||
@@ -284,6 +284,11 @@ const DeepgramAudioSchema = z
|
||||
})
|
||||
.optional();
|
||||
|
||||
const ProviderOptionValueSchema = z.union([z.string(), z.number(), z.boolean()]);
|
||||
const ProviderOptionsSchema = z
|
||||
.record(z.string(), z.record(z.string(), ProviderOptionValueSchema))
|
||||
.optional();
|
||||
|
||||
export const MediaUnderstandingModelSchema = z
|
||||
.object({
|
||||
provider: z.string().optional(),
|
||||
@@ -297,6 +302,7 @@ export const MediaUnderstandingModelSchema = z
|
||||
maxBytes: z.number().int().positive().optional(),
|
||||
timeoutSeconds: z.number().int().positive().optional(),
|
||||
language: z.string().optional(),
|
||||
providerOptions: ProviderOptionsSchema,
|
||||
deepgram: DeepgramAudioSchema,
|
||||
baseUrl: z.string().optional(),
|
||||
headers: z.record(z.string(), z.string()).optional(),
|
||||
@@ -314,6 +320,7 @@ export const ToolsMediaUnderstandingSchema = z
|
||||
prompt: z.string().optional(),
|
||||
timeoutSeconds: z.number().int().positive().optional(),
|
||||
language: z.string().optional(),
|
||||
providerOptions: ProviderOptionsSchema,
|
||||
deepgram: DeepgramAudioSchema,
|
||||
baseUrl: z.string().optional(),
|
||||
headers: z.record(z.string(), z.string()).optional(),
|
||||
|
||||
112
src/media-understanding/runner.deepgram.test.ts
Normal file
112
src/media-understanding/runner.deepgram.test.ts
Normal file
@@ -0,0 +1,112 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import {
|
||||
buildProviderRegistry,
|
||||
createMediaAttachmentCache,
|
||||
normalizeMediaAttachments,
|
||||
runCapability,
|
||||
} from "./runner.js";
|
||||
|
||||
describe("runCapability deepgram provider options", () => {
|
||||
it("merges provider options, headers, and baseUrl overrides", async () => {
|
||||
const tmpPath = path.join(os.tmpdir(), `clawdbot-deepgram-${Date.now()}.wav`);
|
||||
await fs.writeFile(tmpPath, Buffer.from("RIFF"));
|
||||
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
|
||||
const media = normalizeMediaAttachments(ctx);
|
||||
const cache = createMediaAttachmentCache(media);
|
||||
|
||||
let seenQuery: Record<string, string | number | boolean> | undefined;
|
||||
let seenBaseUrl: string | undefined;
|
||||
let seenHeaders: Record<string, string> | undefined;
|
||||
|
||||
const providerRegistry = buildProviderRegistry({
|
||||
deepgram: {
|
||||
id: "deepgram",
|
||||
capabilities: ["audio"],
|
||||
transcribeAudio: async (req) => {
|
||||
seenQuery = req.query;
|
||||
seenBaseUrl = req.baseUrl;
|
||||
seenHeaders = req.headers;
|
||||
return { text: "ok", model: req.model };
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
deepgram: {
|
||||
baseUrl: "https://provider.example",
|
||||
apiKey: "test-key",
|
||||
headers: { "X-Provider": "1" },
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
baseUrl: "https://config.example",
|
||||
headers: { "X-Config": "2" },
|
||||
providerOptions: {
|
||||
deepgram: {
|
||||
detect_language: true,
|
||||
punctuate: true,
|
||||
},
|
||||
},
|
||||
deepgram: { smartFormat: true },
|
||||
models: [
|
||||
{
|
||||
provider: "deepgram",
|
||||
model: "nova-3",
|
||||
baseUrl: "https://entry.example",
|
||||
headers: { "X-Entry": "3" },
|
||||
providerOptions: {
|
||||
deepgram: {
|
||||
detectLanguage: false,
|
||||
punctuate: false,
|
||||
smart_format: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as ClawdbotConfig;
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: "audio",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
providerRegistry,
|
||||
});
|
||||
expect(result.outputs[0]?.text).toBe("ok");
|
||||
expect(seenBaseUrl).toBe("https://entry.example");
|
||||
expect(seenHeaders).toMatchObject({
|
||||
"X-Provider": "1",
|
||||
"X-Config": "2",
|
||||
"X-Entry": "3",
|
||||
});
|
||||
expect(seenQuery).toMatchObject({
|
||||
detect_language: false,
|
||||
punctuate: false,
|
||||
smart_format: true,
|
||||
});
|
||||
expect((seenQuery as Record<string, unknown>)["detectLanguage"]).toBeUndefined();
|
||||
} finally {
|
||||
await cache.cleanup();
|
||||
await fs.unlink(tmpPath).catch(() => {});
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -71,21 +71,73 @@ function trimOutput(text: string, maxChars?: number): string {
|
||||
return trimmed.slice(0, maxChars).trim();
|
||||
}
|
||||
|
||||
function buildDeepgramQuery(options?: {
|
||||
type ProviderQuery = Record<string, string | number | boolean>;
|
||||
|
||||
function normalizeProviderQuery(
|
||||
options?: Record<string, string | number | boolean>,
|
||||
): ProviderQuery | undefined {
|
||||
if (!options) return undefined;
|
||||
const query: ProviderQuery = {};
|
||||
for (const [key, value] of Object.entries(options)) {
|
||||
if (value === undefined) continue;
|
||||
query[key] = value;
|
||||
}
|
||||
return Object.keys(query).length > 0 ? query : undefined;
|
||||
}
|
||||
|
||||
function buildDeepgramCompatQuery(options?: {
|
||||
detectLanguage?: boolean;
|
||||
punctuate?: boolean;
|
||||
smartFormat?: boolean;
|
||||
}): Record<string, string | number | boolean> | undefined {
|
||||
}): ProviderQuery | undefined {
|
||||
if (!options) return undefined;
|
||||
const query: Record<string, string | number | boolean> = {};
|
||||
if (typeof options.detectLanguage === "boolean") {
|
||||
query.detect_language = options.detectLanguage;
|
||||
const query: ProviderQuery = {};
|
||||
if (typeof options.detectLanguage === "boolean") query.detect_language = options.detectLanguage;
|
||||
if (typeof options.punctuate === "boolean") query.punctuate = options.punctuate;
|
||||
if (typeof options.smartFormat === "boolean") query.smart_format = options.smartFormat;
|
||||
return Object.keys(query).length > 0 ? query : undefined;
|
||||
}
|
||||
|
||||
function mergeProviderQuery(
|
||||
base: ProviderQuery | undefined,
|
||||
incoming: ProviderQuery | undefined,
|
||||
): ProviderQuery | undefined {
|
||||
if (!base && !incoming) return undefined;
|
||||
return { ...(base ?? {}), ...(incoming ?? {}) };
|
||||
}
|
||||
|
||||
function normalizeDeepgramQueryKeys(query: ProviderQuery): ProviderQuery {
|
||||
const normalized = { ...query };
|
||||
if ("detectLanguage" in normalized) {
|
||||
normalized.detect_language = normalized.detectLanguage as boolean;
|
||||
delete normalized.detectLanguage;
|
||||
}
|
||||
if (typeof options.punctuate === "boolean") {
|
||||
query.punctuate = options.punctuate;
|
||||
if ("smartFormat" in normalized) {
|
||||
normalized.smart_format = normalized.smartFormat as boolean;
|
||||
delete normalized.smartFormat;
|
||||
}
|
||||
if (typeof options.smartFormat === "boolean") {
|
||||
query.smart_format = options.smartFormat;
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function resolveProviderQuery(params: {
|
||||
providerId: string;
|
||||
config?: MediaUnderstandingConfig;
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
}): ProviderQuery | undefined {
|
||||
const { providerId, config, entry } = params;
|
||||
const mergedOptions = normalizeProviderQuery({
|
||||
...(config?.providerOptions?.[providerId] ?? {}),
|
||||
...(entry.providerOptions?.[providerId] ?? {}),
|
||||
});
|
||||
if (providerId !== "deepgram") {
|
||||
return mergedOptions;
|
||||
}
|
||||
let query = normalizeDeepgramQueryKeys(mergedOptions ?? {});
|
||||
const compat = buildDeepgramCompatQuery({ ...config?.deepgram, ...entry.deepgram });
|
||||
for (const [key, value] of Object.entries(compat ?? {})) {
|
||||
if (query[key] === undefined) {
|
||||
query[key] = value;
|
||||
}
|
||||
}
|
||||
return Object.keys(query).length > 0 ? query : undefined;
|
||||
}
|
||||
@@ -246,13 +298,11 @@ async function runProviderEntry(params: {
|
||||
...(entry.headers ?? {}),
|
||||
};
|
||||
const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
|
||||
const deepgramQuery =
|
||||
providerId === "deepgram"
|
||||
? buildDeepgramQuery({
|
||||
...params.config?.deepgram,
|
||||
...entry.deepgram,
|
||||
})
|
||||
: undefined;
|
||||
const providerQuery = resolveProviderQuery({
|
||||
providerId,
|
||||
config: params.config,
|
||||
entry,
|
||||
});
|
||||
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
||||
const result = await provider.transcribeAudio({
|
||||
buffer: media.buffer,
|
||||
@@ -264,7 +314,7 @@ async function runProviderEntry(params: {
|
||||
model,
|
||||
language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
|
||||
prompt,
|
||||
query: deepgramQuery,
|
||||
query: providerQuery,
|
||||
timeoutMs,
|
||||
});
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user