feat: add deepgram audio options
This commit is contained in:
@@ -80,6 +80,7 @@ read_when:
|
|||||||
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
||||||
- Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
|
- Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
|
||||||
- Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
|
- Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
|
||||||
|
- Audio providers can override `baseUrl`/`headers` via `tools.media.audio`.
|
||||||
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
||||||
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
||||||
- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
|
- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
|
||||||
|
|||||||
@@ -32,6 +32,8 @@ If understanding fails or is disabled, **the reply flow continues** with the ori
|
|||||||
- `tools.media.models`: shared model list (use `capabilities` to gate).
|
- `tools.media.models`: shared model list (use `capabilities` to gate).
|
||||||
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
|
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
|
||||||
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
|
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
|
||||||
|
- provider overrides (`baseUrl`, `headers`)
|
||||||
|
- Deepgram audio options (`deepgram` in `tools.media.audio`)
|
||||||
- optional **per‑capability `models` list** (preferred before shared models)
|
- optional **per‑capability `models` list** (preferred before shared models)
|
||||||
- `attachments` policy (`mode`, `maxAttachments`, `prefer`)
|
- `attachments` policy (`mode`, `maxAttachments`, `prefer`)
|
||||||
- `scope` (optional gating by channel/chatType/session key)
|
- `scope` (optional gating by channel/chatType/session key)
|
||||||
|
|||||||
@@ -41,6 +41,9 @@ DEEPGRAM_API_KEY=dg_...
|
|||||||
|
|
||||||
- `model`: Deepgram model id (default: `nova-3`)
|
- `model`: Deepgram model id (default: `nova-3`)
|
||||||
- `language`: language hint (optional)
|
- `language`: language hint (optional)
|
||||||
|
- `tools.media.audio.deepgram.detectLanguage`: enable language detection (optional)
|
||||||
|
- `tools.media.audio.deepgram.punctuate`: enable punctuation (optional)
|
||||||
|
- `tools.media.audio.deepgram.smartFormat`: enable smart formatting (optional)
|
||||||
|
|
||||||
Example with language:
|
Example with language:
|
||||||
```json5
|
```json5
|
||||||
@@ -58,7 +61,27 @@ Example with language:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Example with Deepgram options:
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
deepgram: {
|
||||||
|
detectLanguage: true,
|
||||||
|
punctuate: true,
|
||||||
|
smartFormat: true
|
||||||
|
},
|
||||||
|
models: [{ provider: "deepgram", model: "nova-3" }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- Authentication follows the standard provider auth order; `DEEPGRAM_API_KEY` is the simplest path.
|
- Authentication follows the standard provider auth order; `DEEPGRAM_API_KEY` is the simplest path.
|
||||||
|
- Override endpoints or headers with `tools.media.audio.baseUrl` and `tools.media.audio.headers` when using a proxy.
|
||||||
- Output follows the same audio rules as other providers (size caps, timeouts, transcript injection).
|
- Output follows the same audio rules as other providers (size caps, timeouts, transcript injection).
|
||||||
|
|||||||
@@ -51,6 +51,16 @@ export type MediaUnderstandingModelConfig = {
|
|||||||
timeoutSeconds?: number;
|
timeoutSeconds?: number;
|
||||||
/** Optional language hint for audio transcription. */
|
/** Optional language hint for audio transcription. */
|
||||||
language?: string;
|
language?: string;
|
||||||
|
/** Optional Deepgram transcription options (audio only). */
|
||||||
|
deepgram?: {
|
||||||
|
detectLanguage?: boolean;
|
||||||
|
punctuate?: boolean;
|
||||||
|
smartFormat?: boolean;
|
||||||
|
};
|
||||||
|
/** Optional base URL override for provider requests. */
|
||||||
|
baseUrl?: string;
|
||||||
|
/** Optional headers merged into provider requests. */
|
||||||
|
headers?: Record<string, string>;
|
||||||
/** Auth profile id to use for this provider. */
|
/** Auth profile id to use for this provider. */
|
||||||
profile?: string;
|
profile?: string;
|
||||||
/** Preferred profile id if multiple are available. */
|
/** Preferred profile id if multiple are available. */
|
||||||
@@ -72,6 +82,16 @@ export type MediaUnderstandingConfig = {
|
|||||||
timeoutSeconds?: number;
|
timeoutSeconds?: number;
|
||||||
/** Default language hint (audio). */
|
/** Default language hint (audio). */
|
||||||
language?: string;
|
language?: string;
|
||||||
|
/** Optional Deepgram transcription options (audio only). */
|
||||||
|
deepgram?: {
|
||||||
|
detectLanguage?: boolean;
|
||||||
|
punctuate?: boolean;
|
||||||
|
smartFormat?: boolean;
|
||||||
|
};
|
||||||
|
/** Optional base URL override for provider requests. */
|
||||||
|
baseUrl?: string;
|
||||||
|
/** Optional headers merged into provider requests. */
|
||||||
|
headers?: Record<string, string>;
|
||||||
/** Attachment selection policy. */
|
/** Attachment selection policy. */
|
||||||
attachments?: MediaUnderstandingAttachmentsConfig;
|
attachments?: MediaUnderstandingAttachmentsConfig;
|
||||||
/** Ordered model list (fallbacks in order). */
|
/** Ordered model list (fallbacks in order). */
|
||||||
|
|||||||
@@ -276,6 +276,14 @@ export const MediaUnderstandingAttachmentsSchema = z
|
|||||||
})
|
})
|
||||||
.optional();
|
.optional();
|
||||||
|
|
||||||
|
const DeepgramAudioSchema = z
|
||||||
|
.object({
|
||||||
|
detectLanguage: z.boolean().optional(),
|
||||||
|
punctuate: z.boolean().optional(),
|
||||||
|
smartFormat: z.boolean().optional(),
|
||||||
|
})
|
||||||
|
.optional();
|
||||||
|
|
||||||
export const MediaUnderstandingModelSchema = z
|
export const MediaUnderstandingModelSchema = z
|
||||||
.object({
|
.object({
|
||||||
provider: z.string().optional(),
|
provider: z.string().optional(),
|
||||||
@@ -289,6 +297,9 @@ export const MediaUnderstandingModelSchema = z
|
|||||||
maxBytes: z.number().int().positive().optional(),
|
maxBytes: z.number().int().positive().optional(),
|
||||||
timeoutSeconds: z.number().int().positive().optional(),
|
timeoutSeconds: z.number().int().positive().optional(),
|
||||||
language: z.string().optional(),
|
language: z.string().optional(),
|
||||||
|
deepgram: DeepgramAudioSchema,
|
||||||
|
baseUrl: z.string().optional(),
|
||||||
|
headers: z.record(z.string(), z.string()).optional(),
|
||||||
profile: z.string().optional(),
|
profile: z.string().optional(),
|
||||||
preferredProfile: z.string().optional(),
|
preferredProfile: z.string().optional(),
|
||||||
})
|
})
|
||||||
@@ -303,6 +314,9 @@ export const ToolsMediaUnderstandingSchema = z
|
|||||||
prompt: z.string().optional(),
|
prompt: z.string().optional(),
|
||||||
timeoutSeconds: z.number().int().positive().optional(),
|
timeoutSeconds: z.number().int().positive().optional(),
|
||||||
language: z.string().optional(),
|
language: z.string().optional(),
|
||||||
|
deepgram: DeepgramAudioSchema,
|
||||||
|
baseUrl: z.string().optional(),
|
||||||
|
headers: z.record(z.string(), z.string()).optional(),
|
||||||
attachments: MediaUnderstandingAttachmentsSchema,
|
attachments: MediaUnderstandingAttachmentsSchema,
|
||||||
models: z.array(MediaUnderstandingModelSchema).optional(),
|
models: z.array(MediaUnderstandingModelSchema).optional(),
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -65,12 +65,18 @@ describe("transcribeDeepgramAudio", () => {
|
|||||||
language: " en ",
|
language: " en ",
|
||||||
mime: "audio/wav",
|
mime: "audio/wav",
|
||||||
headers: { "X-Custom": "1" },
|
headers: { "X-Custom": "1" },
|
||||||
|
query: {
|
||||||
|
punctuate: false,
|
||||||
|
smart_format: true,
|
||||||
|
},
|
||||||
fetchFn,
|
fetchFn,
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(result.model).toBe("nova-3");
|
expect(result.model).toBe("nova-3");
|
||||||
expect(result.text).toBe("hello");
|
expect(result.text).toBe("hello");
|
||||||
expect(seenUrl).toBe("https://api.example.com/v1/listen?model=nova-3&language=en");
|
expect(seenUrl).toBe(
|
||||||
|
"https://api.example.com/v1/listen?model=nova-3&language=en&punctuate=false&smart_format=true",
|
||||||
|
);
|
||||||
expect(seenInit?.method).toBe("POST");
|
expect(seenInit?.method).toBe("POST");
|
||||||
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
|
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,12 @@ export async function transcribeDeepgramAudio(
|
|||||||
const url = new URL(`${baseUrl}/listen`);
|
const url = new URL(`${baseUrl}/listen`);
|
||||||
url.searchParams.set("model", model);
|
url.searchParams.set("model", model);
|
||||||
if (params.language?.trim()) url.searchParams.set("language", params.language.trim());
|
if (params.language?.trim()) url.searchParams.set("language", params.language.trim());
|
||||||
|
if (params.query) {
|
||||||
|
for (const [key, value] of Object.entries(params.query)) {
|
||||||
|
if (value === undefined) continue;
|
||||||
|
url.searchParams.set(key, String(value));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const headers = new Headers(params.headers);
|
const headers = new Headers(params.headers);
|
||||||
if (!headers.has("authorization")) {
|
if (!headers.has("authorization")) {
|
||||||
|
|||||||
@@ -71,6 +71,25 @@ function trimOutput(text: string, maxChars?: number): string {
|
|||||||
return trimmed.slice(0, maxChars).trim();
|
return trimmed.slice(0, maxChars).trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function buildDeepgramQuery(options?: {
|
||||||
|
detectLanguage?: boolean;
|
||||||
|
punctuate?: boolean;
|
||||||
|
smartFormat?: boolean;
|
||||||
|
}): Record<string, string | number | boolean> | undefined {
|
||||||
|
if (!options) return undefined;
|
||||||
|
const query: Record<string, string | number | boolean> = {};
|
||||||
|
if (typeof options.detectLanguage === "boolean") {
|
||||||
|
query.detect_language = options.detectLanguage;
|
||||||
|
}
|
||||||
|
if (typeof options.punctuate === "boolean") {
|
||||||
|
query.punctuate = options.punctuate;
|
||||||
|
}
|
||||||
|
if (typeof options.smartFormat === "boolean") {
|
||||||
|
query.smart_format = options.smartFormat;
|
||||||
|
}
|
||||||
|
return Object.keys(query).length > 0 ? query : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
function buildModelDecision(params: {
|
function buildModelDecision(params: {
|
||||||
entry: MediaUnderstandingModelConfig;
|
entry: MediaUnderstandingModelConfig;
|
||||||
entryType: "provider" | "cli";
|
entryType: "provider" | "cli";
|
||||||
@@ -220,17 +239,32 @@ async function runProviderEntry(params: {
|
|||||||
agentDir: params.agentDir,
|
agentDir: params.agentDir,
|
||||||
});
|
});
|
||||||
const providerConfig = cfg.models?.providers?.[providerId];
|
const providerConfig = cfg.models?.providers?.[providerId];
|
||||||
|
const baseUrl = entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl;
|
||||||
|
const mergedHeaders = {
|
||||||
|
...(providerConfig?.headers ?? {}),
|
||||||
|
...(params.config?.headers ?? {}),
|
||||||
|
...(entry.headers ?? {}),
|
||||||
|
};
|
||||||
|
const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
|
||||||
|
const deepgramQuery =
|
||||||
|
providerId === "deepgram"
|
||||||
|
? buildDeepgramQuery({
|
||||||
|
...params.config?.deepgram,
|
||||||
|
...entry.deepgram,
|
||||||
|
})
|
||||||
|
: undefined;
|
||||||
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
||||||
const result = await provider.transcribeAudio({
|
const result = await provider.transcribeAudio({
|
||||||
buffer: media.buffer,
|
buffer: media.buffer,
|
||||||
fileName: media.fileName,
|
fileName: media.fileName,
|
||||||
mime: media.mime,
|
mime: media.mime,
|
||||||
apiKey: key.apiKey,
|
apiKey: key.apiKey,
|
||||||
baseUrl: providerConfig?.baseUrl,
|
baseUrl,
|
||||||
headers: providerConfig?.headers,
|
headers,
|
||||||
model,
|
model,
|
||||||
language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
|
language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
|
||||||
prompt,
|
prompt,
|
||||||
|
query: deepgramQuery,
|
||||||
timeoutMs,
|
timeoutMs,
|
||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ export type AudioTranscriptionRequest = {
|
|||||||
model?: string;
|
model?: string;
|
||||||
language?: string;
|
language?: string;
|
||||||
prompt?: string;
|
prompt?: string;
|
||||||
|
query?: Record<string, string | number | boolean>;
|
||||||
timeoutMs: number;
|
timeoutMs: number;
|
||||||
fetchFn?: typeof fetch;
|
fetchFn?: typeof fetch;
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user