feat: add deepgram audio options

This commit is contained in:
Peter Steinberger
2026-01-17 08:50:28 +00:00
parent e637bbdfb5
commit ae6792522d
9 changed files with 110 additions and 3 deletions

View File

@@ -51,6 +51,16 @@ export type MediaUnderstandingModelConfig = {
timeoutSeconds?: number;
/** Optional language hint for audio transcription. */
language?: string;
/** Optional Deepgram transcription options (audio only). */
deepgram?: {
detectLanguage?: boolean;
punctuate?: boolean;
smartFormat?: boolean;
};
/** Optional base URL override for provider requests. */
baseUrl?: string;
/** Optional headers merged into provider requests. */
headers?: Record<string, string>;
/** Auth profile id to use for this provider. */
profile?: string;
/** Preferred profile id if multiple are available. */
@@ -72,6 +82,16 @@ export type MediaUnderstandingConfig = {
timeoutSeconds?: number;
/** Default language hint (audio). */
language?: string;
/** Optional Deepgram transcription options (audio only). */
deepgram?: {
detectLanguage?: boolean;
punctuate?: boolean;
smartFormat?: boolean;
};
/** Optional base URL override for provider requests. */
baseUrl?: string;
/** Optional headers merged into provider requests. */
headers?: Record<string, string>;
/** Attachment selection policy. */
attachments?: MediaUnderstandingAttachmentsConfig;
/** Ordered model list (fallbacks in order). */

View File

@@ -276,6 +276,14 @@ export const MediaUnderstandingAttachmentsSchema = z
})
.optional();
const DeepgramAudioSchema = z
.object({
detectLanguage: z.boolean().optional(),
punctuate: z.boolean().optional(),
smartFormat: z.boolean().optional(),
})
.optional();
export const MediaUnderstandingModelSchema = z
.object({
provider: z.string().optional(),
@@ -289,6 +297,9 @@ export const MediaUnderstandingModelSchema = z
maxBytes: z.number().int().positive().optional(),
timeoutSeconds: z.number().int().positive().optional(),
language: z.string().optional(),
deepgram: DeepgramAudioSchema,
baseUrl: z.string().optional(),
headers: z.record(z.string(), z.string()).optional(),
profile: z.string().optional(),
preferredProfile: z.string().optional(),
})
@@ -303,6 +314,9 @@ export const ToolsMediaUnderstandingSchema = z
prompt: z.string().optional(),
timeoutSeconds: z.number().int().positive().optional(),
language: z.string().optional(),
deepgram: DeepgramAudioSchema,
baseUrl: z.string().optional(),
headers: z.record(z.string(), z.string()).optional(),
attachments: MediaUnderstandingAttachmentsSchema,
models: z.array(MediaUnderstandingModelSchema).optional(),
})

View File

@@ -65,12 +65,18 @@ describe("transcribeDeepgramAudio", () => {
language: " en ",
mime: "audio/wav",
headers: { "X-Custom": "1" },
query: {
punctuate: false,
smart_format: true,
},
fetchFn,
});
expect(result.model).toBe("nova-3");
expect(result.text).toBe("hello");
expect(seenUrl).toBe("https://api.example.com/v1/listen?model=nova-3&language=en");
expect(seenUrl).toBe(
"https://api.example.com/v1/listen?model=nova-3&language=en&punctuate=false&smart_format=true",
);
expect(seenInit?.method).toBe("POST");
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);

View File

@@ -29,6 +29,12 @@ export async function transcribeDeepgramAudio(
const url = new URL(`${baseUrl}/listen`);
url.searchParams.set("model", model);
if (params.language?.trim()) url.searchParams.set("language", params.language.trim());
if (params.query) {
for (const [key, value] of Object.entries(params.query)) {
if (value === undefined) continue;
url.searchParams.set(key, String(value));
}
}
const headers = new Headers(params.headers);
if (!headers.has("authorization")) {

View File

@@ -71,6 +71,25 @@ function trimOutput(text: string, maxChars?: number): string {
return trimmed.slice(0, maxChars).trim();
}
function buildDeepgramQuery(options?: {
detectLanguage?: boolean;
punctuate?: boolean;
smartFormat?: boolean;
}): Record<string, string | number | boolean> | undefined {
if (!options) return undefined;
const query: Record<string, string | number | boolean> = {};
if (typeof options.detectLanguage === "boolean") {
query.detect_language = options.detectLanguage;
}
if (typeof options.punctuate === "boolean") {
query.punctuate = options.punctuate;
}
if (typeof options.smartFormat === "boolean") {
query.smart_format = options.smartFormat;
}
return Object.keys(query).length > 0 ? query : undefined;
}
function buildModelDecision(params: {
entry: MediaUnderstandingModelConfig;
entryType: "provider" | "cli";
@@ -220,17 +239,32 @@ async function runProviderEntry(params: {
agentDir: params.agentDir,
});
const providerConfig = cfg.models?.providers?.[providerId];
const baseUrl = entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl;
const mergedHeaders = {
...(providerConfig?.headers ?? {}),
...(params.config?.headers ?? {}),
...(entry.headers ?? {}),
};
const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
const deepgramQuery =
providerId === "deepgram"
? buildDeepgramQuery({
...params.config?.deepgram,
...entry.deepgram,
})
: undefined;
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
const result = await provider.transcribeAudio({
buffer: media.buffer,
fileName: media.fileName,
mime: media.mime,
apiKey: key.apiKey,
baseUrl: providerConfig?.baseUrl,
headers: providerConfig?.headers,
baseUrl,
headers,
model,
language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
prompt,
query: deepgramQuery,
timeoutMs,
});
return {

View File

@@ -57,6 +57,7 @@ export type AudioTranscriptionRequest = {
model?: string;
language?: string;
prompt?: string;
query?: Record<string, string | number | boolean>;
timeoutMs: number;
fetchFn?: typeof fetch;
};