feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
This commit is contained in:
Peter Steinberger
2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions

View File

@@ -0,0 +1,86 @@
import { describe, expect, it } from "vitest";
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
const resolveRequestUrl = (input: RequestInfo | URL) => {
if (typeof input === "string") return input;
if (input instanceof URL) return input.toString();
return input.url;
};
describe("transcribeOpenAiCompatibleAudio", () => {
it("respects lowercase authorization header overrides", async () => {
let seenAuth: string | null = null;
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
const headers = new Headers(init?.headers);
seenAuth = headers.get("authorization");
return new Response(JSON.stringify({ text: "ok" }), {
status: 200,
headers: { "content-type": "application/json" },
});
};
const result = await transcribeOpenAiCompatibleAudio({
buffer: Buffer.from("audio"),
fileName: "note.mp3",
apiKey: "test-key",
timeoutMs: 1000,
headers: { authorization: "Bearer override" },
fetchFn,
});
expect(seenAuth).toBe("Bearer override");
expect(result.text).toBe("ok");
});
it("builds the expected request payload", async () => {
let seenUrl: string | null = null;
let seenInit: RequestInit | undefined;
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
seenUrl = resolveRequestUrl(input);
seenInit = init;
return new Response(JSON.stringify({ text: "hello" }), {
status: 200,
headers: { "content-type": "application/json" },
});
};
const result = await transcribeOpenAiCompatibleAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.wav",
apiKey: "test-key",
timeoutMs: 1234,
baseUrl: "https://api.example.com/v1/",
model: " ",
language: " en ",
prompt: " hello ",
mime: "audio/wav",
headers: { "X-Custom": "1" },
fetchFn,
});
expect(result.model).toBe("whisper-1");
expect(result.text).toBe("hello");
expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
expect(seenInit?.method).toBe("POST");
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
const headers = new Headers(seenInit?.headers);
expect(headers.get("authorization")).toBe("Bearer test-key");
expect(headers.get("x-custom")).toBe("1");
const form = seenInit?.body as FormData;
expect(form).toBeInstanceOf(FormData);
expect(form.get("model")).toBe("whisper-1");
expect(form.get("language")).toBe("en");
expect(form.get("prompt")).toBe("hello");
const file = form.get("file") as Blob | { type?: string; name?: string } | null;
expect(file).not.toBeNull();
if (file) {
expect(file.type).toBe("audio/wav");
if ("name" in file && typeof file.name === "string") {
expect(file.name).toBe("voice.wav");
}
}
});
});

View File

@@ -0,0 +1,61 @@
import path from "node:path";
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
function resolveModel(model?: string): string {
const trimmed = model?.trim();
return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
}
export async function transcribeOpenAiCompatibleAudio(
params: AudioTranscriptionRequest,
): Promise<AudioTranscriptionResult> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
const url = `${baseUrl}/audio/transcriptions`;
const model = resolveModel(params.model);
const form = new FormData();
const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
const bytes = new Uint8Array(params.buffer);
const blob = new Blob([bytes], {
type: params.mime ?? "application/octet-stream",
});
form.append("file", blob, fileName);
form.append("model", model);
if (params.language?.trim()) form.append("language", params.language.trim());
if (params.prompt?.trim()) form.append("prompt", params.prompt.trim());
const headers = new Headers(params.headers);
if (!headers.has("authorization")) {
headers.set("authorization", `Bearer ${params.apiKey}`);
}
const res = await fetchWithTimeout(
url,
{
method: "POST",
headers,
body: form,
},
params.timeoutMs,
fetchFn,
);
if (!res.ok) {
const detail = await readErrorResponse(res);
const suffix = detail ? `: ${detail}` : "";
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
}
const payload = (await res.json()) as { text?: string };
const text = payload.text?.trim();
if (!text) {
throw new Error("Audio transcription response missing text");
}
return { text, model };
}

View File

@@ -0,0 +1,7 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
export const openaiProvider: MediaUnderstandingProvider = {
id: "openai",
transcribeAudio: transcribeOpenAiCompatibleAudio,
};