feat: add inbound media understanding
Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
This commit is contained in:
7
src/media-understanding/providers/google/index.ts
Normal file
7
src/media-understanding/providers/google/index.ts
Normal file
@@ -0,0 +1,7 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeGeminiVideo } from "./video.js";
|
||||
|
||||
export const googleProvider: MediaUnderstandingProvider = {
|
||||
id: "google",
|
||||
describeVideo: describeGeminiVideo,
|
||||
};
|
||||
93
src/media-understanding/providers/google/video.test.ts
Normal file
93
src/media-understanding/providers/google/video.test.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { describeGeminiVideo } from "./video.js";
|
||||
|
||||
const resolveRequestUrl = (input: RequestInfo | URL) => {
|
||||
if (typeof input === "string") return input;
|
||||
if (input instanceof URL) return input.toString();
|
||||
return input.url;
|
||||
};
|
||||
|
||||
describe("describeGeminiVideo", () => {
|
||||
it("respects case-insensitive x-goog-api-key overrides", async () => {
|
||||
let seenKey: string | null = null;
|
||||
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
|
||||
const headers = new Headers(init?.headers);
|
||||
seenKey = headers.get("x-goog-api-key");
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
candidates: [{ content: { parts: [{ text: "video ok" }] } }],
|
||||
}),
|
||||
{ status: 200, headers: { "content-type": "application/json" } },
|
||||
);
|
||||
};
|
||||
|
||||
const result = await describeGeminiVideo({
|
||||
buffer: Buffer.from("video"),
|
||||
fileName: "clip.mp4",
|
||||
apiKey: "test-key",
|
||||
timeoutMs: 1000,
|
||||
headers: { "X-Goog-Api-Key": "override" },
|
||||
fetchFn,
|
||||
});
|
||||
|
||||
expect(seenKey).toBe("override");
|
||||
expect(result.text).toBe("video ok");
|
||||
});
|
||||
|
||||
it("builds the expected request payload", async () => {
|
||||
let seenUrl: string | null = null;
|
||||
let seenInit: RequestInit | undefined;
|
||||
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||
seenUrl = resolveRequestUrl(input);
|
||||
seenInit = init;
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
candidates: [
|
||||
{
|
||||
content: {
|
||||
parts: [{ text: "first" }, { text: " second " }, { text: "" }],
|
||||
},
|
||||
},
|
||||
],
|
||||
}),
|
||||
{ status: 200, headers: { "content-type": "application/json" } },
|
||||
);
|
||||
};
|
||||
|
||||
const result = await describeGeminiVideo({
|
||||
buffer: Buffer.from("video-bytes"),
|
||||
fileName: "clip.mp4",
|
||||
apiKey: "test-key",
|
||||
timeoutMs: 1500,
|
||||
baseUrl: "https://example.com/v1beta/",
|
||||
model: "gemini-3-pro",
|
||||
headers: { "X-Other": "1" },
|
||||
fetchFn,
|
||||
});
|
||||
|
||||
expect(result.model).toBe("gemini-3-pro-preview");
|
||||
expect(result.text).toBe("first\nsecond");
|
||||
expect(seenUrl).toBe("https://example.com/v1beta/models/gemini-3-pro-preview:generateContent");
|
||||
expect(seenInit?.method).toBe("POST");
|
||||
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
|
||||
|
||||
const headers = new Headers(seenInit?.headers);
|
||||
expect(headers.get("x-goog-api-key")).toBe("test-key");
|
||||
expect(headers.get("content-type")).toBe("application/json");
|
||||
expect(headers.get("x-other")).toBe("1");
|
||||
|
||||
const bodyText =
|
||||
typeof seenInit?.body === "string"
|
||||
? seenInit.body
|
||||
: Buffer.isBuffer(seenInit?.body)
|
||||
? seenInit.body.toString("utf8")
|
||||
: "";
|
||||
const body = JSON.parse(bodyText);
|
||||
expect(body.contents?.[0]?.parts?.[0]?.text).toBe("Describe the video.");
|
||||
expect(body.contents?.[0]?.parts?.[1]?.inline_data?.mime_type).toBe("video/mp4");
|
||||
expect(body.contents?.[0]?.parts?.[1]?.inline_data?.data).toBe(
|
||||
Buffer.from("video-bytes").toString("base64"),
|
||||
);
|
||||
});
|
||||
});
|
||||
84
src/media-understanding/providers/google/video.ts
Normal file
84
src/media-understanding/providers/google/video.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
|
||||
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
|
||||
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||
|
||||
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
|
||||
|
||||
function resolveModel(model?: string): string {
|
||||
const trimmed = model?.trim();
|
||||
if (!trimmed) return DEFAULT_GOOGLE_VIDEO_MODEL;
|
||||
return normalizeGoogleModelId(trimmed);
|
||||
}
|
||||
|
||||
function resolvePrompt(prompt?: string): string {
|
||||
const trimmed = prompt?.trim();
|
||||
return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
|
||||
}
|
||||
|
||||
export async function describeGeminiVideo(
|
||||
params: VideoDescriptionRequest,
|
||||
): Promise<VideoDescriptionResult> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
|
||||
const model = resolveModel(params.model);
|
||||
const url = `${baseUrl}/models/${model}:generateContent`;
|
||||
|
||||
const headers = new Headers(params.headers);
|
||||
if (!headers.has("content-type")) {
|
||||
headers.set("content-type", "application/json");
|
||||
}
|
||||
if (!headers.has("x-goog-api-key")) {
|
||||
headers.set("x-goog-api-key", params.apiKey);
|
||||
}
|
||||
|
||||
const body = {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ text: resolvePrompt(params.prompt) },
|
||||
{
|
||||
inline_data: {
|
||||
mime_type: params.mime ?? "video/mp4",
|
||||
data: params.buffer.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const res = await fetchWithTimeout(
|
||||
url,
|
||||
{
|
||||
method: "POST",
|
||||
headers,
|
||||
body: JSON.stringify(body),
|
||||
},
|
||||
params.timeoutMs,
|
||||
fetchFn,
|
||||
);
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await readErrorResponse(res);
|
||||
const suffix = detail ? `: ${detail}` : "";
|
||||
throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
|
||||
}
|
||||
|
||||
const payload = (await res.json()) as {
|
||||
candidates?: Array<{
|
||||
content?: { parts?: Array<{ text?: string }> };
|
||||
}>;
|
||||
};
|
||||
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
||||
const text = parts
|
||||
.map((part) => part?.text?.trim())
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
if (!text) {
|
||||
throw new Error("Video description response missing text");
|
||||
}
|
||||
return { text, model };
|
||||
}
|
||||
13
src/media-understanding/providers/groq/index.ts
Normal file
13
src/media-understanding/providers/groq/index.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
|
||||
|
||||
const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
|
||||
|
||||
export const groqProvider: MediaUnderstandingProvider = {
|
||||
id: "groq",
|
||||
transcribeAudio: (req) =>
|
||||
transcribeOpenAiCompatibleAudio({
|
||||
...req,
|
||||
baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
|
||||
}),
|
||||
};
|
||||
35
src/media-understanding/providers/index.ts
Normal file
35
src/media-understanding/providers/index.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import { normalizeProviderId } from "../../agents/model-selection.js";
|
||||
import type { MediaUnderstandingProvider } from "../types.js";
|
||||
import { googleProvider } from "./google/index.js";
|
||||
import { groqProvider } from "./groq/index.js";
|
||||
import { openaiProvider } from "./openai/index.js";
|
||||
|
||||
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
|
||||
|
||||
export function normalizeMediaProviderId(id: string): string {
|
||||
const normalized = normalizeProviderId(id);
|
||||
if (normalized === "gemini") return "google";
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export function buildMediaUnderstandingRegistry(
|
||||
overrides?: Record<string, MediaUnderstandingProvider>,
|
||||
): Map<string, MediaUnderstandingProvider> {
|
||||
const registry = new Map<string, MediaUnderstandingProvider>();
|
||||
for (const provider of PROVIDERS) {
|
||||
registry.set(normalizeMediaProviderId(provider.id), provider);
|
||||
}
|
||||
if (overrides) {
|
||||
for (const [key, provider] of Object.entries(overrides)) {
|
||||
registry.set(normalizeMediaProviderId(key), provider);
|
||||
}
|
||||
}
|
||||
return registry;
|
||||
}
|
||||
|
||||
export function getMediaUnderstandingProvider(
|
||||
id: string,
|
||||
registry: Map<string, MediaUnderstandingProvider>,
|
||||
): MediaUnderstandingProvider | undefined {
|
||||
return registry.get(normalizeMediaProviderId(id));
|
||||
}
|
||||
86
src/media-understanding/providers/openai/audio.test.ts
Normal file
86
src/media-understanding/providers/openai/audio.test.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||
|
||||
const resolveRequestUrl = (input: RequestInfo | URL) => {
|
||||
if (typeof input === "string") return input;
|
||||
if (input instanceof URL) return input.toString();
|
||||
return input.url;
|
||||
};
|
||||
|
||||
describe("transcribeOpenAiCompatibleAudio", () => {
|
||||
it("respects lowercase authorization header overrides", async () => {
|
||||
let seenAuth: string | null = null;
|
||||
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
|
||||
const headers = new Headers(init?.headers);
|
||||
seenAuth = headers.get("authorization");
|
||||
return new Response(JSON.stringify({ text: "ok" }), {
|
||||
status: 200,
|
||||
headers: { "content-type": "application/json" },
|
||||
});
|
||||
};
|
||||
|
||||
const result = await transcribeOpenAiCompatibleAudio({
|
||||
buffer: Buffer.from("audio"),
|
||||
fileName: "note.mp3",
|
||||
apiKey: "test-key",
|
||||
timeoutMs: 1000,
|
||||
headers: { authorization: "Bearer override" },
|
||||
fetchFn,
|
||||
});
|
||||
|
||||
expect(seenAuth).toBe("Bearer override");
|
||||
expect(result.text).toBe("ok");
|
||||
});
|
||||
|
||||
it("builds the expected request payload", async () => {
|
||||
let seenUrl: string | null = null;
|
||||
let seenInit: RequestInit | undefined;
|
||||
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||
seenUrl = resolveRequestUrl(input);
|
||||
seenInit = init;
|
||||
return new Response(JSON.stringify({ text: "hello" }), {
|
||||
status: 200,
|
||||
headers: { "content-type": "application/json" },
|
||||
});
|
||||
};
|
||||
|
||||
const result = await transcribeOpenAiCompatibleAudio({
|
||||
buffer: Buffer.from("audio-bytes"),
|
||||
fileName: "voice.wav",
|
||||
apiKey: "test-key",
|
||||
timeoutMs: 1234,
|
||||
baseUrl: "https://api.example.com/v1/",
|
||||
model: " ",
|
||||
language: " en ",
|
||||
prompt: " hello ",
|
||||
mime: "audio/wav",
|
||||
headers: { "X-Custom": "1" },
|
||||
fetchFn,
|
||||
});
|
||||
|
||||
expect(result.model).toBe("whisper-1");
|
||||
expect(result.text).toBe("hello");
|
||||
expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
|
||||
expect(seenInit?.method).toBe("POST");
|
||||
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
|
||||
|
||||
const headers = new Headers(seenInit?.headers);
|
||||
expect(headers.get("authorization")).toBe("Bearer test-key");
|
||||
expect(headers.get("x-custom")).toBe("1");
|
||||
|
||||
const form = seenInit?.body as FormData;
|
||||
expect(form).toBeInstanceOf(FormData);
|
||||
expect(form.get("model")).toBe("whisper-1");
|
||||
expect(form.get("language")).toBe("en");
|
||||
expect(form.get("prompt")).toBe("hello");
|
||||
const file = form.get("file") as Blob | { type?: string; name?: string } | null;
|
||||
expect(file).not.toBeNull();
|
||||
if (file) {
|
||||
expect(file.type).toBe("audio/wav");
|
||||
if ("name" in file && typeof file.name === "string") {
|
||||
expect(file.name).toBe("voice.wav");
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
61
src/media-understanding/providers/openai/audio.ts
Normal file
61
src/media-understanding/providers/openai/audio.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
import path from "node:path";
|
||||
|
||||
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
|
||||
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||
|
||||
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
|
||||
const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
|
||||
|
||||
function resolveModel(model?: string): string {
|
||||
const trimmed = model?.trim();
|
||||
return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
|
||||
}
|
||||
|
||||
export async function transcribeOpenAiCompatibleAudio(
|
||||
params: AudioTranscriptionRequest,
|
||||
): Promise<AudioTranscriptionResult> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
|
||||
const url = `${baseUrl}/audio/transcriptions`;
|
||||
|
||||
const model = resolveModel(params.model);
|
||||
const form = new FormData();
|
||||
const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
|
||||
const bytes = new Uint8Array(params.buffer);
|
||||
const blob = new Blob([bytes], {
|
||||
type: params.mime ?? "application/octet-stream",
|
||||
});
|
||||
form.append("file", blob, fileName);
|
||||
form.append("model", model);
|
||||
if (params.language?.trim()) form.append("language", params.language.trim());
|
||||
if (params.prompt?.trim()) form.append("prompt", params.prompt.trim());
|
||||
|
||||
const headers = new Headers(params.headers);
|
||||
if (!headers.has("authorization")) {
|
||||
headers.set("authorization", `Bearer ${params.apiKey}`);
|
||||
}
|
||||
|
||||
const res = await fetchWithTimeout(
|
||||
url,
|
||||
{
|
||||
method: "POST",
|
||||
headers,
|
||||
body: form,
|
||||
},
|
||||
params.timeoutMs,
|
||||
fetchFn,
|
||||
);
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await readErrorResponse(res);
|
||||
const suffix = detail ? `: ${detail}` : "";
|
||||
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
|
||||
}
|
||||
|
||||
const payload = (await res.json()) as { text?: string };
|
||||
const text = payload.text?.trim();
|
||||
if (!text) {
|
||||
throw new Error("Audio transcription response missing text");
|
||||
}
|
||||
return { text, model };
|
||||
}
|
||||
7
src/media-understanding/providers/openai/index.ts
Normal file
7
src/media-understanding/providers/openai/index.ts
Normal file
@@ -0,0 +1,7 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||
|
||||
export const openaiProvider: MediaUnderstandingProvider = {
|
||||
id: "openai",
|
||||
transcribeAudio: transcribeOpenAiCompatibleAudio,
|
||||
};
|
||||
33
src/media-understanding/providers/shared.ts
Normal file
33
src/media-understanding/providers/shared.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
const MAX_ERROR_CHARS = 300;
|
||||
|
||||
export function normalizeBaseUrl(baseUrl: string | undefined, fallback: string): string {
|
||||
const raw = baseUrl?.trim() || fallback;
|
||||
return raw.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
export async function fetchWithTimeout(
|
||||
url: string,
|
||||
init: RequestInit,
|
||||
timeoutMs: number,
|
||||
fetchFn: typeof fetch,
|
||||
): Promise<Response> {
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
|
||||
try {
|
||||
return await fetchFn(url, { ...init, signal: controller.signal });
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
export async function readErrorResponse(res: Response): Promise<string | undefined> {
|
||||
try {
|
||||
const text = await res.text();
|
||||
const collapsed = text.replace(/\s+/g, " ").trim();
|
||||
if (!collapsed) return undefined;
|
||||
if (collapsed.length <= MAX_ERROR_CHARS) return collapsed;
|
||||
return `${collapsed.slice(0, MAX_ERROR_CHARS)}…`;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user