feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions
--- a/src/media-understanding/providers/google/index.ts
+++ b/src/media-understanding/providers/google/index.ts
@@ -0,0 +1,7 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { describeGeminiVideo } from "./video.js";
+
+export const googleProvider: MediaUnderstandingProvider = {
+  id: "google",
+  describeVideo: describeGeminiVideo,
+};
--- a/src/media-understanding/providers/google/video.test.ts
+++ b/src/media-understanding/providers/google/video.test.ts
@@ -0,0 +1,93 @@
+import { describe, expect, it } from "vitest";
+
+import { describeGeminiVideo } from "./video.js";
+
+const resolveRequestUrl = (input: RequestInfo | URL) => {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+};
+
+describe("describeGeminiVideo", () => {
+  it("respects case-insensitive x-goog-api-key overrides", async () => {
+    let seenKey: string | null = null;
+    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
+      const headers = new Headers(init?.headers);
+      seenKey = headers.get("x-goog-api-key");
+      return new Response(
+        JSON.stringify({
+          candidates: [{ content: { parts: [{ text: "video ok" }] } }],
+        }),
+        { status: 200, headers: { "content-type": "application/json" } },
+      );
+    };
+
+    const result = await describeGeminiVideo({
+      buffer: Buffer.from("video"),
+      fileName: "clip.mp4",
+      apiKey: "test-key",
+      timeoutMs: 1000,
+      headers: { "X-Goog-Api-Key": "override" },
+      fetchFn,
+    });
+
+    expect(seenKey).toBe("override");
+    expect(result.text).toBe("video ok");
+  });
+
+  it("builds the expected request payload", async () => {
+    let seenUrl: string | null = null;
+    let seenInit: RequestInit | undefined;
+    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
+      seenUrl = resolveRequestUrl(input);
+      seenInit = init;
+      return new Response(
+        JSON.stringify({
+          candidates: [
+            {
+              content: {
+                parts: [{ text: "first" }, { text: " second " }, { text: "" }],
+              },
+            },
+          ],
+        }),
+        { status: 200, headers: { "content-type": "application/json" } },
+      );
+    };
+
+    const result = await describeGeminiVideo({
+      buffer: Buffer.from("video-bytes"),
+      fileName: "clip.mp4",
+      apiKey: "test-key",
+      timeoutMs: 1500,
+      baseUrl: "https://example.com/v1beta/",
+      model: "gemini-3-pro",
+      headers: { "X-Other": "1" },
+      fetchFn,
+    });
+
+    expect(result.model).toBe("gemini-3-pro-preview");
+    expect(result.text).toBe("first\nsecond");
+    expect(seenUrl).toBe("https://example.com/v1beta/models/gemini-3-pro-preview:generateContent");
+    expect(seenInit?.method).toBe("POST");
+    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
+
+    const headers = new Headers(seenInit?.headers);
+    expect(headers.get("x-goog-api-key")).toBe("test-key");
+    expect(headers.get("content-type")).toBe("application/json");
+    expect(headers.get("x-other")).toBe("1");
+
+    const bodyText =
+      typeof seenInit?.body === "string"
+        ? seenInit.body
+        : Buffer.isBuffer(seenInit?.body)
+          ? seenInit.body.toString("utf8")
+          : "";
+    const body = JSON.parse(bodyText);
+    expect(body.contents?.[0]?.parts?.[0]?.text).toBe("Describe the video.");
+    expect(body.contents?.[0]?.parts?.[1]?.inline_data?.mime_type).toBe("video/mp4");
+    expect(body.contents?.[0]?.parts?.[1]?.inline_data?.data).toBe(
+      Buffer.from("video-bytes").toString("base64"),
+    );
+  });
+});
--- a/src/media-understanding/providers/google/video.ts
+++ b/src/media-understanding/providers/google/video.ts
@@ -0,0 +1,84 @@
+import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
+import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
+import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
+
+export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
+const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
+const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  if (!trimmed) return DEFAULT_GOOGLE_VIDEO_MODEL;
+  return normalizeGoogleModelId(trimmed);
+}
+
+function resolvePrompt(prompt?: string): string {
+  const trimmed = prompt?.trim();
+  return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
+}
+
+export async function describeGeminiVideo(
+  params: VideoDescriptionRequest,
+): Promise<VideoDescriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
+  const model = resolveModel(params.model);
+  const url = `${baseUrl}/models/${model}:generateContent`;
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("content-type")) {
+    headers.set("content-type", "application/json");
+  }
+  if (!headers.has("x-goog-api-key")) {
+    headers.set("x-goog-api-key", params.apiKey);
+  }
+
+  const body = {
+    contents: [
+      {
+        role: "user",
+        parts: [
+          { text: resolvePrompt(params.prompt) },
+          {
+            inline_data: {
+              mime_type: params.mime ?? "video/mp4",
+              data: params.buffer.toString("base64"),
+            },
+          },
+        ],
+      },
+    ],
+  };
+
+  const res = await fetchWithTimeout(
+    url,
+    {
+      method: "POST",
+      headers,
+      body: JSON.stringify(body),
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  if (!res.ok) {
+    const detail = await readErrorResponse(res);
+    const suffix = detail ? `: ${detail}` : "";
+    throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
+  }
+
+  const payload = (await res.json()) as {
+    candidates?: Array<{
+      content?: { parts?: Array<{ text?: string }> };
+    }>;
+  };
+  const parts = payload.candidates?.[0]?.content?.parts ?? [];
+  const text = parts
+    .map((part) => part?.text?.trim())
+    .filter(Boolean)
+    .join("\n");
+  if (!text) {
+    throw new Error("Video description response missing text");
+  }
+  return { text, model };
+}
--- a/src/media-understanding/providers/groq/index.ts
+++ b/src/media-understanding/providers/groq/index.ts
@@ -0,0 +1,13 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
+
+const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
+
+export const groqProvider: MediaUnderstandingProvider = {
+  id: "groq",
+  transcribeAudio: (req) =>
+    transcribeOpenAiCompatibleAudio({
+      ...req,
+      baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
+    }),
+};
--- a/src/media-understanding/providers/index.ts
+++ b/src/media-understanding/providers/index.ts
@@ -0,0 +1,35 @@
+import { normalizeProviderId } from "../../agents/model-selection.js";
+import type { MediaUnderstandingProvider } from "../types.js";
+import { googleProvider } from "./google/index.js";
+import { groqProvider } from "./groq/index.js";
+import { openaiProvider } from "./openai/index.js";
+
+const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
+
+export function normalizeMediaProviderId(id: string): string {
+  const normalized = normalizeProviderId(id);
+  if (normalized === "gemini") return "google";
+  return normalized;
+}
+
+export function buildMediaUnderstandingRegistry(
+  overrides?: Record<string, MediaUnderstandingProvider>,
+): Map<string, MediaUnderstandingProvider> {
+  const registry = new Map<string, MediaUnderstandingProvider>();
+  for (const provider of PROVIDERS) {
+    registry.set(normalizeMediaProviderId(provider.id), provider);
+  }
+  if (overrides) {
+    for (const [key, provider] of Object.entries(overrides)) {
+      registry.set(normalizeMediaProviderId(key), provider);
+    }
+  }
+  return registry;
+}
+
+export function getMediaUnderstandingProvider(
+  id: string,
+  registry: Map<string, MediaUnderstandingProvider>,
+): MediaUnderstandingProvider | undefined {
+  return registry.get(normalizeMediaProviderId(id));
+}
--- a/src/media-understanding/providers/openai/audio.test.ts
+++ b/src/media-understanding/providers/openai/audio.test.ts
@@ -0,0 +1,86 @@
+import { describe, expect, it } from "vitest";
+
+import { transcribeOpenAiCompatibleAudio } from "./audio.js";
+
+const resolveRequestUrl = (input: RequestInfo | URL) => {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  return input.url;
+};
+
+describe("transcribeOpenAiCompatibleAudio", () => {
+  it("respects lowercase authorization header overrides", async () => {
+    let seenAuth: string | null = null;
+    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
+      const headers = new Headers(init?.headers);
+      seenAuth = headers.get("authorization");
+      return new Response(JSON.stringify({ text: "ok" }), {
+        status: 200,
+        headers: { "content-type": "application/json" },
+      });
+    };
+
+    const result = await transcribeOpenAiCompatibleAudio({
+      buffer: Buffer.from("audio"),
+      fileName: "note.mp3",
+      apiKey: "test-key",
+      timeoutMs: 1000,
+      headers: { authorization: "Bearer override" },
+      fetchFn,
+    });
+
+    expect(seenAuth).toBe("Bearer override");
+    expect(result.text).toBe("ok");
+  });
+
+  it("builds the expected request payload", async () => {
+    let seenUrl: string | null = null;
+    let seenInit: RequestInit | undefined;
+    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
+      seenUrl = resolveRequestUrl(input);
+      seenInit = init;
+      return new Response(JSON.stringify({ text: "hello" }), {
+        status: 200,
+        headers: { "content-type": "application/json" },
+      });
+    };
+
+    const result = await transcribeOpenAiCompatibleAudio({
+      buffer: Buffer.from("audio-bytes"),
+      fileName: "voice.wav",
+      apiKey: "test-key",
+      timeoutMs: 1234,
+      baseUrl: "https://api.example.com/v1/",
+      model: " ",
+      language: " en ",
+      prompt: " hello ",
+      mime: "audio/wav",
+      headers: { "X-Custom": "1" },
+      fetchFn,
+    });
+
+    expect(result.model).toBe("whisper-1");
+    expect(result.text).toBe("hello");
+    expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
+    expect(seenInit?.method).toBe("POST");
+    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
+
+    const headers = new Headers(seenInit?.headers);
+    expect(headers.get("authorization")).toBe("Bearer test-key");
+    expect(headers.get("x-custom")).toBe("1");
+
+    const form = seenInit?.body as FormData;
+    expect(form).toBeInstanceOf(FormData);
+    expect(form.get("model")).toBe("whisper-1");
+    expect(form.get("language")).toBe("en");
+    expect(form.get("prompt")).toBe("hello");
+    const file = form.get("file") as Blob | { type?: string; name?: string } | null;
+    expect(file).not.toBeNull();
+    if (file) {
+      expect(file.type).toBe("audio/wav");
+      if ("name" in file && typeof file.name === "string") {
+        expect(file.name).toBe("voice.wav");
+      }
+    }
+  });
+});
--- a/src/media-understanding/providers/openai/audio.ts
+++ b/src/media-understanding/providers/openai/audio.ts
@@ -0,0 +1,61 @@
+import path from "node:path";
+
+import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
+import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
+
+export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
+const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
+}
+
+export async function transcribeOpenAiCompatibleAudio(
+  params: AudioTranscriptionRequest,
+): Promise<AudioTranscriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
+  const url = `${baseUrl}/audio/transcriptions`;
+
+  const model = resolveModel(params.model);
+  const form = new FormData();
+  const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
+  const bytes = new Uint8Array(params.buffer);
+  const blob = new Blob([bytes], {
+    type: params.mime ?? "application/octet-stream",
+  });
+  form.append("file", blob, fileName);
+  form.append("model", model);
+  if (params.language?.trim()) form.append("language", params.language.trim());
+  if (params.prompt?.trim()) form.append("prompt", params.prompt.trim());
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("authorization")) {
+    headers.set("authorization", `Bearer ${params.apiKey}`);
+  }
+
+  const res = await fetchWithTimeout(
+    url,
+    {
+      method: "POST",
+      headers,
+      body: form,
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  if (!res.ok) {
+    const detail = await readErrorResponse(res);
+    const suffix = detail ? `: ${detail}` : "";
+    throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
+  }
+
+  const payload = (await res.json()) as { text?: string };
+  const text = payload.text?.trim();
+  if (!text) {
+    throw new Error("Audio transcription response missing text");
+  }
+  return { text, model };
+}
--- a/src/media-understanding/providers/openai/index.ts
+++ b/src/media-understanding/providers/openai/index.ts
@@ -0,0 +1,7 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { transcribeOpenAiCompatibleAudio } from "./audio.js";
+
+export const openaiProvider: MediaUnderstandingProvider = {
+  id: "openai",
+  transcribeAudio: transcribeOpenAiCompatibleAudio,
+};
--- a/src/media-understanding/providers/shared.ts
+++ b/src/media-understanding/providers/shared.ts
@@ -0,0 +1,33 @@
+const MAX_ERROR_CHARS = 300;
+
+export function normalizeBaseUrl(baseUrl: string | undefined, fallback: string): string {
+  const raw = baseUrl?.trim() || fallback;
+  return raw.replace(/\/+$/, "");
+}
+
+export async function fetchWithTimeout(
+  url: string,
+  init: RequestInit,
+  timeoutMs: number,
+  fetchFn: typeof fetch,
+): Promise<Response> {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
+  try {
+    return await fetchFn(url, { ...init, signal: controller.signal });
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+export async function readErrorResponse(res: Response): Promise<string | undefined> {
+  try {
+    const text = await res.text();
+    const collapsed = text.replace(/\s+/g, " ").trim();
+    if (!collapsed) return undefined;
+    if (collapsed.length <= MAX_ERROR_CHARS) return collapsed;
+    return `${collapsed.slice(0, MAX_ERROR_CHARS)}…`;
+  } catch {
+    return undefined;
+  }
+}