refactor: share responses input handling

2026-01-20 07:59:25 +00:00
parent e26c647828
commit f06ad4502b
5 changed files with 566 additions and 467 deletions
--- a/src/media/input-files.ts
+++ b/src/media/input-files.ts
@@ -0,0 +1,385 @@
+import { lookup } from "node:dns/promises";
+
+import { createCanvas } from "@napi-rs/canvas";
+import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
+
+export type InputImageContent = {
+  type: "image";
+  data: string;
+  mimeType: string;
+};
+
+export type InputFileExtractResult = {
+  filename: string;
+  text?: string;
+  images?: InputImageContent[];
+};
+
+export type InputPdfLimits = {
+  maxPages: number;
+  maxPixels: number;
+  minTextChars: number;
+};
+
+export type InputFileLimits = {
+  allowUrl: boolean;
+  allowedMimes: Set<string>;
+  maxBytes: number;
+  maxChars: number;
+  maxRedirects: number;
+  timeoutMs: number;
+  pdf: InputPdfLimits;
+};
+
+export type InputImageLimits = {
+  allowUrl: boolean;
+  allowedMimes: Set<string>;
+  maxBytes: number;
+  maxRedirects: number;
+  timeoutMs: number;
+};
+
+export type InputImageSource = {
+  type: "base64" | "url";
+  data?: string;
+  url?: string;
+  mediaType?: string;
+};
+
+export type InputFileSource = {
+  type: "base64" | "url";
+  data?: string;
+  url?: string;
+  mediaType?: string;
+  filename?: string;
+};
+
+export type InputFetchResult = {
+  buffer: Buffer;
+  mimeType: string;
+  contentType?: string;
+};
+
+export const DEFAULT_INPUT_IMAGE_MIMES = ["image/jpeg", "image/png", "image/gif", "image/webp"];
+export const DEFAULT_INPUT_FILE_MIMES = [
+  "text/plain",
+  "text/markdown",
+  "text/html",
+  "text/csv",
+  "application/json",
+  "application/pdf",
+];
+export const DEFAULT_INPUT_IMAGE_MAX_BYTES = 10 * 1024 * 1024;
+export const DEFAULT_INPUT_FILE_MAX_BYTES = 5 * 1024 * 1024;
+export const DEFAULT_INPUT_FILE_MAX_CHARS = 200_000;
+export const DEFAULT_INPUT_MAX_REDIRECTS = 3;
+export const DEFAULT_INPUT_TIMEOUT_MS = 10_000;
+export const DEFAULT_INPUT_PDF_MAX_PAGES = 4;
+export const DEFAULT_INPUT_PDF_MAX_PIXELS = 4_000_000;
+export const DEFAULT_INPUT_PDF_MIN_TEXT_CHARS = 200;
+
+const PRIVATE_IPV4_PATTERNS = [
+  /^127\./,
+  /^10\./,
+  /^192\.168\./,
+  /^172\.(1[6-9]|2[0-9]|3[0-1])\./,
+  /^0\./,
+];
+const PRIVATE_IPV6_PREFIXES = ["::1", "fe80:", "fec0:", "fc", "fd"];
+
+function isPrivateIpAddress(address: string): boolean {
+  if (address.includes(":")) {
+    const lower = address.toLowerCase();
+    if (lower === "::1") return true;
+    return PRIVATE_IPV6_PREFIXES.some((prefix) => lower.startsWith(prefix));
+  }
+  return PRIVATE_IPV4_PATTERNS.some((pattern) => pattern.test(address));
+}
+
+function isBlockedHostname(hostname: string): boolean {
+  const lower = hostname.toLowerCase();
+  return (
+    lower === "localhost" ||
+    lower.endsWith(".localhost") ||
+    lower.endsWith(".local") ||
+    lower.endsWith(".internal")
+  );
+}
+
+async function assertPublicHostname(hostname: string): Promise<void> {
+  if (isBlockedHostname(hostname)) {
+    throw new Error(`Blocked hostname: ${hostname}`);
+  }
+
+  const results = await lookup(hostname, { all: true });
+  if (results.length === 0) {
+    throw new Error(`Unable to resolve hostname: ${hostname}`);
+  }
+  for (const entry of results) {
+    if (isPrivateIpAddress(entry.address)) {
+      throw new Error(`Private IP addresses are not allowed: ${entry.address}`);
+    }
+  }
+}
+
+function isRedirectStatus(status: number): boolean {
+  return status === 301 || status === 302 || status === 303 || status === 307 || status === 308;
+}
+
+export function normalizeMimeType(value: string | undefined): string | undefined {
+  if (!value) return undefined;
+  const [raw] = value.split(";");
+  const normalized = raw?.trim().toLowerCase();
+  return normalized || undefined;
+}
+
+export function parseContentType(value: string | undefined): {
+  mimeType?: string;
+  charset?: string;
+} {
+  if (!value) return {};
+  const parts = value.split(";").map((part) => part.trim());
+  const mimeType = normalizeMimeType(parts[0]);
+  const charset = parts
+    .map((part) => part.match(/^charset=(.+)$/i)?.[1]?.trim())
+    .find((part) => part && part.length > 0);
+  return { mimeType, charset };
+}
+
+export function normalizeMimeList(values: string[] | undefined, fallback: string[]): Set<string> {
+  const input = values && values.length > 0 ? values : fallback;
+  return new Set(input.map((value) => normalizeMimeType(value)).filter(Boolean) as string[]);
+}
+
+export async function fetchWithGuard(params: {
+  url: string;
+  maxBytes: number;
+  timeoutMs: number;
+  maxRedirects: number;
+}): Promise<InputFetchResult> {
+  let currentUrl = params.url;
+  let redirectCount = 0;
+
+  const controller = new AbortController();
+  const timeoutId = setTimeout(() => controller.abort(), params.timeoutMs);
+
+  try {
+    while (true) {
+      const parsedUrl = new URL(currentUrl);
+      if (!["http:", "https:"].includes(parsedUrl.protocol)) {
+        throw new Error(`Invalid URL protocol: ${parsedUrl.protocol}. Only HTTP/HTTPS allowed.`);
+      }
+      await assertPublicHostname(parsedUrl.hostname);
+
+      const response = await fetch(parsedUrl, {
+        signal: controller.signal,
+        headers: { "User-Agent": "Clawdbot-Gateway/1.0" },
+        redirect: "manual",
+      });
+
+      if (isRedirectStatus(response.status)) {
+        const location = response.headers.get("location");
+        if (!location) {
+          throw new Error(`Redirect missing location header (${response.status})`);
+        }
+        redirectCount += 1;
+        if (redirectCount > params.maxRedirects) {
+          throw new Error(`Too many redirects (limit: ${params.maxRedirects})`);
+        }
+        currentUrl = new URL(location, parsedUrl).toString();
+        continue;
+      }
+
+      if (!response.ok) {
+        throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
+      }
+
+      const contentLength = response.headers.get("content-length");
+      if (contentLength) {
+        const size = parseInt(contentLength, 10);
+        if (size > params.maxBytes) {
+          throw new Error(`Content too large: ${size} bytes (limit: ${params.maxBytes} bytes)`);
+        }
+      }
+
+      const buffer = Buffer.from(await response.arrayBuffer());
+      if (buffer.byteLength > params.maxBytes) {
+        throw new Error(
+          `Content too large: ${buffer.byteLength} bytes (limit: ${params.maxBytes} bytes)`,
+        );
+      }
+
+      const contentType = response.headers.get("content-type") || undefined;
+      const parsed = parseContentType(contentType);
+      const mimeType = parsed.mimeType ?? "application/octet-stream";
+      return { buffer, mimeType, contentType };
+    }
+  } finally {
+    clearTimeout(timeoutId);
+  }
+}
+
+function decodeTextContent(buffer: Buffer, charset: string | undefined): string {
+  const encoding = charset?.trim().toLowerCase() || "utf-8";
+  try {
+    return new TextDecoder(encoding).decode(buffer);
+  } catch {
+    return new TextDecoder("utf-8").decode(buffer);
+  }
+}
+
+function clampText(text: string, maxChars: number): string {
+  if (text.length <= maxChars) return text;
+  return text.slice(0, maxChars);
+}
+
+async function extractPdfContent(params: {
+  buffer: Buffer;
+  limits: InputFileLimits;
+}): Promise<{ text: string; images: InputImageContent[] }> {
+  const { buffer, limits } = params;
+  const pdf = await getDocument({
+    data: new Uint8Array(buffer),
+    // @ts-expect-error pdfjs-dist legacy option not in current type defs.
+    disableWorker: true,
+  }).promise;
+  const maxPages = Math.min(pdf.numPages, limits.pdf.maxPages);
+  const textParts: string[] = [];
+
+  for (let pageNum = 1; pageNum <= maxPages; pageNum += 1) {
+    const page = await pdf.getPage(pageNum);
+    const textContent = await page.getTextContent();
+    const pageText = textContent.items
+      .map((item) => ("str" in item ? String(item.str) : ""))
+      .filter(Boolean)
+      .join(" ");
+    if (pageText) textParts.push(pageText);
+  }
+
+  const text = textParts.join("\n\n");
+  if (text.trim().length >= limits.pdf.minTextChars) {
+    return { text, images: [] };
+  }
+
+  const images: InputImageContent[] = [];
+  for (let pageNum = 1; pageNum <= maxPages; pageNum += 1) {
+    const page = await pdf.getPage(pageNum);
+    const viewport = page.getViewport({ scale: 1 });
+    const maxPixels = limits.pdf.maxPixels;
+    const pixelBudget = Math.max(1, maxPixels);
+    const pagePixels = viewport.width * viewport.height;
+    const scale = Math.min(1, Math.sqrt(pixelBudget / pagePixels));
+    const scaled = page.getViewport({ scale: Math.max(0.1, scale) });
+    const canvas = createCanvas(Math.ceil(scaled.width), Math.ceil(scaled.height));
+    await page.render({
+      canvas: canvas as unknown as HTMLCanvasElement,
+      viewport: scaled,
+    }).promise;
+    const png = canvas.toBuffer("image/png");
+    images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" });
+  }
+
+  return { text, images };
+}
+
+export async function extractImageContentFromSource(
+  source: InputImageSource,
+  limits: InputImageLimits,
+): Promise<InputImageContent> {
+  if (source.type === "base64") {
+    if (!source.data) {
+      throw new Error("input_image base64 source missing 'data' field");
+    }
+    const mimeType = normalizeMimeType(source.mediaType) ?? "image/png";
+    if (!limits.allowedMimes.has(mimeType)) {
+      throw new Error(`Unsupported image MIME type: ${mimeType}`);
+    }
+    const buffer = Buffer.from(source.data, "base64");
+    if (buffer.byteLength > limits.maxBytes) {
+      throw new Error(
+        `Image too large: ${buffer.byteLength} bytes (limit: ${limits.maxBytes} bytes)`,
+      );
+    }
+    return { type: "image", data: source.data, mimeType };
+  }
+
+  if (source.type === "url" && source.url) {
+    if (!limits.allowUrl) {
+      throw new Error("input_image URL sources are disabled by config");
+    }
+    const result = await fetchWithGuard({
+      url: source.url,
+      maxBytes: limits.maxBytes,
+      timeoutMs: limits.timeoutMs,
+      maxRedirects: limits.maxRedirects,
+    });
+    if (!limits.allowedMimes.has(result.mimeType)) {
+      throw new Error(`Unsupported image MIME type from URL: ${result.mimeType}`);
+    }
+    return { type: "image", data: result.buffer.toString("base64"), mimeType: result.mimeType };
+  }
+
+  throw new Error("input_image must have 'source.url' or 'source.data'");
+}
+
+export async function extractFileContentFromSource(params: {
+  source: InputFileSource;
+  limits: InputFileLimits;
+}): Promise<InputFileExtractResult> {
+  const { source, limits } = params;
+  const filename = source.filename || "file";
+
+  let buffer: Buffer;
+  let mimeType: string | undefined;
+  let charset: string | undefined;
+
+  if (source.type === "base64") {
+    if (!source.data) {
+      throw new Error("input_file base64 source missing 'data' field");
+    }
+    const parsed = parseContentType(source.mediaType);
+    mimeType = parsed.mimeType;
+    charset = parsed.charset;
+    buffer = Buffer.from(source.data, "base64");
+  } else if (source.type === "url" && source.url) {
+    if (!limits.allowUrl) {
+      throw new Error("input_file URL sources are disabled by config");
+    }
+    const result = await fetchWithGuard({
+      url: source.url,
+      maxBytes: limits.maxBytes,
+      timeoutMs: limits.timeoutMs,
+      maxRedirects: limits.maxRedirects,
+    });
+    const parsed = parseContentType(result.contentType);
+    mimeType = parsed.mimeType ?? normalizeMimeType(result.mimeType);
+    charset = parsed.charset;
+    buffer = result.buffer;
+  } else {
+    throw new Error("input_file must have 'source.url' or 'source.data'");
+  }
+
+  if (buffer.byteLength > limits.maxBytes) {
+    throw new Error(`File too large: ${buffer.byteLength} bytes (limit: ${limits.maxBytes} bytes)`);
+  }
+
+  if (!mimeType) {
+    throw new Error("input_file missing media type");
+  }
+  if (!limits.allowedMimes.has(mimeType)) {
+    throw new Error(`Unsupported file MIME type: ${mimeType}`);
+  }
+
+  if (mimeType === "application/pdf") {
+    const extracted = await extractPdfContent({ buffer, limits });
+    const text = extracted.text ? clampText(extracted.text, limits.maxChars) : "";
+    return {
+      filename,
+      text,
+      images: extracted.images.length > 0 ? extracted.images : undefined,
+    };
+  }
+
+  const text = clampText(decodeTextContent(buffer, charset), limits.maxChars);
+  return { filename, text };
+}