import { logWarn } from "../logger.js"; import { closeDispatcher, createPinnedDispatcher, resolvePinnedHostname, } from "../infra/net/ssrf.js"; import type { Dispatcher } from "undici"; type CanvasModule = typeof import("@napi-rs/canvas"); type PdfJsModule = typeof import("pdfjs-dist/legacy/build/pdf.mjs"); let canvasModulePromise: Promise | null = null; let pdfJsModulePromise: Promise | null = null; // Lazy-load optional PDF/image deps so non-PDF paths don't require native installs. async function loadCanvasModule(): Promise { if (!canvasModulePromise) { canvasModulePromise = import("@napi-rs/canvas").catch((err) => { canvasModulePromise = null; throw new Error( `Optional dependency @napi-rs/canvas is required for PDF image extraction: ${String(err)}`, ); }); } return canvasModulePromise; } async function loadPdfJsModule(): Promise { if (!pdfJsModulePromise) { pdfJsModulePromise = import("pdfjs-dist/legacy/build/pdf.mjs").catch((err) => { pdfJsModulePromise = null; throw new Error( `Optional dependency pdfjs-dist is required for PDF extraction: ${String(err)}`, ); }); } return pdfJsModulePromise; } export type InputImageContent = { type: "image"; data: string; mimeType: string; }; export type InputFileExtractResult = { filename: string; text?: string; images?: InputImageContent[]; }; export type InputPdfLimits = { maxPages: number; maxPixels: number; minTextChars: number; }; export type InputFileLimits = { allowUrl: boolean; allowedMimes: Set; maxBytes: number; maxChars: number; maxRedirects: number; timeoutMs: number; pdf: InputPdfLimits; }; export type InputImageLimits = { allowUrl: boolean; allowedMimes: Set; maxBytes: number; maxRedirects: number; timeoutMs: number; }; export type InputImageSource = { type: "base64" | "url"; data?: string; url?: string; mediaType?: string; }; export type InputFileSource = { type: "base64" | "url"; data?: string; url?: string; mediaType?: string; filename?: string; }; export type InputFetchResult = { buffer: Buffer; mimeType: string; contentType?: string; }; export const DEFAULT_INPUT_IMAGE_MIMES = ["image/jpeg", "image/png", "image/gif", "image/webp"]; export const DEFAULT_INPUT_FILE_MIMES = [ "text/plain", "text/markdown", "text/html", "text/csv", "application/json", "application/pdf", ]; export const DEFAULT_INPUT_IMAGE_MAX_BYTES = 10 * 1024 * 1024; export const DEFAULT_INPUT_FILE_MAX_BYTES = 5 * 1024 * 1024; export const DEFAULT_INPUT_FILE_MAX_CHARS = 200_000; export const DEFAULT_INPUT_MAX_REDIRECTS = 3; export const DEFAULT_INPUT_TIMEOUT_MS = 10_000; export const DEFAULT_INPUT_PDF_MAX_PAGES = 4; export const DEFAULT_INPUT_PDF_MAX_PIXELS = 4_000_000; export const DEFAULT_INPUT_PDF_MIN_TEXT_CHARS = 200; function isRedirectStatus(status: number): boolean { return status === 301 || status === 302 || status === 303 || status === 307 || status === 308; } export function normalizeMimeType(value: string | undefined): string | undefined { if (!value) return undefined; const [raw] = value.split(";"); const normalized = raw?.trim().toLowerCase(); return normalized || undefined; } export function parseContentType(value: string | undefined): { mimeType?: string; charset?: string; } { if (!value) return {}; const parts = value.split(";").map((part) => part.trim()); const mimeType = normalizeMimeType(parts[0]); const charset = parts .map((part) => part.match(/^charset=(.+)$/i)?.[1]?.trim()) .find((part) => part && part.length > 0); return { mimeType, charset }; } export function normalizeMimeList(values: string[] | undefined, fallback: string[]): Set { const input = values && values.length > 0 ? values : fallback; return new Set(input.map((value) => normalizeMimeType(value)).filter(Boolean) as string[]); } export async function fetchWithGuard(params: { url: string; maxBytes: number; timeoutMs: number; maxRedirects: number; }): Promise { let currentUrl = params.url; let redirectCount = 0; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), params.timeoutMs); try { while (true) { const parsedUrl = new URL(currentUrl); if (!["http:", "https:"].includes(parsedUrl.protocol)) { throw new Error(`Invalid URL protocol: ${parsedUrl.protocol}. Only HTTP/HTTPS allowed.`); } const pinned = await resolvePinnedHostname(parsedUrl.hostname); const dispatcher = createPinnedDispatcher(pinned); try { const response = await fetch(parsedUrl, { signal: controller.signal, headers: { "User-Agent": "Clawdbot-Gateway/1.0" }, redirect: "manual", dispatcher, } as RequestInit & { dispatcher: Dispatcher }); if (isRedirectStatus(response.status)) { const location = response.headers.get("location"); if (!location) { throw new Error(`Redirect missing location header (${response.status})`); } redirectCount += 1; if (redirectCount > params.maxRedirects) { throw new Error(`Too many redirects (limit: ${params.maxRedirects})`); } void response.body?.cancel(); currentUrl = new URL(location, parsedUrl).toString(); continue; } if (!response.ok) { throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`); } const contentLength = response.headers.get("content-length"); if (contentLength) { const size = parseInt(contentLength, 10); if (size > params.maxBytes) { throw new Error(`Content too large: ${size} bytes (limit: ${params.maxBytes} bytes)`); } } const buffer = Buffer.from(await response.arrayBuffer()); if (buffer.byteLength > params.maxBytes) { throw new Error( `Content too large: ${buffer.byteLength} bytes (limit: ${params.maxBytes} bytes)`, ); } const contentType = response.headers.get("content-type") || undefined; const parsed = parseContentType(contentType); const mimeType = parsed.mimeType ?? "application/octet-stream"; return { buffer, mimeType, contentType }; } finally { await closeDispatcher(dispatcher); } } } finally { clearTimeout(timeoutId); } } function decodeTextContent(buffer: Buffer, charset: string | undefined): string { const encoding = charset?.trim().toLowerCase() || "utf-8"; try { return new TextDecoder(encoding).decode(buffer); } catch { return new TextDecoder("utf-8").decode(buffer); } } function clampText(text: string, maxChars: number): string { if (text.length <= maxChars) return text; return text.slice(0, maxChars); } async function extractPdfContent(params: { buffer: Buffer; limits: InputFileLimits; }): Promise<{ text: string; images: InputImageContent[] }> { const { buffer, limits } = params; const { getDocument } = await loadPdfJsModule(); const pdf = await getDocument({ data: new Uint8Array(buffer), disableWorker: true, }).promise; const maxPages = Math.min(pdf.numPages, limits.pdf.maxPages); const textParts: string[] = []; for (let pageNum = 1; pageNum <= maxPages; pageNum += 1) { const page = await pdf.getPage(pageNum); const textContent = await page.getTextContent(); const pageText = textContent.items .map((item) => ("str" in item ? String(item.str) : "")) .filter(Boolean) .join(" "); if (pageText) textParts.push(pageText); } const text = textParts.join("\n\n"); if (text.trim().length >= limits.pdf.minTextChars) { return { text, images: [] }; } let canvasModule: CanvasModule; try { canvasModule = await loadCanvasModule(); } catch (err) { logWarn(`media: PDF image extraction skipped; ${String(err)}`); return { text, images: [] }; } const { createCanvas } = canvasModule; const images: InputImageContent[] = []; for (let pageNum = 1; pageNum <= maxPages; pageNum += 1) { const page = await pdf.getPage(pageNum); const viewport = page.getViewport({ scale: 1 }); const maxPixels = limits.pdf.maxPixels; const pixelBudget = Math.max(1, maxPixels); const pagePixels = viewport.width * viewport.height; const scale = Math.min(1, Math.sqrt(pixelBudget / pagePixels)); const scaled = page.getViewport({ scale: Math.max(0.1, scale) }); const canvas = createCanvas(Math.ceil(scaled.width), Math.ceil(scaled.height)); await page.render({ canvas: canvas as unknown as HTMLCanvasElement, viewport: scaled, }).promise; const png = canvas.toBuffer("image/png"); images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" }); } return { text, images }; } export async function extractImageContentFromSource( source: InputImageSource, limits: InputImageLimits, ): Promise { if (source.type === "base64") { if (!source.data) { throw new Error("input_image base64 source missing 'data' field"); } const mimeType = normalizeMimeType(source.mediaType) ?? "image/png"; if (!limits.allowedMimes.has(mimeType)) { throw new Error(`Unsupported image MIME type: ${mimeType}`); } const buffer = Buffer.from(source.data, "base64"); if (buffer.byteLength > limits.maxBytes) { throw new Error( `Image too large: ${buffer.byteLength} bytes (limit: ${limits.maxBytes} bytes)`, ); } return { type: "image", data: source.data, mimeType }; } if (source.type === "url" && source.url) { if (!limits.allowUrl) { throw new Error("input_image URL sources are disabled by config"); } const result = await fetchWithGuard({ url: source.url, maxBytes: limits.maxBytes, timeoutMs: limits.timeoutMs, maxRedirects: limits.maxRedirects, }); if (!limits.allowedMimes.has(result.mimeType)) { throw new Error(`Unsupported image MIME type from URL: ${result.mimeType}`); } return { type: "image", data: result.buffer.toString("base64"), mimeType: result.mimeType }; } throw new Error("input_image must have 'source.url' or 'source.data'"); } export async function extractFileContentFromSource(params: { source: InputFileSource; limits: InputFileLimits; }): Promise { const { source, limits } = params; const filename = source.filename || "file"; let buffer: Buffer; let mimeType: string | undefined; let charset: string | undefined; if (source.type === "base64") { if (!source.data) { throw new Error("input_file base64 source missing 'data' field"); } const parsed = parseContentType(source.mediaType); mimeType = parsed.mimeType; charset = parsed.charset; buffer = Buffer.from(source.data, "base64"); } else if (source.type === "url" && source.url) { if (!limits.allowUrl) { throw new Error("input_file URL sources are disabled by config"); } const result = await fetchWithGuard({ url: source.url, maxBytes: limits.maxBytes, timeoutMs: limits.timeoutMs, maxRedirects: limits.maxRedirects, }); const parsed = parseContentType(result.contentType); mimeType = parsed.mimeType ?? normalizeMimeType(result.mimeType); charset = parsed.charset; buffer = result.buffer; } else { throw new Error("input_file must have 'source.url' or 'source.data'"); } if (buffer.byteLength > limits.maxBytes) { throw new Error(`File too large: ${buffer.byteLength} bytes (limit: ${limits.maxBytes} bytes)`); } if (!mimeType) { throw new Error("input_file missing media type"); } if (!limits.allowedMimes.has(mimeType)) { throw new Error(`Unsupported file MIME type: ${mimeType}`); } if (mimeType === "application/pdf") { const extracted = await extractPdfContent({ buffer, limits }); const text = extracted.text ? clampText(extracted.text, limits.maxChars) : ""; return { filename, text, images: extracted.images.length > 0 ? extracted.images : undefined, }; } const text = clampText(decodeTextContent(buffer, charset), limits.maxChars); return { filename, text }; }