From 5bd55037e450d1d3e77e1cd1baf724f6a3d4d6e1 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 21 Jan 2026 02:52:27 +0000 Subject: [PATCH] fix: harden web fetch SSRF and redirects Co-authored-by: Eli --- CHANGELOG.md | 1 + README.md | 46 +++---- docs/gateway/configuration.md | 1 + docs/tools/web.md | 2 + src/agents/tools/web-fetch.ssrf.test.ts | 160 ++++++++++++++++++++++++ src/agents/tools/web-fetch.ts | 102 ++++++++++++--- src/config/schema.ts | 2 + src/config/types.tools.ts | 2 + src/config/zod-schema.agent-runtime.ts | 1 + src/infra/net/ssrf.ts | 131 +++++++++++++++++++ src/media/input-files.ts | 46 +------ 11 files changed, 412 insertions(+), 82 deletions(-) create mode 100644 src/agents/tools/web-fetch.ssrf.test.ts create mode 100644 src/infra/net/ssrf.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index fbc5c50ad..9467dfcf4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ Docs: https://docs.clawd.bot - UI: preserve ordered list numbering in chat markdown. (#1341) — thanks @bradleypriest. - UI: allow Control UI to read gatewayUrl from URL params for remote WebSocket targets. (#1342) — thanks @ameno-. - Web search: infer Perplexity base URL from API key source (direct vs OpenRouter). +- Web fetch: harden SSRF protection with shared hostname checks and redirect limits. (#1346) — thanks @fogboots. - TUI: keep thinking blocks ordered before content during streaming and isolate per-run assembly. (#1202) — thanks @aaronveklabs. - TUI: align custom editor initialization with the latest pi-tui API. (#1298) — thanks @sibbl. - CLI: avoid duplicating --profile/--dev flags when formatting commands. diff --git a/README.md b/README.md index 28528313a..5c149af56 100644 --- a/README.md +++ b/README.md @@ -479,27 +479,27 @@ Core contributors: Thanks to all clawtributors:

- steipete bohdanpodvirnyi joaohlisboa mneves75 MatthieuBizien rahthakor vrknetha radek-paclt joshp123 mukhtharcm - maxsumrall xadenryan Tobias Bischoff juanpablodlc hsrvc magimetal meaningfool NicholasSpisak abhisekbasu1 sebslight - claude jamesgroat Hyaxia dantelex daveonkels mteam88 Eng. Juan Combetto dbhurley Mariano Belinky TSavo - julianengel benithors bradleypriest timolins nachx639 sreekaransrinath gupsammy cristip73 nachoiacovino Vasanth Rao Naik Sabavat - cpojer lc0rp scald gumadeiras andranik-sahakyan davidguttman sleontenko sircrumpet peschee rafaelreis-r - thewilloftheshadow ratulsarna lutr0 danielz1z emanuelst KristijanJovanovski CashWilliams rdev osolmaz joshrad-dev - kiranjd adityashaw2 sheeek artuskg onutc tyler6204 manuelhettich minghinmatthewlam myfunc buddyh - connorshea mcinteerj John-Rood timkrase zerone0x gerardward2007 obviyus tosh-hamburg azade-c roshanasingh4 - bjesuiter cheeeee Josh Phillips Whoaa512 YuriNachos chriseidhof ysqander superman32432432 vignesh07 Yurii Chukhlib - grp06 antons austinm911 blacksmith-sh[bot] dan-dr HeimdallStrategy imfing jalehman jarvis-medmatic kkarimi - mahmoudashraf93 petter-b pkrmf RandyVentures Ryan Lisse erikpr1994 Ghost jonasjancarik Keith the Silly Goose L36 Server - Marc mitschabaude-bot neist ngutman chrisrodz dougvk Friederike Seiler gabriel-trigo iamadig Kit - koala73 manmal ogulcancelik pasogott petradonka rubyrunsstuff sibbl suminhthanh VACInc wes-davis - zats 24601 Chris Taylor Django Navarro evalexpr henrino3 humanwritten larlyssa mkbehr oswalpalash - pcty-nextgen-service-account Syhids Aaron Konyer aaronveklabs adam91holt erik-agens fcatuhe ivanrvpereira jayhickey jeffersonwarrior - jeffersonwarrior Jonathan D. Rhyne (DJ-D) jverdi longmaba mickahouan mjrussell p6l-richard philipp-spiess robaxelsen Sash Catanzarite - T5-AndyML VAC zknicker alejandro maza andrewting19 anpoirier Asleep123 bolismauro cash-echo-bot Clawd - conhecendocontato Dimitrios Ploutarchos Drake Thomsen Felix Krause gtsifrikas HazAT hrdwdmrbl hugobarauna Jamie Openshaw Jarvis - Jefferson Nunn Kevin Lin kitze levifig Lloyd loukotal martinpucik Miles mrdbstn MSch - Mustafa Tag Eldeen ndraiman nexty5870 odysseus0 prathamdby reeltimeapps RLTCmpe rodrigouroz Rolf Fredheim Rony Kelner - Samrat Jha siraht snopoke The Admiral thesash Ubuntu voidserf wstock Zach Knickerbocker Alphonse-arianee - Azade carlulsoe ddyo Erik latitudeki5223 Manuel Maly Mourad Boustani odrobnik pcty-nextgen-ios-builder Quentin - Randy Torres rhjoh ronak-guliani William Stock + steipete bohdanpodvirnyi joaohlisboa mneves75 MatthieuBizien MaudeBot rahthakor vrknetha radek-paclt joshp123 + mukhtharcm maxsumrall xadenryan Tobias Bischoff juanpablodlc hsrvc magimetal meaningfool NicholasSpisak abhisekbasu1 + sebslight claude jamesgroat Hyaxia dantelex daveonkels mteam88 Eng. Juan Combetto dbhurley Mariano Belinky + TSavo julianengel benithors bradleypriest timolins nachx639 sreekaransrinath gupsammy cristip73 nachoiacovino + Vasanth Rao Naik Sabavat cpojer lc0rp scald gumadeiras andranik-sahakyan davidguttman sleontenko sircrumpet peschee + rafaelreis-r thewilloftheshadow ratulsarna lutr0 danielz1z emanuelst KristijanJovanovski CashWilliams rdev osolmaz + joshrad-dev kiranjd adityashaw2 sheeek artuskg onutc tyler6204 manuelhettich minghinmatthewlam myfunc + buddyh connorshea mcinteerj John-Rood timkrase zerone0x gerardward2007 obviyus tosh-hamburg azade-c + roshanasingh4 bjesuiter cheeeee Josh Phillips Whoaa512 YuriNachos chriseidhof vignesh07 ysqander superman32432432 + Yurii Chukhlib grp06 antons austinm911 blacksmith-sh[bot] dan-dr HeimdallStrategy imfing jalehman jarvis-medmatic + kkarimi mahmoudashraf93 petter-b pkrmf RandyVentures Ryan Lisse dougvk erikpr1994 Ghost jonasjancarik + Keith the Silly Goose L36 Server Marc mitschabaude-bot neist ngutman chrisrodz Friederike Seiler gabriel-trigo iamadig + Kit koala73 manmal ogulcancelik pasogott petradonka rubyrunsstuff sibbl suminhthanh VACInc + wes-davis zats 24601 Chris Taylor Django Navarro evalexpr henrino3 humanwritten larlyssa mkbehr + oswalpalash pcty-nextgen-service-account Syhids Aaron Konyer aaronveklabs adam91holt ClawdFx erik-agens fcatuhe ivanrvpereira + jayhickey jeffersonwarrior jeffersonwarrior Jonathan D. Rhyne (DJ-D) jverdi longmaba mickahouan mjrussell p6l-richard philipp-spiess + robaxelsen Sash Catanzarite T5-AndyML VAC zknicker alejandro maza ameno- andrewting19 anpoirier Asleep123 + bolismauro cash-echo-bot Clawd conhecendocontato Dimitrios Ploutarchos Drake Thomsen Felix Krause gtsifrikas HazAT hrdwdmrbl + hugobarauna Jamie Openshaw Jarvis Jefferson Nunn Kevin Lin kitze levifig Lloyd loukotal martinpucik + Miles mrdbstn MSch Mustafa Tag Eldeen ndraiman nexty5870 odysseus0 prathamdby reeltimeapps RLTCmpe + rodrigouroz Rolf Fredheim Rony Kelner Samrat Jha siraht snopoke The Admiral thesash Ubuntu voidserf + wstock Zach Knickerbocker Alphonse-arianee Azade carlulsoe ddyo Erik latitudeki5223 Manuel Maly Mourad Boustani + odrobnik pcty-nextgen-ios-builder Quentin Randy Torres rhjoh ronak-guliani William Stock

diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 65d46932b..e26637029 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1774,6 +1774,7 @@ Note: `applyPatch` is only under `tools.exec`. - `tools.web.fetch.maxChars` (default 50000) - `tools.web.fetch.timeoutSeconds` (default 30) - `tools.web.fetch.cacheTtlMinutes` (default 15) +- `tools.web.fetch.maxRedirects` (default 3) - `tools.web.fetch.userAgent` (optional override) - `tools.web.fetch.readability` (default true; disable to use basic HTML cleanup only) - `tools.web.fetch.firecrawl.enabled` (default true when an API key is set) diff --git a/docs/tools/web.md b/docs/tools/web.md index da73fe34e..cf1738570 100644 --- a/docs/tools/web.md +++ b/docs/tools/web.md @@ -215,6 +215,7 @@ Fetch a URL and extract readable content. maxChars: 50000, timeoutSeconds: 30, cacheTtlMinutes: 15, + maxRedirects: 3, userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", readability: true, firecrawl: { @@ -241,6 +242,7 @@ Notes: - `web_fetch` uses Readability (main-content extraction) first, then Firecrawl (if configured). If both fail, the tool returns an error. - Firecrawl requests use bot-circumvention mode and cache results by default. - `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed. +- `web_fetch` blocks private/internal hostnames and re-checks redirects (limit with `maxRedirects`). - `web_fetch` is best-effort extraction; some sites will need the browser tool. - See [Firecrawl](/tools/firecrawl) for key setup and service details. - Responses are cached (default 15 minutes) to reduce repeated fetches. diff --git a/src/agents/tools/web-fetch.ssrf.test.ts b/src/agents/tools/web-fetch.ssrf.test.ts new file mode 100644 index 000000000..24e4dfe41 --- /dev/null +++ b/src/agents/tools/web-fetch.ssrf.test.ts @@ -0,0 +1,160 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +const lookupMock = vi.fn(); + +vi.mock("node:dns/promises", () => ({ + lookup: lookupMock, +})); + +function makeHeaders(map: Record): { get: (key: string) => string | null } { + return { + get: (key) => map[key.toLowerCase()] ?? null, + }; +} + +function redirectResponse(location: string): Response { + return { + ok: false, + status: 302, + headers: makeHeaders({ location }), + body: { cancel: vi.fn() }, + } as Response; +} + +function textResponse(body: string): Response { + return { + ok: true, + status: 200, + headers: makeHeaders({ "content-type": "text/plain" }), + text: async () => body, + } as Response; +} + +describe("web_fetch SSRF protection", () => { + const priorFetch = global.fetch; + + afterEach(() => { + // @ts-expect-error restore + global.fetch = priorFetch; + lookupMock.mockReset(); + vi.restoreAllMocks(); + }); + + it("blocks localhost hostnames before fetch/firecrawl", async () => { + const fetchSpy = vi.fn(); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { + web: { + fetch: { + cacheTtlMinutes: 0, + firecrawl: { apiKey: "firecrawl-test" }, + }, + }, + }, + }, + }); + + await expect(tool?.execute?.("call", { url: "http://localhost/test" })).rejects.toThrow( + /Blocked hostname/i, + ); + expect(fetchSpy).not.toHaveBeenCalled(); + expect(lookupMock).not.toHaveBeenCalled(); + }); + + it("blocks private IP literals without DNS", async () => { + const fetchSpy = vi.fn(); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } }, + }, + }); + + await expect(tool?.execute?.("call", { url: "http://127.0.0.1/test" })).rejects.toThrow( + /private|internal|blocked/i, + ); + await expect(tool?.execute?.("call", { url: "http://[::ffff:127.0.0.1]/" })).rejects.toThrow( + /private|internal|blocked/i, + ); + expect(fetchSpy).not.toHaveBeenCalled(); + expect(lookupMock).not.toHaveBeenCalled(); + }); + + it("blocks when DNS resolves to private addresses", async () => { + lookupMock.mockImplementation(async (hostname: string) => { + if (hostname === "public.test") { + return [{ address: "93.184.216.34", family: 4 }]; + } + return [{ address: "10.0.0.5", family: 4 }]; + }); + + const fetchSpy = vi.fn(); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } }, + }, + }); + + await expect(tool?.execute?.("call", { url: "https://private.test/resource" })).rejects.toThrow( + /private|internal|blocked/i, + ); + expect(fetchSpy).not.toHaveBeenCalled(); + }); + + it("blocks redirects to private hosts", async () => { + lookupMock.mockResolvedValue([{ address: "93.184.216.34", family: 4 }]); + + const fetchSpy = vi.fn().mockResolvedValueOnce(redirectResponse("http://127.0.0.1/secret")); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { + web: { + fetch: { cacheTtlMinutes: 0, firecrawl: { apiKey: "firecrawl-test" } }, + }, + }, + }, + }); + + await expect(tool?.execute?.("call", { url: "https://example.com" })).rejects.toThrow( + /private|internal|blocked/i, + ); + expect(fetchSpy).toHaveBeenCalledTimes(1); + }); + + it("allows public hosts", async () => { + lookupMock.mockResolvedValue([{ address: "93.184.216.34", family: 4 }]); + + const fetchSpy = vi.fn().mockResolvedValue(textResponse("ok")); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } }, + }, + }); + + const result = await tool?.execute?.("call", { url: "https://example.com" }); + expect(result?.details).toMatchObject({ + status: 200, + extractor: "raw", + }); + }); +}); diff --git a/src/agents/tools/web-fetch.ts b/src/agents/tools/web-fetch.ts index be766ed9d..c8bcaa609 100644 --- a/src/agents/tools/web-fetch.ts +++ b/src/agents/tools/web-fetch.ts @@ -1,6 +1,7 @@ import { Type } from "@sinclair/typebox"; import type { ClawdbotConfig } from "../../config/config.js"; +import { assertPublicHostname, SsrFBlockedError } from "../../infra/net/ssrf.js"; import { stringEnum } from "../schema/typebox.js"; import type { AnyAgentTool } from "./common.js"; import { jsonResult, readNumberParam, readStringParam } from "./common.js"; @@ -29,6 +30,7 @@ export { extractReadableContent } from "./web-fetch-utils.js"; const EXTRACT_MODES = ["markdown", "text"] as const; const DEFAULT_FETCH_MAX_CHARS = 50_000; +const DEFAULT_FETCH_MAX_REDIRECTS = 3; const DEFAULT_ERROR_MAX_CHARS = 4_000; const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev"; const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000; @@ -144,6 +146,11 @@ function resolveMaxChars(value: unknown, fallback: number): number { return Math.max(100, Math.floor(parsed)); } +function resolveMaxRedirects(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + return Math.max(0, Math.floor(parsed)); +} + function looksLikeHtml(value: string): boolean { const trimmed = value.trimStart(); if (!trimmed) return false; @@ -151,6 +158,68 @@ function looksLikeHtml(value: string): boolean { return head.startsWith(" { + const signal = withTimeout(undefined, params.timeoutSeconds * 1000); + const visited = new Set(); + let currentUrl = params.url; + let redirectCount = 0; + + while (true) { + let parsedUrl: URL; + try { + parsedUrl = new URL(currentUrl); + } catch { + throw new Error("Invalid URL: must be http or https"); + } + if (!["http:", "https:"].includes(parsedUrl.protocol)) { + throw new Error("Invalid URL: must be http or https"); + } + + await assertPublicHostname(parsedUrl.hostname); + + const res = await fetch(parsedUrl.toString(), { + method: "GET", + headers: { + Accept: "*/*", + "User-Agent": params.userAgent, + "Accept-Language": "en-US,en;q=0.9", + }, + signal, + redirect: "manual", + }); + + if (isRedirectStatus(res.status)) { + const location = res.headers.get("location"); + if (!location) { + throw new Error(`Redirect missing location header (${res.status})`); + } + redirectCount += 1; + if (redirectCount > params.maxRedirects) { + throw new Error(`Too many redirects (limit: ${params.maxRedirects})`); + } + const nextUrl = new URL(location, parsedUrl).toString(); + if (visited.has(nextUrl)) { + throw new Error("Redirect loop detected"); + } + visited.add(nextUrl); + void res.body?.cancel(); + currentUrl = nextUrl; + continue; + } + + return { response: res, finalUrl: currentUrl }; + } +} + function formatWebFetchErrorDetail(params: { detail: string; contentType?: string | null; @@ -247,6 +316,7 @@ async function runWebFetch(params: { url: string; extractMode: ExtractMode; maxChars: number; + maxRedirects: number; timeoutSeconds: number; cacheTtlMs: number; userAgent: string; @@ -278,20 +348,23 @@ async function runWebFetch(params: { const start = Date.now(); let res: Response; + let finalUrl = params.url; try { - res = await fetch(parsedUrl.toString(), { - method: "GET", - headers: { - Accept: "*/*", - "User-Agent": params.userAgent, - "Accept-Language": "en-US,en;q=0.9", - }, - signal: withTimeout(undefined, params.timeoutSeconds * 1000), + const result = await fetchWithRedirects({ + url: params.url, + maxRedirects: params.maxRedirects, + timeoutSeconds: params.timeoutSeconds, + userAgent: params.userAgent, }); + res = result.response; + finalUrl = result.finalUrl; } catch (error) { + if (error instanceof SsrFBlockedError) { + throw error; + } if (params.firecrawlEnabled && params.firecrawlApiKey) { const firecrawl = await fetchFirecrawlContent({ - url: params.url, + url: finalUrl, extractMode: params.extractMode, apiKey: params.firecrawlApiKey, baseUrl: params.firecrawlBaseUrl, @@ -304,7 +377,7 @@ async function runWebFetch(params: { const truncated = truncateText(firecrawl.text, params.maxChars); const payload = { url: params.url, - finalUrl: firecrawl.finalUrl || params.url, + finalUrl: firecrawl.finalUrl || finalUrl, status: firecrawl.status ?? 200, contentType: "text/markdown", title: firecrawl.title, @@ -339,7 +412,7 @@ async function runWebFetch(params: { const truncated = truncateText(firecrawl.text, params.maxChars); const payload = { url: params.url, - finalUrl: firecrawl.finalUrl || params.url, + finalUrl: firecrawl.finalUrl || finalUrl, status: firecrawl.status ?? res.status, contentType: "text/markdown", title: firecrawl.title, @@ -374,7 +447,7 @@ async function runWebFetch(params: { if (params.readabilityEnabled) { const readable = await extractReadableContent({ html: body, - url: res.url || params.url, + url: finalUrl, extractMode: params.extractMode, }); if (readable?.text) { @@ -382,7 +455,7 @@ async function runWebFetch(params: { title = readable.title; extractor = "readability"; } else { - const firecrawl = await tryFirecrawlFallback(params); + const firecrawl = await tryFirecrawlFallback({ ...params, url: finalUrl }); if (firecrawl) { text = firecrawl.text; title = firecrawl.title; @@ -411,7 +484,7 @@ async function runWebFetch(params: { const truncated = truncateText(text, params.maxChars); const payload = { url: params.url, - finalUrl: res.url || params.url, + finalUrl, status: res.status, contentType, title, @@ -508,6 +581,7 @@ export function createWebFetchTool(options?: { url, extractMode, maxChars: resolveMaxChars(maxChars ?? fetch?.maxChars, DEFAULT_FETCH_MAX_CHARS), + maxRedirects: resolveMaxRedirects(fetch?.maxRedirects, DEFAULT_FETCH_MAX_REDIRECTS), timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), userAgent, diff --git a/src/config/schema.ts b/src/config/schema.ts index e905b7ead..667072e66 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -178,6 +178,7 @@ const FIELD_LABELS: Record = { "tools.web.fetch.maxChars": "Web Fetch Max Chars", "tools.web.fetch.timeoutSeconds": "Web Fetch Timeout (sec)", "tools.web.fetch.cacheTtlMinutes": "Web Fetch Cache TTL (min)", + "tools.web.fetch.maxRedirects": "Web Fetch Max Redirects", "tools.web.fetch.userAgent": "Web Fetch User-Agent", "gateway.controlUi.basePath": "Control UI Base Path", "gateway.http.endpoints.chatCompletions.enabled": "OpenAI Chat Completions Endpoint", @@ -378,6 +379,7 @@ const FIELD_HELP: Record = { "tools.web.fetch.maxChars": "Max characters returned by web_fetch (truncated).", "tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.", "tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.", + "tools.web.fetch.maxRedirects": "Maximum redirects allowed for web_fetch (default: 3).", "tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.", "tools.web.fetch.readability": "Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).", diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 2288a88f2..7d7b3a57d 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -309,6 +309,8 @@ export type ToolsConfig = { timeoutSeconds?: number; /** Cache TTL in minutes for fetched content. */ cacheTtlMinutes?: number; + /** Maximum number of redirects to follow (default: 3). */ + maxRedirects?: number; /** Override User-Agent header for fetch requests. */ userAgent?: string; /** Use Readability to extract main content (default: true). */ diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts index 537129acc..716ba02d9 100644 --- a/src/config/zod-schema.agent-runtime.ts +++ b/src/config/zod-schema.agent-runtime.ts @@ -147,6 +147,7 @@ export const ToolsWebFetchSchema = z maxChars: z.number().int().positive().optional(), timeoutSeconds: z.number().int().positive().optional(), cacheTtlMinutes: z.number().nonnegative().optional(), + maxRedirects: z.number().int().nonnegative().optional(), userAgent: z.string().optional(), }) .strict() diff --git a/src/infra/net/ssrf.ts b/src/infra/net/ssrf.ts new file mode 100644 index 000000000..9b09cc4b1 --- /dev/null +++ b/src/infra/net/ssrf.ts @@ -0,0 +1,131 @@ +import { lookup as dnsLookup } from "node:dns/promises"; + +export class SsrFBlockedError extends Error { + constructor(message: string) { + super(message); + this.name = "SsrFBlockedError"; + } +} + +type LookupFn = typeof dnsLookup; + +const PRIVATE_IPV6_PREFIXES = ["fe80:", "fec0:", "fc", "fd"]; +const BLOCKED_HOSTNAMES = new Set(["localhost", "metadata.google.internal"]); + +function normalizeHostname(hostname: string): string { + const normalized = hostname.trim().toLowerCase().replace(/\.$/, ""); + if (normalized.startsWith("[") && normalized.endsWith("]")) { + return normalized.slice(1, -1); + } + return normalized; +} + +function parseIpv4(address: string): number[] | null { + const parts = address.split("."); + if (parts.length !== 4) return null; + const numbers = parts.map((part) => Number.parseInt(part, 10)); + if (numbers.some((value) => Number.isNaN(value) || value < 0 || value > 255)) return null; + return numbers; +} + +function parseIpv4FromMappedIpv6(mapped: string): number[] | null { + if (mapped.includes(".")) { + return parseIpv4(mapped); + } + const parts = mapped.split(":").filter(Boolean); + if (parts.length === 1) { + const value = Number.parseInt(parts[0], 16); + if (Number.isNaN(value) || value < 0 || value > 0xffff_ffff) return null; + return [(value >>> 24) & 0xff, (value >>> 16) & 0xff, (value >>> 8) & 0xff, value & 0xff]; + } + if (parts.length !== 2) return null; + const high = Number.parseInt(parts[0], 16); + const low = Number.parseInt(parts[1], 16); + if ( + Number.isNaN(high) || + Number.isNaN(low) || + high < 0 || + low < 0 || + high > 0xffff || + low > 0xffff + ) { + return null; + } + const value = (high << 16) + low; + return [(value >>> 24) & 0xff, (value >>> 16) & 0xff, (value >>> 8) & 0xff, value & 0xff]; +} + +function isPrivateIpv4(parts: number[]): boolean { + const [octet1, octet2] = parts; + if (octet1 === 0) return true; + if (octet1 === 10) return true; + if (octet1 === 127) return true; + if (octet1 === 169 && octet2 === 254) return true; + if (octet1 === 172 && octet2 >= 16 && octet2 <= 31) return true; + if (octet1 === 192 && octet2 === 168) return true; + if (octet1 === 100 && octet2 >= 64 && octet2 <= 127) return true; + return false; +} + +export function isPrivateIpAddress(address: string): boolean { + let normalized = address.trim().toLowerCase(); + if (normalized.startsWith("[") && normalized.endsWith("]")) { + normalized = normalized.slice(1, -1); + } + if (!normalized) return false; + + if (normalized.startsWith("::ffff:")) { + const mapped = normalized.slice("::ffff:".length); + const ipv4 = parseIpv4FromMappedIpv6(mapped); + if (ipv4) return isPrivateIpv4(ipv4); + } + + if (normalized.includes(":")) { + if (normalized === "::" || normalized === "::1") return true; + return PRIVATE_IPV6_PREFIXES.some((prefix) => normalized.startsWith(prefix)); + } + + const ipv4 = parseIpv4(normalized); + if (!ipv4) return false; + return isPrivateIpv4(ipv4); +} + +export function isBlockedHostname(hostname: string): boolean { + const normalized = normalizeHostname(hostname); + if (!normalized) return false; + if (BLOCKED_HOSTNAMES.has(normalized)) return true; + return ( + normalized.endsWith(".localhost") || + normalized.endsWith(".local") || + normalized.endsWith(".internal") + ); +} + +export async function assertPublicHostname( + hostname: string, + lookupFn: LookupFn = dnsLookup, +): Promise { + const normalized = normalizeHostname(hostname); + if (!normalized) { + throw new Error("Invalid hostname"); + } + + if (isBlockedHostname(normalized)) { + throw new SsrFBlockedError(`Blocked hostname: ${hostname}`); + } + + if (isPrivateIpAddress(normalized)) { + throw new SsrFBlockedError("Blocked: private/internal IP address"); + } + + const results = await lookupFn(normalized, { all: true }); + if (results.length === 0) { + throw new Error(`Unable to resolve hostname: ${hostname}`); + } + + for (const entry of results) { + if (isPrivateIpAddress(entry.address)) { + throw new SsrFBlockedError("Blocked: resolves to private/internal IP address"); + } + } +} diff --git a/src/media/input-files.ts b/src/media/input-files.ts index 0b131b93e..8b1d1945a 100644 --- a/src/media/input-files.ts +++ b/src/media/input-files.ts @@ -1,5 +1,5 @@ -import { lookup } from "node:dns/promises"; import { logWarn } from "../logger.js"; +import { assertPublicHostname } from "../infra/net/ssrf.js"; type CanvasModule = typeof import("@napi-rs/canvas"); type PdfJsModule = typeof import("pdfjs-dist/legacy/build/pdf.mjs"); @@ -107,50 +107,6 @@ export const DEFAULT_INPUT_PDF_MAX_PAGES = 4; export const DEFAULT_INPUT_PDF_MAX_PIXELS = 4_000_000; export const DEFAULT_INPUT_PDF_MIN_TEXT_CHARS = 200; -const PRIVATE_IPV4_PATTERNS = [ - /^127\./, - /^10\./, - /^192\.168\./, - /^172\.(1[6-9]|2[0-9]|3[0-1])\./, - /^0\./, -]; -const PRIVATE_IPV6_PREFIXES = ["::1", "fe80:", "fec0:", "fc", "fd"]; - -function isPrivateIpAddress(address: string): boolean { - if (address.includes(":")) { - const lower = address.toLowerCase(); - if (lower === "::1") return true; - return PRIVATE_IPV6_PREFIXES.some((prefix) => lower.startsWith(prefix)); - } - return PRIVATE_IPV4_PATTERNS.some((pattern) => pattern.test(address)); -} - -function isBlockedHostname(hostname: string): boolean { - const lower = hostname.toLowerCase(); - return ( - lower === "localhost" || - lower.endsWith(".localhost") || - lower.endsWith(".local") || - lower.endsWith(".internal") - ); -} - -async function assertPublicHostname(hostname: string): Promise { - if (isBlockedHostname(hostname)) { - throw new Error(`Blocked hostname: ${hostname}`); - } - - const results = await lookup(hostname, { all: true }); - if (results.length === 0) { - throw new Error(`Unable to resolve hostname: ${hostname}`); - } - for (const entry of results) { - if (isPrivateIpAddress(entry.address)) { - throw new Error(`Private IP addresses are not allowed: ${entry.address}`); - } - } -} - function isRedirectStatus(status: number): boolean { return status === 301 || status === 302 || status === 303 || status === 307 || status === 308; }