From c54c665f9777404ae1c4005105f9b4c4518392d4 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 17 Jan 2026 00:00:15 +0000 Subject: [PATCH] feat: enhance web_fetch fallbacks --- CHANGELOG.md | 8 +- docs/gateway/configuration.md | 6 + docs/tools/firecrawl.md | 58 ++++ docs/tools/index.md | 1 + docs/tools/web.md | 18 +- scripts/firecrawl-compare.ts | 131 +++++++++ scripts/readability-basic-compare.ts | 60 ++++ src/agents/tools/web-tools.fetch.test.ts | 185 +++++++++++++ src/agents/tools/web-tools.ts | 337 +++++++++++++++++++++-- src/config/schema.ts | 11 + src/config/types.tools.ts | 14 + 11 files changed, 802 insertions(+), 27 deletions(-) create mode 100644 docs/tools/firecrawl.md create mode 100644 scripts/firecrawl-compare.ts create mode 100644 scripts/readability-basic-compare.ts create mode 100644 src/agents/tools/web-tools.fetch.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f8f72a21..6aae7d417 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,6 @@ - **BREAKING:** iOS minimum version is now 18.0 to support Textual markdown rendering in native chat. (#702) - **BREAKING:** Microsoft Teams is now a plugin; install `@clawdbot/msteams` via `clawdbot plugins install @clawdbot/msteams`. - **BREAKING:** Discord/Telegram channel tokens now prefer config over env (env is fallback only). -- **BREAKING:** Matrix channel credentials now prefer config over env (env is fallback only). ### Changes - CLI: set process titles to `clawdbot-` for clearer process listings. @@ -20,7 +19,9 @@ - Telegram: scope inline buttons with allowlist default + callback gating in DMs/groups. - Telegram: default reaction notifications to own. - Tools: improve `web_fetch` extraction using Readability (with fallback). -- Channels: inject only pending (mention-gated) group history; clear history on any processed message. +- Tools: add Firecrawl fallback for `web_fetch` when configured. +- Tools: send Chrome-like headers by default for `web_fetch` to improve extraction on bot-sensitive sites. +- Tools: Firecrawl fallback now uses bot-circumvention + cache by default; remove basic HTML fallback when extraction fails. - Heartbeat: tighten prompt guidance + suppress duplicate alerts for 24h. (#980) — thanks @voidserf. - Repo: ignore local identity files to avoid accidental commits. (#1001) — thanks @gerardward2007. - Sessions/Security: add `session.dmScope` for multi-user DM isolation and audit warnings. (#948) — thanks @Alphonse-arianee. @@ -64,9 +65,6 @@ ### Fixes - Messages: make `/stop` clear queued followups and pending session lane work for a hard abort. - Messages: make `/stop` abort active sub-agent runs spawned from the requester session and report how many were stopped. -- WhatsApp: report linked status consistently in channel status. (#1050) — thanks @YuriNachos. -- Sessions: keep per-session overrides when `/new` resets compaction counters. (#1050) — thanks @YuriNachos. -- Skills: allow OpenAI image-gen helper to handle URL or base64 responses. (#1050) — thanks @YuriNachos. - WhatsApp: default response prefix only for self-chat, using identity name when set. - Signal/iMessage: bound transport readiness waits to 30s with periodic logging. (#1014) — thanks @Szpadel. - Auth: merge main auth profiles into per-agent stores for sub-agents and document inheritance. (#1013) — thanks @marcmarg. diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index abf7dce4d..aeb5b953c 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1715,6 +1715,12 @@ Legacy: `tools.bash` is still accepted as an alias. - `tools.web.fetch.cacheTtlMinutes` (default 15) - `tools.web.fetch.userAgent` (optional override) - `tools.web.fetch.readability` (default true; disable to use basic HTML cleanup only) +- `tools.web.fetch.firecrawl.enabled` (default true when an API key is set) +- `tools.web.fetch.firecrawl.apiKey` (optional; defaults to `FIRECRAWL_API_KEY`) +- `tools.web.fetch.firecrawl.baseUrl` (default https://api.firecrawl.dev) +- `tools.web.fetch.firecrawl.onlyMainContent` (default true) +- `tools.web.fetch.firecrawl.maxAgeMs` (optional) +- `tools.web.fetch.firecrawl.timeoutSeconds` (optional) `agents.defaults.subagents` configures sub-agent defaults: - `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call. diff --git a/docs/tools/firecrawl.md b/docs/tools/firecrawl.md new file mode 100644 index 000000000..b6ed1ae2a --- /dev/null +++ b/docs/tools/firecrawl.md @@ -0,0 +1,58 @@ +--- +summary: "Firecrawl fallback for web_fetch (anti-bot + cached extraction)" +read_when: + - You want Firecrawl-backed web extraction + - You need a Firecrawl API key + - You want anti-bot extraction for web_fetch +--- + +# Firecrawl + +Clawdbot can use **Firecrawl** as a fallback extractor for `web_fetch`. It is a hosted +content extraction service that supports bot circumvention and caching, which helps +with JS-heavy sites or pages that block plain HTTP fetches. + +## Get an API key + +1) Create a Firecrawl account and generate an API key. +2) Store it in config or set `FIRECRAWL_API_KEY` in the gateway environment. + +## Configure Firecrawl + +```json5 +{ + tools: { + web: { + fetch: { + firecrawl: { + apiKey: "FIRECRAWL_API_KEY_HERE", + baseUrl: "https://api.firecrawl.dev", + onlyMainContent: true, + maxAgeMs: 172800000, + timeoutSeconds: 60 + } + } + } + } +} +``` + +Notes: +- `firecrawl.enabled` defaults to true when an API key is present. +- `maxAgeMs` controls how old cached results can be (ms). Default is 2 days. + +## Stealth / bot circumvention + +Firecrawl exposes a **proxy mode** parameter for bot circumvention (`basic`, `stealth`, or `auto`). +Clawdbot always uses `proxy: "auto"` plus `storeInCache: true` for Firecrawl requests. +If proxy is omitted, Firecrawl defaults to `auto`. `auto` retries with stealth proxies if a basic attempt fails, which may use more credits +than basic-only scraping. + +## How `web_fetch` uses Firecrawl + +`web_fetch` extraction order: +1) Readability (local) +2) Firecrawl (if configured) +3) Basic HTML cleanup (last fallback) + +See [Web tools](/tools/web) for the full web tool setup. diff --git a/docs/tools/index.md b/docs/tools/index.md index 103c9ac26..25a1a8372 100644 --- a/docs/tools/index.md +++ b/docs/tools/index.md @@ -215,6 +215,7 @@ Notes: - Responses are cached (default 15 min). - For JS-heavy sites, prefer the browser tool. - See [Web tools](/tools/web) for setup. +- See [Firecrawl](/tools/firecrawl) for the optional anti-bot fallback. ### `browser` Control the dedicated clawd browser. diff --git a/docs/tools/web.md b/docs/tools/web.md index 4e44f72dd..a1c2dd5a0 100644 --- a/docs/tools/web.md +++ b/docs/tools/web.md @@ -104,6 +104,7 @@ Fetch a URL and extract readable content. ### Requirements - `tools.web.fetch.enabled` must not be `false` (default: enabled) +- Optional Firecrawl fallback: set `tools.web.fetch.firecrawl.apiKey` or `FIRECRAWL_API_KEY`. ### Config @@ -116,8 +117,16 @@ Fetch a URL and extract readable content. maxChars: 50000, timeoutSeconds: 30, cacheTtlMinutes: 15, - userAgent: "clawdbot/2026.1.15", - readability: true + userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + readability: true, + firecrawl: { + enabled: true, + apiKey: "FIRECRAWL_API_KEY_HERE", // optional if FIRECRAWL_API_KEY is set + baseUrl: "https://api.firecrawl.dev", + onlyMainContent: true, + maxAgeMs: 86400000, // ms (1 day) + timeoutSeconds: 60 + } } } } @@ -131,8 +140,11 @@ Fetch a URL and extract readable content. - `maxChars` (truncate long pages) Notes: -- `web_fetch` uses Readability (main-content extraction) by default and falls back to basic HTML cleanup if it fails. +- `web_fetch` uses Readability (main-content extraction) first, then Firecrawl (if configured). If both fail, the tool returns an error. +- Firecrawl requests use bot-circumvention mode and cache results by default. +- `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed. - `web_fetch` is best-effort extraction; some sites will need the browser tool. +- See [Firecrawl](/tools/firecrawl) for key setup and service details. - Responses are cached (default 15 minutes) to reduce repeated fetches. - If you use tool profiles/allowlists, add `web_search`/`web_fetch` or `group:web`. - If the Brave key is missing, `web_search` returns a short setup hint with a docs link. diff --git a/scripts/firecrawl-compare.ts b/scripts/firecrawl-compare.ts new file mode 100644 index 000000000..2724aea48 --- /dev/null +++ b/scripts/firecrawl-compare.ts @@ -0,0 +1,131 @@ +import { extractReadableContent, fetchFirecrawlContent } from "../src/agents/tools/web-tools.js"; + +const DEFAULT_URLS = [ + "https://en.wikipedia.org/wiki/Web_scraping", + "https://news.ycombinator.com/", + "https://www.apple.com/iphone/", + "https://www.nytimes.com/", + "https://www.reddit.com/r/javascript/", +]; + +const urls = process.argv.slice(2); +const targets = urls.length > 0 ? urls : DEFAULT_URLS; +const apiKey = process.env.FIRECRAWL_API_KEY; +const baseUrl = process.env.FIRECRAWL_BASE_URL ?? "https://api.firecrawl.dev"; + +const userAgent = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; +const timeoutMs = 30_000; + +function truncate(value: string, max = 180): string { + if (!value) return ""; + return value.length > max ? `${value.slice(0, max)}…` : value; +} + +async function fetchHtml(url: string): Promise<{ + ok: boolean; + status: number; + contentType: string; + finalUrl: string; + body: string; +}> { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + try { + const res = await fetch(url, { + method: "GET", + headers: { Accept: "*/*", "User-Agent": userAgent }, + signal: controller.signal, + }); + const contentType = res.headers.get("content-type") ?? "application/octet-stream"; + const body = await res.text(); + return { + ok: res.ok, + status: res.status, + contentType, + finalUrl: res.url || url, + body, + }; + } finally { + clearTimeout(timer); + } +} + +async function run() { + if (!apiKey) { + console.log("FIRECRAWL_API_KEY not set. Firecrawl comparisons will be skipped."); + } + + for (const url of targets) { + console.log(`\n=== ${url}`); + let localStatus = "skipped"; + let localTitle = ""; + let localText = ""; + let localError: string | undefined; + + try { + const res = await fetchHtml(url); + if (!res.ok) { + localStatus = `http ${res.status}`; + } else if (!res.contentType.includes("text/html")) { + localStatus = `non-html (${res.contentType})`; + } else { + const readable = await extractReadableContent({ + html: res.body, + url: res.finalUrl, + extractMode: "markdown", + }); + if (readable?.text) { + localStatus = "readability"; + localTitle = readable.title ?? ""; + localText = readable.text; + } else { + localStatus = "readability-empty"; + } + } + } catch (error) { + localStatus = "error"; + localError = error instanceof Error ? error.message : String(error); + } + + console.log( + `local: ${localStatus} len=${localText.length} title=${truncate(localTitle, 80)}` + ); + if (localError) console.log(`local error: ${localError}`); + if (localText) console.log(`local sample: ${truncate(localText)}`); + + if (apiKey) { + try { + const firecrawl = await fetchFirecrawlContent({ + url, + extractMode: "markdown", + apiKey, + baseUrl, + onlyMainContent: true, + maxAgeMs: 172_800_000, + proxy: "auto", + storeInCache: true, + timeoutSeconds: 60, + }); + console.log( + `firecrawl: ok len=${firecrawl.text.length} title=${truncate( + firecrawl.title ?? "", + 80, + )} status=${firecrawl.status ?? "n/a"}` + ); + if (firecrawl.warning) console.log(`firecrawl warning: ${firecrawl.warning}`); + if (firecrawl.text) console.log(`firecrawl sample: ${truncate(firecrawl.text)}`); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.log(`firecrawl: error ${message}`); + } + } + } + + process.exit(0); +} + +run().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/scripts/readability-basic-compare.ts b/scripts/readability-basic-compare.ts new file mode 100644 index 000000000..d3c312f7b --- /dev/null +++ b/scripts/readability-basic-compare.ts @@ -0,0 +1,60 @@ +import { createWebFetchTool } from "../src/agents/tools/web-tools.js"; + +const DEFAULT_URLS = [ + "https://example.com/", + "https://news.ycombinator.com/", + "https://www.reddit.com/r/javascript/", + "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent", + "https://httpbin.org/html", +]; + +const urls = process.argv.slice(2); +const targets = urls.length > 0 ? urls : DEFAULT_URLS; + +async function runFetch(url: string, readability: boolean) { + if (!readability) { + throw new Error("Basic extraction removed. Set readability=true or enable Firecrawl."); + } + const tool = createWebFetchTool({ + config: { + tools: { + web: { fetch: { readability, cacheTtlMinutes: 0, firecrawl: { enabled: false } } }, + }, + }, + sandboxed: false, + }); + if (!tool) throw new Error("web_fetch tool is disabled"); + const result = await tool.execute("test", { url, extractMode: "markdown" }); + return result.details as { + text?: string; + title?: string; + extractor?: string; + length?: number; + truncated?: boolean; + }; +} + +function truncate(value: string, max = 160): string { + if (!value) return ""; + return value.length > max ? `${value.slice(0, max)}…` : value; +} + +async function run() { + for (const url of targets) { + console.log(`\n=== ${url}`); + const readable = await runFetch(url, true); + + console.log( + `readability: ${readable.extractor ?? "unknown"} len=${readable.length ?? 0} title=${truncate( + readable.title ?? "", + 80, + )}`, + ); + if (readable.text) console.log(`readability sample: ${truncate(readable.text)}`); + } +} + +run().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/src/agents/tools/web-tools.fetch.test.ts b/src/agents/tools/web-tools.fetch.test.ts new file mode 100644 index 000000000..d1e964833 --- /dev/null +++ b/src/agents/tools/web-tools.fetch.test.ts @@ -0,0 +1,185 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +import { createWebFetchTool } from "./web-tools.js"; + +type MockResponse = { + ok: boolean; + status: number; + url?: string; + headers?: { get: (key: string) => string | null }; + text?: () => Promise; + json?: () => Promise; +}; + +function makeHeaders(map: Record): { get: (key: string) => string | null } { + return { + get: (key) => map[key.toLowerCase()] ?? null, + }; +} + +function htmlResponse(html: string, url = "https://example.com/"): MockResponse { + return { + ok: true, + status: 200, + url, + headers: makeHeaders({ "content-type": "text/html; charset=utf-8" }), + text: async () => html, + }; +} + +function firecrawlResponse(markdown: string, url = "https://example.com/"): MockResponse { + return { + ok: true, + status: 200, + json: async () => ({ + success: true, + data: { + markdown, + metadata: { title: "Firecrawl Title", sourceURL: url, statusCode: 200 }, + }, + }), + }; +} + +function firecrawlError(): MockResponse { + return { + ok: false, + status: 403, + json: async () => ({ success: false, error: "blocked" }), + }; +} + +function requestUrl(input: RequestInfo): string { + if (typeof input === "string") return input; + if (input instanceof URL) return input.toString(); + if ("url" in input && typeof input.url === "string") return input.url; + return ""; +} + +describe("web_fetch extraction fallbacks", () => { + const priorFetch = global.fetch; + + afterEach(() => { + // @ts-expect-error restore + global.fetch = priorFetch; + vi.restoreAllMocks(); + }); + + it("falls back to firecrawl when readability returns no content", async () => { + const mockFetch = vi.fn((input: RequestInfo) => { + const url = requestUrl(input); + if (url.includes("api.firecrawl.dev")) { + return Promise.resolve(firecrawlResponse("firecrawl content")) as Promise; + } + return Promise.resolve( + htmlResponse("", url), + ) as Promise; + }); + // @ts-expect-error mock fetch + global.fetch = mockFetch; + + const tool = createWebFetchTool({ + config: { + tools: { + web: { + fetch: { + cacheTtlMinutes: 0, + firecrawl: { apiKey: "firecrawl-test" }, + }, + }, + }, + }, + sandboxed: false, + }); + + const result = await tool?.execute?.("call", { url: "https://example.com/empty" }); + const details = result?.details as { extractor?: string; text?: string }; + expect(details.extractor).toBe("firecrawl"); + expect(details.text).toContain("firecrawl content"); + }); + + it("throws when readability is disabled and firecrawl is unavailable", async () => { + const mockFetch = vi.fn((input: RequestInfo) => + Promise.resolve(htmlResponse("hi", requestUrl(input))), + ); + // @ts-expect-error mock fetch + global.fetch = mockFetch; + + const tool = createWebFetchTool({ + config: { + tools: { + web: { + fetch: { readability: false, cacheTtlMinutes: 0, firecrawl: { enabled: false } }, + }, + }, + }, + sandboxed: false, + }); + + await expect( + tool?.execute?.("call", { url: "https://example.com/readability-off" }), + ).rejects.toThrow("Readability disabled"); + }); + + it("throws when readability is empty and firecrawl fails", async () => { + const mockFetch = vi.fn((input: RequestInfo) => { + const url = requestUrl(input); + if (url.includes("api.firecrawl.dev")) { + return Promise.resolve(firecrawlError()) as Promise; + } + return Promise.resolve( + htmlResponse("", url), + ) as Promise; + }); + // @ts-expect-error mock fetch + global.fetch = mockFetch; + + const tool = createWebFetchTool({ + config: { + tools: { + web: { + fetch: { cacheTtlMinutes: 0, firecrawl: { apiKey: "firecrawl-test" } }, + }, + }, + }, + sandboxed: false, + }); + + await expect( + tool?.execute?.("call", { url: "https://example.com/readability-empty" }), + ).rejects.toThrow("Readability and Firecrawl returned no content"); + }); + + it("uses firecrawl when direct fetch fails", async () => { + const mockFetch = vi.fn((input: RequestInfo) => { + const url = requestUrl(input); + if (url.includes("api.firecrawl.dev")) { + return Promise.resolve(firecrawlResponse("firecrawl fallback", url)) as Promise; + } + return Promise.resolve({ + ok: false, + status: 403, + headers: makeHeaders({ "content-type": "text/html" }), + text: async () => "blocked", + } as Response); + }); + // @ts-expect-error mock fetch + global.fetch = mockFetch; + + const tool = createWebFetchTool({ + config: { + tools: { + web: { + fetch: { cacheTtlMinutes: 0, firecrawl: { apiKey: "firecrawl-test" } }, + }, + }, + }, + sandboxed: false, + }); + + const result = await tool?.execute?.("call", { url: "https://example.com/blocked" }); + const details = result?.details as { extractor?: string; text?: string }; + expect(details.extractor).toBe("firecrawl"); + expect(details.text).toContain("firecrawl fallback"); + }); +}); diff --git a/src/agents/tools/web-tools.ts b/src/agents/tools/web-tools.ts index 596179558..133e42bae 100644 --- a/src/agents/tools/web-tools.ts +++ b/src/agents/tools/web-tools.ts @@ -1,7 +1,6 @@ import { Type } from "@sinclair/typebox"; import type { ClawdbotConfig } from "../../config/config.js"; -import { VERSION } from "../../version.js"; import { stringEnum } from "../schema/typebox.js"; import type { AnyAgentTool } from "./common.js"; import { jsonResult, readNumberParam, readStringParam } from "./common.js"; @@ -15,6 +14,10 @@ const DEFAULT_FETCH_MAX_CHARS = 50_000; const DEFAULT_TIMEOUT_SECONDS = 30; const DEFAULT_CACHE_TTL_MINUTES = 15; const DEFAULT_CACHE_MAX_ENTRIES = 100; +const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev"; +const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000; +const DEFAULT_FETCH_USER_AGENT = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; const BRAVE_SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"; @@ -30,6 +33,15 @@ type WebFetchConfig = NonNullable["web"] extends infer : undefined : undefined; +type FirecrawlFetchConfig = { + enabled?: boolean; + apiKey?: string; + baseUrl?: string; + onlyMainContent?: boolean; + maxAgeMs?: number; + timeoutSeconds?: number; +} | undefined; + type CacheEntry = { value: T; expiresAt: number; @@ -123,6 +135,13 @@ function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean { return true; } +function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig { + if (!fetch || typeof fetch !== "object") return undefined; + const firecrawl = "firecrawl" in fetch ? fetch.firecrawl : undefined; + if (!firecrawl || typeof firecrawl !== "object") return undefined; + return firecrawl as FirecrawlFetchConfig; +} + function resolveSearchApiKey(search?: WebSearchConfig): string | undefined { const fromConfig = search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : ""; @@ -130,6 +149,52 @@ function resolveSearchApiKey(search?: WebSearchConfig): string | undefined { return fromConfig || fromEnv || undefined; } +function resolveFirecrawlApiKey(firecrawl?: FirecrawlFetchConfig): string | undefined { + const fromConfig = + firecrawl && "apiKey" in firecrawl && typeof firecrawl.apiKey === "string" + ? firecrawl.apiKey.trim() + : ""; + const fromEnv = (process.env.FIRECRAWL_API_KEY ?? "").trim(); + return fromConfig || fromEnv || undefined; +} + +function resolveFirecrawlEnabled(params: { + firecrawl?: FirecrawlFetchConfig; + apiKey?: string; +}): boolean { + if (typeof params.firecrawl?.enabled === "boolean") return params.firecrawl.enabled; + return Boolean(params.apiKey); +} + +function resolveFirecrawlBaseUrl(firecrawl?: FirecrawlFetchConfig): string { + const raw = + firecrawl && "baseUrl" in firecrawl && typeof firecrawl.baseUrl === "string" + ? firecrawl.baseUrl.trim() + : ""; + return raw || DEFAULT_FIRECRAWL_BASE_URL; +} + +function resolveFirecrawlOnlyMainContent(firecrawl?: FirecrawlFetchConfig): boolean { + if (typeof firecrawl?.onlyMainContent === "boolean") return firecrawl.onlyMainContent; + return true; +} + +function resolveFirecrawlMaxAgeMs(firecrawl?: FirecrawlFetchConfig): number | undefined { + const raw = + firecrawl && "maxAgeMs" in firecrawl && typeof firecrawl.maxAgeMs === "number" + ? firecrawl.maxAgeMs + : undefined; + if (typeof raw !== "number" || !Number.isFinite(raw)) return undefined; + const parsed = Math.max(0, Math.floor(raw)); + return parsed > 0 ? parsed : undefined; +} + +function resolveFirecrawlMaxAgeMsOrDefault(firecrawl?: FirecrawlFetchConfig): number { + const resolved = resolveFirecrawlMaxAgeMs(firecrawl); + if (typeof resolved === "number") return resolved; + return DEFAULT_FIRECRAWL_MAX_AGE_MS; +} + function missingSearchKeyPayload() { return { error: "missing_brave_api_key", @@ -278,9 +343,18 @@ function htmlToMarkdown(html: string): { text: string; title?: string } { return { text, title }; } -function htmlToText(html: string): { text: string; title?: string } { - const { text, title } = htmlToMarkdown(html); - return { text, title }; +function markdownToText(markdown: string): string { + let text = markdown; + text = text.replace(/!\[[^\]]*]\([^)]+\)/g, ""); + text = text.replace(/\[([^\]]+)]\([^)]+\)/g, "$1"); + text = text.replace(/```[\s\S]*?```/g, (block) => + block.replace(/```[^\n]*\n?/g, "").replace(/```/g, ""), + ); + text = text.replace(/`([^`]+)`/g, "$1"); + text = text.replace(/^#{1,6}\s+/gm, ""); + text = text.replace(/^\s*[-*+]\s+/gm, ""); + text = text.replace(/^\s*\d+\.\s+/gm, ""); + return normalizeWhitespace(text); } function truncateText(value: string, maxChars: number): { text: string; truncated: boolean } { @@ -336,6 +410,81 @@ export async function extractReadableContent(params: { } } +export async function fetchFirecrawlContent(params: { + url: string; + extractMode: (typeof EXTRACT_MODES)[number]; + apiKey: string; + baseUrl: string; + onlyMainContent: boolean; + maxAgeMs: number; + proxy: "auto" | "basic" | "stealth"; + storeInCache: boolean; + timeoutSeconds: number; +}): Promise<{ + text: string; + title?: string; + finalUrl?: string; + status?: number; + warning?: string; +}> { + const endpoint = resolveFirecrawlEndpoint(params.baseUrl); + const body: Record = { + url: params.url, + formats: ["markdown"], + onlyMainContent: params.onlyMainContent, + timeout: params.timeoutSeconds * 1000, + maxAge: params.maxAgeMs, + proxy: params.proxy, + storeInCache: params.storeInCache, + }; + + const res = await fetch(endpoint, { + method: "POST", + headers: { + Authorization: `Bearer ${params.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + + const payload = (await res.json()) as { + success?: boolean; + data?: { + markdown?: string; + content?: string; + metadata?: { + title?: string; + sourceURL?: string; + statusCode?: number; + }; + }; + warning?: string; + error?: string; + }; + + if (!res.ok || payload?.success === false) { + const detail = payload?.error || res.statusText; + throw new Error(`Firecrawl fetch failed (${res.status}): ${detail}`.trim()); + } + + const data = payload?.data ?? {}; + const rawText = + typeof data.markdown === "string" + ? data.markdown + : typeof data.content === "string" + ? data.content + : ""; + const text = params.extractMode === "text" ? markdownToText(rawText) : rawText; + return { + text, + title: data.metadata?.title, + finalUrl: data.metadata?.sourceURL, + status: data.metadata?.statusCode, + warning: payload?.warning, + }; +} + async function runWebSearch(params: { query: string; count: number; @@ -414,6 +563,14 @@ async function runWebFetch(params: { cacheTtlMs: number; userAgent: string; readabilityEnabled: boolean; + firecrawlEnabled: boolean; + firecrawlApiKey?: string; + firecrawlBaseUrl: string; + firecrawlOnlyMainContent: boolean; + firecrawlMaxAgeMs: number; + firecrawlProxy: "auto" | "basic" | "stealth"; + firecrawlStoreInCache: boolean; + firecrawlTimeoutSeconds: number; }): Promise> { const cacheKey = normalizeCacheKey( `fetch:${params.url}:${params.extractMode}:${params.maxChars}`, @@ -432,16 +589,84 @@ async function runWebFetch(params: { } const start = Date.now(); - const res = await fetch(parsedUrl.toString(), { - method: "GET", - headers: { - Accept: "*/*", - "User-Agent": params.userAgent, - }, - signal: withTimeout(undefined, params.timeoutSeconds * 1000), - }); + let res: Response; + try { + res = await fetch(parsedUrl.toString(), { + method: "GET", + headers: { + Accept: "*/*", + "User-Agent": params.userAgent, + "Accept-Language": "en-US,en;q=0.9", + }, + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + } catch (error) { + if (params.firecrawlEnabled && params.firecrawlApiKey) { + const firecrawl = await fetchFirecrawlContent({ + url: params.url, + extractMode: params.extractMode, + apiKey: params.firecrawlApiKey, + baseUrl: params.firecrawlBaseUrl, + onlyMainContent: params.firecrawlOnlyMainContent, + maxAgeMs: params.firecrawlMaxAgeMs, + proxy: params.firecrawlProxy, + storeInCache: params.firecrawlStoreInCache, + timeoutSeconds: params.firecrawlTimeoutSeconds, + }); + const truncated = truncateText(firecrawl.text, params.maxChars); + const payload = { + url: params.url, + finalUrl: firecrawl.finalUrl || params.url, + status: firecrawl.status ?? 200, + contentType: "text/markdown", + title: firecrawl.title, + extractMode: params.extractMode, + extractor: "firecrawl", + truncated: truncated.truncated, + length: truncated.text.length, + fetchedAt: new Date().toISOString(), + tookMs: Date.now() - start, + text: truncated.text, + warning: firecrawl.warning, + }; + writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; + } + throw error; + } if (!res.ok) { + if (params.firecrawlEnabled && params.firecrawlApiKey) { + const firecrawl = await fetchFirecrawlContent({ + url: params.url, + extractMode: params.extractMode, + apiKey: params.firecrawlApiKey, + baseUrl: params.firecrawlBaseUrl, + onlyMainContent: params.firecrawlOnlyMainContent, + maxAgeMs: params.firecrawlMaxAgeMs, + proxy: params.firecrawlProxy, + storeInCache: params.firecrawlStoreInCache, + timeoutSeconds: params.firecrawlTimeoutSeconds, + }); + const truncated = truncateText(firecrawl.text, params.maxChars); + const payload = { + url: params.url, + finalUrl: firecrawl.finalUrl || params.url, + status: firecrawl.status ?? res.status, + contentType: "text/markdown", + title: firecrawl.title, + extractMode: params.extractMode, + extractor: "firecrawl", + truncated: truncated.truncated, + length: truncated.text.length, + fetchedAt: new Date().toISOString(), + tookMs: Date.now() - start, + text: truncated.text, + warning: firecrawl.warning, + }; + writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; + } const detail = await readResponseText(res); throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`); } @@ -450,6 +675,7 @@ async function runWebFetch(params: { const body = await readResponseText(res); let title: string | undefined; + let extractor = "raw"; let text = body; if (contentType.includes("text/html")) { if (params.readabilityEnabled) { @@ -461,21 +687,29 @@ async function runWebFetch(params: { if (readable?.text) { text = readable.text; title = readable.title; + extractor = "readability"; } else { - const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body); - text = parsed.text; - title = parsed.title; + const firecrawl = await tryFirecrawlFallback(params); + if (firecrawl) { + text = firecrawl.text; + title = firecrawl.title; + extractor = "firecrawl"; + } else { + throw new Error( + "Web fetch extraction failed: Readability and Firecrawl returned no content.", + ); + } } } else { - const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body); - text = parsed.text; - title = parsed.title; + throw new Error("Web fetch extraction failed: Readability disabled and Firecrawl unavailable."); } } else if (contentType.includes("application/json")) { try { text = JSON.stringify(JSON.parse(body), null, 2); + extractor = "json"; } catch { text = body; + extractor = "raw"; } } @@ -487,6 +721,7 @@ async function runWebFetch(params: { contentType, title, extractMode: params.extractMode, + extractor, truncated: truncated.truncated, length: truncated.text.length, fetchedAt: new Date().toISOString(), @@ -497,6 +732,37 @@ async function runWebFetch(params: { return payload; } +async function tryFirecrawlFallback(params: { + url: string; + extractMode: (typeof EXTRACT_MODES)[number]; + firecrawlEnabled: boolean; + firecrawlApiKey?: string; + firecrawlBaseUrl: string; + firecrawlOnlyMainContent: boolean; + firecrawlMaxAgeMs: number; + firecrawlProxy: "auto" | "basic" | "stealth"; + firecrawlStoreInCache: boolean; + firecrawlTimeoutSeconds: number; +}): Promise<{ text: string; title?: string } | null> { + if (!params.firecrawlEnabled || !params.firecrawlApiKey) return null; + try { + const firecrawl = await fetchFirecrawlContent({ + url: params.url, + extractMode: params.extractMode, + apiKey: params.firecrawlApiKey, + baseUrl: params.firecrawlBaseUrl, + onlyMainContent: params.firecrawlOnlyMainContent, + maxAgeMs: params.firecrawlMaxAgeMs, + proxy: params.firecrawlProxy, + storeInCache: params.firecrawlStoreInCache, + timeoutSeconds: params.firecrawlTimeoutSeconds, + }); + return { text: firecrawl.text, title: firecrawl.title }; + } catch { + return null; + } +} + export function createWebSearchTool(options?: { config?: ClawdbotConfig; sandboxed?: boolean; @@ -537,6 +803,21 @@ export function createWebSearchTool(options?: { }; } +function resolveFirecrawlEndpoint(baseUrl: string): string { + const trimmed = baseUrl.trim(); + if (!trimmed) return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`; + try { + const url = new URL(trimmed); + if (url.pathname && url.pathname !== "/") { + return url.toString(); + } + url.pathname = "/v2/scrape"; + return url.toString(); + } catch { + return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`; + } +} + export function createWebFetchTool(options?: { config?: ClawdbotConfig; sandboxed?: boolean; @@ -544,9 +825,19 @@ export function createWebFetchTool(options?: { const fetch = resolveFetchConfig(options?.config); if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null; const readabilityEnabled = resolveFetchReadabilityEnabled(fetch); + const firecrawl = resolveFirecrawlConfig(fetch); + const firecrawlApiKey = resolveFirecrawlApiKey(firecrawl); + const firecrawlEnabled = resolveFirecrawlEnabled({ firecrawl, apiKey: firecrawlApiKey }); + const firecrawlBaseUrl = resolveFirecrawlBaseUrl(firecrawl); + const firecrawlOnlyMainContent = resolveFirecrawlOnlyMainContent(firecrawl); + const firecrawlMaxAgeMs = resolveFirecrawlMaxAgeMsOrDefault(firecrawl); + const firecrawlTimeoutSeconds = resolveTimeoutSeconds( + firecrawl?.timeoutSeconds ?? fetch?.timeoutSeconds, + DEFAULT_TIMEOUT_SECONDS, + ); const userAgent = (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) || - `clawdbot/${VERSION}`; + DEFAULT_FETCH_USER_AGENT; return { label: "Web Fetch", name: "web_fetch", @@ -566,6 +857,14 @@ export function createWebFetchTool(options?: { cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), userAgent, readabilityEnabled, + firecrawlEnabled, + firecrawlApiKey, + firecrawlBaseUrl, + firecrawlOnlyMainContent, + firecrawlMaxAgeMs, + firecrawlProxy: "auto", + firecrawlStoreInCache: true, + firecrawlTimeoutSeconds, }); return jsonResult(result); }, diff --git a/src/config/schema.ts b/src/config/schema.ts index c9ede8420..cacc100d4 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -264,6 +264,17 @@ const FIELD_HELP: Record = { "tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.", "tools.web.fetch.readability": "Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).", + "tools.web.fetch.firecrawl.enabled": "Enable Firecrawl fallback for web_fetch (if configured).", + "tools.web.fetch.firecrawl.apiKey": + "Firecrawl API key (fallback: FIRECRAWL_API_KEY env var).", + "tools.web.fetch.firecrawl.baseUrl": + "Firecrawl base URL (e.g. https://api.firecrawl.dev or custom endpoint).", + "tools.web.fetch.firecrawl.onlyMainContent": + "When true, Firecrawl returns only the main content (default: true).", + "tools.web.fetch.firecrawl.maxAgeMs": + "Firecrawl maxAge (ms) for cached results when supported by the API.", + "tools.web.fetch.firecrawl.timeoutSeconds": + "Timeout in seconds for Firecrawl requests.", "channels.slack.allowBots": "Allow bot-authored messages to trigger Slack replies (default: false).", "channels.slack.thread.historyScope": diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 562d9e593..7f5407300 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -111,6 +111,20 @@ export type ToolsConfig = { userAgent?: string; /** Use Readability to extract main content (default: true). */ readability?: boolean; + firecrawl?: { + /** Enable Firecrawl fallback (default: true when apiKey is set). */ + enabled?: boolean; + /** Firecrawl API key (optional; defaults to FIRECRAWL_API_KEY env var). */ + apiKey?: string; + /** Firecrawl base URL (default: https://api.firecrawl.dev). */ + baseUrl?: string; + /** Whether to keep only main content (default: true). */ + onlyMainContent?: boolean; + /** Max age (ms) for cached Firecrawl content. */ + maxAgeMs?: number; + /** Timeout in seconds for Firecrawl requests. */ + timeoutSeconds?: number; + }; }; }; audio?: {