feat: enhance web_fetch fallbacks

This commit is contained in:
Peter Steinberger
2026-01-17 00:00:15 +00:00
parent a84000c6d9
commit c54c665f97
11 changed files with 802 additions and 27 deletions

View File

@@ -12,7 +12,6 @@
- **BREAKING:** iOS minimum version is now 18.0 to support Textual markdown rendering in native chat. (#702)
- **BREAKING:** Microsoft Teams is now a plugin; install `@clawdbot/msteams` via `clawdbot plugins install @clawdbot/msteams`.
- **BREAKING:** Discord/Telegram channel tokens now prefer config over env (env is fallback only).
- **BREAKING:** Matrix channel credentials now prefer config over env (env is fallback only).
### Changes
- CLI: set process titles to `clawdbot-<command>` for clearer process listings.
@@ -20,7 +19,9 @@
- Telegram: scope inline buttons with allowlist default + callback gating in DMs/groups.
- Telegram: default reaction notifications to own.
- Tools: improve `web_fetch` extraction using Readability (with fallback).
- Channels: inject only pending (mention-gated) group history; clear history on any processed message.
- Tools: add Firecrawl fallback for `web_fetch` when configured.
- Tools: send Chrome-like headers by default for `web_fetch` to improve extraction on bot-sensitive sites.
- Tools: Firecrawl fallback now uses bot-circumvention + cache by default; remove basic HTML fallback when extraction fails.
- Heartbeat: tighten prompt guidance + suppress duplicate alerts for 24h. (#980) — thanks @voidserf.
- Repo: ignore local identity files to avoid accidental commits. (#1001) — thanks @gerardward2007.
- Sessions/Security: add `session.dmScope` for multi-user DM isolation and audit warnings. (#948) — thanks @Alphonse-arianee.
@@ -64,9 +65,6 @@
### Fixes
- Messages: make `/stop` clear queued followups and pending session lane work for a hard abort.
- Messages: make `/stop` abort active sub-agent runs spawned from the requester session and report how many were stopped.
- WhatsApp: report linked status consistently in channel status. (#1050) — thanks @YuriNachos.
- Sessions: keep per-session overrides when `/new` resets compaction counters. (#1050) — thanks @YuriNachos.
- Skills: allow OpenAI image-gen helper to handle URL or base64 responses. (#1050) — thanks @YuriNachos.
- WhatsApp: default response prefix only for self-chat, using identity name when set.
- Signal/iMessage: bound transport readiness waits to 30s with periodic logging. (#1014) — thanks @Szpadel.
- Auth: merge main auth profiles into per-agent stores for sub-agents and document inheritance. (#1013) — thanks @marcmarg.

View File

@@ -1715,6 +1715,12 @@ Legacy: `tools.bash` is still accepted as an alias.
- `tools.web.fetch.cacheTtlMinutes` (default 15)
- `tools.web.fetch.userAgent` (optional override)
- `tools.web.fetch.readability` (default true; disable to use basic HTML cleanup only)
- `tools.web.fetch.firecrawl.enabled` (default true when an API key is set)
- `tools.web.fetch.firecrawl.apiKey` (optional; defaults to `FIRECRAWL_API_KEY`)
- `tools.web.fetch.firecrawl.baseUrl` (default https://api.firecrawl.dev)
- `tools.web.fetch.firecrawl.onlyMainContent` (default true)
- `tools.web.fetch.firecrawl.maxAgeMs` (optional)
- `tools.web.fetch.firecrawl.timeoutSeconds` (optional)
`agents.defaults.subagents` configures sub-agent defaults:
- `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the callers model unless overridden per agent or per call.

58
docs/tools/firecrawl.md Normal file
View File

@@ -0,0 +1,58 @@
---
summary: "Firecrawl fallback for web_fetch (anti-bot + cached extraction)"
read_when:
- You want Firecrawl-backed web extraction
- You need a Firecrawl API key
- You want anti-bot extraction for web_fetch
---
# Firecrawl
Clawdbot can use **Firecrawl** as a fallback extractor for `web_fetch`. It is a hosted
content extraction service that supports bot circumvention and caching, which helps
with JS-heavy sites or pages that block plain HTTP fetches.
## Get an API key
1) Create a Firecrawl account and generate an API key.
2) Store it in config or set `FIRECRAWL_API_KEY` in the gateway environment.
## Configure Firecrawl
```json5
{
tools: {
web: {
fetch: {
firecrawl: {
apiKey: "FIRECRAWL_API_KEY_HERE",
baseUrl: "https://api.firecrawl.dev",
onlyMainContent: true,
maxAgeMs: 172800000,
timeoutSeconds: 60
}
}
}
}
}
```
Notes:
- `firecrawl.enabled` defaults to true when an API key is present.
- `maxAgeMs` controls how old cached results can be (ms). Default is 2 days.
## Stealth / bot circumvention
Firecrawl exposes a **proxy mode** parameter for bot circumvention (`basic`, `stealth`, or `auto`).
Clawdbot always uses `proxy: "auto"` plus `storeInCache: true` for Firecrawl requests.
If proxy is omitted, Firecrawl defaults to `auto`. `auto` retries with stealth proxies if a basic attempt fails, which may use more credits
than basic-only scraping.
## How `web_fetch` uses Firecrawl
`web_fetch` extraction order:
1) Readability (local)
2) Firecrawl (if configured)
3) Basic HTML cleanup (last fallback)
See [Web tools](/tools/web) for the full web tool setup.

View File

@@ -215,6 +215,7 @@ Notes:
- Responses are cached (default 15 min).
- For JS-heavy sites, prefer the browser tool.
- See [Web tools](/tools/web) for setup.
- See [Firecrawl](/tools/firecrawl) for the optional anti-bot fallback.
### `browser`
Control the dedicated clawd browser.

View File

@@ -104,6 +104,7 @@ Fetch a URL and extract readable content.
### Requirements
- `tools.web.fetch.enabled` must not be `false` (default: enabled)
- Optional Firecrawl fallback: set `tools.web.fetch.firecrawl.apiKey` or `FIRECRAWL_API_KEY`.
### Config
@@ -116,8 +117,16 @@ Fetch a URL and extract readable content.
maxChars: 50000,
timeoutSeconds: 30,
cacheTtlMinutes: 15,
userAgent: "clawdbot/2026.1.15",
readability: true
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
readability: true,
firecrawl: {
enabled: true,
apiKey: "FIRECRAWL_API_KEY_HERE", // optional if FIRECRAWL_API_KEY is set
baseUrl: "https://api.firecrawl.dev",
onlyMainContent: true,
maxAgeMs: 86400000, // ms (1 day)
timeoutSeconds: 60
}
}
}
}
@@ -131,8 +140,11 @@ Fetch a URL and extract readable content.
- `maxChars` (truncate long pages)
Notes:
- `web_fetch` uses Readability (main-content extraction) by default and falls back to basic HTML cleanup if it fails.
- `web_fetch` uses Readability (main-content extraction) first, then Firecrawl (if configured). If both fail, the tool returns an error.
- Firecrawl requests use bot-circumvention mode and cache results by default.
- `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed.
- `web_fetch` is best-effort extraction; some sites will need the browser tool.
- See [Firecrawl](/tools/firecrawl) for key setup and service details.
- Responses are cached (default 15 minutes) to reduce repeated fetches.
- If you use tool profiles/allowlists, add `web_search`/`web_fetch` or `group:web`.
- If the Brave key is missing, `web_search` returns a short setup hint with a docs link.

View File

@@ -0,0 +1,131 @@
import { extractReadableContent, fetchFirecrawlContent } from "../src/agents/tools/web-tools.js";
const DEFAULT_URLS = [
"https://en.wikipedia.org/wiki/Web_scraping",
"https://news.ycombinator.com/",
"https://www.apple.com/iphone/",
"https://www.nytimes.com/",
"https://www.reddit.com/r/javascript/",
];
const urls = process.argv.slice(2);
const targets = urls.length > 0 ? urls : DEFAULT_URLS;
const apiKey = process.env.FIRECRAWL_API_KEY;
const baseUrl = process.env.FIRECRAWL_BASE_URL ?? "https://api.firecrawl.dev";
const userAgent =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
const timeoutMs = 30_000;
function truncate(value: string, max = 180): string {
if (!value) return "";
return value.length > max ? `${value.slice(0, max)}` : value;
}
async function fetchHtml(url: string): Promise<{
ok: boolean;
status: number;
contentType: string;
finalUrl: string;
body: string;
}> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
try {
const res = await fetch(url, {
method: "GET",
headers: { Accept: "*/*", "User-Agent": userAgent },
signal: controller.signal,
});
const contentType = res.headers.get("content-type") ?? "application/octet-stream";
const body = await res.text();
return {
ok: res.ok,
status: res.status,
contentType,
finalUrl: res.url || url,
body,
};
} finally {
clearTimeout(timer);
}
}
async function run() {
if (!apiKey) {
console.log("FIRECRAWL_API_KEY not set. Firecrawl comparisons will be skipped.");
}
for (const url of targets) {
console.log(`\n=== ${url}`);
let localStatus = "skipped";
let localTitle = "";
let localText = "";
let localError: string | undefined;
try {
const res = await fetchHtml(url);
if (!res.ok) {
localStatus = `http ${res.status}`;
} else if (!res.contentType.includes("text/html")) {
localStatus = `non-html (${res.contentType})`;
} else {
const readable = await extractReadableContent({
html: res.body,
url: res.finalUrl,
extractMode: "markdown",
});
if (readable?.text) {
localStatus = "readability";
localTitle = readable.title ?? "";
localText = readable.text;
} else {
localStatus = "readability-empty";
}
}
} catch (error) {
localStatus = "error";
localError = error instanceof Error ? error.message : String(error);
}
console.log(
`local: ${localStatus} len=${localText.length} title=${truncate(localTitle, 80)}`
);
if (localError) console.log(`local error: ${localError}`);
if (localText) console.log(`local sample: ${truncate(localText)}`);
if (apiKey) {
try {
const firecrawl = await fetchFirecrawlContent({
url,
extractMode: "markdown",
apiKey,
baseUrl,
onlyMainContent: true,
maxAgeMs: 172_800_000,
proxy: "auto",
storeInCache: true,
timeoutSeconds: 60,
});
console.log(
`firecrawl: ok len=${firecrawl.text.length} title=${truncate(
firecrawl.title ?? "",
80,
)} status=${firecrawl.status ?? "n/a"}`
);
if (firecrawl.warning) console.log(`firecrawl warning: ${firecrawl.warning}`);
if (firecrawl.text) console.log(`firecrawl sample: ${truncate(firecrawl.text)}`);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.log(`firecrawl: error ${message}`);
}
}
}
process.exit(0);
}
run().catch((error) => {
console.error(error);
process.exit(1);
});

View File

@@ -0,0 +1,60 @@
import { createWebFetchTool } from "../src/agents/tools/web-tools.js";
const DEFAULT_URLS = [
"https://example.com/",
"https://news.ycombinator.com/",
"https://www.reddit.com/r/javascript/",
"https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent",
"https://httpbin.org/html",
];
const urls = process.argv.slice(2);
const targets = urls.length > 0 ? urls : DEFAULT_URLS;
async function runFetch(url: string, readability: boolean) {
if (!readability) {
throw new Error("Basic extraction removed. Set readability=true or enable Firecrawl.");
}
const tool = createWebFetchTool({
config: {
tools: {
web: { fetch: { readability, cacheTtlMinutes: 0, firecrawl: { enabled: false } } },
},
},
sandboxed: false,
});
if (!tool) throw new Error("web_fetch tool is disabled");
const result = await tool.execute("test", { url, extractMode: "markdown" });
return result.details as {
text?: string;
title?: string;
extractor?: string;
length?: number;
truncated?: boolean;
};
}
function truncate(value: string, max = 160): string {
if (!value) return "";
return value.length > max ? `${value.slice(0, max)}` : value;
}
async function run() {
for (const url of targets) {
console.log(`\n=== ${url}`);
const readable = await runFetch(url, true);
console.log(
`readability: ${readable.extractor ?? "unknown"} len=${readable.length ?? 0} title=${truncate(
readable.title ?? "",
80,
)}`,
);
if (readable.text) console.log(`readability sample: ${truncate(readable.text)}`);
}
}
run().catch((error) => {
console.error(error);
process.exit(1);
});

View File

@@ -0,0 +1,185 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import { createWebFetchTool } from "./web-tools.js";
type MockResponse = {
ok: boolean;
status: number;
url?: string;
headers?: { get: (key: string) => string | null };
text?: () => Promise<string>;
json?: () => Promise<unknown>;
};
function makeHeaders(map: Record<string, string>): { get: (key: string) => string | null } {
return {
get: (key) => map[key.toLowerCase()] ?? null,
};
}
function htmlResponse(html: string, url = "https://example.com/"): MockResponse {
return {
ok: true,
status: 200,
url,
headers: makeHeaders({ "content-type": "text/html; charset=utf-8" }),
text: async () => html,
};
}
function firecrawlResponse(markdown: string, url = "https://example.com/"): MockResponse {
return {
ok: true,
status: 200,
json: async () => ({
success: true,
data: {
markdown,
metadata: { title: "Firecrawl Title", sourceURL: url, statusCode: 200 },
},
}),
};
}
function firecrawlError(): MockResponse {
return {
ok: false,
status: 403,
json: async () => ({ success: false, error: "blocked" }),
};
}
function requestUrl(input: RequestInfo): string {
if (typeof input === "string") return input;
if (input instanceof URL) return input.toString();
if ("url" in input && typeof input.url === "string") return input.url;
return "";
}
describe("web_fetch extraction fallbacks", () => {
const priorFetch = global.fetch;
afterEach(() => {
// @ts-expect-error restore
global.fetch = priorFetch;
vi.restoreAllMocks();
});
it("falls back to firecrawl when readability returns no content", async () => {
const mockFetch = vi.fn((input: RequestInfo) => {
const url = requestUrl(input);
if (url.includes("api.firecrawl.dev")) {
return Promise.resolve(firecrawlResponse("firecrawl content")) as Promise<Response>;
}
return Promise.resolve(
htmlResponse("<!doctype html><html><head></head><body></body></html>", url),
) as Promise<Response>;
});
// @ts-expect-error mock fetch
global.fetch = mockFetch;
const tool = createWebFetchTool({
config: {
tools: {
web: {
fetch: {
cacheTtlMinutes: 0,
firecrawl: { apiKey: "firecrawl-test" },
},
},
},
},
sandboxed: false,
});
const result = await tool?.execute?.("call", { url: "https://example.com/empty" });
const details = result?.details as { extractor?: string; text?: string };
expect(details.extractor).toBe("firecrawl");
expect(details.text).toContain("firecrawl content");
});
it("throws when readability is disabled and firecrawl is unavailable", async () => {
const mockFetch = vi.fn((input: RequestInfo) =>
Promise.resolve(htmlResponse("<html><body>hi</body></html>", requestUrl(input))),
);
// @ts-expect-error mock fetch
global.fetch = mockFetch;
const tool = createWebFetchTool({
config: {
tools: {
web: {
fetch: { readability: false, cacheTtlMinutes: 0, firecrawl: { enabled: false } },
},
},
},
sandboxed: false,
});
await expect(
tool?.execute?.("call", { url: "https://example.com/readability-off" }),
).rejects.toThrow("Readability disabled");
});
it("throws when readability is empty and firecrawl fails", async () => {
const mockFetch = vi.fn((input: RequestInfo) => {
const url = requestUrl(input);
if (url.includes("api.firecrawl.dev")) {
return Promise.resolve(firecrawlError()) as Promise<Response>;
}
return Promise.resolve(
htmlResponse("<!doctype html><html><head></head><body></body></html>", url),
) as Promise<Response>;
});
// @ts-expect-error mock fetch
global.fetch = mockFetch;
const tool = createWebFetchTool({
config: {
tools: {
web: {
fetch: { cacheTtlMinutes: 0, firecrawl: { apiKey: "firecrawl-test" } },
},
},
},
sandboxed: false,
});
await expect(
tool?.execute?.("call", { url: "https://example.com/readability-empty" }),
).rejects.toThrow("Readability and Firecrawl returned no content");
});
it("uses firecrawl when direct fetch fails", async () => {
const mockFetch = vi.fn((input: RequestInfo) => {
const url = requestUrl(input);
if (url.includes("api.firecrawl.dev")) {
return Promise.resolve(firecrawlResponse("firecrawl fallback", url)) as Promise<Response>;
}
return Promise.resolve({
ok: false,
status: 403,
headers: makeHeaders({ "content-type": "text/html" }),
text: async () => "blocked",
} as Response);
});
// @ts-expect-error mock fetch
global.fetch = mockFetch;
const tool = createWebFetchTool({
config: {
tools: {
web: {
fetch: { cacheTtlMinutes: 0, firecrawl: { apiKey: "firecrawl-test" } },
},
},
},
sandboxed: false,
});
const result = await tool?.execute?.("call", { url: "https://example.com/blocked" });
const details = result?.details as { extractor?: string; text?: string };
expect(details.extractor).toBe("firecrawl");
expect(details.text).toContain("firecrawl fallback");
});
});

View File

@@ -1,7 +1,6 @@
import { Type } from "@sinclair/typebox";
import type { ClawdbotConfig } from "../../config/config.js";
import { VERSION } from "../../version.js";
import { stringEnum } from "../schema/typebox.js";
import type { AnyAgentTool } from "./common.js";
import { jsonResult, readNumberParam, readStringParam } from "./common.js";
@@ -15,6 +14,10 @@ const DEFAULT_FETCH_MAX_CHARS = 50_000;
const DEFAULT_TIMEOUT_SECONDS = 30;
const DEFAULT_CACHE_TTL_MINUTES = 15;
const DEFAULT_CACHE_MAX_ENTRIES = 100;
const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
const DEFAULT_FETCH_USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
const BRAVE_SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search";
@@ -30,6 +33,15 @@ type WebFetchConfig = NonNullable<ClawdbotConfig["tools"]>["web"] extends infer
: undefined
: undefined;
type FirecrawlFetchConfig = {
enabled?: boolean;
apiKey?: string;
baseUrl?: string;
onlyMainContent?: boolean;
maxAgeMs?: number;
timeoutSeconds?: number;
} | undefined;
type CacheEntry<T> = {
value: T;
expiresAt: number;
@@ -123,6 +135,13 @@ function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean {
return true;
}
function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig {
if (!fetch || typeof fetch !== "object") return undefined;
const firecrawl = "firecrawl" in fetch ? fetch.firecrawl : undefined;
if (!firecrawl || typeof firecrawl !== "object") return undefined;
return firecrawl as FirecrawlFetchConfig;
}
function resolveSearchApiKey(search?: WebSearchConfig): string | undefined {
const fromConfig =
search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : "";
@@ -130,6 +149,52 @@ function resolveSearchApiKey(search?: WebSearchConfig): string | undefined {
return fromConfig || fromEnv || undefined;
}
function resolveFirecrawlApiKey(firecrawl?: FirecrawlFetchConfig): string | undefined {
const fromConfig =
firecrawl && "apiKey" in firecrawl && typeof firecrawl.apiKey === "string"
? firecrawl.apiKey.trim()
: "";
const fromEnv = (process.env.FIRECRAWL_API_KEY ?? "").trim();
return fromConfig || fromEnv || undefined;
}
function resolveFirecrawlEnabled(params: {
firecrawl?: FirecrawlFetchConfig;
apiKey?: string;
}): boolean {
if (typeof params.firecrawl?.enabled === "boolean") return params.firecrawl.enabled;
return Boolean(params.apiKey);
}
function resolveFirecrawlBaseUrl(firecrawl?: FirecrawlFetchConfig): string {
const raw =
firecrawl && "baseUrl" in firecrawl && typeof firecrawl.baseUrl === "string"
? firecrawl.baseUrl.trim()
: "";
return raw || DEFAULT_FIRECRAWL_BASE_URL;
}
function resolveFirecrawlOnlyMainContent(firecrawl?: FirecrawlFetchConfig): boolean {
if (typeof firecrawl?.onlyMainContent === "boolean") return firecrawl.onlyMainContent;
return true;
}
function resolveFirecrawlMaxAgeMs(firecrawl?: FirecrawlFetchConfig): number | undefined {
const raw =
firecrawl && "maxAgeMs" in firecrawl && typeof firecrawl.maxAgeMs === "number"
? firecrawl.maxAgeMs
: undefined;
if (typeof raw !== "number" || !Number.isFinite(raw)) return undefined;
const parsed = Math.max(0, Math.floor(raw));
return parsed > 0 ? parsed : undefined;
}
function resolveFirecrawlMaxAgeMsOrDefault(firecrawl?: FirecrawlFetchConfig): number {
const resolved = resolveFirecrawlMaxAgeMs(firecrawl);
if (typeof resolved === "number") return resolved;
return DEFAULT_FIRECRAWL_MAX_AGE_MS;
}
function missingSearchKeyPayload() {
return {
error: "missing_brave_api_key",
@@ -278,9 +343,18 @@ function htmlToMarkdown(html: string): { text: string; title?: string } {
return { text, title };
}
function htmlToText(html: string): { text: string; title?: string } {
const { text, title } = htmlToMarkdown(html);
return { text, title };
function markdownToText(markdown: string): string {
let text = markdown;
text = text.replace(/!\[[^\]]*]\([^)]+\)/g, "");
text = text.replace(/\[([^\]]+)]\([^)]+\)/g, "$1");
text = text.replace(/```[\s\S]*?```/g, (block) =>
block.replace(/```[^\n]*\n?/g, "").replace(/```/g, ""),
);
text = text.replace(/`([^`]+)`/g, "$1");
text = text.replace(/^#{1,6}\s+/gm, "");
text = text.replace(/^\s*[-*+]\s+/gm, "");
text = text.replace(/^\s*\d+\.\s+/gm, "");
return normalizeWhitespace(text);
}
function truncateText(value: string, maxChars: number): { text: string; truncated: boolean } {
@@ -336,6 +410,81 @@ export async function extractReadableContent(params: {
}
}
export async function fetchFirecrawlContent(params: {
url: string;
extractMode: (typeof EXTRACT_MODES)[number];
apiKey: string;
baseUrl: string;
onlyMainContent: boolean;
maxAgeMs: number;
proxy: "auto" | "basic" | "stealth";
storeInCache: boolean;
timeoutSeconds: number;
}): Promise<{
text: string;
title?: string;
finalUrl?: string;
status?: number;
warning?: string;
}> {
const endpoint = resolveFirecrawlEndpoint(params.baseUrl);
const body: Record<string, unknown> = {
url: params.url,
formats: ["markdown"],
onlyMainContent: params.onlyMainContent,
timeout: params.timeoutSeconds * 1000,
maxAge: params.maxAgeMs,
proxy: params.proxy,
storeInCache: params.storeInCache,
};
const res = await fetch(endpoint, {
method: "POST",
headers: {
Authorization: `Bearer ${params.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
signal: withTimeout(undefined, params.timeoutSeconds * 1000),
});
const payload = (await res.json()) as {
success?: boolean;
data?: {
markdown?: string;
content?: string;
metadata?: {
title?: string;
sourceURL?: string;
statusCode?: number;
};
};
warning?: string;
error?: string;
};
if (!res.ok || payload?.success === false) {
const detail = payload?.error || res.statusText;
throw new Error(`Firecrawl fetch failed (${res.status}): ${detail}`.trim());
}
const data = payload?.data ?? {};
const rawText =
typeof data.markdown === "string"
? data.markdown
: typeof data.content === "string"
? data.content
: "";
const text = params.extractMode === "text" ? markdownToText(rawText) : rawText;
return {
text,
title: data.metadata?.title,
finalUrl: data.metadata?.sourceURL,
status: data.metadata?.statusCode,
warning: payload?.warning,
};
}
async function runWebSearch(params: {
query: string;
count: number;
@@ -414,6 +563,14 @@ async function runWebFetch(params: {
cacheTtlMs: number;
userAgent: string;
readabilityEnabled: boolean;
firecrawlEnabled: boolean;
firecrawlApiKey?: string;
firecrawlBaseUrl: string;
firecrawlOnlyMainContent: boolean;
firecrawlMaxAgeMs: number;
firecrawlProxy: "auto" | "basic" | "stealth";
firecrawlStoreInCache: boolean;
firecrawlTimeoutSeconds: number;
}): Promise<Record<string, unknown>> {
const cacheKey = normalizeCacheKey(
`fetch:${params.url}:${params.extractMode}:${params.maxChars}`,
@@ -432,16 +589,84 @@ async function runWebFetch(params: {
}
const start = Date.now();
const res = await fetch(parsedUrl.toString(), {
method: "GET",
headers: {
Accept: "*/*",
"User-Agent": params.userAgent,
},
signal: withTimeout(undefined, params.timeoutSeconds * 1000),
});
let res: Response;
try {
res = await fetch(parsedUrl.toString(), {
method: "GET",
headers: {
Accept: "*/*",
"User-Agent": params.userAgent,
"Accept-Language": "en-US,en;q=0.9",
},
signal: withTimeout(undefined, params.timeoutSeconds * 1000),
});
} catch (error) {
if (params.firecrawlEnabled && params.firecrawlApiKey) {
const firecrawl = await fetchFirecrawlContent({
url: params.url,
extractMode: params.extractMode,
apiKey: params.firecrawlApiKey,
baseUrl: params.firecrawlBaseUrl,
onlyMainContent: params.firecrawlOnlyMainContent,
maxAgeMs: params.firecrawlMaxAgeMs,
proxy: params.firecrawlProxy,
storeInCache: params.firecrawlStoreInCache,
timeoutSeconds: params.firecrawlTimeoutSeconds,
});
const truncated = truncateText(firecrawl.text, params.maxChars);
const payload = {
url: params.url,
finalUrl: firecrawl.finalUrl || params.url,
status: firecrawl.status ?? 200,
contentType: "text/markdown",
title: firecrawl.title,
extractMode: params.extractMode,
extractor: "firecrawl",
truncated: truncated.truncated,
length: truncated.text.length,
fetchedAt: new Date().toISOString(),
tookMs: Date.now() - start,
text: truncated.text,
warning: firecrawl.warning,
};
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
return payload;
}
throw error;
}
if (!res.ok) {
if (params.firecrawlEnabled && params.firecrawlApiKey) {
const firecrawl = await fetchFirecrawlContent({
url: params.url,
extractMode: params.extractMode,
apiKey: params.firecrawlApiKey,
baseUrl: params.firecrawlBaseUrl,
onlyMainContent: params.firecrawlOnlyMainContent,
maxAgeMs: params.firecrawlMaxAgeMs,
proxy: params.firecrawlProxy,
storeInCache: params.firecrawlStoreInCache,
timeoutSeconds: params.firecrawlTimeoutSeconds,
});
const truncated = truncateText(firecrawl.text, params.maxChars);
const payload = {
url: params.url,
finalUrl: firecrawl.finalUrl || params.url,
status: firecrawl.status ?? res.status,
contentType: "text/markdown",
title: firecrawl.title,
extractMode: params.extractMode,
extractor: "firecrawl",
truncated: truncated.truncated,
length: truncated.text.length,
fetchedAt: new Date().toISOString(),
tookMs: Date.now() - start,
text: truncated.text,
warning: firecrawl.warning,
};
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
return payload;
}
const detail = await readResponseText(res);
throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`);
}
@@ -450,6 +675,7 @@ async function runWebFetch(params: {
const body = await readResponseText(res);
let title: string | undefined;
let extractor = "raw";
let text = body;
if (contentType.includes("text/html")) {
if (params.readabilityEnabled) {
@@ -461,21 +687,29 @@ async function runWebFetch(params: {
if (readable?.text) {
text = readable.text;
title = readable.title;
extractor = "readability";
} else {
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
text = parsed.text;
title = parsed.title;
const firecrawl = await tryFirecrawlFallback(params);
if (firecrawl) {
text = firecrawl.text;
title = firecrawl.title;
extractor = "firecrawl";
} else {
throw new Error(
"Web fetch extraction failed: Readability and Firecrawl returned no content.",
);
}
}
} else {
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
text = parsed.text;
title = parsed.title;
throw new Error("Web fetch extraction failed: Readability disabled and Firecrawl unavailable.");
}
} else if (contentType.includes("application/json")) {
try {
text = JSON.stringify(JSON.parse(body), null, 2);
extractor = "json";
} catch {
text = body;
extractor = "raw";
}
}
@@ -487,6 +721,7 @@ async function runWebFetch(params: {
contentType,
title,
extractMode: params.extractMode,
extractor,
truncated: truncated.truncated,
length: truncated.text.length,
fetchedAt: new Date().toISOString(),
@@ -497,6 +732,37 @@ async function runWebFetch(params: {
return payload;
}
async function tryFirecrawlFallback(params: {
url: string;
extractMode: (typeof EXTRACT_MODES)[number];
firecrawlEnabled: boolean;
firecrawlApiKey?: string;
firecrawlBaseUrl: string;
firecrawlOnlyMainContent: boolean;
firecrawlMaxAgeMs: number;
firecrawlProxy: "auto" | "basic" | "stealth";
firecrawlStoreInCache: boolean;
firecrawlTimeoutSeconds: number;
}): Promise<{ text: string; title?: string } | null> {
if (!params.firecrawlEnabled || !params.firecrawlApiKey) return null;
try {
const firecrawl = await fetchFirecrawlContent({
url: params.url,
extractMode: params.extractMode,
apiKey: params.firecrawlApiKey,
baseUrl: params.firecrawlBaseUrl,
onlyMainContent: params.firecrawlOnlyMainContent,
maxAgeMs: params.firecrawlMaxAgeMs,
proxy: params.firecrawlProxy,
storeInCache: params.firecrawlStoreInCache,
timeoutSeconds: params.firecrawlTimeoutSeconds,
});
return { text: firecrawl.text, title: firecrawl.title };
} catch {
return null;
}
}
export function createWebSearchTool(options?: {
config?: ClawdbotConfig;
sandboxed?: boolean;
@@ -537,6 +803,21 @@ export function createWebSearchTool(options?: {
};
}
function resolveFirecrawlEndpoint(baseUrl: string): string {
const trimmed = baseUrl.trim();
if (!trimmed) return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`;
try {
const url = new URL(trimmed);
if (url.pathname && url.pathname !== "/") {
return url.toString();
}
url.pathname = "/v2/scrape";
return url.toString();
} catch {
return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`;
}
}
export function createWebFetchTool(options?: {
config?: ClawdbotConfig;
sandboxed?: boolean;
@@ -544,9 +825,19 @@ export function createWebFetchTool(options?: {
const fetch = resolveFetchConfig(options?.config);
if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null;
const readabilityEnabled = resolveFetchReadabilityEnabled(fetch);
const firecrawl = resolveFirecrawlConfig(fetch);
const firecrawlApiKey = resolveFirecrawlApiKey(firecrawl);
const firecrawlEnabled = resolveFirecrawlEnabled({ firecrawl, apiKey: firecrawlApiKey });
const firecrawlBaseUrl = resolveFirecrawlBaseUrl(firecrawl);
const firecrawlOnlyMainContent = resolveFirecrawlOnlyMainContent(firecrawl);
const firecrawlMaxAgeMs = resolveFirecrawlMaxAgeMsOrDefault(firecrawl);
const firecrawlTimeoutSeconds = resolveTimeoutSeconds(
firecrawl?.timeoutSeconds ?? fetch?.timeoutSeconds,
DEFAULT_TIMEOUT_SECONDS,
);
const userAgent =
(fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
`clawdbot/${VERSION}`;
DEFAULT_FETCH_USER_AGENT;
return {
label: "Web Fetch",
name: "web_fetch",
@@ -566,6 +857,14 @@ export function createWebFetchTool(options?: {
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
userAgent,
readabilityEnabled,
firecrawlEnabled,
firecrawlApiKey,
firecrawlBaseUrl,
firecrawlOnlyMainContent,
firecrawlMaxAgeMs,
firecrawlProxy: "auto",
firecrawlStoreInCache: true,
firecrawlTimeoutSeconds,
});
return jsonResult(result);
},

View File

@@ -264,6 +264,17 @@ const FIELD_HELP: Record<string, string> = {
"tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.",
"tools.web.fetch.readability":
"Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).",
"tools.web.fetch.firecrawl.enabled": "Enable Firecrawl fallback for web_fetch (if configured).",
"tools.web.fetch.firecrawl.apiKey":
"Firecrawl API key (fallback: FIRECRAWL_API_KEY env var).",
"tools.web.fetch.firecrawl.baseUrl":
"Firecrawl base URL (e.g. https://api.firecrawl.dev or custom endpoint).",
"tools.web.fetch.firecrawl.onlyMainContent":
"When true, Firecrawl returns only the main content (default: true).",
"tools.web.fetch.firecrawl.maxAgeMs":
"Firecrawl maxAge (ms) for cached results when supported by the API.",
"tools.web.fetch.firecrawl.timeoutSeconds":
"Timeout in seconds for Firecrawl requests.",
"channels.slack.allowBots":
"Allow bot-authored messages to trigger Slack replies (default: false).",
"channels.slack.thread.historyScope":

View File

@@ -111,6 +111,20 @@ export type ToolsConfig = {
userAgent?: string;
/** Use Readability to extract main content (default: true). */
readability?: boolean;
firecrawl?: {
/** Enable Firecrawl fallback (default: true when apiKey is set). */
enabled?: boolean;
/** Firecrawl API key (optional; defaults to FIRECRAWL_API_KEY env var). */
apiKey?: string;
/** Firecrawl base URL (default: https://api.firecrawl.dev). */
baseUrl?: string;
/** Whether to keep only main content (default: true). */
onlyMainContent?: boolean;
/** Max age (ms) for cached Firecrawl content. */
maxAgeMs?: number;
/** Timeout in seconds for Firecrawl requests. */
timeoutSeconds?: number;
};
};
};
audio?: {