feat: enhance web_fetch fallbacks

2026-01-17 00:00:15 +00:00
parent a84000c6d9
commit c54c665f97
11 changed files with 802 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,6 @@
 - **BREAKING:** iOS minimum version is now 18.0 to support Textual markdown rendering in native chat. (#702)
 - **BREAKING:** Microsoft Teams is now a plugin; install `@clawdbot/msteams` via `clawdbot plugins install @clawdbot/msteams`.
 - **BREAKING:** Discord/Telegram channel tokens now prefer config over env (env is fallback only).
- **BREAKING:** Matrix channel credentials now prefer config over env (env is fallback only).

 ### Changes
 - CLI: set process titles to `clawdbot-<command>` for clearer process listings.
@@ -20,7 +19,9 @@
 - Telegram: scope inline buttons with allowlist default + callback gating in DMs/groups.
 - Telegram: default reaction notifications to own.
 - Tools: improve `web_fetch` extraction using Readability (with fallback).
- Channels: inject only pending (mention-gated) group history; clear history on any processed message.
+- Tools: add Firecrawl fallback for `web_fetch` when configured.
+- Tools: send Chrome-like headers by default for `web_fetch` to improve extraction on bot-sensitive sites.
+- Tools: Firecrawl fallback now uses bot-circumvention + cache by default; remove basic HTML fallback when extraction fails.
 - Heartbeat: tighten prompt guidance + suppress duplicate alerts for 24h. (#980) — thanks @voidserf.
 - Repo: ignore local identity files to avoid accidental commits. (#1001) — thanks @gerardward2007.
 - Sessions/Security: add `session.dmScope` for multi-user DM isolation and audit warnings. (#948) — thanks @Alphonse-arianee.
@@ -64,9 +65,6 @@
 ### Fixes
 - Messages: make `/stop` clear queued followups and pending session lane work for a hard abort.
 - Messages: make `/stop` abort active sub-agent runs spawned from the requester session and report how many were stopped.
- WhatsApp: report linked status consistently in channel status. (#1050) — thanks @YuriNachos.
- Sessions: keep per-session overrides when `/new` resets compaction counters. (#1050) — thanks @YuriNachos.
- Skills: allow OpenAI image-gen helper to handle URL or base64 responses. (#1050) — thanks @YuriNachos.
 - WhatsApp: default response prefix only for self-chat, using identity name when set.
 - Signal/iMessage: bound transport readiness waits to 30s with periodic logging. (#1014) — thanks @Szpadel.
 - Auth: merge main auth profiles into per-agent stores for sub-agents and document inheritance. (#1013) — thanks @marcmarg.
--- a/docs/gateway/configuration.md
+++ b/docs/gateway/configuration.md
@@ -1715,6 +1715,12 @@ Legacy: `tools.bash` is still accepted as an alias.
 - `tools.web.fetch.cacheTtlMinutes` (default 15)
 - `tools.web.fetch.userAgent` (optional override)
 - `tools.web.fetch.readability` (default true; disable to use basic HTML cleanup only)
+- `tools.web.fetch.firecrawl.enabled` (default true when an API key is set)
+- `tools.web.fetch.firecrawl.apiKey` (optional; defaults to `FIRECRAWL_API_KEY`)
+- `tools.web.fetch.firecrawl.baseUrl` (default https://api.firecrawl.dev)
+- `tools.web.fetch.firecrawl.onlyMainContent` (default true)
+- `tools.web.fetch.firecrawl.maxAgeMs` (optional)
+- `tools.web.fetch.firecrawl.timeoutSeconds` (optional)

 `agents.defaults.subagents` configures sub-agent defaults:
 - `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call.
--- a/docs/tools/firecrawl.md
+++ b/docs/tools/firecrawl.md
@@ -0,0 +1,58 @@
+---
+summary: "Firecrawl fallback for web_fetch (anti-bot + cached extraction)"
+read_when:
+  - You want Firecrawl-backed web extraction
+  - You need a Firecrawl API key
+  - You want anti-bot extraction for web_fetch
+---
+
+# Firecrawl
+
+Clawdbot can use **Firecrawl** as a fallback extractor for `web_fetch`. It is a hosted
+content extraction service that supports bot circumvention and caching, which helps
+with JS-heavy sites or pages that block plain HTTP fetches.
+
+## Get an API key
+
+1) Create a Firecrawl account and generate an API key.
+2) Store it in config or set `FIRECRAWL_API_KEY` in the gateway environment.
+
+## Configure Firecrawl
+
+```json5
+{
+  tools: {
+    web: {
+      fetch: {
+        firecrawl: {
+          apiKey: "FIRECRAWL_API_KEY_HERE",
+          baseUrl: "https://api.firecrawl.dev",
+          onlyMainContent: true,
+          maxAgeMs: 172800000,
+          timeoutSeconds: 60
+        }
+      }
+    }
+  }
+}
+```
+
+Notes:
+- `firecrawl.enabled` defaults to true when an API key is present.
+- `maxAgeMs` controls how old cached results can be (ms). Default is 2 days.
+
+## Stealth / bot circumvention
+
+Firecrawl exposes a **proxy mode** parameter for bot circumvention (`basic`, `stealth`, or `auto`).
+Clawdbot always uses `proxy: "auto"` plus `storeInCache: true` for Firecrawl requests.
+If proxy is omitted, Firecrawl defaults to `auto`. `auto` retries with stealth proxies if a basic attempt fails, which may use more credits
+than basic-only scraping.
+
+## How `web_fetch` uses Firecrawl
+
+`web_fetch` extraction order:
+1) Readability (local)
+2) Firecrawl (if configured)
+3) Basic HTML cleanup (last fallback)
+
+See [Web tools](/tools/web) for the full web tool setup.
--- a/docs/tools/index.md
+++ b/docs/tools/index.md
@@ -215,6 +215,7 @@ Notes:
 - Responses are cached (default 15 min).
 - For JS-heavy sites, prefer the browser tool.
 - See [Web tools](/tools/web) for setup.
+- See [Firecrawl](/tools/firecrawl) for the optional anti-bot fallback.

 ### `browser`
 Control the dedicated clawd browser.
--- a/docs/tools/web.md
+++ b/docs/tools/web.md
@@ -104,6 +104,7 @@ Fetch a URL and extract readable content.
 ### Requirements

 - `tools.web.fetch.enabled` must not be `false` (default: enabled)
+- Optional Firecrawl fallback: set `tools.web.fetch.firecrawl.apiKey` or `FIRECRAWL_API_KEY`.

 ### Config

@@ -116,8 +117,16 @@ Fetch a URL and extract readable content.
        maxChars: 50000,
        timeoutSeconds: 30,
        cacheTtlMinutes: 15,
-        userAgent: "clawdbot/2026.1.15",
-        readability: true
+        userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+        readability: true,
+        firecrawl: {
+          enabled: true,
+          apiKey: "FIRECRAWL_API_KEY_HERE", // optional if FIRECRAWL_API_KEY is set
+          baseUrl: "https://api.firecrawl.dev",
+          onlyMainContent: true,
+          maxAgeMs: 86400000, // ms (1 day)
+          timeoutSeconds: 60
+        }
      }
    }
  }
@@ -131,8 +140,11 @@ Fetch a URL and extract readable content.
 - `maxChars` (truncate long pages)

 Notes:
- `web_fetch` uses Readability (main-content extraction) by default and falls back to basic HTML cleanup if it fails.
+- `web_fetch` uses Readability (main-content extraction) first, then Firecrawl (if configured). If both fail, the tool returns an error.
+- Firecrawl requests use bot-circumvention mode and cache results by default.
+- `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed.
 - `web_fetch` is best-effort extraction; some sites will need the browser tool.
+- See [Firecrawl](/tools/firecrawl) for key setup and service details.
 - Responses are cached (default 15 minutes) to reduce repeated fetches.
 - If you use tool profiles/allowlists, add `web_search`/`web_fetch` or `group:web`.
 - If the Brave key is missing, `web_search` returns a short setup hint with a docs link.
--- a/scripts/firecrawl-compare.ts
+++ b/scripts/firecrawl-compare.ts
@@ -0,0 +1,131 @@
+import { extractReadableContent, fetchFirecrawlContent } from "../src/agents/tools/web-tools.js";
+
+const DEFAULT_URLS = [
+  "https://en.wikipedia.org/wiki/Web_scraping",
+  "https://news.ycombinator.com/",
+  "https://www.apple.com/iphone/",
+  "https://www.nytimes.com/",
+  "https://www.reddit.com/r/javascript/",
+];
+
+const urls = process.argv.slice(2);
+const targets = urls.length > 0 ? urls : DEFAULT_URLS;
+const apiKey = process.env.FIRECRAWL_API_KEY;
+const baseUrl = process.env.FIRECRAWL_BASE_URL ?? "https://api.firecrawl.dev";
+
+const userAgent =
+  "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
+const timeoutMs = 30_000;
+
+function truncate(value: string, max = 180): string {
+  if (!value) return "";
+  return value.length > max ? `${value.slice(0, max)}…` : value;
+}
+
+async function fetchHtml(url: string): Promise<{
+  ok: boolean;
+  status: number;
+  contentType: string;
+  finalUrl: string;
+  body: string;
+}> {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), timeoutMs);
+  try {
+    const res = await fetch(url, {
+      method: "GET",
+      headers: { Accept: "*/*", "User-Agent": userAgent },
+      signal: controller.signal,
+    });
+    const contentType = res.headers.get("content-type") ?? "application/octet-stream";
+    const body = await res.text();
+    return {
+      ok: res.ok,
+      status: res.status,
+      contentType,
+      finalUrl: res.url || url,
+      body,
+    };
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+async function run() {
+  if (!apiKey) {
+    console.log("FIRECRAWL_API_KEY not set. Firecrawl comparisons will be skipped.");
+  }
+
+  for (const url of targets) {
+    console.log(`\n=== ${url}`);
+    let localStatus = "skipped";
+    let localTitle = "";
+    let localText = "";
+    let localError: string | undefined;
+
+    try {
+      const res = await fetchHtml(url);
+      if (!res.ok) {
+        localStatus = `http ${res.status}`;
+      } else if (!res.contentType.includes("text/html")) {
+        localStatus = `non-html (${res.contentType})`;
+      } else {
+        const readable = await extractReadableContent({
+          html: res.body,
+          url: res.finalUrl,
+          extractMode: "markdown",
+        });
+        if (readable?.text) {
+          localStatus = "readability";
+          localTitle = readable.title ?? "";
+          localText = readable.text;
+        } else {
+          localStatus = "readability-empty";
+        }
+      }
+    } catch (error) {
+      localStatus = "error";
+      localError = error instanceof Error ? error.message : String(error);
+    }
+
+    console.log(
+      `local: ${localStatus} len=${localText.length} title=${truncate(localTitle, 80)}`
+    );
+    if (localError) console.log(`local error: ${localError}`);
+    if (localText) console.log(`local sample: ${truncate(localText)}`);
+
+    if (apiKey) {
+      try {
+        const firecrawl = await fetchFirecrawlContent({
+          url,
+          extractMode: "markdown",
+          apiKey,
+          baseUrl,
+          onlyMainContent: true,
+          maxAgeMs: 172_800_000,
+          proxy: "auto",
+          storeInCache: true,
+          timeoutSeconds: 60,
+        });
+        console.log(
+          `firecrawl: ok len=${firecrawl.text.length} title=${truncate(
+            firecrawl.title ?? "",
+            80,
+          )} status=${firecrawl.status ?? "n/a"}`
+        );
+        if (firecrawl.warning) console.log(`firecrawl warning: ${firecrawl.warning}`);
+        if (firecrawl.text) console.log(`firecrawl sample: ${truncate(firecrawl.text)}`);
+      } catch (error) {
+        const message = error instanceof Error ? error.message : String(error);
+        console.log(`firecrawl: error ${message}`);
+      }
+    }
+  }
+
+  process.exit(0);
+}
+
+run().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
--- a/scripts/readability-basic-compare.ts
+++ b/scripts/readability-basic-compare.ts
@@ -0,0 +1,60 @@
+import { createWebFetchTool } from "../src/agents/tools/web-tools.js";
+
+const DEFAULT_URLS = [
+  "https://example.com/",
+  "https://news.ycombinator.com/",
+  "https://www.reddit.com/r/javascript/",
+  "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent",
+  "https://httpbin.org/html",
+];
+
+const urls = process.argv.slice(2);
+const targets = urls.length > 0 ? urls : DEFAULT_URLS;
+
+async function runFetch(url: string, readability: boolean) {
+  if (!readability) {
+    throw new Error("Basic extraction removed. Set readability=true or enable Firecrawl.");
+  }
+  const tool = createWebFetchTool({
+    config: {
+      tools: {
+        web: { fetch: { readability, cacheTtlMinutes: 0, firecrawl: { enabled: false } } },
+      },
+    },
+    sandboxed: false,
+  });
+  if (!tool) throw new Error("web_fetch tool is disabled");
+  const result = await tool.execute("test", { url, extractMode: "markdown" });
+  return result.details as {
+    text?: string;
+    title?: string;
+    extractor?: string;
+    length?: number;
+    truncated?: boolean;
+  };
+}
+
+function truncate(value: string, max = 160): string {
+  if (!value) return "";
+  return value.length > max ? `${value.slice(0, max)}…` : value;
+}
+
+async function run() {
+  for (const url of targets) {
+    console.log(`\n=== ${url}`);
+    const readable = await runFetch(url, true);
+
+    console.log(
+      `readability: ${readable.extractor ?? "unknown"} len=${readable.length ?? 0} title=${truncate(
+        readable.title ?? "",
+        80,
+      )}`,
+    );
+    if (readable.text) console.log(`readability sample: ${truncate(readable.text)}`);
+  }
+}
+
+run().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
--- a/src/agents/tools/web-tools.fetch.test.ts
+++ b/src/agents/tools/web-tools.fetch.test.ts
@@ -0,0 +1,185 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+
+import { createWebFetchTool } from "./web-tools.js";
+
+type MockResponse = {
+  ok: boolean;
+  status: number;
+  url?: string;
+  headers?: { get: (key: string) => string | null };
+  text?: () => Promise<string>;
+  json?: () => Promise<unknown>;
+};
+
+function makeHeaders(map: Record<string, string>): { get: (key: string) => string | null } {
+  return {
+    get: (key) => map[key.toLowerCase()] ?? null,
+  };
+}
+
+function htmlResponse(html: string, url = "https://example.com/"): MockResponse {
+  return {
+    ok: true,
+    status: 200,
+    url,
+    headers: makeHeaders({ "content-type": "text/html; charset=utf-8" }),
+    text: async () => html,
+  };
+}
+
+function firecrawlResponse(markdown: string, url = "https://example.com/"): MockResponse {
+  return {
+    ok: true,
+    status: 200,
+    json: async () => ({
+      success: true,
+      data: {
+        markdown,
+        metadata: { title: "Firecrawl Title", sourceURL: url, statusCode: 200 },
+      },
+    }),
+  };
+}
+
+function firecrawlError(): MockResponse {
+  return {
+    ok: false,
+    status: 403,
+    json: async () => ({ success: false, error: "blocked" }),
+  };
+}
+
+function requestUrl(input: RequestInfo): string {
+  if (typeof input === "string") return input;
+  if (input instanceof URL) return input.toString();
+  if ("url" in input && typeof input.url === "string") return input.url;
+  return "";
+}
+
+describe("web_fetch extraction fallbacks", () => {
+  const priorFetch = global.fetch;
+
+  afterEach(() => {
+    // @ts-expect-error restore
+    global.fetch = priorFetch;
+    vi.restoreAllMocks();
+  });
+
+  it("falls back to firecrawl when readability returns no content", async () => {
+    const mockFetch = vi.fn((input: RequestInfo) => {
+      const url = requestUrl(input);
+      if (url.includes("api.firecrawl.dev")) {
+        return Promise.resolve(firecrawlResponse("firecrawl content")) as Promise<Response>;
+      }
+      return Promise.resolve(
+        htmlResponse("<!doctype html><html><head></head><body></body></html>", url),
+      ) as Promise<Response>;
+    });
+    // @ts-expect-error mock fetch
+    global.fetch = mockFetch;
+
+    const tool = createWebFetchTool({
+      config: {
+        tools: {
+          web: {
+            fetch: {
+              cacheTtlMinutes: 0,
+              firecrawl: { apiKey: "firecrawl-test" },
+            },
+          },
+        },
+      },
+      sandboxed: false,
+    });
+
+    const result = await tool?.execute?.("call", { url: "https://example.com/empty" });
+    const details = result?.details as { extractor?: string; text?: string };
+    expect(details.extractor).toBe("firecrawl");
+    expect(details.text).toContain("firecrawl content");
+  });
+
+  it("throws when readability is disabled and firecrawl is unavailable", async () => {
+    const mockFetch = vi.fn((input: RequestInfo) =>
+      Promise.resolve(htmlResponse("<html><body>hi</body></html>", requestUrl(input))),
+    );
+    // @ts-expect-error mock fetch
+    global.fetch = mockFetch;
+
+    const tool = createWebFetchTool({
+      config: {
+        tools: {
+          web: {
+            fetch: { readability: false, cacheTtlMinutes: 0, firecrawl: { enabled: false } },
+          },
+        },
+      },
+      sandboxed: false,
+    });
+
+    await expect(
+      tool?.execute?.("call", { url: "https://example.com/readability-off" }),
+    ).rejects.toThrow("Readability disabled");
+  });
+
+  it("throws when readability is empty and firecrawl fails", async () => {
+    const mockFetch = vi.fn((input: RequestInfo) => {
+      const url = requestUrl(input);
+      if (url.includes("api.firecrawl.dev")) {
+        return Promise.resolve(firecrawlError()) as Promise<Response>;
+      }
+      return Promise.resolve(
+        htmlResponse("<!doctype html><html><head></head><body></body></html>", url),
+      ) as Promise<Response>;
+    });
+    // @ts-expect-error mock fetch
+    global.fetch = mockFetch;
+
+    const tool = createWebFetchTool({
+      config: {
+        tools: {
+          web: {
+            fetch: { cacheTtlMinutes: 0, firecrawl: { apiKey: "firecrawl-test" } },
+          },
+        },
+      },
+      sandboxed: false,
+    });
+
+    await expect(
+      tool?.execute?.("call", { url: "https://example.com/readability-empty" }),
+    ).rejects.toThrow("Readability and Firecrawl returned no content");
+  });
+
+  it("uses firecrawl when direct fetch fails", async () => {
+    const mockFetch = vi.fn((input: RequestInfo) => {
+      const url = requestUrl(input);
+      if (url.includes("api.firecrawl.dev")) {
+        return Promise.resolve(firecrawlResponse("firecrawl fallback", url)) as Promise<Response>;
+      }
+      return Promise.resolve({
+        ok: false,
+        status: 403,
+        headers: makeHeaders({ "content-type": "text/html" }),
+        text: async () => "blocked",
+      } as Response);
+    });
+    // @ts-expect-error mock fetch
+    global.fetch = mockFetch;
+
+    const tool = createWebFetchTool({
+      config: {
+        tools: {
+          web: {
+            fetch: { cacheTtlMinutes: 0, firecrawl: { apiKey: "firecrawl-test" } },
+          },
+        },
+      },
+      sandboxed: false,
+    });
+
+    const result = await tool?.execute?.("call", { url: "https://example.com/blocked" });
+    const details = result?.details as { extractor?: string; text?: string };
+    expect(details.extractor).toBe("firecrawl");
+    expect(details.text).toContain("firecrawl fallback");
+  });
+});
--- a/src/agents/tools/web-tools.ts
+++ b/src/agents/tools/web-tools.ts
@@ -1,7 +1,6 @@
 import { Type } from "@sinclair/typebox";

 import type { ClawdbotConfig } from "../../config/config.js";
-import { VERSION } from "../../version.js";
 import { stringEnum } from "../schema/typebox.js";
 import type { AnyAgentTool } from "./common.js";
 import { jsonResult, readNumberParam, readStringParam } from "./common.js";
@@ -15,6 +14,10 @@ const DEFAULT_FETCH_MAX_CHARS = 50_000;
 const DEFAULT_TIMEOUT_SECONDS = 30;
 const DEFAULT_CACHE_TTL_MINUTES = 15;
 const DEFAULT_CACHE_MAX_ENTRIES = 100;
+const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
+const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
+const DEFAULT_FETCH_USER_AGENT =
+  "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";

 const BRAVE_SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search";

@@ -30,6 +33,15 @@ type WebFetchConfig = NonNullable<ClawdbotConfig["tools"]>["web"] extends infer
    : undefined
  : undefined;

+type FirecrawlFetchConfig = {
+  enabled?: boolean;
+  apiKey?: string;
+  baseUrl?: string;
+  onlyMainContent?: boolean;
+  maxAgeMs?: number;
+  timeoutSeconds?: number;
+} | undefined;
+
 type CacheEntry<T> = {
  value: T;
  expiresAt: number;
@@ -123,6 +135,13 @@ function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean {
  return true;
 }

+function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig {
+  if (!fetch || typeof fetch !== "object") return undefined;
+  const firecrawl = "firecrawl" in fetch ? fetch.firecrawl : undefined;
+  if (!firecrawl || typeof firecrawl !== "object") return undefined;
+  return firecrawl as FirecrawlFetchConfig;
+}
+
 function resolveSearchApiKey(search?: WebSearchConfig): string | undefined {
  const fromConfig =
    search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : "";
@@ -130,6 +149,52 @@ function resolveSearchApiKey(search?: WebSearchConfig): string | undefined {
  return fromConfig || fromEnv || undefined;
 }

+function resolveFirecrawlApiKey(firecrawl?: FirecrawlFetchConfig): string | undefined {
+  const fromConfig =
+    firecrawl && "apiKey" in firecrawl && typeof firecrawl.apiKey === "string"
+      ? firecrawl.apiKey.trim()
+      : "";
+  const fromEnv = (process.env.FIRECRAWL_API_KEY ?? "").trim();
+  return fromConfig || fromEnv || undefined;
+}
+
+function resolveFirecrawlEnabled(params: {
+  firecrawl?: FirecrawlFetchConfig;
+  apiKey?: string;
+}): boolean {
+  if (typeof params.firecrawl?.enabled === "boolean") return params.firecrawl.enabled;
+  return Boolean(params.apiKey);
+}
+
+function resolveFirecrawlBaseUrl(firecrawl?: FirecrawlFetchConfig): string {
+  const raw =
+    firecrawl && "baseUrl" in firecrawl && typeof firecrawl.baseUrl === "string"
+      ? firecrawl.baseUrl.trim()
+      : "";
+  return raw || DEFAULT_FIRECRAWL_BASE_URL;
+}
+
+function resolveFirecrawlOnlyMainContent(firecrawl?: FirecrawlFetchConfig): boolean {
+  if (typeof firecrawl?.onlyMainContent === "boolean") return firecrawl.onlyMainContent;
+  return true;
+}
+
+function resolveFirecrawlMaxAgeMs(firecrawl?: FirecrawlFetchConfig): number | undefined {
+  const raw =
+    firecrawl && "maxAgeMs" in firecrawl && typeof firecrawl.maxAgeMs === "number"
+      ? firecrawl.maxAgeMs
+      : undefined;
+  if (typeof raw !== "number" || !Number.isFinite(raw)) return undefined;
+  const parsed = Math.max(0, Math.floor(raw));
+  return parsed > 0 ? parsed : undefined;
+}
+
+function resolveFirecrawlMaxAgeMsOrDefault(firecrawl?: FirecrawlFetchConfig): number {
+  const resolved = resolveFirecrawlMaxAgeMs(firecrawl);
+  if (typeof resolved === "number") return resolved;
+  return DEFAULT_FIRECRAWL_MAX_AGE_MS;
+}
+
 function missingSearchKeyPayload() {
  return {
    error: "missing_brave_api_key",
@@ -278,9 +343,18 @@ function htmlToMarkdown(html: string): { text: string; title?: string } {
  return { text, title };
 }

-function htmlToText(html: string): { text: string; title?: string } {
-  const { text, title } = htmlToMarkdown(html);
-  return { text, title };
+function markdownToText(markdown: string): string {
+  let text = markdown;
+  text = text.replace(/!\[[^\]]*]\([^)]+\)/g, "");
+  text = text.replace(/\[([^\]]+)]\([^)]+\)/g, "$1");
+  text = text.replace(/```[\s\S]*?```/g, (block) =>
+    block.replace(/```[^\n]*\n?/g, "").replace(/```/g, ""),
+  );
+  text = text.replace(/`([^`]+)`/g, "$1");
+  text = text.replace(/^#{1,6}\s+/gm, "");
+  text = text.replace(/^\s*[-*+]\s+/gm, "");
+  text = text.replace(/^\s*\d+\.\s+/gm, "");
+  return normalizeWhitespace(text);
 }

 function truncateText(value: string, maxChars: number): { text: string; truncated: boolean } {
@@ -336,6 +410,81 @@ export async function extractReadableContent(params: {
  }
 }

+export async function fetchFirecrawlContent(params: {
+  url: string;
+  extractMode: (typeof EXTRACT_MODES)[number];
+  apiKey: string;
+  baseUrl: string;
+  onlyMainContent: boolean;
+  maxAgeMs: number;
+  proxy: "auto" | "basic" | "stealth";
+  storeInCache: boolean;
+  timeoutSeconds: number;
+}): Promise<{
+  text: string;
+  title?: string;
+  finalUrl?: string;
+  status?: number;
+  warning?: string;
+}> {
+  const endpoint = resolveFirecrawlEndpoint(params.baseUrl);
+  const body: Record<string, unknown> = {
+    url: params.url,
+    formats: ["markdown"],
+    onlyMainContent: params.onlyMainContent,
+    timeout: params.timeoutSeconds * 1000,
+    maxAge: params.maxAgeMs,
+    proxy: params.proxy,
+    storeInCache: params.storeInCache,
+  };
+
+  const res = await fetch(endpoint, {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${params.apiKey}`,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify(body),
+    signal: withTimeout(undefined, params.timeoutSeconds * 1000),
+  });
+
+  const payload = (await res.json()) as {
+    success?: boolean;
+    data?: {
+      markdown?: string;
+      content?: string;
+      metadata?: {
+        title?: string;
+        sourceURL?: string;
+        statusCode?: number;
+      };
+    };
+    warning?: string;
+    error?: string;
+  };
+
+  if (!res.ok || payload?.success === false) {
+    const detail = payload?.error || res.statusText;
+    throw new Error(`Firecrawl fetch failed (${res.status}): ${detail}`.trim());
+  }
+
+  const data = payload?.data ?? {};
+  const rawText =
+    typeof data.markdown === "string"
+      ? data.markdown
+      : typeof data.content === "string"
+        ? data.content
+        : "";
+  const text = params.extractMode === "text" ? markdownToText(rawText) : rawText;
+  return {
+    text,
+    title: data.metadata?.title,
+    finalUrl: data.metadata?.sourceURL,
+    status: data.metadata?.statusCode,
+    warning: payload?.warning,
+  };
+}
+
 async function runWebSearch(params: {
  query: string;
  count: number;
@@ -414,6 +563,14 @@ async function runWebFetch(params: {
  cacheTtlMs: number;
  userAgent: string;
  readabilityEnabled: boolean;
+  firecrawlEnabled: boolean;
+  firecrawlApiKey?: string;
+  firecrawlBaseUrl: string;
+  firecrawlOnlyMainContent: boolean;
+  firecrawlMaxAgeMs: number;
+  firecrawlProxy: "auto" | "basic" | "stealth";
+  firecrawlStoreInCache: boolean;
+  firecrawlTimeoutSeconds: number;
 }): Promise<Record<string, unknown>> {
  const cacheKey = normalizeCacheKey(
    `fetch:${params.url}:${params.extractMode}:${params.maxChars}`,
@@ -432,16 +589,84 @@ async function runWebFetch(params: {
  }

  const start = Date.now();
-  const res = await fetch(parsedUrl.toString(), {
-    method: "GET",
-    headers: {
-      Accept: "*/*",
-      "User-Agent": params.userAgent,
-    },
-    signal: withTimeout(undefined, params.timeoutSeconds * 1000),
-  });
+  let res: Response;
+  try {
+    res = await fetch(parsedUrl.toString(), {
+      method: "GET",
+      headers: {
+        Accept: "*/*",
+        "User-Agent": params.userAgent,
+        "Accept-Language": "en-US,en;q=0.9",
+      },
+      signal: withTimeout(undefined, params.timeoutSeconds * 1000),
+    });
+  } catch (error) {
+    if (params.firecrawlEnabled && params.firecrawlApiKey) {
+      const firecrawl = await fetchFirecrawlContent({
+        url: params.url,
+        extractMode: params.extractMode,
+        apiKey: params.firecrawlApiKey,
+        baseUrl: params.firecrawlBaseUrl,
+        onlyMainContent: params.firecrawlOnlyMainContent,
+        maxAgeMs: params.firecrawlMaxAgeMs,
+        proxy: params.firecrawlProxy,
+        storeInCache: params.firecrawlStoreInCache,
+        timeoutSeconds: params.firecrawlTimeoutSeconds,
+      });
+      const truncated = truncateText(firecrawl.text, params.maxChars);
+      const payload = {
+        url: params.url,
+        finalUrl: firecrawl.finalUrl || params.url,
+        status: firecrawl.status ?? 200,
+        contentType: "text/markdown",
+        title: firecrawl.title,
+        extractMode: params.extractMode,
+        extractor: "firecrawl",
+        truncated: truncated.truncated,
+        length: truncated.text.length,
+        fetchedAt: new Date().toISOString(),
+        tookMs: Date.now() - start,
+        text: truncated.text,
+        warning: firecrawl.warning,
+      };
+      writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
+      return payload;
+    }
+    throw error;
+  }

  if (!res.ok) {
+    if (params.firecrawlEnabled && params.firecrawlApiKey) {
+      const firecrawl = await fetchFirecrawlContent({
+        url: params.url,
+        extractMode: params.extractMode,
+        apiKey: params.firecrawlApiKey,
+        baseUrl: params.firecrawlBaseUrl,
+        onlyMainContent: params.firecrawlOnlyMainContent,
+        maxAgeMs: params.firecrawlMaxAgeMs,
+        proxy: params.firecrawlProxy,
+        storeInCache: params.firecrawlStoreInCache,
+        timeoutSeconds: params.firecrawlTimeoutSeconds,
+      });
+      const truncated = truncateText(firecrawl.text, params.maxChars);
+      const payload = {
+        url: params.url,
+        finalUrl: firecrawl.finalUrl || params.url,
+        status: firecrawl.status ?? res.status,
+        contentType: "text/markdown",
+        title: firecrawl.title,
+        extractMode: params.extractMode,
+        extractor: "firecrawl",
+        truncated: truncated.truncated,
+        length: truncated.text.length,
+        fetchedAt: new Date().toISOString(),
+        tookMs: Date.now() - start,
+        text: truncated.text,
+        warning: firecrawl.warning,
+      };
+      writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
+      return payload;
+    }
    const detail = await readResponseText(res);
    throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`);
  }
@@ -450,6 +675,7 @@ async function runWebFetch(params: {
  const body = await readResponseText(res);

  let title: string | undefined;
+  let extractor = "raw";
  let text = body;
  if (contentType.includes("text/html")) {
    if (params.readabilityEnabled) {
@@ -461,21 +687,29 @@ async function runWebFetch(params: {
      if (readable?.text) {
        text = readable.text;
        title = readable.title;
+        extractor = "readability";
      } else {
-        const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
-        text = parsed.text;
-        title = parsed.title;
+        const firecrawl = await tryFirecrawlFallback(params);
+        if (firecrawl) {
+          text = firecrawl.text;
+          title = firecrawl.title;
+          extractor = "firecrawl";
+        } else {
+          throw new Error(
+            "Web fetch extraction failed: Readability and Firecrawl returned no content.",
+          );
+        }
      }
    } else {
-      const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
-      text = parsed.text;
-      title = parsed.title;
+      throw new Error("Web fetch extraction failed: Readability disabled and Firecrawl unavailable.");
    }
  } else if (contentType.includes("application/json")) {
    try {
      text = JSON.stringify(JSON.parse(body), null, 2);
+      extractor = "json";
    } catch {
      text = body;
+      extractor = "raw";
    }
  }

@@ -487,6 +721,7 @@ async function runWebFetch(params: {
    contentType,
    title,
    extractMode: params.extractMode,
+    extractor,
    truncated: truncated.truncated,
    length: truncated.text.length,
    fetchedAt: new Date().toISOString(),
@@ -497,6 +732,37 @@ async function runWebFetch(params: {
  return payload;
 }

+async function tryFirecrawlFallback(params: {
+  url: string;
+  extractMode: (typeof EXTRACT_MODES)[number];
+  firecrawlEnabled: boolean;
+  firecrawlApiKey?: string;
+  firecrawlBaseUrl: string;
+  firecrawlOnlyMainContent: boolean;
+  firecrawlMaxAgeMs: number;
+  firecrawlProxy: "auto" | "basic" | "stealth";
+  firecrawlStoreInCache: boolean;
+  firecrawlTimeoutSeconds: number;
+}): Promise<{ text: string; title?: string } | null> {
+  if (!params.firecrawlEnabled || !params.firecrawlApiKey) return null;
+  try {
+    const firecrawl = await fetchFirecrawlContent({
+      url: params.url,
+      extractMode: params.extractMode,
+      apiKey: params.firecrawlApiKey,
+      baseUrl: params.firecrawlBaseUrl,
+      onlyMainContent: params.firecrawlOnlyMainContent,
+      maxAgeMs: params.firecrawlMaxAgeMs,
+      proxy: params.firecrawlProxy,
+      storeInCache: params.firecrawlStoreInCache,
+      timeoutSeconds: params.firecrawlTimeoutSeconds,
+    });
+    return { text: firecrawl.text, title: firecrawl.title };
+  } catch {
+    return null;
+  }
+}
+
 export function createWebSearchTool(options?: {
  config?: ClawdbotConfig;
  sandboxed?: boolean;
@@ -537,6 +803,21 @@ export function createWebSearchTool(options?: {
  };
 }

+function resolveFirecrawlEndpoint(baseUrl: string): string {
+  const trimmed = baseUrl.trim();
+  if (!trimmed) return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`;
+  try {
+    const url = new URL(trimmed);
+    if (url.pathname && url.pathname !== "/") {
+      return url.toString();
+    }
+    url.pathname = "/v2/scrape";
+    return url.toString();
+  } catch {
+    return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`;
+  }
+}
+
 export function createWebFetchTool(options?: {
  config?: ClawdbotConfig;
  sandboxed?: boolean;
@@ -544,9 +825,19 @@ export function createWebFetchTool(options?: {
  const fetch = resolveFetchConfig(options?.config);
  if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null;
  const readabilityEnabled = resolveFetchReadabilityEnabled(fetch);
+  const firecrawl = resolveFirecrawlConfig(fetch);
+  const firecrawlApiKey = resolveFirecrawlApiKey(firecrawl);
+  const firecrawlEnabled = resolveFirecrawlEnabled({ firecrawl, apiKey: firecrawlApiKey });
+  const firecrawlBaseUrl = resolveFirecrawlBaseUrl(firecrawl);
+  const firecrawlOnlyMainContent = resolveFirecrawlOnlyMainContent(firecrawl);
+  const firecrawlMaxAgeMs = resolveFirecrawlMaxAgeMsOrDefault(firecrawl);
+  const firecrawlTimeoutSeconds = resolveTimeoutSeconds(
+    firecrawl?.timeoutSeconds ?? fetch?.timeoutSeconds,
+    DEFAULT_TIMEOUT_SECONDS,
+  );
  const userAgent =
    (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
-    `clawdbot/${VERSION}`;
+    DEFAULT_FETCH_USER_AGENT;
  return {
    label: "Web Fetch",
    name: "web_fetch",
@@ -566,6 +857,14 @@ export function createWebFetchTool(options?: {
        cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
        userAgent,
        readabilityEnabled,
+        firecrawlEnabled,
+        firecrawlApiKey,
+        firecrawlBaseUrl,
+        firecrawlOnlyMainContent,
+        firecrawlMaxAgeMs,
+        firecrawlProxy: "auto",
+        firecrawlStoreInCache: true,
+        firecrawlTimeoutSeconds,
      });
      return jsonResult(result);
    },
--- a/src/config/schema.ts
+++ b/src/config/schema.ts
@@ -264,6 +264,17 @@ const FIELD_HELP: Record<string, string> = {
  "tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.",
  "tools.web.fetch.readability":
    "Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).",
+  "tools.web.fetch.firecrawl.enabled": "Enable Firecrawl fallback for web_fetch (if configured).",
+  "tools.web.fetch.firecrawl.apiKey":
+    "Firecrawl API key (fallback: FIRECRAWL_API_KEY env var).",
+  "tools.web.fetch.firecrawl.baseUrl":
+    "Firecrawl base URL (e.g. https://api.firecrawl.dev or custom endpoint).",
+  "tools.web.fetch.firecrawl.onlyMainContent":
+    "When true, Firecrawl returns only the main content (default: true).",
+  "tools.web.fetch.firecrawl.maxAgeMs":
+    "Firecrawl maxAge (ms) for cached results when supported by the API.",
+  "tools.web.fetch.firecrawl.timeoutSeconds":
+    "Timeout in seconds for Firecrawl requests.",
  "channels.slack.allowBots":
    "Allow bot-authored messages to trigger Slack replies (default: false).",
  "channels.slack.thread.historyScope":
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -111,6 +111,20 @@ export type ToolsConfig = {
      userAgent?: string;
      /** Use Readability to extract main content (default: true). */
      readability?: boolean;
+      firecrawl?: {
+        /** Enable Firecrawl fallback (default: true when apiKey is set). */
+        enabled?: boolean;
+        /** Firecrawl API key (optional; defaults to FIRECRAWL_API_KEY env var). */
+        apiKey?: string;
+        /** Firecrawl base URL (default: https://api.firecrawl.dev). */
+        baseUrl?: string;
+        /** Whether to keep only main content (default: true). */
+        onlyMainContent?: boolean;
+        /** Max age (ms) for cached Firecrawl content. */
+        maxAgeMs?: number;
+        /** Timeout in seconds for Firecrawl requests. */
+        timeoutSeconds?: number;
+      };
    };
  };
  audio?: {