feat: enhance web_fetch fallbacks
This commit is contained in:
131
scripts/firecrawl-compare.ts
Normal file
131
scripts/firecrawl-compare.ts
Normal file
@@ -0,0 +1,131 @@
|
||||
import { extractReadableContent, fetchFirecrawlContent } from "../src/agents/tools/web-tools.js";
|
||||
|
||||
const DEFAULT_URLS = [
|
||||
"https://en.wikipedia.org/wiki/Web_scraping",
|
||||
"https://news.ycombinator.com/",
|
||||
"https://www.apple.com/iphone/",
|
||||
"https://www.nytimes.com/",
|
||||
"https://www.reddit.com/r/javascript/",
|
||||
];
|
||||
|
||||
const urls = process.argv.slice(2);
|
||||
const targets = urls.length > 0 ? urls : DEFAULT_URLS;
|
||||
const apiKey = process.env.FIRECRAWL_API_KEY;
|
||||
const baseUrl = process.env.FIRECRAWL_BASE_URL ?? "https://api.firecrawl.dev";
|
||||
|
||||
const userAgent =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
|
||||
const timeoutMs = 30_000;
|
||||
|
||||
function truncate(value: string, max = 180): string {
|
||||
if (!value) return "";
|
||||
return value.length > max ? `${value.slice(0, max)}…` : value;
|
||||
}
|
||||
|
||||
async function fetchHtml(url: string): Promise<{
|
||||
ok: boolean;
|
||||
status: number;
|
||||
contentType: string;
|
||||
finalUrl: string;
|
||||
body: string;
|
||||
}> {
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
method: "GET",
|
||||
headers: { Accept: "*/*", "User-Agent": userAgent },
|
||||
signal: controller.signal,
|
||||
});
|
||||
const contentType = res.headers.get("content-type") ?? "application/octet-stream";
|
||||
const body = await res.text();
|
||||
return {
|
||||
ok: res.ok,
|
||||
status: res.status,
|
||||
contentType,
|
||||
finalUrl: res.url || url,
|
||||
body,
|
||||
};
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
async function run() {
|
||||
if (!apiKey) {
|
||||
console.log("FIRECRAWL_API_KEY not set. Firecrawl comparisons will be skipped.");
|
||||
}
|
||||
|
||||
for (const url of targets) {
|
||||
console.log(`\n=== ${url}`);
|
||||
let localStatus = "skipped";
|
||||
let localTitle = "";
|
||||
let localText = "";
|
||||
let localError: string | undefined;
|
||||
|
||||
try {
|
||||
const res = await fetchHtml(url);
|
||||
if (!res.ok) {
|
||||
localStatus = `http ${res.status}`;
|
||||
} else if (!res.contentType.includes("text/html")) {
|
||||
localStatus = `non-html (${res.contentType})`;
|
||||
} else {
|
||||
const readable = await extractReadableContent({
|
||||
html: res.body,
|
||||
url: res.finalUrl,
|
||||
extractMode: "markdown",
|
||||
});
|
||||
if (readable?.text) {
|
||||
localStatus = "readability";
|
||||
localTitle = readable.title ?? "";
|
||||
localText = readable.text;
|
||||
} else {
|
||||
localStatus = "readability-empty";
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
localStatus = "error";
|
||||
localError = error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
|
||||
console.log(
|
||||
`local: ${localStatus} len=${localText.length} title=${truncate(localTitle, 80)}`
|
||||
);
|
||||
if (localError) console.log(`local error: ${localError}`);
|
||||
if (localText) console.log(`local sample: ${truncate(localText)}`);
|
||||
|
||||
if (apiKey) {
|
||||
try {
|
||||
const firecrawl = await fetchFirecrawlContent({
|
||||
url,
|
||||
extractMode: "markdown",
|
||||
apiKey,
|
||||
baseUrl,
|
||||
onlyMainContent: true,
|
||||
maxAgeMs: 172_800_000,
|
||||
proxy: "auto",
|
||||
storeInCache: true,
|
||||
timeoutSeconds: 60,
|
||||
});
|
||||
console.log(
|
||||
`firecrawl: ok len=${firecrawl.text.length} title=${truncate(
|
||||
firecrawl.title ?? "",
|
||||
80,
|
||||
)} status=${firecrawl.status ?? "n/a"}`
|
||||
);
|
||||
if (firecrawl.warning) console.log(`firecrawl warning: ${firecrawl.warning}`);
|
||||
if (firecrawl.text) console.log(`firecrawl sample: ${truncate(firecrawl.text)}`);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
console.log(`firecrawl: error ${message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
run().catch((error) => {
|
||||
console.error(error);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user