132 lines
3.8 KiB
TypeScript
132 lines
3.8 KiB
TypeScript
import { extractReadableContent, fetchFirecrawlContent } from "../src/agents/tools/web-tools.js";
|
|
|
|
const DEFAULT_URLS = [
|
|
"https://en.wikipedia.org/wiki/Web_scraping",
|
|
"https://news.ycombinator.com/",
|
|
"https://www.apple.com/iphone/",
|
|
"https://www.nytimes.com/",
|
|
"https://www.reddit.com/r/javascript/",
|
|
];
|
|
|
|
const urls = process.argv.slice(2);
|
|
const targets = urls.length > 0 ? urls : DEFAULT_URLS;
|
|
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
const baseUrl = process.env.FIRECRAWL_BASE_URL ?? "https://api.firecrawl.dev";
|
|
|
|
const userAgent =
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
|
|
const timeoutMs = 30_000;
|
|
|
|
function truncate(value: string, max = 180): string {
|
|
if (!value) return "";
|
|
return value.length > max ? `${value.slice(0, max)}…` : value;
|
|
}
|
|
|
|
async function fetchHtml(url: string): Promise<{
|
|
ok: boolean;
|
|
status: number;
|
|
contentType: string;
|
|
finalUrl: string;
|
|
body: string;
|
|
}> {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
try {
|
|
const res = await fetch(url, {
|
|
method: "GET",
|
|
headers: { Accept: "*/*", "User-Agent": userAgent },
|
|
signal: controller.signal,
|
|
});
|
|
const contentType = res.headers.get("content-type") ?? "application/octet-stream";
|
|
const body = await res.text();
|
|
return {
|
|
ok: res.ok,
|
|
status: res.status,
|
|
contentType,
|
|
finalUrl: res.url || url,
|
|
body,
|
|
};
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
async function run() {
|
|
if (!apiKey) {
|
|
console.log("FIRECRAWL_API_KEY not set. Firecrawl comparisons will be skipped.");
|
|
}
|
|
|
|
for (const url of targets) {
|
|
console.log(`\n=== ${url}`);
|
|
let localStatus = "skipped";
|
|
let localTitle = "";
|
|
let localText = "";
|
|
let localError: string | undefined;
|
|
|
|
try {
|
|
const res = await fetchHtml(url);
|
|
if (!res.ok) {
|
|
localStatus = `http ${res.status}`;
|
|
} else if (!res.contentType.includes("text/html")) {
|
|
localStatus = `non-html (${res.contentType})`;
|
|
} else {
|
|
const readable = await extractReadableContent({
|
|
html: res.body,
|
|
url: res.finalUrl,
|
|
extractMode: "markdown",
|
|
});
|
|
if (readable?.text) {
|
|
localStatus = "readability";
|
|
localTitle = readable.title ?? "";
|
|
localText = readable.text;
|
|
} else {
|
|
localStatus = "readability-empty";
|
|
}
|
|
}
|
|
} catch (error) {
|
|
localStatus = "error";
|
|
localError = error instanceof Error ? error.message : String(error);
|
|
}
|
|
|
|
console.log(
|
|
`local: ${localStatus} len=${localText.length} title=${truncate(localTitle, 80)}`
|
|
);
|
|
if (localError) console.log(`local error: ${localError}`);
|
|
if (localText) console.log(`local sample: ${truncate(localText)}`);
|
|
|
|
if (apiKey) {
|
|
try {
|
|
const firecrawl = await fetchFirecrawlContent({
|
|
url,
|
|
extractMode: "markdown",
|
|
apiKey,
|
|
baseUrl,
|
|
onlyMainContent: true,
|
|
maxAgeMs: 172_800_000,
|
|
proxy: "auto",
|
|
storeInCache: true,
|
|
timeoutSeconds: 60,
|
|
});
|
|
console.log(
|
|
`firecrawl: ok len=${firecrawl.text.length} title=${truncate(
|
|
firecrawl.title ?? "",
|
|
80,
|
|
)} status=${firecrawl.status ?? "n/a"}`
|
|
);
|
|
if (firecrawl.warning) console.log(`firecrawl warning: ${firecrawl.warning}`);
|
|
if (firecrawl.text) console.log(`firecrawl sample: ${truncate(firecrawl.text)}`);
|
|
} catch (error) {
|
|
const message = error instanceof Error ? error.message : String(error);
|
|
console.log(`firecrawl: error ${message}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
process.exit(0);
|
|
}
|
|
|
|
run().catch((error) => {
|
|
console.error(error);
|
|
process.exit(1);
|
|
});
|