feat: improve web_fetch readability extraction

This commit is contained in:
Peter Steinberger
2026-01-16 23:17:55 +00:00
parent 9aad6dfe1b
commit 37fa4f7eef
9 changed files with 242 additions and 8 deletions

View File

@@ -0,0 +1,49 @@
import { describe, expect, it } from "vitest";
import { extractReadableContent } from "./web-tools.js";
const SAMPLE_HTML = `<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<title>Example Article</title>
</head>
<body>
<nav>
<ul>
<li><a href="/home">Home</a></li>
<li><a href="/about">About</a></li>
</ul>
</nav>
<main>
<article>
<h1>Example Article</h1>
<p>Main content starts here with enough words to satisfy readability.</p>
<p>Second paragraph for a bit more signal.</p>
</article>
</main>
<footer>Footer text</footer>
</body>
</html>`;
describe("web fetch readability", () => {
it("extracts readable text", async () => {
const result = await extractReadableContent({
html: SAMPLE_HTML,
url: "https://example.com/article",
extractMode: "text",
});
expect(result?.text).toContain("Main content starts here");
expect(result?.title).toBe("Example Article");
});
it("extracts readable markdown", async () => {
const result = await extractReadableContent({
html: SAMPLE_HTML,
url: "https://example.com/article",
extractMode: "markdown",
});
expect(result?.text).toContain("Main content starts here");
expect(result?.title).toBe("Example Article");
});
});

View File

@@ -118,6 +118,11 @@ function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boole
return true;
}
function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean {
if (typeof fetch?.readability === "boolean") return fetch.readability;
return true;
}
function resolveSearchApiKey(search?: WebSearchConfig): string | undefined {
const fromConfig =
search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : "";
@@ -300,6 +305,37 @@ async function readResponseText(res: Response): Promise<string> {
}
}
export async function extractReadableContent(params: {
html: string;
url: string;
extractMode: (typeof EXTRACT_MODES)[number];
}): Promise<{ text: string; title?: string } | null> {
try {
const [{ Readability }, { parseHTML }] = await Promise.all([
import("@mozilla/readability"),
import("linkedom"),
]);
const { document } = parseHTML(params.html);
try {
(document as { baseURI?: string }).baseURI = params.url;
} catch {
// Best-effort base URI for relative links.
}
const reader = new Readability(document, { charThreshold: 0 });
const parsed = reader.parse();
if (!parsed?.content) return null;
const title = parsed.title || undefined;
if (params.extractMode === "text") {
const text = normalizeWhitespace(parsed.textContent ?? "");
return { text, title };
}
const rendered = htmlToMarkdown(parsed.content);
return { text: rendered.text, title: title ?? rendered.title };
} catch {
return null;
}
}
async function runWebSearch(params: {
query: string;
count: number;
@@ -377,6 +413,7 @@ async function runWebFetch(params: {
timeoutSeconds: number;
cacheTtlMs: number;
userAgent: string;
readabilityEnabled: boolean;
}): Promise<Record<string, unknown>> {
const cacheKey = normalizeCacheKey(
`fetch:${params.url}:${params.extractMode}:${params.maxChars}`,
@@ -415,9 +452,25 @@ async function runWebFetch(params: {
let title: string | undefined;
let text = body;
if (contentType.includes("text/html")) {
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
text = parsed.text;
title = parsed.title;
if (params.readabilityEnabled) {
const readable = await extractReadableContent({
html: body,
url: res.url || params.url,
extractMode: params.extractMode,
});
if (readable?.text) {
text = readable.text;
title = readable.title;
} else {
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
text = parsed.text;
title = parsed.title;
}
} else {
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
text = parsed.text;
title = parsed.title;
}
} else if (contentType.includes("application/json")) {
try {
text = JSON.stringify(JSON.parse(body), null, 2);
@@ -490,6 +543,7 @@ export function createWebFetchTool(options?: {
}): AnyAgentTool | null {
const fetch = resolveFetchConfig(options?.config);
if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null;
const readabilityEnabled = resolveFetchReadabilityEnabled(fetch);
const userAgent =
(fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
`clawdbot/${VERSION}`;
@@ -511,6 +565,7 @@ export function createWebFetchTool(options?: {
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
userAgent,
readabilityEnabled,
});
return jsonResult(result);
},

View File

@@ -262,6 +262,8 @@ const FIELD_HELP: Record<string, string> = {
"tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.",
"tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.",
"tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.",
"tools.web.fetch.readability":
"Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).",
"channels.slack.allowBots":
"Allow bot-authored messages to trigger Slack replies (default: false).",
"channels.slack.thread.historyScope":

View File

@@ -99,7 +99,7 @@ export type ToolsConfig = {
cacheTtlMinutes?: number;
};
fetch?: {
/** Enable web fetch tool (default: false). */
/** Enable web fetch tool (default: true). */
enabled?: boolean;
/** Max characters to return from fetched content. */
maxChars?: number;
@@ -109,6 +109,8 @@ export type ToolsConfig = {
cacheTtlMinutes?: number;
/** Override User-Agent header for fetch requests. */
userAgent?: string;
/** Use Readability to extract main content (default: true). */
readability?: boolean;
};
};
audio?: {