feat: improve web_fetch readability extraction
This commit is contained in:
49
src/agents/tools/web-tools.readability.test.ts
Normal file
49
src/agents/tools/web-tools.readability.test.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { extractReadableContent } from "./web-tools.js";
|
||||
|
||||
const SAMPLE_HTML = `<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>Example Article</title>
|
||||
</head>
|
||||
<body>
|
||||
<nav>
|
||||
<ul>
|
||||
<li><a href="/home">Home</a></li>
|
||||
<li><a href="/about">About</a></li>
|
||||
</ul>
|
||||
</nav>
|
||||
<main>
|
||||
<article>
|
||||
<h1>Example Article</h1>
|
||||
<p>Main content starts here with enough words to satisfy readability.</p>
|
||||
<p>Second paragraph for a bit more signal.</p>
|
||||
</article>
|
||||
</main>
|
||||
<footer>Footer text</footer>
|
||||
</body>
|
||||
</html>`;
|
||||
|
||||
describe("web fetch readability", () => {
|
||||
it("extracts readable text", async () => {
|
||||
const result = await extractReadableContent({
|
||||
html: SAMPLE_HTML,
|
||||
url: "https://example.com/article",
|
||||
extractMode: "text",
|
||||
});
|
||||
expect(result?.text).toContain("Main content starts here");
|
||||
expect(result?.title).toBe("Example Article");
|
||||
});
|
||||
|
||||
it("extracts readable markdown", async () => {
|
||||
const result = await extractReadableContent({
|
||||
html: SAMPLE_HTML,
|
||||
url: "https://example.com/article",
|
||||
extractMode: "markdown",
|
||||
});
|
||||
expect(result?.text).toContain("Main content starts here");
|
||||
expect(result?.title).toBe("Example Article");
|
||||
});
|
||||
});
|
||||
@@ -118,6 +118,11 @@ function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boole
|
||||
return true;
|
||||
}
|
||||
|
||||
function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean {
|
||||
if (typeof fetch?.readability === "boolean") return fetch.readability;
|
||||
return true;
|
||||
}
|
||||
|
||||
function resolveSearchApiKey(search?: WebSearchConfig): string | undefined {
|
||||
const fromConfig =
|
||||
search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : "";
|
||||
@@ -300,6 +305,37 @@ async function readResponseText(res: Response): Promise<string> {
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractReadableContent(params: {
|
||||
html: string;
|
||||
url: string;
|
||||
extractMode: (typeof EXTRACT_MODES)[number];
|
||||
}): Promise<{ text: string; title?: string } | null> {
|
||||
try {
|
||||
const [{ Readability }, { parseHTML }] = await Promise.all([
|
||||
import("@mozilla/readability"),
|
||||
import("linkedom"),
|
||||
]);
|
||||
const { document } = parseHTML(params.html);
|
||||
try {
|
||||
(document as { baseURI?: string }).baseURI = params.url;
|
||||
} catch {
|
||||
// Best-effort base URI for relative links.
|
||||
}
|
||||
const reader = new Readability(document, { charThreshold: 0 });
|
||||
const parsed = reader.parse();
|
||||
if (!parsed?.content) return null;
|
||||
const title = parsed.title || undefined;
|
||||
if (params.extractMode === "text") {
|
||||
const text = normalizeWhitespace(parsed.textContent ?? "");
|
||||
return { text, title };
|
||||
}
|
||||
const rendered = htmlToMarkdown(parsed.content);
|
||||
return { text: rendered.text, title: title ?? rendered.title };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function runWebSearch(params: {
|
||||
query: string;
|
||||
count: number;
|
||||
@@ -377,6 +413,7 @@ async function runWebFetch(params: {
|
||||
timeoutSeconds: number;
|
||||
cacheTtlMs: number;
|
||||
userAgent: string;
|
||||
readabilityEnabled: boolean;
|
||||
}): Promise<Record<string, unknown>> {
|
||||
const cacheKey = normalizeCacheKey(
|
||||
`fetch:${params.url}:${params.extractMode}:${params.maxChars}`,
|
||||
@@ -415,9 +452,25 @@ async function runWebFetch(params: {
|
||||
let title: string | undefined;
|
||||
let text = body;
|
||||
if (contentType.includes("text/html")) {
|
||||
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
|
||||
text = parsed.text;
|
||||
title = parsed.title;
|
||||
if (params.readabilityEnabled) {
|
||||
const readable = await extractReadableContent({
|
||||
html: body,
|
||||
url: res.url || params.url,
|
||||
extractMode: params.extractMode,
|
||||
});
|
||||
if (readable?.text) {
|
||||
text = readable.text;
|
||||
title = readable.title;
|
||||
} else {
|
||||
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
|
||||
text = parsed.text;
|
||||
title = parsed.title;
|
||||
}
|
||||
} else {
|
||||
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
|
||||
text = parsed.text;
|
||||
title = parsed.title;
|
||||
}
|
||||
} else if (contentType.includes("application/json")) {
|
||||
try {
|
||||
text = JSON.stringify(JSON.parse(body), null, 2);
|
||||
@@ -490,6 +543,7 @@ export function createWebFetchTool(options?: {
|
||||
}): AnyAgentTool | null {
|
||||
const fetch = resolveFetchConfig(options?.config);
|
||||
if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null;
|
||||
const readabilityEnabled = resolveFetchReadabilityEnabled(fetch);
|
||||
const userAgent =
|
||||
(fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
|
||||
`clawdbot/${VERSION}`;
|
||||
@@ -511,6 +565,7 @@ export function createWebFetchTool(options?: {
|
||||
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
|
||||
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
|
||||
userAgent,
|
||||
readabilityEnabled,
|
||||
});
|
||||
return jsonResult(result);
|
||||
},
|
||||
|
||||
@@ -262,6 +262,8 @@ const FIELD_HELP: Record<string, string> = {
|
||||
"tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.",
|
||||
"tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.",
|
||||
"tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.",
|
||||
"tools.web.fetch.readability":
|
||||
"Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).",
|
||||
"channels.slack.allowBots":
|
||||
"Allow bot-authored messages to trigger Slack replies (default: false).",
|
||||
"channels.slack.thread.historyScope":
|
||||
|
||||
@@ -99,7 +99,7 @@ export type ToolsConfig = {
|
||||
cacheTtlMinutes?: number;
|
||||
};
|
||||
fetch?: {
|
||||
/** Enable web fetch tool (default: false). */
|
||||
/** Enable web fetch tool (default: true). */
|
||||
enabled?: boolean;
|
||||
/** Max characters to return from fetched content. */
|
||||
maxChars?: number;
|
||||
@@ -109,6 +109,8 @@ export type ToolsConfig = {
|
||||
cacheTtlMinutes?: number;
|
||||
/** Override User-Agent header for fetch requests. */
|
||||
userAgent?: string;
|
||||
/** Use Readability to extract main content (default: true). */
|
||||
readability?: boolean;
|
||||
};
|
||||
};
|
||||
audio?: {
|
||||
|
||||
Reference in New Issue
Block a user