feat: improve web_fetch readability extraction

2026-01-16 23:17:55 +00:00
parent 9aad6dfe1b
commit 37fa4f7eef
9 changed files with 242 additions and 8 deletions
--- a/src/agents/tools/web-tools.readability.test.ts
+++ b/src/agents/tools/web-tools.readability.test.ts
@@ -0,0 +1,49 @@
+import { describe, expect, it } from "vitest";
+
+import { extractReadableContent } from "./web-tools.js";
+
+const SAMPLE_HTML = `<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>Example Article</title>
+  </head>
+  <body>
+    <nav>
+      <ul>
+        <li><a href="/home">Home</a></li>
+        <li><a href="/about">About</a></li>
+      </ul>
+    </nav>
+    <main>
+      <article>
+        <h1>Example Article</h1>
+        <p>Main content starts here with enough words to satisfy readability.</p>
+        <p>Second paragraph for a bit more signal.</p>
+      </article>
+    </main>
+    <footer>Footer text</footer>
+  </body>
+</html>`;
+
+describe("web fetch readability", () => {
+  it("extracts readable text", async () => {
+    const result = await extractReadableContent({
+      html: SAMPLE_HTML,
+      url: "https://example.com/article",
+      extractMode: "text",
+    });
+    expect(result?.text).toContain("Main content starts here");
+    expect(result?.title).toBe("Example Article");
+  });
+
+  it("extracts readable markdown", async () => {
+    const result = await extractReadableContent({
+      html: SAMPLE_HTML,
+      url: "https://example.com/article",
+      extractMode: "markdown",
+    });
+    expect(result?.text).toContain("Main content starts here");
+    expect(result?.title).toBe("Example Article");
+  });
+});
--- a/src/agents/tools/web-tools.ts
+++ b/src/agents/tools/web-tools.ts
@@ -118,6 +118,11 @@ function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boole
  return true;
 }

+function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean {
+  if (typeof fetch?.readability === "boolean") return fetch.readability;
+  return true;
+}
+
 function resolveSearchApiKey(search?: WebSearchConfig): string | undefined {
  const fromConfig =
    search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : "";
@@ -300,6 +305,37 @@ async function readResponseText(res: Response): Promise<string> {
  }
 }

+export async function extractReadableContent(params: {
+  html: string;
+  url: string;
+  extractMode: (typeof EXTRACT_MODES)[number];
+}): Promise<{ text: string; title?: string } | null> {
+  try {
+    const [{ Readability }, { parseHTML }] = await Promise.all([
+      import("@mozilla/readability"),
+      import("linkedom"),
+    ]);
+    const { document } = parseHTML(params.html);
+    try {
+      (document as { baseURI?: string }).baseURI = params.url;
+    } catch {
+      // Best-effort base URI for relative links.
+    }
+    const reader = new Readability(document, { charThreshold: 0 });
+    const parsed = reader.parse();
+    if (!parsed?.content) return null;
+    const title = parsed.title || undefined;
+    if (params.extractMode === "text") {
+      const text = normalizeWhitespace(parsed.textContent ?? "");
+      return { text, title };
+    }
+    const rendered = htmlToMarkdown(parsed.content);
+    return { text: rendered.text, title: title ?? rendered.title };
+  } catch {
+    return null;
+  }
+}
+
 async function runWebSearch(params: {
  query: string;
  count: number;
@@ -377,6 +413,7 @@ async function runWebFetch(params: {
  timeoutSeconds: number;
  cacheTtlMs: number;
  userAgent: string;
+  readabilityEnabled: boolean;
 }): Promise<Record<string, unknown>> {
  const cacheKey = normalizeCacheKey(
    `fetch:${params.url}:${params.extractMode}:${params.maxChars}`,
@@ -415,9 +452,25 @@ async function runWebFetch(params: {
  let title: string | undefined;
  let text = body;
  if (contentType.includes("text/html")) {
-    const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
-    text = parsed.text;
-    title = parsed.title;
+    if (params.readabilityEnabled) {
+      const readable = await extractReadableContent({
+        html: body,
+        url: res.url || params.url,
+        extractMode: params.extractMode,
+      });
+      if (readable?.text) {
+        text = readable.text;
+        title = readable.title;
+      } else {
+        const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
+        text = parsed.text;
+        title = parsed.title;
+      }
+    } else {
+      const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
+      text = parsed.text;
+      title = parsed.title;
+    }
  } else if (contentType.includes("application/json")) {
    try {
      text = JSON.stringify(JSON.parse(body), null, 2);
@@ -490,6 +543,7 @@ export function createWebFetchTool(options?: {
 }): AnyAgentTool | null {
  const fetch = resolveFetchConfig(options?.config);
  if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null;
+  const readabilityEnabled = resolveFetchReadabilityEnabled(fetch);
  const userAgent =
    (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
    `clawdbot/${VERSION}`;
@@ -511,6 +565,7 @@ export function createWebFetchTool(options?: {
        timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
        cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
        userAgent,
+        readabilityEnabled,
      });
      return jsonResult(result);
    },
--- a/src/config/schema.ts
+++ b/src/config/schema.ts
@@ -262,6 +262,8 @@ const FIELD_HELP: Record<string, string> = {
  "tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.",
  "tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.",
  "tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.",
+  "tools.web.fetch.readability":
+    "Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).",
  "channels.slack.allowBots":
    "Allow bot-authored messages to trigger Slack replies (default: false).",
  "channels.slack.thread.historyScope":
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -99,7 +99,7 @@ export type ToolsConfig = {
      cacheTtlMinutes?: number;
    };
    fetch?: {
-      /** Enable web fetch tool (default: false). */
+      /** Enable web fetch tool (default: true). */
      enabled?: boolean;
      /** Max characters to return from fetched content. */
      maxChars?: number;
@@ -109,6 +109,8 @@ export type ToolsConfig = {
      cacheTtlMinutes?: number;
      /** Override User-Agent header for fetch requests. */
      userAgent?: string;
+      /** Use Readability to extract main content (default: true). */
+      readability?: boolean;
    };
  };
  audio?: {