feat: improve web_fetch readability extraction

This commit is contained in:
Peter Steinberger
2026-01-16 23:17:55 +00:00
parent 9aad6dfe1b
commit 37fa4f7eef
9 changed files with 242 additions and 8 deletions

View File

@@ -18,6 +18,7 @@
- CLI/macOS: sync remote SSH target/identity to config and let `gateway status` auto-infer SSH targets (ssh-config aware).
- Telegram: scope inline buttons with allowlist default + callback gating in DMs/groups.
- Telegram: default reaction notifications to own.
- Tools: improve `web_fetch` extraction using Readability (with fallback).
- Heartbeat: tighten prompt guidance + suppress duplicate alerts for 24h. (#980) — thanks @voidserf.
- Repo: ignore local identity files to avoid accidental commits. (#1001) — thanks @gerardward2007.
- Sessions/Security: add `session.dmScope` for multi-user DM isolation and audit warnings. (#948) — thanks @Alphonse-arianee.
@@ -95,7 +96,7 @@
- macOS: resolve gateway token/password using config mode/remote URL, and warn when `launchctl setenv` overrides config. (#1022, #1021) — thanks @kkarimi.
- Telegram: allow reply-chain messages to bypass mention gating in groups. (#1038) — thanks @adityashaw2.
- Groups: treat replies to the bot as implicit mentions across supported channels.
- Security: bump `tar` to 7.5.3 to fix GHSA-8qq5-rm4j-mr97.
- Security: bump dependency `tar` to 7.5.3 to fix GHSA-8qq5-rm4j-mr97.
## 2026.1.14-1

View File

@@ -1709,11 +1709,12 @@ Legacy: `tools.bash` is still accepted as an alias.
- `tools.web.search.maxResults` (110, default 5)
- `tools.web.search.timeoutSeconds` (default 30)
- `tools.web.search.cacheTtlMinutes` (default 15)
- `tools.web.fetch.enabled` (default false; sandboxed sessions auto-enable unless set to false)
- `tools.web.fetch.enabled` (default true)
- `tools.web.fetch.maxChars` (default 50000)
- `tools.web.fetch.timeoutSeconds` (default 30)
- `tools.web.fetch.cacheTtlMinutes` (default 15)
- `tools.web.fetch.userAgent` (optional override)
- `tools.web.fetch.readability` (default true; disable to use basic HTML cleanup only)
`agents.defaults.subagents` configures sub-agent defaults:
- `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the callers model unless overridden per agent or per call.

View File

@@ -116,7 +116,8 @@ Fetch a URL and extract readable content.
maxChars: 50000,
timeoutSeconds: 30,
cacheTtlMinutes: 15,
userAgent: "clawdbot/2026.1.15"
userAgent: "clawdbot/2026.1.15",
readability: true
}
}
}
@@ -130,7 +131,8 @@ Fetch a URL and extract readable content.
- `maxChars` (truncate long pages)
Notes:
- `web_fetch` uses Readability (main-content extraction) by default and falls back to basic HTML cleanup if it fails.
- `web_fetch` is best-effort extraction; some sites will need the browser tool.
- Responses are cached (default 15 minutes) to reduce repeated fetches.
- If you use tool profiles/allowlists, add `web_search`/`web_fetch` or `group:web`.
- If the Brave key is missing, `web_search` returns a short setup hint with a docs link.
- If the Brave key is missing, `web_search` returns a short setup hint with a docs link.

View File

@@ -142,6 +142,7 @@
"@mariozechner/pi-ai": "0.46.0",
"@mariozechner/pi-coding-agent": "^0.46.0",
"@mariozechner/pi-tui": "^0.46.0",
"@mozilla/readability": "^0.6.0",
"@sinclair/typebox": "0.34.47",
"@slack/bolt": "^4.6.0",
"@slack/web-api": "^7.13.0",
@@ -162,6 +163,7 @@
"hono": "4.11.4",
"jiti": "^2.6.1",
"json5": "^2.2.3",
"linkedom": "^0.18.12",
"long": "5.3.2",
"markdown-it": "^14.1.0",
"osc-progress": "^0.2.0",

120
pnpm-lock.yaml generated
View File

@@ -40,6 +40,9 @@ importers:
'@mariozechner/pi-tui':
specifier: ^0.46.0
version: 0.46.0
'@mozilla/readability':
specifier: ^0.6.0
version: 0.6.0
'@sinclair/typebox':
specifier: 0.34.47
version: 0.34.47
@@ -100,6 +103,9 @@ importers:
json5:
specifier: ^2.2.3
version: 2.2.3
linkedom:
specifier: ^0.18.12
version: 0.18.12
long:
specifier: 5.3.2
version: 5.3.2
@@ -1036,6 +1042,10 @@ packages:
'@mistralai/mistralai@1.10.0':
resolution: {integrity: sha512-tdIgWs4Le8vpvPiUEWne6tK0qbVc+jMenujnvTqOjogrJUsCSQhus0tHTU1avDDh5//Rq2dFgP9mWRAdIEoBqg==}
'@mozilla/readability@0.6.0':
resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==}
engines: {node: '>=14.0.0'}
'@napi-rs/wasm-runtime@1.1.1':
resolution: {integrity: sha512-p64ah1M1ld8xjWv3qbvFwHiFVWrq1yFvV4f7w+mzaqiR4IlSgkqhcRdHwsGgomwzBH51sRY4NEowLxnaBjcW/A==}
@@ -2221,6 +2231,9 @@ packages:
resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==}
engines: {node: '>=18'}
boolbase@1.0.0:
resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}
bottleneck@2.19.5:
resolution: {integrity: sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw==}
@@ -2405,6 +2418,16 @@ packages:
resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
engines: {node: '>= 8'}
css-select@5.2.2:
resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==}
css-what@6.2.2:
resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==}
engines: {node: '>= 6'}
cssom@0.5.0:
resolution: {integrity: sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==}
curve25519-js@0.0.4:
resolution: {integrity: sha512-axn2UMEnkhyDUPWOwVKBMVIzSQy2ejH2xRGy1wq81dqRwApXfIzfbE3hIX0ZRFBIihf/KDqK158DLwESu4AK1w==}
@@ -2456,9 +2479,22 @@ packages:
docx-preview@0.3.7:
resolution: {integrity: sha512-Lav69CTA/IYZPJTsKH7oYeoZjyg96N0wEJMNslGJnZJ+dMUZK85Lt5ASC79yUlD48ecWjuv+rkcmFt6EVPV0Xg==}
dom-serializer@2.0.0:
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
domelementtype@2.3.0:
resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==}
domhandler@5.0.3:
resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==}
engines: {node: '>= 4'}
dompurify@3.3.1:
resolution: {integrity: sha512-qkdCKzLNtrgPFP1Vo+98FRzJnBRGe4ffyCea9IwHB1fyxPOeNTHpLKYGd4Uk9xvNoH0ZoOjwZxNptyMwqrId1Q==}
domutils@3.2.2:
resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==}
dotenv@17.2.3:
resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==}
engines: {node: '>=12'}
@@ -2493,6 +2529,10 @@ packages:
resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==}
engines: {node: '>=0.12'}
entities@6.0.1:
resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==}
engines: {node: '>=0.12'}
env-var@7.5.0:
resolution: {integrity: sha512-mKZOzLRN0ETzau2W2QXefbFjo5EF4yWq28OyKb9ICdeNhHJlOE/pHHnz4hdYJ9cNZXcJHo5xN4OT4pzuSHSNvA==}
engines: {node: '>=10'}
@@ -2767,9 +2807,15 @@ packages:
html-escaper@2.0.2:
resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==}
html-escaper@3.0.3:
resolution: {integrity: sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==}
html-parse-string@0.0.9:
resolution: {integrity: sha512-wyGnsOolHbNrcb8N6bdJF4EHyzd3zVGCb9/mBxeNjAYBDOZqD7YkqLBz7kXtdgHwNnV8lN/BpSDpsI1zm8Sd8g==}
htmlparser2@10.0.0:
resolution: {integrity: sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==}
http-errors@2.0.1:
resolution: {integrity: sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==}
engines: {node: '>= 0.8'}
@@ -3037,6 +3083,15 @@ packages:
limiter@1.1.5:
resolution: {integrity: sha512-FWWMIEOxz3GwUI4Ts/IvgVy6LPvoMPgjMdQ185nN6psJyBJ4yOpzqm695/h5umdLJg2vW3GR5iG11MAkR2AzJA==}
linkedom@0.18.12:
resolution: {integrity: sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q==}
engines: {node: '>=16'}
peerDependencies:
canvas: '>= 2'
peerDependenciesMeta:
canvas:
optional: true
linkify-it@5.0.0:
resolution: {integrity: sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ==}
@@ -3307,6 +3362,9 @@ packages:
engines: {node: ^12.13.0 || ^14.15.0 || >=16.0.0}
deprecated: This package is no longer supported.
nth-check@2.1.1:
resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==}
object-assign@4.1.1:
resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==}
engines: {node: '>=0.10.0'}
@@ -3982,6 +4040,9 @@ packages:
uhtml@5.0.9:
resolution: {integrity: sha512-qPyu3vGilaLe6zrjOCD/xezWEHLwdevxmbY3hzyhT25KBDF4F7YYW3YZcL3kylD/6dMoVISHjn8ggV3+9FY+5g==}
uhyphen@0.2.0:
resolution: {integrity: sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==}
uint8array-extras@1.5.0:
resolution: {integrity: sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==}
engines: {node: '>=18'}
@@ -5253,6 +5314,8 @@ snapshots:
zod: 3.25.76
zod-to-json-schema: 3.25.1(zod@3.25.76)
'@mozilla/readability@0.6.0': {}
'@napi-rs/wasm-runtime@1.1.1':
dependencies:
'@emnapi/core': 1.8.1
@@ -6544,6 +6607,8 @@ snapshots:
transitivePeerDependencies:
- supports-color
boolbase@1.0.0: {}
bottleneck@2.19.5: {}
bowser@2.13.1: {}
@@ -6745,6 +6810,18 @@ snapshots:
shebang-command: 2.0.0
which: 2.0.2
css-select@5.2.2:
dependencies:
boolbase: 1.0.0
css-what: 6.2.2
domhandler: 5.0.3
domutils: 3.2.2
nth-check: 2.1.1
css-what@6.2.2: {}
cssom@0.5.0: {}
curve25519-js@0.0.4: {}
data-uri-to-buffer@4.0.1: {}
@@ -6777,10 +6854,28 @@ snapshots:
dependencies:
jszip: 3.10.1
dom-serializer@2.0.0:
dependencies:
domelementtype: 2.3.0
domhandler: 5.0.3
entities: 4.5.0
domelementtype@2.3.0: {}
domhandler@5.0.3:
dependencies:
domelementtype: 2.3.0
dompurify@3.3.1:
optionalDependencies:
'@types/trusted-types': 2.0.7
domutils@3.2.2:
dependencies:
dom-serializer: 2.0.0
domelementtype: 2.3.0
domhandler: 5.0.3
dotenv@17.2.3: {}
dunder-proto@1.0.1:
@@ -6808,6 +6903,8 @@ snapshots:
entities@4.5.0: {}
entities@6.0.1: {}
env-var@7.5.0:
optional: true
@@ -7157,8 +7254,17 @@ snapshots:
html-escaper@2.0.2: {}
html-escaper@3.0.3: {}
html-parse-string@0.0.9: {}
htmlparser2@10.0.0:
dependencies:
domelementtype: 2.3.0
domhandler: 5.0.3
domutils: 3.2.2
entities: 6.0.1
http-errors@2.0.1:
dependencies:
depd: 2.0.0
@@ -7436,6 +7542,14 @@ snapshots:
limiter@1.1.5: {}
linkedom@0.18.12:
dependencies:
css-select: 5.2.2
cssom: 0.5.0
html-escaper: 3.0.3
htmlparser2: 10.0.0
uhyphen: 0.2.0
linkify-it@5.0.0:
dependencies:
uc.micro: 2.1.0
@@ -7741,6 +7855,10 @@ snapshots:
set-blocking: 2.0.0
optional: true
nth-check@2.1.1:
dependencies:
boolbase: 1.0.0
object-assign@4.1.1: {}
object-inspect@1.13.4: {}
@@ -8537,6 +8655,8 @@ snapshots:
dependencies:
'@webreflection/alien-signals': 0.3.2
uhyphen@0.2.0: {}
uint8array-extras@1.5.0: {}
undici-types@7.16.0: {}

View File

@@ -0,0 +1,49 @@
import { describe, expect, it } from "vitest";
import { extractReadableContent } from "./web-tools.js";
const SAMPLE_HTML = `<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<title>Example Article</title>
</head>
<body>
<nav>
<ul>
<li><a href="/home">Home</a></li>
<li><a href="/about">About</a></li>
</ul>
</nav>
<main>
<article>
<h1>Example Article</h1>
<p>Main content starts here with enough words to satisfy readability.</p>
<p>Second paragraph for a bit more signal.</p>
</article>
</main>
<footer>Footer text</footer>
</body>
</html>`;
describe("web fetch readability", () => {
it("extracts readable text", async () => {
const result = await extractReadableContent({
html: SAMPLE_HTML,
url: "https://example.com/article",
extractMode: "text",
});
expect(result?.text).toContain("Main content starts here");
expect(result?.title).toBe("Example Article");
});
it("extracts readable markdown", async () => {
const result = await extractReadableContent({
html: SAMPLE_HTML,
url: "https://example.com/article",
extractMode: "markdown",
});
expect(result?.text).toContain("Main content starts here");
expect(result?.title).toBe("Example Article");
});
});

View File

@@ -118,6 +118,11 @@ function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boole
return true;
}
function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean {
if (typeof fetch?.readability === "boolean") return fetch.readability;
return true;
}
function resolveSearchApiKey(search?: WebSearchConfig): string | undefined {
const fromConfig =
search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : "";
@@ -300,6 +305,37 @@ async function readResponseText(res: Response): Promise<string> {
}
}
export async function extractReadableContent(params: {
html: string;
url: string;
extractMode: (typeof EXTRACT_MODES)[number];
}): Promise<{ text: string; title?: string } | null> {
try {
const [{ Readability }, { parseHTML }] = await Promise.all([
import("@mozilla/readability"),
import("linkedom"),
]);
const { document } = parseHTML(params.html);
try {
(document as { baseURI?: string }).baseURI = params.url;
} catch {
// Best-effort base URI for relative links.
}
const reader = new Readability(document, { charThreshold: 0 });
const parsed = reader.parse();
if (!parsed?.content) return null;
const title = parsed.title || undefined;
if (params.extractMode === "text") {
const text = normalizeWhitespace(parsed.textContent ?? "");
return { text, title };
}
const rendered = htmlToMarkdown(parsed.content);
return { text: rendered.text, title: title ?? rendered.title };
} catch {
return null;
}
}
async function runWebSearch(params: {
query: string;
count: number;
@@ -377,6 +413,7 @@ async function runWebFetch(params: {
timeoutSeconds: number;
cacheTtlMs: number;
userAgent: string;
readabilityEnabled: boolean;
}): Promise<Record<string, unknown>> {
const cacheKey = normalizeCacheKey(
`fetch:${params.url}:${params.extractMode}:${params.maxChars}`,
@@ -415,9 +452,25 @@ async function runWebFetch(params: {
let title: string | undefined;
let text = body;
if (contentType.includes("text/html")) {
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
text = parsed.text;
title = parsed.title;
if (params.readabilityEnabled) {
const readable = await extractReadableContent({
html: body,
url: res.url || params.url,
extractMode: params.extractMode,
});
if (readable?.text) {
text = readable.text;
title = readable.title;
} else {
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
text = parsed.text;
title = parsed.title;
}
} else {
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
text = parsed.text;
title = parsed.title;
}
} else if (contentType.includes("application/json")) {
try {
text = JSON.stringify(JSON.parse(body), null, 2);
@@ -490,6 +543,7 @@ export function createWebFetchTool(options?: {
}): AnyAgentTool | null {
const fetch = resolveFetchConfig(options?.config);
if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null;
const readabilityEnabled = resolveFetchReadabilityEnabled(fetch);
const userAgent =
(fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
`clawdbot/${VERSION}`;
@@ -511,6 +565,7 @@ export function createWebFetchTool(options?: {
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
userAgent,
readabilityEnabled,
});
return jsonResult(result);
},

View File

@@ -262,6 +262,8 @@ const FIELD_HELP: Record<string, string> = {
"tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.",
"tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.",
"tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.",
"tools.web.fetch.readability":
"Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).",
"channels.slack.allowBots":
"Allow bot-authored messages to trigger Slack replies (default: false).",
"channels.slack.thread.historyScope":

View File

@@ -99,7 +99,7 @@ export type ToolsConfig = {
cacheTtlMinutes?: number;
};
fetch?: {
/** Enable web fetch tool (default: false). */
/** Enable web fetch tool (default: true). */
enabled?: boolean;
/** Max characters to return from fetched content. */
maxChars?: number;
@@ -109,6 +109,8 @@ export type ToolsConfig = {
cacheTtlMinutes?: number;
/** Override User-Agent header for fetch requests. */
userAgent?: string;
/** Use Readability to extract main content (default: true). */
readability?: boolean;
};
};
audio?: {