diff --git a/CHANGELOG.md b/CHANGELOG.md index 35f9c7b2a..f40342fcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ - CLI/macOS: sync remote SSH target/identity to config and let `gateway status` auto-infer SSH targets (ssh-config aware). - Telegram: scope inline buttons with allowlist default + callback gating in DMs/groups. - Telegram: default reaction notifications to own. +- Tools: improve `web_fetch` extraction using Readability (with fallback). - Heartbeat: tighten prompt guidance + suppress duplicate alerts for 24h. (#980) — thanks @voidserf. - Repo: ignore local identity files to avoid accidental commits. (#1001) — thanks @gerardward2007. - Sessions/Security: add `session.dmScope` for multi-user DM isolation and audit warnings. (#948) — thanks @Alphonse-arianee. @@ -95,7 +96,7 @@ - macOS: resolve gateway token/password using config mode/remote URL, and warn when `launchctl setenv` overrides config. (#1022, #1021) — thanks @kkarimi. - Telegram: allow reply-chain messages to bypass mention gating in groups. (#1038) — thanks @adityashaw2. - Groups: treat replies to the bot as implicit mentions across supported channels. -- Security: bump `tar` to 7.5.3 to fix GHSA-8qq5-rm4j-mr97. +- Security: bump dependency `tar` to 7.5.3 to fix GHSA-8qq5-rm4j-mr97. ## 2026.1.14-1 diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 5b9a900fd..abf7dce4d 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1709,11 +1709,12 @@ Legacy: `tools.bash` is still accepted as an alias. - `tools.web.search.maxResults` (1–10, default 5) - `tools.web.search.timeoutSeconds` (default 30) - `tools.web.search.cacheTtlMinutes` (default 15) -- `tools.web.fetch.enabled` (default false; sandboxed sessions auto-enable unless set to false) +- `tools.web.fetch.enabled` (default true) - `tools.web.fetch.maxChars` (default 50000) - `tools.web.fetch.timeoutSeconds` (default 30) - `tools.web.fetch.cacheTtlMinutes` (default 15) - `tools.web.fetch.userAgent` (optional override) +- `tools.web.fetch.readability` (default true; disable to use basic HTML cleanup only) `agents.defaults.subagents` configures sub-agent defaults: - `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call. diff --git a/docs/tools/web.md b/docs/tools/web.md index 5de9381c0..4e44f72dd 100644 --- a/docs/tools/web.md +++ b/docs/tools/web.md @@ -116,7 +116,8 @@ Fetch a URL and extract readable content. maxChars: 50000, timeoutSeconds: 30, cacheTtlMinutes: 15, - userAgent: "clawdbot/2026.1.15" + userAgent: "clawdbot/2026.1.15", + readability: true } } } @@ -130,7 +131,8 @@ Fetch a URL and extract readable content. - `maxChars` (truncate long pages) Notes: +- `web_fetch` uses Readability (main-content extraction) by default and falls back to basic HTML cleanup if it fails. - `web_fetch` is best-effort extraction; some sites will need the browser tool. - Responses are cached (default 15 minutes) to reduce repeated fetches. - If you use tool profiles/allowlists, add `web_search`/`web_fetch` or `group:web`. - - If the Brave key is missing, `web_search` returns a short setup hint with a docs link. +- If the Brave key is missing, `web_search` returns a short setup hint with a docs link. diff --git a/package.json b/package.json index 9edc69d36..331438b30 100644 --- a/package.json +++ b/package.json @@ -142,6 +142,7 @@ "@mariozechner/pi-ai": "0.46.0", "@mariozechner/pi-coding-agent": "^0.46.0", "@mariozechner/pi-tui": "^0.46.0", + "@mozilla/readability": "^0.6.0", "@sinclair/typebox": "0.34.47", "@slack/bolt": "^4.6.0", "@slack/web-api": "^7.13.0", @@ -162,6 +163,7 @@ "hono": "4.11.4", "jiti": "^2.6.1", "json5": "^2.2.3", + "linkedom": "^0.18.12", "long": "5.3.2", "markdown-it": "^14.1.0", "osc-progress": "^0.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bab486595..2b7b18eba 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -40,6 +40,9 @@ importers: '@mariozechner/pi-tui': specifier: ^0.46.0 version: 0.46.0 + '@mozilla/readability': + specifier: ^0.6.0 + version: 0.6.0 '@sinclair/typebox': specifier: 0.34.47 version: 0.34.47 @@ -100,6 +103,9 @@ importers: json5: specifier: ^2.2.3 version: 2.2.3 + linkedom: + specifier: ^0.18.12 + version: 0.18.12 long: specifier: 5.3.2 version: 5.3.2 @@ -1036,6 +1042,10 @@ packages: '@mistralai/mistralai@1.10.0': resolution: {integrity: sha512-tdIgWs4Le8vpvPiUEWne6tK0qbVc+jMenujnvTqOjogrJUsCSQhus0tHTU1avDDh5//Rq2dFgP9mWRAdIEoBqg==} + '@mozilla/readability@0.6.0': + resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==} + engines: {node: '>=14.0.0'} + '@napi-rs/wasm-runtime@1.1.1': resolution: {integrity: sha512-p64ah1M1ld8xjWv3qbvFwHiFVWrq1yFvV4f7w+mzaqiR4IlSgkqhcRdHwsGgomwzBH51sRY4NEowLxnaBjcW/A==} @@ -2221,6 +2231,9 @@ packages: resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==} engines: {node: '>=18'} + boolbase@1.0.0: + resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} + bottleneck@2.19.5: resolution: {integrity: sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw==} @@ -2405,6 +2418,16 @@ packages: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} + css-select@5.2.2: + resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==} + + css-what@6.2.2: + resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==} + engines: {node: '>= 6'} + + cssom@0.5.0: + resolution: {integrity: sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==} + curve25519-js@0.0.4: resolution: {integrity: sha512-axn2UMEnkhyDUPWOwVKBMVIzSQy2ejH2xRGy1wq81dqRwApXfIzfbE3hIX0ZRFBIihf/KDqK158DLwESu4AK1w==} @@ -2456,9 +2479,22 @@ packages: docx-preview@0.3.7: resolution: {integrity: sha512-Lav69CTA/IYZPJTsKH7oYeoZjyg96N0wEJMNslGJnZJ+dMUZK85Lt5ASC79yUlD48ecWjuv+rkcmFt6EVPV0Xg==} + dom-serializer@2.0.0: + resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==} + + domelementtype@2.3.0: + resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==} + + domhandler@5.0.3: + resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==} + engines: {node: '>= 4'} + dompurify@3.3.1: resolution: {integrity: sha512-qkdCKzLNtrgPFP1Vo+98FRzJnBRGe4ffyCea9IwHB1fyxPOeNTHpLKYGd4Uk9xvNoH0ZoOjwZxNptyMwqrId1Q==} + domutils@3.2.2: + resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==} + dotenv@17.2.3: resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==} engines: {node: '>=12'} @@ -2493,6 +2529,10 @@ packages: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} + entities@6.0.1: + resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==} + engines: {node: '>=0.12'} + env-var@7.5.0: resolution: {integrity: sha512-mKZOzLRN0ETzau2W2QXefbFjo5EF4yWq28OyKb9ICdeNhHJlOE/pHHnz4hdYJ9cNZXcJHo5xN4OT4pzuSHSNvA==} engines: {node: '>=10'} @@ -2767,9 +2807,15 @@ packages: html-escaper@2.0.2: resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==} + html-escaper@3.0.3: + resolution: {integrity: sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==} + html-parse-string@0.0.9: resolution: {integrity: sha512-wyGnsOolHbNrcb8N6bdJF4EHyzd3zVGCb9/mBxeNjAYBDOZqD7YkqLBz7kXtdgHwNnV8lN/BpSDpsI1zm8Sd8g==} + htmlparser2@10.0.0: + resolution: {integrity: sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==} + http-errors@2.0.1: resolution: {integrity: sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==} engines: {node: '>= 0.8'} @@ -3037,6 +3083,15 @@ packages: limiter@1.1.5: resolution: {integrity: sha512-FWWMIEOxz3GwUI4Ts/IvgVy6LPvoMPgjMdQ185nN6psJyBJ4yOpzqm695/h5umdLJg2vW3GR5iG11MAkR2AzJA==} + linkedom@0.18.12: + resolution: {integrity: sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q==} + engines: {node: '>=16'} + peerDependencies: + canvas: '>= 2' + peerDependenciesMeta: + canvas: + optional: true + linkify-it@5.0.0: resolution: {integrity: sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ==} @@ -3307,6 +3362,9 @@ packages: engines: {node: ^12.13.0 || ^14.15.0 || >=16.0.0} deprecated: This package is no longer supported. + nth-check@2.1.1: + resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} + object-assign@4.1.1: resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} engines: {node: '>=0.10.0'} @@ -3982,6 +4040,9 @@ packages: uhtml@5.0.9: resolution: {integrity: sha512-qPyu3vGilaLe6zrjOCD/xezWEHLwdevxmbY3hzyhT25KBDF4F7YYW3YZcL3kylD/6dMoVISHjn8ggV3+9FY+5g==} + uhyphen@0.2.0: + resolution: {integrity: sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==} + uint8array-extras@1.5.0: resolution: {integrity: sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==} engines: {node: '>=18'} @@ -5253,6 +5314,8 @@ snapshots: zod: 3.25.76 zod-to-json-schema: 3.25.1(zod@3.25.76) + '@mozilla/readability@0.6.0': {} + '@napi-rs/wasm-runtime@1.1.1': dependencies: '@emnapi/core': 1.8.1 @@ -6544,6 +6607,8 @@ snapshots: transitivePeerDependencies: - supports-color + boolbase@1.0.0: {} + bottleneck@2.19.5: {} bowser@2.13.1: {} @@ -6745,6 +6810,18 @@ snapshots: shebang-command: 2.0.0 which: 2.0.2 + css-select@5.2.2: + dependencies: + boolbase: 1.0.0 + css-what: 6.2.2 + domhandler: 5.0.3 + domutils: 3.2.2 + nth-check: 2.1.1 + + css-what@6.2.2: {} + + cssom@0.5.0: {} + curve25519-js@0.0.4: {} data-uri-to-buffer@4.0.1: {} @@ -6777,10 +6854,28 @@ snapshots: dependencies: jszip: 3.10.1 + dom-serializer@2.0.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + entities: 4.5.0 + + domelementtype@2.3.0: {} + + domhandler@5.0.3: + dependencies: + domelementtype: 2.3.0 + dompurify@3.3.1: optionalDependencies: '@types/trusted-types': 2.0.7 + domutils@3.2.2: + dependencies: + dom-serializer: 2.0.0 + domelementtype: 2.3.0 + domhandler: 5.0.3 + dotenv@17.2.3: {} dunder-proto@1.0.1: @@ -6808,6 +6903,8 @@ snapshots: entities@4.5.0: {} + entities@6.0.1: {} + env-var@7.5.0: optional: true @@ -7157,8 +7254,17 @@ snapshots: html-escaper@2.0.2: {} + html-escaper@3.0.3: {} + html-parse-string@0.0.9: {} + htmlparser2@10.0.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.2.2 + entities: 6.0.1 + http-errors@2.0.1: dependencies: depd: 2.0.0 @@ -7436,6 +7542,14 @@ snapshots: limiter@1.1.5: {} + linkedom@0.18.12: + dependencies: + css-select: 5.2.2 + cssom: 0.5.0 + html-escaper: 3.0.3 + htmlparser2: 10.0.0 + uhyphen: 0.2.0 + linkify-it@5.0.0: dependencies: uc.micro: 2.1.0 @@ -7741,6 +7855,10 @@ snapshots: set-blocking: 2.0.0 optional: true + nth-check@2.1.1: + dependencies: + boolbase: 1.0.0 + object-assign@4.1.1: {} object-inspect@1.13.4: {} @@ -8537,6 +8655,8 @@ snapshots: dependencies: '@webreflection/alien-signals': 0.3.2 + uhyphen@0.2.0: {} + uint8array-extras@1.5.0: {} undici-types@7.16.0: {} diff --git a/src/agents/tools/web-tools.readability.test.ts b/src/agents/tools/web-tools.readability.test.ts new file mode 100644 index 000000000..75728bdee --- /dev/null +++ b/src/agents/tools/web-tools.readability.test.ts @@ -0,0 +1,49 @@ +import { describe, expect, it } from "vitest"; + +import { extractReadableContent } from "./web-tools.js"; + +const SAMPLE_HTML = ` + + + + Example Article + + + +
+
+

Example Article

+

Main content starts here with enough words to satisfy readability.

+

Second paragraph for a bit more signal.

+
+
+ + +`; + +describe("web fetch readability", () => { + it("extracts readable text", async () => { + const result = await extractReadableContent({ + html: SAMPLE_HTML, + url: "https://example.com/article", + extractMode: "text", + }); + expect(result?.text).toContain("Main content starts here"); + expect(result?.title).toBe("Example Article"); + }); + + it("extracts readable markdown", async () => { + const result = await extractReadableContent({ + html: SAMPLE_HTML, + url: "https://example.com/article", + extractMode: "markdown", + }); + expect(result?.text).toContain("Main content starts here"); + expect(result?.title).toBe("Example Article"); + }); +}); diff --git a/src/agents/tools/web-tools.ts b/src/agents/tools/web-tools.ts index d12e49c20..596179558 100644 --- a/src/agents/tools/web-tools.ts +++ b/src/agents/tools/web-tools.ts @@ -118,6 +118,11 @@ function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boole return true; } +function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean { + if (typeof fetch?.readability === "boolean") return fetch.readability; + return true; +} + function resolveSearchApiKey(search?: WebSearchConfig): string | undefined { const fromConfig = search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : ""; @@ -300,6 +305,37 @@ async function readResponseText(res: Response): Promise { } } +export async function extractReadableContent(params: { + html: string; + url: string; + extractMode: (typeof EXTRACT_MODES)[number]; +}): Promise<{ text: string; title?: string } | null> { + try { + const [{ Readability }, { parseHTML }] = await Promise.all([ + import("@mozilla/readability"), + import("linkedom"), + ]); + const { document } = parseHTML(params.html); + try { + (document as { baseURI?: string }).baseURI = params.url; + } catch { + // Best-effort base URI for relative links. + } + const reader = new Readability(document, { charThreshold: 0 }); + const parsed = reader.parse(); + if (!parsed?.content) return null; + const title = parsed.title || undefined; + if (params.extractMode === "text") { + const text = normalizeWhitespace(parsed.textContent ?? ""); + return { text, title }; + } + const rendered = htmlToMarkdown(parsed.content); + return { text: rendered.text, title: title ?? rendered.title }; + } catch { + return null; + } +} + async function runWebSearch(params: { query: string; count: number; @@ -377,6 +413,7 @@ async function runWebFetch(params: { timeoutSeconds: number; cacheTtlMs: number; userAgent: string; + readabilityEnabled: boolean; }): Promise> { const cacheKey = normalizeCacheKey( `fetch:${params.url}:${params.extractMode}:${params.maxChars}`, @@ -415,9 +452,25 @@ async function runWebFetch(params: { let title: string | undefined; let text = body; if (contentType.includes("text/html")) { - const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body); - text = parsed.text; - title = parsed.title; + if (params.readabilityEnabled) { + const readable = await extractReadableContent({ + html: body, + url: res.url || params.url, + extractMode: params.extractMode, + }); + if (readable?.text) { + text = readable.text; + title = readable.title; + } else { + const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body); + text = parsed.text; + title = parsed.title; + } + } else { + const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body); + text = parsed.text; + title = parsed.title; + } } else if (contentType.includes("application/json")) { try { text = JSON.stringify(JSON.parse(body), null, 2); @@ -490,6 +543,7 @@ export function createWebFetchTool(options?: { }): AnyAgentTool | null { const fetch = resolveFetchConfig(options?.config); if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null; + const readabilityEnabled = resolveFetchReadabilityEnabled(fetch); const userAgent = (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) || `clawdbot/${VERSION}`; @@ -511,6 +565,7 @@ export function createWebFetchTool(options?: { timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), userAgent, + readabilityEnabled, }); return jsonResult(result); }, diff --git a/src/config/schema.ts b/src/config/schema.ts index 25bd2c4d1..c9ede8420 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -262,6 +262,8 @@ const FIELD_HELP: Record = { "tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.", "tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.", "tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.", + "tools.web.fetch.readability": + "Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).", "channels.slack.allowBots": "Allow bot-authored messages to trigger Slack replies (default: false).", "channels.slack.thread.historyScope": diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 51fa53391..562d9e593 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -99,7 +99,7 @@ export type ToolsConfig = { cacheTtlMinutes?: number; }; fetch?: { - /** Enable web fetch tool (default: false). */ + /** Enable web fetch tool (default: true). */ enabled?: boolean; /** Max characters to return from fetched content. */ maxChars?: number; @@ -109,6 +109,8 @@ export type ToolsConfig = { cacheTtlMinutes?: number; /** Override User-Agent header for fetch requests. */ userAgent?: string; + /** Use Readability to extract main content (default: true). */ + readability?: boolean; }; }; audio?: {