Example Article
+Main content starts here with enough words to satisfy readability.
+Second paragraph for a bit more signal.
+diff --git a/CHANGELOG.md b/CHANGELOG.md index 35f9c7b2a..f40342fcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ - CLI/macOS: sync remote SSH target/identity to config and let `gateway status` auto-infer SSH targets (ssh-config aware). - Telegram: scope inline buttons with allowlist default + callback gating in DMs/groups. - Telegram: default reaction notifications to own. +- Tools: improve `web_fetch` extraction using Readability (with fallback). - Heartbeat: tighten prompt guidance + suppress duplicate alerts for 24h. (#980) — thanks @voidserf. - Repo: ignore local identity files to avoid accidental commits. (#1001) — thanks @gerardward2007. - Sessions/Security: add `session.dmScope` for multi-user DM isolation and audit warnings. (#948) — thanks @Alphonse-arianee. @@ -95,7 +96,7 @@ - macOS: resolve gateway token/password using config mode/remote URL, and warn when `launchctl setenv` overrides config. (#1022, #1021) — thanks @kkarimi. - Telegram: allow reply-chain messages to bypass mention gating in groups. (#1038) — thanks @adityashaw2. - Groups: treat replies to the bot as implicit mentions across supported channels. -- Security: bump `tar` to 7.5.3 to fix GHSA-8qq5-rm4j-mr97. +- Security: bump dependency `tar` to 7.5.3 to fix GHSA-8qq5-rm4j-mr97. ## 2026.1.14-1 diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 5b9a900fd..abf7dce4d 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1709,11 +1709,12 @@ Legacy: `tools.bash` is still accepted as an alias. - `tools.web.search.maxResults` (1–10, default 5) - `tools.web.search.timeoutSeconds` (default 30) - `tools.web.search.cacheTtlMinutes` (default 15) -- `tools.web.fetch.enabled` (default false; sandboxed sessions auto-enable unless set to false) +- `tools.web.fetch.enabled` (default true) - `tools.web.fetch.maxChars` (default 50000) - `tools.web.fetch.timeoutSeconds` (default 30) - `tools.web.fetch.cacheTtlMinutes` (default 15) - `tools.web.fetch.userAgent` (optional override) +- `tools.web.fetch.readability` (default true; disable to use basic HTML cleanup only) `agents.defaults.subagents` configures sub-agent defaults: - `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call. diff --git a/docs/tools/web.md b/docs/tools/web.md index 5de9381c0..4e44f72dd 100644 --- a/docs/tools/web.md +++ b/docs/tools/web.md @@ -116,7 +116,8 @@ Fetch a URL and extract readable content. maxChars: 50000, timeoutSeconds: 30, cacheTtlMinutes: 15, - userAgent: "clawdbot/2026.1.15" + userAgent: "clawdbot/2026.1.15", + readability: true } } } @@ -130,7 +131,8 @@ Fetch a URL and extract readable content. - `maxChars` (truncate long pages) Notes: +- `web_fetch` uses Readability (main-content extraction) by default and falls back to basic HTML cleanup if it fails. - `web_fetch` is best-effort extraction; some sites will need the browser tool. - Responses are cached (default 15 minutes) to reduce repeated fetches. - If you use tool profiles/allowlists, add `web_search`/`web_fetch` or `group:web`. - - If the Brave key is missing, `web_search` returns a short setup hint with a docs link. +- If the Brave key is missing, `web_search` returns a short setup hint with a docs link. diff --git a/package.json b/package.json index 9edc69d36..331438b30 100644 --- a/package.json +++ b/package.json @@ -142,6 +142,7 @@ "@mariozechner/pi-ai": "0.46.0", "@mariozechner/pi-coding-agent": "^0.46.0", "@mariozechner/pi-tui": "^0.46.0", + "@mozilla/readability": "^0.6.0", "@sinclair/typebox": "0.34.47", "@slack/bolt": "^4.6.0", "@slack/web-api": "^7.13.0", @@ -162,6 +163,7 @@ "hono": "4.11.4", "jiti": "^2.6.1", "json5": "^2.2.3", + "linkedom": "^0.18.12", "long": "5.3.2", "markdown-it": "^14.1.0", "osc-progress": "^0.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bab486595..2b7b18eba 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -40,6 +40,9 @@ importers: '@mariozechner/pi-tui': specifier: ^0.46.0 version: 0.46.0 + '@mozilla/readability': + specifier: ^0.6.0 + version: 0.6.0 '@sinclair/typebox': specifier: 0.34.47 version: 0.34.47 @@ -100,6 +103,9 @@ importers: json5: specifier: ^2.2.3 version: 2.2.3 + linkedom: + specifier: ^0.18.12 + version: 0.18.12 long: specifier: 5.3.2 version: 5.3.2 @@ -1036,6 +1042,10 @@ packages: '@mistralai/mistralai@1.10.0': resolution: {integrity: sha512-tdIgWs4Le8vpvPiUEWne6tK0qbVc+jMenujnvTqOjogrJUsCSQhus0tHTU1avDDh5//Rq2dFgP9mWRAdIEoBqg==} + '@mozilla/readability@0.6.0': + resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==} + engines: {node: '>=14.0.0'} + '@napi-rs/wasm-runtime@1.1.1': resolution: {integrity: sha512-p64ah1M1ld8xjWv3qbvFwHiFVWrq1yFvV4f7w+mzaqiR4IlSgkqhcRdHwsGgomwzBH51sRY4NEowLxnaBjcW/A==} @@ -2221,6 +2231,9 @@ packages: resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==} engines: {node: '>=18'} + boolbase@1.0.0: + resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} + bottleneck@2.19.5: resolution: {integrity: sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw==} @@ -2405,6 +2418,16 @@ packages: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} + css-select@5.2.2: + resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==} + + css-what@6.2.2: + resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==} + engines: {node: '>= 6'} + + cssom@0.5.0: + resolution: {integrity: sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==} + curve25519-js@0.0.4: resolution: {integrity: sha512-axn2UMEnkhyDUPWOwVKBMVIzSQy2ejH2xRGy1wq81dqRwApXfIzfbE3hIX0ZRFBIihf/KDqK158DLwESu4AK1w==} @@ -2456,9 +2479,22 @@ packages: docx-preview@0.3.7: resolution: {integrity: sha512-Lav69CTA/IYZPJTsKH7oYeoZjyg96N0wEJMNslGJnZJ+dMUZK85Lt5ASC79yUlD48ecWjuv+rkcmFt6EVPV0Xg==} + dom-serializer@2.0.0: + resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==} + + domelementtype@2.3.0: + resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==} + + domhandler@5.0.3: + resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==} + engines: {node: '>= 4'} + dompurify@3.3.1: resolution: {integrity: sha512-qkdCKzLNtrgPFP1Vo+98FRzJnBRGe4ffyCea9IwHB1fyxPOeNTHpLKYGd4Uk9xvNoH0ZoOjwZxNptyMwqrId1Q==} + domutils@3.2.2: + resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==} + dotenv@17.2.3: resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==} engines: {node: '>=12'} @@ -2493,6 +2529,10 @@ packages: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} + entities@6.0.1: + resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==} + engines: {node: '>=0.12'} + env-var@7.5.0: resolution: {integrity: sha512-mKZOzLRN0ETzau2W2QXefbFjo5EF4yWq28OyKb9ICdeNhHJlOE/pHHnz4hdYJ9cNZXcJHo5xN4OT4pzuSHSNvA==} engines: {node: '>=10'} @@ -2767,9 +2807,15 @@ packages: html-escaper@2.0.2: resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==} + html-escaper@3.0.3: + resolution: {integrity: sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==} + html-parse-string@0.0.9: resolution: {integrity: sha512-wyGnsOolHbNrcb8N6bdJF4EHyzd3zVGCb9/mBxeNjAYBDOZqD7YkqLBz7kXtdgHwNnV8lN/BpSDpsI1zm8Sd8g==} + htmlparser2@10.0.0: + resolution: {integrity: sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==} + http-errors@2.0.1: resolution: {integrity: sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==} engines: {node: '>= 0.8'} @@ -3037,6 +3083,15 @@ packages: limiter@1.1.5: resolution: {integrity: sha512-FWWMIEOxz3GwUI4Ts/IvgVy6LPvoMPgjMdQ185nN6psJyBJ4yOpzqm695/h5umdLJg2vW3GR5iG11MAkR2AzJA==} + linkedom@0.18.12: + resolution: {integrity: sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q==} + engines: {node: '>=16'} + peerDependencies: + canvas: '>= 2' + peerDependenciesMeta: + canvas: + optional: true + linkify-it@5.0.0: resolution: {integrity: sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ==} @@ -3307,6 +3362,9 @@ packages: engines: {node: ^12.13.0 || ^14.15.0 || >=16.0.0} deprecated: This package is no longer supported. + nth-check@2.1.1: + resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} + object-assign@4.1.1: resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} engines: {node: '>=0.10.0'} @@ -3982,6 +4040,9 @@ packages: uhtml@5.0.9: resolution: {integrity: sha512-qPyu3vGilaLe6zrjOCD/xezWEHLwdevxmbY3hzyhT25KBDF4F7YYW3YZcL3kylD/6dMoVISHjn8ggV3+9FY+5g==} + uhyphen@0.2.0: + resolution: {integrity: sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==} + uint8array-extras@1.5.0: resolution: {integrity: sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==} engines: {node: '>=18'} @@ -5253,6 +5314,8 @@ snapshots: zod: 3.25.76 zod-to-json-schema: 3.25.1(zod@3.25.76) + '@mozilla/readability@0.6.0': {} + '@napi-rs/wasm-runtime@1.1.1': dependencies: '@emnapi/core': 1.8.1 @@ -6544,6 +6607,8 @@ snapshots: transitivePeerDependencies: - supports-color + boolbase@1.0.0: {} + bottleneck@2.19.5: {} bowser@2.13.1: {} @@ -6745,6 +6810,18 @@ snapshots: shebang-command: 2.0.0 which: 2.0.2 + css-select@5.2.2: + dependencies: + boolbase: 1.0.0 + css-what: 6.2.2 + domhandler: 5.0.3 + domutils: 3.2.2 + nth-check: 2.1.1 + + css-what@6.2.2: {} + + cssom@0.5.0: {} + curve25519-js@0.0.4: {} data-uri-to-buffer@4.0.1: {} @@ -6777,10 +6854,28 @@ snapshots: dependencies: jszip: 3.10.1 + dom-serializer@2.0.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + entities: 4.5.0 + + domelementtype@2.3.0: {} + + domhandler@5.0.3: + dependencies: + domelementtype: 2.3.0 + dompurify@3.3.1: optionalDependencies: '@types/trusted-types': 2.0.7 + domutils@3.2.2: + dependencies: + dom-serializer: 2.0.0 + domelementtype: 2.3.0 + domhandler: 5.0.3 + dotenv@17.2.3: {} dunder-proto@1.0.1: @@ -6808,6 +6903,8 @@ snapshots: entities@4.5.0: {} + entities@6.0.1: {} + env-var@7.5.0: optional: true @@ -7157,8 +7254,17 @@ snapshots: html-escaper@2.0.2: {} + html-escaper@3.0.3: {} + html-parse-string@0.0.9: {} + htmlparser2@10.0.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.2.2 + entities: 6.0.1 + http-errors@2.0.1: dependencies: depd: 2.0.0 @@ -7436,6 +7542,14 @@ snapshots: limiter@1.1.5: {} + linkedom@0.18.12: + dependencies: + css-select: 5.2.2 + cssom: 0.5.0 + html-escaper: 3.0.3 + htmlparser2: 10.0.0 + uhyphen: 0.2.0 + linkify-it@5.0.0: dependencies: uc.micro: 2.1.0 @@ -7741,6 +7855,10 @@ snapshots: set-blocking: 2.0.0 optional: true + nth-check@2.1.1: + dependencies: + boolbase: 1.0.0 + object-assign@4.1.1: {} object-inspect@1.13.4: {} @@ -8537,6 +8655,8 @@ snapshots: dependencies: '@webreflection/alien-signals': 0.3.2 + uhyphen@0.2.0: {} + uint8array-extras@1.5.0: {} undici-types@7.16.0: {} diff --git a/src/agents/tools/web-tools.readability.test.ts b/src/agents/tools/web-tools.readability.test.ts new file mode 100644 index 000000000..75728bdee --- /dev/null +++ b/src/agents/tools/web-tools.readability.test.ts @@ -0,0 +1,49 @@ +import { describe, expect, it } from "vitest"; + +import { extractReadableContent } from "./web-tools.js"; + +const SAMPLE_HTML = ` + +
+ +Main content starts here with enough words to satisfy readability.
+Second paragraph for a bit more signal.
+